From 1e04951206d9edc4965540e34185643022b8c67f Mon Sep 17 00:00:00 2001 From: otaj <6065855+otaj@users.noreply.github.com> Date: Sun, 28 Aug 2022 12:56:37 +0200 Subject: [PATCH 001/193] Remove deprecated `TrainerCallbackHookMixin` (#14401) * remove deprecated callback hook * changelog --- pyproject.toml | 1 - src/pytorch_lightning/CHANGELOG.md | 3 + .../trainer/callback_hook.py | 670 ------------------ src/pytorch_lightning/trainer/trainer.py | 2 - .../deprecated_api/test_remove_1-8.py | 97 --- 5 files changed, 3 insertions(+), 770 deletions(-) delete mode 100644 src/pytorch_lightning/trainer/callback_hook.py diff --git a/pyproject.toml b/pyproject.toml index 1f704e7aa20ad..19702524bc62e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -52,7 +52,6 @@ module = [ "pytorch_lightning.callbacks.progress.rich_progress", "pytorch_lightning.profilers.base", "pytorch_lightning.profilers.pytorch", - "pytorch_lightning.trainer.callback_hook", "pytorch_lightning.trainer.supporters", "pytorch_lightning.trainer.trainer", "pytorch_lightning.tuner.batch_size_scaling", diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index d6c233e4a17ca..845c7fd88bc61 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -101,6 +101,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Removed deprecated support for old torchtext versions ([#14375](https://github.com/Lightning-AI/lightning/pull/14375)) +- Removed the deprecated class `TrainerCallbackHookMixin` ([#14401](https://github.com/Lightning-AI/lightning/14401)) + + ### Fixed - Fixed `LightningDataModule` hparams parsing ([#12806](https://github.com/PyTorchLightning/pytorch-lightning/pull/12806)) diff --git a/src/pytorch_lightning/trainer/callback_hook.py b/src/pytorch_lightning/trainer/callback_hook.py deleted file mode 100644 index 1e455b3424606..0000000000000 --- a/src/pytorch_lightning/trainer/callback_hook.py +++ /dev/null @@ -1,670 +0,0 @@ -# Copyright The PyTorch Lightning team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from abc import ABC -from copy import deepcopy -from typing import Any, Dict, List, Optional, Type, Union - -from packaging.version import Version -from torch import Tensor - -import pytorch_lightning as pl -from pytorch_lightning.callbacks import Callback -from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation, rank_zero_warn -from pytorch_lightning.utilities.types import STEP_OUTPUT - - -class TrainerCallbackHookMixin(ABC): - r""" - .. deprecated:: v1.6 - The `TrainerCallbackHookMixin` class was deprecated in v1.6 and will be removed in v1.8. - """ - - # this is just a summary on variables used in this abstract class, - # the proper values/initialisation should be done in child class - callbacks: List[Callback] = [] - lightning_module: "pl.LightningModule" - - def on_before_accelerator_backend_setup(self) -> None: - r""" - .. deprecated:: v1.6 - `TrainerCallbackHookMixin.on_before_accelerator_backend_setup` was deprecated in v1.6 - and will be removed in v1.8. - - Called at the beginning of fit (train + validate), validate, test, or predict, or tune. - """ - rank_zero_deprecation( - "`TrainerCallbackHookMixin.on_before_accelerator_backend_setup` was deprecated in v1.6 " - "and will be removed in v1.8." - ) - for callback in self.callbacks: - callback.on_before_accelerator_backend_setup(self, self.lightning_module) - - def on_configure_sharded_model(self) -> None: - r""" - .. deprecated:: v1.6 - `TrainerCallbackHookMixin.on_configure_sharded_model` was deprecated in v1.6 and will be removed in v1.8. - - Called at the beginning of fit (train + validate), validate, test, or predict, or tune. - """ - rank_zero_deprecation( - "`TrainerCallbackHookMixin.on_configure_sharded_model` was deprecated in v1.6 and will be removed in v1.8." - ) - for callback in self.callbacks: - callback.on_configure_sharded_model(self, self.lightning_module) - - def setup(self, stage: Optional[str]) -> None: - r""" - .. deprecated:: v1.6 - `TrainerCallbackHookMixin.setup` was deprecated in v1.6 and will be removed in v1.8. - - Called at the beginning of fit (train + validate), validate, test, or predict, or tune. - """ - rank_zero_deprecation("`TrainerCallbackHookMixin.setup` was deprecated in v1.6 and will be removed in v1.8.") - for callback in self.callbacks: - callback.setup(self, self.lightning_module, stage=stage) - - def teardown(self, stage: Optional[str] = None) -> None: - r""" - .. deprecated:: v1.6 - `TrainerCallbackHookMixin.teardown` was deprecated in v1.6 and will be removed in v1.8. - - Called at the end of fit (train + validate), validate, test, or predict, or tune. - """ - rank_zero_deprecation("`TrainerCallbackHookMixin.teardown` was deprecated in v1.6 and will be removed in v1.8.") - for callback in self.callbacks: - callback.teardown(self, self.lightning_module, stage=stage) - - def on_init_start(self): - r""" - .. deprecated:: v1.6 - `TrainerCallbackHookMixin.on_init_start` was deprecated in v1.6 and will be removed in v1.8. - - Called when the trainer initialization begins, model has not yet been set. - """ - rank_zero_deprecation( - "`TrainerCallbackHookMixin.on_init_start` was deprecated in v1.6 and will be removed in v1.8." - ) - for callback in self.callbacks: - callback.on_init_start(self) - - def on_init_end(self): - r""" - .. deprecated:: v1.6 - `TrainerCallbackHookMixin.on_init_end` was deprecated in v1.6 and will be removed in v1.8. - - Called when the trainer initialization ends, model has not yet been set. - """ - rank_zero_deprecation( - "`TrainerCallbackHookMixin.on_init_end` was deprecated in v1.6 and will be removed in v1.8." - ) - for callback in self.callbacks: - callback.on_init_end(self) - - def on_fit_start(self): - r""" - .. deprecated:: v1.6 - `TrainerCallbackHookMixin.on_fit_start` was deprecated in v1.6 and will be removed in v1.8. - - Called when fit begins. - """ - rank_zero_deprecation( - "`TrainerCallbackHookMixin.on_fit_start` was deprecated in v1.6 and will be removed in v1.8." - ) - for callback in self.callbacks: - callback.on_fit_start(self, self.lightning_module) - - def on_fit_end(self): - r""" - .. deprecated:: v1.6 - `TrainerCallbackHookMixin.on_fit_end` was deprecated in v1.6 and will be removed in v1.8. - - Called when fit ends. - """ - rank_zero_deprecation( - "`TrainerCallbackHookMixin.on_fit_end` was deprecated in v1.6 and will be removed in v1.8." - ) - for callback in self.callbacks: - callback.on_fit_end(self, self.lightning_module) - - def on_sanity_check_start(self): - r""" - .. deprecated:: v1.6 - `TrainerCallbackHookMixin.on_sanity_check_start` was deprecated in v1.6 and will be removed in v1.8. - - Called when the validation sanity check starts. - """ - rank_zero_deprecation( - "`TrainerCallbackHookMixin.on_sanity_check_start` was deprecated in v1.6 and will be removed in v1.8." - ) - for callback in self.callbacks: - callback.on_sanity_check_start(self, self.lightning_module) - - def on_sanity_check_end(self): - r""" - .. deprecated:: v1.6 - `TrainerCallbackHookMixin.on_sanity_check_end` was deprecated in v1.6 and will be removed in v1.8. - - Called when the validation sanity check ends. - """ - rank_zero_deprecation( - "`TrainerCallbackHookMixin.on_sanity_check_end` was deprecated in v1.6 and will be removed in v1.8." - ) - for callback in self.callbacks: - callback.on_sanity_check_end(self, self.lightning_module) - - def on_train_epoch_start(self): - r""" - .. deprecated:: v1.6 - `TrainerCallbackHookMixin.on_train_epoch_start` was deprecated in v1.6 and will be removed in v1.8. - - Called when the epoch begins. - """ - rank_zero_deprecation( - "`TrainerCallbackHookMixin.on_train_epoch_start` was deprecated in v1.6 and will be removed in v1.8." - ) - for callback in self.callbacks: - callback.on_train_epoch_start(self, self.lightning_module) - - def on_train_epoch_end(self): - r""" - .. deprecated:: v1.6 - `TrainerCallbackHookMixin.on_train_epoch_end` was deprecated in v1.6 and will be removed in v1.8. - - Called when the epoch ends. - """ - rank_zero_deprecation( - "`TrainerCallbackHookMixin.on_train_epoch_end` was deprecated in v1.6 and will be removed in v1.8." - ) - for callback in self.callbacks: - callback.on_train_epoch_end(self, self.lightning_module) - - def on_validation_epoch_start(self): - r""" - .. deprecated:: v1.6 - `TrainerCallbackHookMixin.on_validation_epoch_start` was deprecated in v1.6 and will be removed in v1.8. - - Called when the epoch begins. - """ - rank_zero_deprecation( - "`TrainerCallbackHookMixin.on_validation_epoch_start` was deprecated in v1.6 and will be removed in v1.8." - ) - for callback in self.callbacks: - callback.on_validation_epoch_start(self, self.lightning_module) - - def on_validation_epoch_end(self): - r""" - .. deprecated:: v1.6 - `TrainerCallbackHookMixin.on_validation_epoch_end` was deprecated in v1.6 and will be removed in v1.8. - - Called when the validation epoch ends. - """ - rank_zero_deprecation( - "`TrainerCallbackHookMixin.on_validation_epoch_end` was deprecated in v1.6 and will be removed in v1.8." - ) - for callback in self.callbacks: - callback.on_validation_epoch_end(self, self.lightning_module) - - def on_test_epoch_start(self): - r""" - .. deprecated:: v1.6 - `TrainerCallbackHookMixin.on_test_epoch_start` was deprecated in v1.6 and will be removed in v1.8. - - Called when the epoch begins. - """ - rank_zero_deprecation( - "`TrainerCallbackHookMixin.on_test_epoch_start` was deprecated in v1.6 and will be removed in v1.8." - ) - for callback in self.callbacks: - callback.on_test_epoch_start(self, self.lightning_module) - - def on_test_epoch_end(self): - r""" - .. deprecated:: v1.6 - `TrainerCallbackHookMixin.on_test_epoch_end` was deprecated in v1.6 and will be removed in v1.8. - - Called when the test epoch ends. - """ - rank_zero_deprecation( - "`TrainerCallbackHookMixin.on_test_epoch_end` was deprecated in v1.6 and will be removed in v1.8." - ) - for callback in self.callbacks: - callback.on_test_epoch_end(self, self.lightning_module) - - def on_predict_epoch_start(self) -> None: - r""" - .. deprecated:: v1.6 - `TrainerCallbackHookMixin.on_predict_epoch_start` was deprecated in v1.6 and will be removed in v1.8. - - Called when the epoch begins. - """ - rank_zero_deprecation( - "`TrainerCallbackHookMixin.on_predict_epoch_start` was deprecated in v1.6 and will be removed in v1.8." - ) - for callback in self.callbacks: - callback.on_predict_epoch_start(self, self.lightning_module) - - def on_predict_epoch_end(self, outputs: List[Any]) -> None: - r""" - .. deprecated:: v1.6 - `TrainerCallbackHookMixin.on_predict_epoch_end` was deprecated in v1.6 and will be removed in v1.8. - - Called when the epoch ends. - """ - rank_zero_deprecation( - "`TrainerCallbackHookMixin.on_predict_epoch_end` was deprecated in v1.6 and will be removed in v1.8." - ) - for callback in self.callbacks: - callback.on_predict_epoch_end(self, self.lightning_module, outputs) - - def on_epoch_start(self): - r""" - .. deprecated:: v1.6 - `TrainerCallbackHookMixin.on_epoch_start` was deprecated in v1.6 and will be removed in v1.8. - - Called when either of train/val/test epoch begins. - """ - rank_zero_deprecation( - "`TrainerCallbackHookMixin.on_epoch_start` was deprecated in v1.6 and will be removed in v1.8." - ) - for callback in self.callbacks: - callback.on_epoch_start(self, self.lightning_module) - - def on_epoch_end(self): - r""" - .. deprecated:: v1.6 - `TrainerCallbackHookMixin.on_epoch_end` was deprecated in v1.6 and will be removed in v1.8. - - Called when either of train/val/test epoch ends. - """ - rank_zero_deprecation( - "`TrainerCallbackHookMixin.on_epoch_end` was deprecated in v1.6 and will be removed in v1.8." - ) - for callback in self.callbacks: - callback.on_epoch_end(self, self.lightning_module) - - def on_train_start(self): - r""" - .. deprecated:: v1.6 - `TrainerCallbackHookMixin.on_train_start` was deprecated in v1.6 and will be removed in v1.8. - - Called when the train begins. - """ - rank_zero_deprecation( - "`TrainerCallbackHookMixin.on_train_start` was deprecated in v1.6 and will be removed in v1.8." - ) - for callback in self.callbacks: - callback.on_train_start(self, self.lightning_module) - - def on_train_end(self): - r""" - .. deprecated:: v1.6 - `TrainerCallbackHookMixin.on_train_end` was deprecated in v1.6 and will be removed in v1.8. - - Called when the train ends. - """ - rank_zero_deprecation( - "`TrainerCallbackHookMixin.on_train_end` was deprecated in v1.6 and will be removed in v1.8." - ) - for callback in self.callbacks: - callback.on_train_end(self, self.lightning_module) - - def on_pretrain_routine_start(self) -> None: - r""" - .. deprecated:: v1.6 - `TrainerCallbackHookMixin.on_pretrain_routine_start` was deprecated in v1.6 and will be removed in v1.8. - - Called when the pre-train routine begins. - """ - rank_zero_deprecation( - "`TrainerCallbackHookMixin.on_pretrain_routine_start` was deprecated in v1.6 and will be removed in v1.8." - ) - for callback in self.callbacks: - callback.on_pretrain_routine_start(self, self.lightning_module) - - def on_pretrain_routine_end(self) -> None: - r""" - .. deprecated:: v1.6 - `TrainerCallbackHookMixin.on_pretrain_routine_end` was deprecated in v1.6 and will be removed in v1.8. - - Called when the pre-train routine ends. - """ - rank_zero_deprecation( - "`TrainerCallbackHookMixin.on_pretrain_routine_end` was deprecated in v1.6 and will be removed in v1.8." - ) - for callback in self.callbacks: - callback.on_pretrain_routine_end(self, self.lightning_module) - - def on_batch_start(self): - r""" - .. deprecated:: v1.6 - `TrainerCallbackHookMixin.on_batch_start` was deprecated in v1.6 and will be removed in v1.8. - - Called when the training batch begins. - """ - rank_zero_deprecation( - "`TrainerCallbackHookMixin.on_batch_start` was deprecated in v1.6 and will be removed in v1.8." - ) - for callback in self.callbacks: - callback.on_batch_start(self, self.lightning_module) - - def on_batch_end(self): - r""" - .. deprecated:: v1.6 - `TrainerCallbackHookMixin.on_batch_end` was deprecated in v1.6 and will be removed in v1.8. - - Called when the training batch ends. - """ - rank_zero_deprecation( - "`TrainerCallbackHookMixin.on_batch_end` was deprecated in v1.6 and will be removed in v1.8." - ) - for callback in self.callbacks: - callback.on_batch_end(self, self.lightning_module) - - def on_train_batch_start(self, batch, batch_idx): - r""" - .. deprecated:: v1.6 - `TrainerCallbackHookMixin.on_train_batch_start` was deprecated in v1.6 and will be removed in v1.8. - - Called when the training batch begins. - """ - rank_zero_deprecation( - "`TrainerCallbackHookMixin.on_train_batch_start` was deprecated in v1.6 and will be removed in v1.8." - ) - for callback in self.callbacks: - callback.on_train_batch_start(self, self.lightning_module, batch, batch_idx) - - def on_train_batch_end(self, outputs: STEP_OUTPUT, batch, batch_idx): - r""" - .. deprecated:: v1.6 - `TrainerCallbackHookMixin.on_train_batch_end` was deprecated in v1.6 and will be removed in v1.8. - - Called when the training batch ends. - """ - rank_zero_deprecation( - "`TrainerCallbackHookMixin.on_train_batch_end` was deprecated in v1.6 and will be removed in v1.8." - ) - for callback in self.callbacks: - callback.on_train_batch_end(self, self.lightning_module, outputs, batch, batch_idx) - - def on_validation_batch_start(self, batch, batch_idx, dataloader_idx): - r""" - .. deprecated:: v1.6 - `TrainerCallbackHookMixin.on_validation_batch_start` was deprecated in v1.6 and will be removed in v1.8. - - Called when the validation batch begins. - """ - rank_zero_deprecation( - "`TrainerCallbackHookMixin.on_validation_batch_start` was deprecated in v1.6 and will be removed in v1.8." - ) - for callback in self.callbacks: - callback.on_validation_batch_start(self, self.lightning_module, batch, batch_idx, dataloader_idx) - - def on_validation_batch_end(self, outputs: STEP_OUTPUT, batch, batch_idx, dataloader_idx): - r""" - .. deprecated:: v1.6 - `TrainerCallbackHookMixin.on_validation_batch_end` was deprecated in v1.6 and will be removed in v1.8. - - Called when the validation batch ends. - """ - rank_zero_deprecation( - "`TrainerCallbackHookMixin.on_validation_batch_end` was deprecated in v1.6 and will be removed in v1.8." - ) - for callback in self.callbacks: - callback.on_validation_batch_end(self, self.lightning_module, outputs, batch, batch_idx, dataloader_idx) - - def on_test_batch_start(self, batch, batch_idx, dataloader_idx): - r""" - .. deprecated:: v1.6 - `TrainerCallbackHookMixin.on_test_batch_start` was deprecated in v1.6 and will be removed in v1.8. - - Called when the test batch begins. - """ - rank_zero_deprecation( - "`TrainerCallbackHookMixin.on_test_batch_start` was deprecated in v1.6 and will be removed in v1.8." - ) - for callback in self.callbacks: - callback.on_test_batch_start(self, self.lightning_module, batch, batch_idx, dataloader_idx) - - def on_test_batch_end(self, outputs: STEP_OUTPUT, batch, batch_idx, dataloader_idx): - r""" - .. deprecated:: v1.6 - `TrainerCallbackHookMixin.on_test_batch_end` was deprecated in v1.6 and will be removed in v1.8. - - Called when the test batch ends. - """ - rank_zero_deprecation( - "`TrainerCallbackHookMixin.on_test_batch_end` was deprecated in v1.6 and will be removed in v1.8." - ) - for callback in self.callbacks: - callback.on_test_batch_end(self, self.lightning_module, outputs, batch, batch_idx, dataloader_idx) - - def on_predict_batch_start(self, batch: Any, batch_idx: int, dataloader_idx: int) -> None: - r""" - .. deprecated:: v1.6 - `TrainerCallbackHookMixin.on_predict_batch_start` was deprecated in v1.6 and will be removed in v1.8. - - Called when the predict batch begins. - """ - rank_zero_deprecation( - "`TrainerCallbackHookMixin.on_predict_batch_start` was deprecated in v1.6 and will be removed in v1.8." - ) - for callback in self.callbacks: - callback.on_predict_batch_start(self, self.lightning_module, batch, batch_idx, dataloader_idx) - - def on_predict_batch_end(self, outputs: STEP_OUTPUT, batch: Any, batch_idx: int, dataloader_idx: int) -> None: - r""" - .. deprecated:: v1.6 - `TrainerCallbackHookMixin.on_predict_batch_end` was deprecated in v1.6 and will be removed in v1.8. - - Called when the predict batch ends. - """ - rank_zero_deprecation( - "`TrainerCallbackHookMixin.on_predict_batch_end` was deprecated in v1.6 and will be removed in v1.8." - ) - for callback in self.callbacks: - callback.on_predict_batch_end(self, self.lightning_module, outputs, batch, batch_idx, dataloader_idx) - - def on_validation_start(self): - r""" - .. deprecated:: v1.6 - `TrainerCallbackHookMixin.on_validation_start` was deprecated in v1.6 and will be removed in v1.8. - - Called when the validation loop begins. - """ - rank_zero_deprecation( - "`TrainerCallbackHookMixin.on_validation_start` was deprecated in v1.6 and will be removed in v1.8." - ) - for callback in self.callbacks: - callback.on_validation_start(self, self.lightning_module) - - def on_validation_end(self): - r""" - .. deprecated:: v1.6 - `TrainerCallbackHookMixin.on_validation_end` was deprecated in v1.6 and will be removed in v1.8. - - Called when the validation loop ends. - """ - rank_zero_deprecation( - "`TrainerCallbackHookMixin.on_validation_end` was deprecated in v1.6 and will be removed in v1.8." - ) - for callback in self.callbacks: - callback.on_validation_end(self, self.lightning_module) - - def on_test_start(self): - r""" - .. deprecated:: v1.6 - `TrainerCallbackHookMixin.on_test_start` was deprecated in v1.6 and will be removed in v1.8. - - Called when the test begins. - """ - rank_zero_deprecation( - "`TrainerCallbackHookMixin.on_test_start` was deprecated in v1.6 and will be removed in v1.8." - ) - for callback in self.callbacks: - callback.on_test_start(self, self.lightning_module) - - def on_test_end(self): - r""" - .. deprecated:: v1.6 - `TrainerCallbackHookMixin.on_test_end` was deprecated in v1.6 and will be removed in v1.8. - - Called when the test ends. - """ - rank_zero_deprecation( - "`TrainerCallbackHookMixin.on_test_end` was deprecated in v1.6 and will be removed in v1.8." - ) - for callback in self.callbacks: - callback.on_test_end(self, self.lightning_module) - - def on_predict_start(self) -> None: - r""" - .. deprecated:: v1.6 - `TrainerCallbackHookMixin.on_predict_start` was deprecated in v1.6 and will be removed in v1.8. - - Called when predict begins. - """ - rank_zero_deprecation( - "`TrainerCallbackHookMixin.on_predict_start` was deprecated in v1.6 and will be removed in v1.8." - ) - for callback in self.callbacks: - callback.on_predict_start(self, self.lightning_module) - - def on_predict_end(self) -> None: - r""" - .. deprecated:: v1.6 - `TrainerCallbackHookMixin.on_predict_end` was deprecated in v1.6 and will be removed in v1.8. - - Called when predict ends. - """ - rank_zero_deprecation( - "`TrainerCallbackHookMixin.on_predict_end` was deprecated in v1.6 and will be removed in v1.8." - ) - for callback in self.callbacks: - callback.on_predict_end(self, self.lightning_module) - - def on_exception(self, exception: BaseException) -> None: - r""" - .. deprecated:: v1.6 - `TrainerCallbackHookMixin.on_exception` was deprecated in v1.6 and will be removed in v1.8. - - Called when any trainer execution is interrupted by an exception. - """ - rank_zero_deprecation( - "`TrainerCallbackHookMixin.on_exception` was deprecated in v1.6 and will be removed in v1.8." - ) - for callback in self.callbacks: - callback.on_exception(self, self.lightning_module, exception) - - def on_save_checkpoint(self, checkpoint: Dict[str, Any]) -> Dict[str, dict]: - r""" - .. deprecated:: v1.6 - `TrainerCallbackHookMixin.on_save_checkpoint` was deprecated in v1.6 and will be removed in v1.8. - - Called when saving a model checkpoint. - """ - rank_zero_deprecation( - "`TrainerCallbackHookMixin.on_save_checkpoint` was deprecated in v1.6 and will be removed in v1.8." - ) - callback_states = {} - for callback in self.callbacks: - state = callback.on_save_checkpoint(self, self.lightning_module, checkpoint) - if state: - callback_states[callback.state_key] = state - return callback_states - - def on_load_checkpoint(self, checkpoint: Dict[str, Any]) -> None: - r""" - .. deprecated:: v1.6 - `TrainerCallbackHookMixin.on_load_checkpoint` was deprecated in v1.6 and will be removed in v1.8. - - Called when loading a model checkpoint. - """ - # Todo: the `callback_states` are dropped with TPUSpawn as they - # can't be saved using `xm.save` - # https://github.com/pytorch/xla/issues/2773 - rank_zero_deprecation( - "`TrainerCallbackHookMixin.on_load_checkpoint` was deprecated in v1.6 and will be removed in v1.8." - ) - callback_states: Dict[Union[Type, str], Dict] = checkpoint.get("callbacks") - - if callback_states is None: - return - - is_legacy_ckpt = Version(checkpoint["pytorch-lightning_version"]) < Version("1.5.0dev") - current_callbacks_keys = {cb._legacy_state_key if is_legacy_ckpt else cb.state_key for cb in self.callbacks} - difference = callback_states.keys() - current_callbacks_keys - if difference: - rank_zero_warn( - "Be aware that when using `ckpt_path`," - " callbacks used to create the checkpoint need to be provided during `Trainer` instantiation." - f" Please add the following callbacks: {list(difference)}.", - ) - - for callback in self.callbacks: - state = callback_states.get(callback.state_key, callback_states.get(callback._legacy_state_key)) - if state: - state = deepcopy(state) - callback.on_load_checkpoint(self, self.lightning_module, state) - - def on_before_backward(self, loss: Tensor) -> None: - r""" - .. deprecated:: v1.6 - `TrainerCallbackHookMixin.on_before_backward` was deprecated in v1.6 and will be removed in v1.8. - - Called before ``loss.backward()``. - """ - rank_zero_deprecation( - "`TrainerCallbackHookMixin.on_before_backward` was deprecated in v1.6 and will be removed in v1.8." - ) - for callback in self.callbacks: - callback.on_before_backward(self, self.lightning_module, loss) - - def on_after_backward(self): - r""" - .. deprecated:: v1.6 - `TrainerCallbackHookMixin.on_after_backward` was deprecated in v1.6 and will be removed in v1.8. - - Called after loss.backward() and before optimizers do anything. - """ - rank_zero_deprecation( - "`TrainerCallbackHookMixin.on_after_backward` was deprecated in v1.6 and will be removed in v1.8." - ) - for callback in self.callbacks: - callback.on_after_backward(self, self.lightning_module) - - def on_before_optimizer_step(self, optimizer, optimizer_idx): - r""" - .. deprecated:: v1.6 - `TrainerCallbackHookMixin.on_before_optimizer_step` was deprecated in v1.6 and will be removed in v1.8. - - Called after on_after_backward() once the gradient is accumulated and before optimizer.step(). - """ - rank_zero_deprecation( - "`TrainerCallbackHookMixin.on_before_optimizer_step` was deprecated in v1.6 and will be removed in v1.8." - ) - for callback in self.callbacks: - callback.on_before_optimizer_step(self, self.lightning_module, optimizer, optimizer_idx) - - def on_before_zero_grad(self, optimizer): - r""" - .. deprecated:: v1.6 - `TrainerCallbackHookMixin.on_before_zero_grad` was deprecated in v1.6 and will be removed in v1.8. - - Called after optimizer.step() and before optimizer.zero_grad(). - """ - rank_zero_deprecation( - "`TrainerCallbackHookMixin.on_before_zero_grad` was deprecated in v1.6 and will be removed in v1.8." - ) - for callback in self.callbacks: - callback.on_before_zero_grad(self, self.lightning_module, optimizer) diff --git a/src/pytorch_lightning/trainer/trainer.py b/src/pytorch_lightning/trainer/trainer.py index 4c9456e8ead37..13817deaf64d1 100644 --- a/src/pytorch_lightning/trainer/trainer.py +++ b/src/pytorch_lightning/trainer/trainer.py @@ -70,7 +70,6 @@ XLAProfiler, ) from pytorch_lightning.strategies import ParallelStrategy, Strategy -from pytorch_lightning.trainer.callback_hook import TrainerCallbackHookMixin from pytorch_lightning.trainer.configuration_validator import verify_loop_configurations from pytorch_lightning.trainer.connectors.accelerator_connector import _LITERAL_WARN, AcceleratorConnector from pytorch_lightning.trainer.connectors.callback_connector import CallbackConnector @@ -127,7 +126,6 @@ class Trainer( - TrainerCallbackHookMixin, # TODO: Remove in v1.8 TrainerOptimizersMixin, # TODO: Remove in v1.8 TrainerDataLoadingMixin, # TODO: Remove in v1.8 ): diff --git a/tests/tests_pytorch/deprecated_api/test_remove_1-8.py b/tests/tests_pytorch/deprecated_api/test_remove_1-8.py index 07223b88b7710..d3a014177dabc 100644 --- a/tests/tests_pytorch/deprecated_api/test_remove_1-8.py +++ b/tests/tests_pytorch/deprecated_api/test_remove_1-8.py @@ -20,7 +20,6 @@ import numpy as np import pytest import torch -from torch import optim import pytorch_lightning from pytorch_lightning import Callback, Trainer @@ -142,102 +141,6 @@ def test_v1_8_0_trainer_optimizers_mixin(): trainer.convert_to_lightning_optimizers() -def test_v1_8_0_deprecate_trainer_callback_hook_mixin(): - methods_with_self = [ - "on_before_accelerator_backend_setup", - "on_configure_sharded_model", - "on_init_start", - "on_init_end", - "on_fit_start", - "on_fit_end", - "on_sanity_check_start", - "on_sanity_check_end", - "on_train_epoch_start", - "on_train_epoch_end", - "on_validation_epoch_start", - "on_validation_epoch_end", - "on_test_epoch_start", - "on_test_epoch_end", - "on_predict_epoch_start", - "on_epoch_start", - "on_epoch_end", - "on_train_start", - "on_train_end", - "on_pretrain_routine_start", - "on_pretrain_routine_end", - "on_batch_start", - "on_batch_end", - "on_validation_start", - "on_validation_end", - "on_test_start", - "on_test_end", - "on_predict_start", - "on_predict_end", - "on_after_backward", - ] - methods_with_stage = [ - "setup", - "teardown", - ] - methods_with_batch_batch_idx_dataloader_idx = [ - "on_validation_batch_start", - "on_test_batch_start", - "on_predict_batch_start", - ] - methods_with_outputs_batch_batch_idx_dataloader_idx = [ - "on_validation_batch_end", - "on_test_batch_end", - "on_predict_batch_end", - ] - methods_with_checkpoint = ["on_save_checkpoint", "on_load_checkpoint"] - trainer = Trainer( - max_epochs=1, - limit_val_batches=0.1, - limit_train_batches=0.2, - enable_progress_bar=False, - logger=False, - ) - model = BoringModel() - # need to attach model to trainer for testing of `on_pretrain_routine_start` - trainer.strategy.connect(model) - for method_name in methods_with_self: - fn = getattr(trainer, method_name, None) - with pytest.deprecated_call(match="was deprecated in v1.6 and will be removed in v1.8"): - fn() - for method_name in methods_with_stage: - fn = getattr(trainer, method_name) - with pytest.deprecated_call(match="was deprecated in v1.6 and will be removed in v1.8"): - fn(stage="test") - for method_name in methods_with_batch_batch_idx_dataloader_idx: - fn = getattr(trainer, method_name) - with pytest.deprecated_call(match="was deprecated in v1.6 and will be removed in v1.8"): - fn(batch={}, batch_idx=0, dataloader_idx=0) - for method_name in methods_with_outputs_batch_batch_idx_dataloader_idx: - fn = getattr(trainer, method_name) - with pytest.deprecated_call(match="was deprecated in v1.6 and will be removed in v1.8"): - fn(outputs=torch.tensor([[1.0, -1.0], [1.0, -1.0]]), batch={}, batch_idx=0, dataloader_idx=0) - for method_name in methods_with_checkpoint: - fn = getattr(trainer, method_name) - with pytest.deprecated_call(match="was deprecated in v1.6 and will be removed in v1.8"): - fn(checkpoint={}) - with pytest.deprecated_call(match="was deprecated in v1.6 and will be removed in v1.8"): - trainer.on_train_batch_start(batch={}, batch_idx=0) - with pytest.deprecated_call(match="was deprecated in v1.6 and will be removed in v1.8"): - trainer.on_train_batch_end(outputs=torch.tensor([[1.0, -1.0], [1.0, -1.0]]), batch={}, batch_idx=0) - with pytest.deprecated_call(match="was deprecated in v1.6 and will be removed in v1.8"): - trainer.on_predict_epoch_end(outputs=torch.tensor([[1.0, -1.0], [1.0, -1.0]])) - with pytest.deprecated_call(match="was deprecated in v1.6 and will be removed in v1.8"): - trainer.on_exception(exception=Exception) - with pytest.deprecated_call(match="was deprecated in v1.6 and will be removed in v1.8"): - trainer.on_before_backward(loss=torch.tensor([[1.0, -1.0], [1.0, -1.0]])) - with pytest.deprecated_call(match="was deprecated in v1.6 and will be removed in v1.8"): - trainer.on_before_optimizer_step( - optimizer=optim.SGD(model.parameters(), lr=0.01, momentum=0.9), optimizer_idx=0 - ) - with pytest.deprecated_call(match="was deprecated in v1.6 and will be removed in v1.8"): - trainer.on_before_zero_grad(optimizer=optim.SGD(model.parameters(), lr=0.01, momentum=0.9)) - - def test_v1_8_0_deprecate_trainer_data_loading_mixin(): trainer = Trainer(max_epochs=1) model = BoringModel() From cea9a72d9da1fd0b6af384afe413390a72f08d01 Mon Sep 17 00:00:00 2001 From: Krishna Kalyan Date: Sun, 28 Aug 2022 19:06:09 +0100 Subject: [PATCH 002/193] Removed the deprecated the `trainer.lr_schedulers` (#14408) Co-authored-by: Rohit Gupta --- src/pytorch_lightning/CHANGELOG.md | 3 +++ src/pytorch_lightning/trainer/trainer.py | 11 ----------- tests/tests_pytorch/deprecated_api/test_remove_1-8.py | 6 ------ 3 files changed, 3 insertions(+), 17 deletions(-) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 845c7fd88bc61..1fad936dced81 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -95,6 +95,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Removed the experimental `pytorch_lightning.utiltiies.meta` functions in favor of built-in https://github.com/pytorch/torchdistx support ([#13868](https://github.com/Lightning-AI/lightning/pull/13868)) +- Removed the deprecated the `trainer.lr_schedulers` ([#14408](https://github.com/Lightning-AI/lightning/pull/14408)) + + - Removed the deprecated `LightningModule.{on_hpc_load,on_hpc_save}` hooks in favor of the general purpose hooks `LightningModule.{on_load_checkpoint,on_save_checkpoint}` ([#14315](https://github.com/Lightning-AI/lightning/pull/14315)) diff --git a/src/pytorch_lightning/trainer/trainer.py b/src/pytorch_lightning/trainer/trainer.py index 13817deaf64d1..846682a7faa9d 100644 --- a/src/pytorch_lightning/trainer/trainer.py +++ b/src/pytorch_lightning/trainer/trainer.py @@ -2138,17 +2138,6 @@ def lightning_optimizers(self) -> Dict[int, LightningOptimizer]: def lr_scheduler_configs(self) -> List[LRSchedulerConfig]: return self.strategy.lr_scheduler_configs - @property - def lr_schedulers(self) -> List[Dict[str, Any]]: - rank_zero_deprecation( - "`Trainer.lr_schedulers` is deprecated in v1.6 and will be removed in v1.8." - " You can use `trainer.lr_scheduler_configs` instead which contains dataclasses instead of dictionaries.", - stacklevel=5, - ) - from dataclasses import asdict - - return [asdict(config) for config in self.strategy.lr_scheduler_configs] - @property def optimizer_frequencies(self) -> List[int]: return self.strategy.optimizer_frequencies diff --git a/tests/tests_pytorch/deprecated_api/test_remove_1-8.py b/tests/tests_pytorch/deprecated_api/test_remove_1-8.py index d3a014177dabc..ab6f9d86b2b36 100644 --- a/tests/tests_pytorch/deprecated_api/test_remove_1-8.py +++ b/tests/tests_pytorch/deprecated_api/test_remove_1-8.py @@ -117,12 +117,6 @@ def test_v1_8_0_deprecated_trainer_should_rank_save_checkpoint(tmpdir): _ = trainer.should_rank_save_checkpoint -def test_v1_8_0_deprecated_lr_scheduler(): - trainer = Trainer() - with pytest.deprecated_call(match=r"`Trainer.lr_schedulers` is deprecated in v1.6 and will be removed in v1.8."): - assert trainer.lr_schedulers == [] - - def test_v1_8_0_trainer_optimizers_mixin(): trainer = Trainer() model = BoringModel() From 5cbe1f48d26ca3c2f8b1915be4d374b9929d159b Mon Sep 17 00:00:00 2001 From: Krishna Kalyan Date: Sun, 28 Aug 2022 19:07:00 +0100 Subject: [PATCH 003/193] Removed the deprecated `Trainer.data_parallel_device_ids` function in favour of `Trainer.device_ids` (#14422) Co-authored-by: Rohit Gupta --- src/pytorch_lightning/CHANGELOG.md | 3 +++ src/pytorch_lightning/trainer/trainer.py | 8 ------ .../deprecated_api/test_remove_1-8.py | 26 ------------------- 3 files changed, 3 insertions(+), 34 deletions(-) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 1fad936dced81..d1b1224eafb2a 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -104,6 +104,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Removed deprecated support for old torchtext versions ([#14375](https://github.com/Lightning-AI/lightning/pull/14375)) +- Removed the deprecated `Trainer.data_parallel_device_ids` hook in favour of `Trainer.device_ids` ([#14422](https://github.com/Lightning-AI/lightning/pull/14422)) + + - Removed the deprecated class `TrainerCallbackHookMixin` ([#14401](https://github.com/Lightning-AI/lightning/14401)) diff --git a/src/pytorch_lightning/trainer/trainer.py b/src/pytorch_lightning/trainer/trainer.py index 846682a7faa9d..e43dba9201356 100644 --- a/src/pytorch_lightning/trainer/trainer.py +++ b/src/pytorch_lightning/trainer/trainer.py @@ -2106,14 +2106,6 @@ def devices(self) -> int: ) return self.num_devices - @property - def data_parallel_device_ids(self) -> Optional[List[int]]: - rank_zero_deprecation( - "`Trainer.data_parallel_device_ids` was deprecated in v1.6 and will be removed in v1.8." - " Please use `Trainer.device_ids` instead." - ) - return self.device_ids if isinstance(self.accelerator, CUDAAccelerator) else None - @property def lightning_module(self) -> "pl.LightningModule": # TODO: this is actually an optional return diff --git a/tests/tests_pytorch/deprecated_api/test_remove_1-8.py b/tests/tests_pytorch/deprecated_api/test_remove_1-8.py index ab6f9d86b2b36..12a2d9ec22b2a 100644 --- a/tests/tests_pytorch/deprecated_api/test_remove_1-8.py +++ b/tests/tests_pytorch/deprecated_api/test_remove_1-8.py @@ -780,32 +780,6 @@ def test_trainer_num_processes(monkeypatch, trainer_kwargs, expected_num_process trainer.num_processes == expected_num_processes -@pytest.mark.parametrize( - ["trainer_kwargs", "expected_data_parallel_device_ids"], - [ - ({}, None), - ({"devices": 1}, None), - ({"devices": "1"}, None), - ({"accelerator": "gpu", "devices": 1}, [0]), - ({"accelerator": "gpu", "devices": 2}, [0, 1]), - ({"accelerator": "gpu", "devices": [1]}, [1]), - ({"accelerator": "gpu", "devices": "0,"}, [0]), - ], -) -def test_trainer_data_parallel_device_ids(monkeypatch, trainer_kwargs, expected_data_parallel_device_ids): - """Test multi type argument with bool.""" - if trainer_kwargs.get("accelerator") == "gpu": - monkeypatch.setattr(device_parser, "is_cuda_available", lambda: True) - monkeypatch.setattr(device_parser, "num_cuda_devices", lambda: 2) - - trainer = Trainer(**trainer_kwargs) - with pytest.deprecated_call( - match="`Trainer.data_parallel_device_ids` was deprecated in v1.6 and will be removed in v1.8." - " Please use `Trainer.device_ids` instead." - ): - assert trainer.data_parallel_device_ids == expected_data_parallel_device_ids - - def test_deprecated_mc_save_checkpoint(): mc = ModelCheckpoint() trainer = Trainer() From 1a3fe395715b2b9eb3c80647b5e0eb490047be8d Mon Sep 17 00:00:00 2001 From: Krishna Kalyan Date: Sun, 28 Aug 2022 22:59:24 +0100 Subject: [PATCH 004/193] Removed deprecated `Trainer.num_processes` property in favour of `Trainer.num_devices` (#14423) Co-authored-by: awaelchli Co-authored-by: Rohit Gupta --- src/pytorch_lightning/CHANGELOG.md | 3 +++ src/pytorch_lightning/trainer/trainer.py | 8 ------- .../deprecated_api/test_remove_1-8.py | 22 ------------------- 3 files changed, 3 insertions(+), 30 deletions(-) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index d1b1224eafb2a..b976464c6e5c6 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -104,6 +104,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Removed deprecated support for old torchtext versions ([#14375](https://github.com/Lightning-AI/lightning/pull/14375)) +- Removed deprecated `Trainer.num_processes` attribute in favour of `Trainer.num_devices` ([#14423](https://github.com/Lightning-AI/lightning/pull/14423)) + + - Removed the deprecated `Trainer.data_parallel_device_ids` hook in favour of `Trainer.device_ids` ([#14422](https://github.com/Lightning-AI/lightning/pull/14422)) diff --git a/src/pytorch_lightning/trainer/trainer.py b/src/pytorch_lightning/trainer/trainer.py index e43dba9201356..b1bc2576d2fa8 100644 --- a/src/pytorch_lightning/trainer/trainer.py +++ b/src/pytorch_lightning/trainer/trainer.py @@ -2058,14 +2058,6 @@ def num_devices(self) -> int: """Number of devices the trainer uses per node.""" return len(self.device_ids) - @property - def num_processes(self) -> int: - rank_zero_deprecation( - "`Trainer.num_processes` is deprecated in v1.6 and will be removed in v1.8. " - "Please use `Trainer.num_devices` instead." - ) - return self.num_devices - @property def root_gpu(self) -> Optional[int]: rank_zero_deprecation( diff --git a/tests/tests_pytorch/deprecated_api/test_remove_1-8.py b/tests/tests_pytorch/deprecated_api/test_remove_1-8.py index 12a2d9ec22b2a..38178c5c8e9e9 100644 --- a/tests/tests_pytorch/deprecated_api/test_remove_1-8.py +++ b/tests/tests_pytorch/deprecated_api/test_remove_1-8.py @@ -758,28 +758,6 @@ def test_v1_8_0_deprecated_lightning_ipu_module(): _ = LightningIPUModule(BoringModel(), 32) -@pytest.mark.parametrize( - ["trainer_kwargs", "expected_num_processes"], - [ - ({}, 1), - ({"devices": 1}, 1), - ({"devices": 4}, 4), - ({"accelerator": "cpu", "devices": 1}, 0), - ({"accelerator": "gpu", "devices": 4}, 4), - ], -) -def test_trainer_num_processes(monkeypatch, trainer_kwargs, expected_num_processes): - if trainer_kwargs.get("accelerator") == "gpu": - monkeypatch.setattr(device_parser, "is_cuda_available", lambda: True) - monkeypatch.setattr(device_parser, "num_cuda_devices", lambda: 16) - trainer = Trainer(**trainer_kwargs) - with pytest.deprecated_call( - match="`Trainer.num_processes` is deprecated in v1.6 and will be removed in v1.8. " - "Please use `Trainer.num_devices` instead." - ): - trainer.num_processes == expected_num_processes - - def test_deprecated_mc_save_checkpoint(): mc = ModelCheckpoint() trainer = Trainer() From 2b7cd58e06702e7fc96e65001a3853ed2aabc1bf Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 29 Aug 2022 17:08:37 +0900 Subject: [PATCH 005/193] Bump tj-actions/changed-files from 28 to 29.0.1 (#14430) Bumps [tj-actions/changed-files](https://github.com/tj-actions/changed-files) from 28 to 29.0.1. - [Release notes](https://github.com/tj-actions/changed-files/releases) - [Changelog](https://github.com/tj-actions/changed-files/blob/main/HISTORY.md) - [Commits](https://github.com/tj-actions/changed-files/compare/v28...v29.0.1) --- updated-dependencies: - dependency-name: tj-actions/changed-files dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/ci-pr-gatekeeper.yml | 2 +- .github/workflows/ci-pytorch-test-conda.yml | 2 +- .github/workflows/ci-pytorch-test-full.yml | 2 +- .github/workflows/ci-pytorch-test-slow.yml | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/ci-pr-gatekeeper.yml b/.github/workflows/ci-pr-gatekeeper.yml index 93f55689165e5..8714bec926c23 100644 --- a/.github/workflows/ci-pr-gatekeeper.yml +++ b/.github/workflows/ci-pr-gatekeeper.yml @@ -20,7 +20,7 @@ jobs: fetch-depth: "2" # To retrieve the preceding commit. - name: Get changed files using defaults id: changed-files - uses: tj-actions/changed-files@v28 + uses: tj-actions/changed-files@v29.0.1 - name: Determine changes id: touched run: | diff --git a/.github/workflows/ci-pytorch-test-conda.yml b/.github/workflows/ci-pytorch-test-conda.yml index 2a63189caa019..c25c12df67b66 100644 --- a/.github/workflows/ci-pytorch-test-conda.yml +++ b/.github/workflows/ci-pytorch-test-conda.yml @@ -37,7 +37,7 @@ jobs: - name: Get changed files id: changed-files - uses: tj-actions/changed-files@v28 + uses: tj-actions/changed-files@v29.0.1 - name: Decide if the test should be skipped id: skip diff --git a/.github/workflows/ci-pytorch-test-full.yml b/.github/workflows/ci-pytorch-test-full.yml index 6f246e62e35fb..770dd4727f056 100644 --- a/.github/workflows/ci-pytorch-test-full.yml +++ b/.github/workflows/ci-pytorch-test-full.yml @@ -39,7 +39,7 @@ jobs: - name: Get changed files id: changed-files - uses: tj-actions/changed-files@v28 + uses: tj-actions/changed-files@v29.0.1 - name: Decide if the test should be skipped id: skip diff --git a/.github/workflows/ci-pytorch-test-slow.yml b/.github/workflows/ci-pytorch-test-slow.yml index 194a0bd910010..9096d2765043a 100644 --- a/.github/workflows/ci-pytorch-test-slow.yml +++ b/.github/workflows/ci-pytorch-test-slow.yml @@ -30,7 +30,7 @@ jobs: - name: Get changed files id: changed-files - uses: tj-actions/changed-files@v28 + uses: tj-actions/changed-files@v29.0.1 - name: Decide if the test should be skipped id: skip From f202e84f4ba70c615bdbc70ca2bc19bface95f18 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Mon, 29 Aug 2022 14:53:57 +0200 Subject: [PATCH 006/193] Remove the legacy `get_deprecated_arg_names` (#14415) --- src/pytorch_lightning/CHANGELOG.md | 7 +++++-- src/pytorch_lightning/core/datamodule.py | 9 --------- src/pytorch_lightning/trainer/trainer.py | 9 --------- src/pytorch_lightning/utilities/argparse.py | 2 -- tests/tests_pytorch/utilities/test_argparse.py | 6 +----- 5 files changed, 6 insertions(+), 27 deletions(-) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index b976464c6e5c6..bad937458c36f 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -86,10 +86,13 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Removed the deprecated `DistributedType` and `DeviceType` enum classes ([#14045](https://github.com/Lightning-AI/lightning/pull/14045)) -- Remove the deprecated `on_train_batch_end(outputs)` format when multiple optimizers are used and TBPTT is enabled ([#14373](https://github.com/PyTorchLightning/pytorch-lightning/pull/14373)) +- Removed the legacy and unused `Trainer.get_deprecated_arg_names()` ([#14415](https://github.com/Lightning-AI/lightning/pull/14415)) -- Remove the deprecated `training_epoch_end(outputs)` format when multiple optimizers are used and TBPTT is enabled ([#14373](https://github.com/PyTorchLightning/pytorch-lightning/pull/14373)) +- Removed the deprecated `on_train_batch_end(outputs)` format when multiple optimizers are used and TBPTT is enabled ([#14373](https://github.com/PyTorchLightning/pytorch-lightning/pull/14373)) + + +- Removed the deprecated `training_epoch_end(outputs)` format when multiple optimizers are used and TBPTT is enabled ([#14373](https://github.com/PyTorchLightning/pytorch-lightning/pull/14373)) - Removed the experimental `pytorch_lightning.utiltiies.meta` functions in favor of built-in https://github.com/pytorch/torchdistx support ([#13868](https://github.com/Lightning-AI/lightning/pull/13868)) diff --git a/src/pytorch_lightning/core/datamodule.py b/src/pytorch_lightning/core/datamodule.py index b7a25badf420f..e4adf9b1ca928 100644 --- a/src/pytorch_lightning/core/datamodule.py +++ b/src/pytorch_lightning/core/datamodule.py @@ -113,15 +113,6 @@ def get_init_arguments_and_types(cls) -> List[Tuple[str, Tuple, Any]]: """ return get_init_arguments_and_types(cls) - @classmethod - def get_deprecated_arg_names(cls) -> List: - """Returns a list with deprecated DataModule arguments.""" - depr_arg_names: List[str] = [] - for name, val in cls.__dict__.items(): - if name.startswith("DEPRECATED") and isinstance(val, (tuple, list)): - depr_arg_names.extend(val) - return depr_arg_names - @classmethod def from_datasets( cls, diff --git a/src/pytorch_lightning/trainer/trainer.py b/src/pytorch_lightning/trainer/trainer.py index b1bc2576d2fa8..08fade4021a8b 100644 --- a/src/pytorch_lightning/trainer/trainer.py +++ b/src/pytorch_lightning/trainer/trainer.py @@ -2404,15 +2404,6 @@ def default_attributes(cls) -> dict: init_signature = inspect.signature(cls) return {k: v.default for k, v in init_signature.parameters.items()} - @classmethod - def get_deprecated_arg_names(cls) -> List: - """Returns a list with deprecated Trainer arguments.""" - depr_arg_names = [] - for name, val in cls.__dict__.items(): - if name.startswith("DEPRECATED") and isinstance(val, (tuple, list)): - depr_arg_names.extend(val) - return depr_arg_names - @classmethod def from_argparse_args(cls: Any, args: Union[Namespace, ArgumentParser], **kwargs) -> Any: return from_argparse_args(cls, args, **kwargs) diff --git a/src/pytorch_lightning/utilities/argparse.py b/src/pytorch_lightning/utilities/argparse.py index 26277db183410..8b1872ee7b643 100644 --- a/src/pytorch_lightning/utilities/argparse.py +++ b/src/pytorch_lightning/utilities/argparse.py @@ -212,8 +212,6 @@ def add_argparse_args( parser = ArgumentParser(parents=[parent_parser], add_help=False) ignore_arg_names = ["self", "args", "kwargs"] - if hasattr(cls, "get_deprecated_arg_names"): - ignore_arg_names += cls.get_deprecated_arg_names() allowed_types = (str, int, float, bool) diff --git a/tests/tests_pytorch/utilities/test_argparse.py b/tests/tests_pytorch/utilities/test_argparse.py index 2a88e8db531f9..ba5d51c2a2095 100644 --- a/tests/tests_pytorch/utilities/test_argparse.py +++ b/tests/tests_pytorch/utilities/test_argparse.py @@ -1,6 +1,6 @@ import io from argparse import ArgumentParser, Namespace -from typing import Generic, List, TypeVar +from typing import Generic, TypeVar from unittest.mock import MagicMock import pytest @@ -118,10 +118,6 @@ class AddArgparseArgsExampleClass: def __init__(self, my_parameter: int = 0): pass - @staticmethod - def get_deprecated_arg_names() -> List[str]: - return [] - class AddArgparseArgsExampleClassViaInit: def __init__(self, my_parameter: int = 0): From 66ca0933324aa78e1345685b7563bed6486fd5b7 Mon Sep 17 00:00:00 2001 From: Mansy Date: Mon, 29 Aug 2022 15:16:22 +0200 Subject: [PATCH 007/193] Fix cloud e2e, artifacts and cleanup (#14392) --- .azure/app-cloud-e2e.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.azure/app-cloud-e2e.yml b/.azure/app-cloud-e2e.yml index aed2cd07b987b..a8c66f420d7f4 100644 --- a/.azure/app-cloud-e2e.yml +++ b/.azure/app-cloud-e2e.yml @@ -122,11 +122,13 @@ jobs: displayName: 'Run the tests' - publish: '$(Build.ArtifactStagingDirectory)/videos' + condition: failed() displayName: 'Publish videos' artifact: $(name) - bash: | time python -c "from lightning.app import testing; testing.delete_cloud_lightning_apps()" + condition: always() env: # LAI_USER: $(LAI_USER) # LAI_PASS: $(LAI_PASS) From 80b1987ad9ee25249ea0aca62e194acebe1ecd05 Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Mon, 29 Aug 2022 20:03:37 +0200 Subject: [PATCH 008/193] Update GHA job names (#14400) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * update CJ job names * groups * filter * Apply suggestions from code review Co-authored-by: Akihiro Nitta Co-authored-by: Carlos Mocholí --- .github/checkgroup.yml | 66 +++++++++++---------- .github/workflows/ci-app-examples.yml | 9 ++- .github/workflows/ci-app-tests.yml | 3 +- .github/workflows/ci-pytorch-test-conda.yml | 2 +- .github/workflows/ci-pytorch-test-full.yml | 2 +- .github/workflows/ci-pytorch-test-slow.yml | 2 +- .github/workflows/docs-checks.yml | 4 +- 7 files changed, 51 insertions(+), 37 deletions(-) diff --git a/.github/checkgroup.yml b/.github/checkgroup.yml index ace786f8ac40a..531df1ebeaea8 100644 --- a/.github/checkgroup.yml +++ b/.github/checkgroup.yml @@ -28,29 +28,29 @@ subprojects: - ".github/workflows/ci-pytorch*.yml" - ".github/workflows/docs-*.yml" checks: - - "conda (3.8, 1.10)" - - "conda (3.8, 1.9)" - - "conda (3.9, 1.11)" - - "conda (3.9, 1.12)" - - "cpu (macOS-11, 3.10, latest, stable)" - - "cpu (macOS-11, 3.7, latest, stable)" - - "cpu (macOS-11, 3.7, oldest, stable)" - - "cpu (ubuntu-20.04, 3.10, latest, stable)" - - "cpu (ubuntu-20.04, 3.7, latest, stable)" - - "cpu (ubuntu-20.04, 3.7, oldest, stable)" - - "cpu (windows-2022, 3.10, latest, stable)" - - "cpu (windows-2022, 3.7, latest, stable)" - - "cpu (windows-2022, 3.7, oldest, stable)" - - "doctest (pytorch)" - - "make-docs (pytorch)" + - "pl-conda (3.8, 1.10)" + - "pl-conda (3.8, 1.9)" + - "pl-conda (3.9, 1.11)" + - "pl-conda (3.9, 1.12)" + - "pl-cpu (macOS-11, 3.10, latest, stable)" + - "pl-cpu (macOS-11, 3.7, latest, stable)" + - "pl-cpu (macOS-11, 3.7, oldest, stable)" + - "pl-cpu (ubuntu-20.04, 3.10, latest, stable)" + - "pl-cpu (ubuntu-20.04, 3.7, latest, stable)" + - "pl-cpu (ubuntu-20.04, 3.7, oldest, stable)" + - "pl-cpu (windows-2022, 3.10, latest, stable)" + - "pl-cpu (windows-2022, 3.7, latest, stable)" + - "pl-cpu (windows-2022, 3.7, oldest, stable)" + - "make-doctest (pytorch)" + - "make-html (pytorch)" - "mypy" - "PR Gatekeeper (pytorch)" - "pytorch-lightning (GPUs)" - "pytorch-lightning (HPUs)" - "pytorch-lightning (IPUs)" - - "slow (macOS-11, 3.7, 1.11)" - - "slow (ubuntu-20.04, 3.7, 1.11)" - - "slow (windows-2022, 3.7, 1.11)" + - "pl-slow (macOS-11, 3.7, 1.11)" + - "pl-slow (ubuntu-20.04, 3.7, 1.11)" + - "pl-slow (windows-2022, 3.7, 1.11)" - "test-on-tpus" - id: "pytorch_lightning: Azure GPU" @@ -79,8 +79,8 @@ subprojects: - "requirements/docs.txt" - "requirements/pytorch/**" checks: - - "doctest (pytorch)" - - "make-docs (pytorch)" + - "make-doctest (pytorch)" + - "make-html (pytorch)" - id: "pytorch_lightning: Docker" paths: @@ -127,14 +127,20 @@ subprojects: - "examples/app_*" checks: - "App.cloud-e2e" - - "doctest (app)" - - "make-docs (app)" - - "pytest (macOS-11, 3.8, latest)" - - "pytest (macOS-11, 3.8, oldest)" - - "pytest (ubuntu-20.04, 3.8, latest)" - - "pytest (ubuntu-20.04, 3.8, oldest)" - - "pytest (windows-2022, 3.8, latest)" - - "pytest (windows-2022, 3.8, oldest)" + - "make-doctest (app)" + - "make-html (app)" + - "app-examples (macOS-11, 3.8, latest)" + - "app-examples (macOS-11, 3.8, oldest)" + - "app-examples (ubuntu-20.04, 3.8, latest)" + - "app-examples (ubuntu-20.04, 3.8, oldest)" + - "app-examples (windows-2022, 3.8, latest)" + - "app-examples (windows-2022, 3.8, oldest)" + - "app-pytest (macOS-11, 3.8, latest)" + - "app-pytest (macOS-11, 3.8, oldest)" + - "app-pytest (ubuntu-20.04, 3.8, latest)" + - "app-pytest (ubuntu-20.04, 3.8, oldest)" + - "app-pytest (windows-2022, 3.8, latest)" + - "app-pytest (windows-2022, 3.8, oldest)" - id: "lightning_app: Azure" paths: @@ -149,8 +155,8 @@ subprojects: - "requirements/docs.txt" - "requirements/app/**" checks: - - "doctest (app)" - - "make-docs (app)" + - "make-doctest (app)" + - "make-html (app)" - id: "install" paths: diff --git a/.github/workflows/ci-app-examples.yml b/.github/workflows/ci-app-examples.yml index 8af5a2fc5a39e..ecd2a746412f1 100644 --- a/.github/workflows/ci-app-examples.yml +++ b/.github/workflows/ci-app-examples.yml @@ -6,13 +6,20 @@ on: # Trigger the workflow on push or pull request, but only for the master bran branches: [master, "release/*"] pull_request: branches: [master, "release/*"] + paths: + - ".github/workflows/ci-app-examples.yml" + - "src/lightning_app/**" + - "tests/tests_app_examples/**" + - "examples/app_*" + - "requirements/app/**" + - "setup.py" concurrency: group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }} cancel-in-progress: ${{ github.ref != 'refs/heads/master' }} jobs: - pytest: + app-examples: runs-on: ${{ matrix.os }} strategy: fail-fast: false diff --git a/.github/workflows/ci-app-tests.yml b/.github/workflows/ci-app-tests.yml index 8d9c538eb665b..91299fdc4e16b 100644 --- a/.github/workflows/ci-app-tests.yml +++ b/.github/workflows/ci-app-tests.yml @@ -6,6 +6,7 @@ on: # Trigger the workflow on push or pull request, but only for the master bran branches: [master, "release/*"] pull_request: paths: + - ".github/workflows/ci-app-tests.yml" - "src/lightning_app/**" - "tests/tests_app/**" - "requirements/app/**" @@ -16,7 +17,7 @@ concurrency: cancel-in-progress: ${{ github.ref != 'refs/heads/master' }} jobs: - pytest: + app-pytest: runs-on: ${{ matrix.os }} strategy: fail-fast: false diff --git a/.github/workflows/ci-pytorch-test-conda.yml b/.github/workflows/ci-pytorch-test-conda.yml index c25c12df67b66..82c463a54169f 100644 --- a/.github/workflows/ci-pytorch-test-conda.yml +++ b/.github/workflows/ci-pytorch-test-conda.yml @@ -16,7 +16,7 @@ defaults: shell: bash -l {0} jobs: - conda: + pl-conda: runs-on: ubuntu-20.04 container: pytorchlightning/pytorch_lightning:base-conda-py${{ matrix.python-version }}-torch${{ matrix.pytorch-version }} strategy: diff --git a/.github/workflows/ci-pytorch-test-full.yml b/.github/workflows/ci-pytorch-test-full.yml index 770dd4727f056..987373b6ea2bf 100644 --- a/.github/workflows/ci-pytorch-test-full.yml +++ b/.github/workflows/ci-pytorch-test-full.yml @@ -14,7 +14,7 @@ concurrency: jobs: - cpu: + pl-cpu: runs-on: ${{ matrix.os }} if: github.event.pull_request.draft == false strategy: diff --git a/.github/workflows/ci-pytorch-test-slow.yml b/.github/workflows/ci-pytorch-test-slow.yml index 9096d2765043a..126eaaf17da1a 100644 --- a/.github/workflows/ci-pytorch-test-slow.yml +++ b/.github/workflows/ci-pytorch-test-slow.yml @@ -13,7 +13,7 @@ concurrency: cancel-in-progress: ${{ ! (github.ref == 'refs/heads/master' || startsWith(github.ref, 'refs/heads/release/')) }} jobs: - slow: + pl-slow: runs-on: ${{ matrix.os }} if: github.event.pull_request.draft == false strategy: diff --git a/.github/workflows/docs-checks.yml b/.github/workflows/docs-checks.yml index 5b5a9aec778be..a91f216af963f 100644 --- a/.github/workflows/docs-checks.yml +++ b/.github/workflows/docs-checks.yml @@ -12,7 +12,7 @@ concurrency: cancel-in-progress: ${{ ! (github.ref == 'refs/heads/master' || startsWith(github.ref, 'refs/heads/release/')) }} jobs: - doctest: + make-doctest: runs-on: ubuntu-20.04 strategy: fail-fast: false @@ -63,7 +63,7 @@ jobs: make doctest make coverage - make-docs: + make-html: runs-on: ubuntu-20.04 strategy: fail-fast: false From d0d1818d5000432c28789e414ed3a16fdd64aeb1 Mon Sep 17 00:00:00 2001 From: ananthsub Date: Mon, 29 Aug 2022 13:23:34 -0700 Subject: [PATCH 009/193] Update `has_len_all_ranks` to use `Strategy.root_device` (#12144) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Carlos Mocholí Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Adrian Wälchli --- src/pytorch_lightning/utilities/data.py | 4 ++-- .../properties/test_estimated_stepping_batches.py | 10 ++++++++-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/src/pytorch_lightning/utilities/data.py b/src/pytorch_lightning/utilities/data.py index c795cfc47d3a5..adb425127a81e 100644 --- a/src/pytorch_lightning/utilities/data.py +++ b/src/pytorch_lightning/utilities/data.py @@ -139,14 +139,14 @@ def has_len(dataloader: Union[DataLoader, Iterable]) -> bool: def has_len_all_ranks( dataloader: DataLoader, - training_type: "pl.Strategy", + strategy: "pl.Strategy", model: Union["pl.LightningModule", "pl.LightningDataModule"], ) -> bool: """Checks if a given Dataloader has ``__len__`` method implemented i.e. if it is a finite dataloader or infinite dataloader.""" try: local_length = len(dataloader) - total_length = training_type.reduce(torch.tensor(local_length).to(model.device), reduce_op="sum") + total_length = strategy.reduce(torch.tensor(local_length, device=strategy.root_device), reduce_op="sum") if total_length == 0: rank_zero_warn( diff --git a/tests/tests_pytorch/trainer/properties/test_estimated_stepping_batches.py b/tests/tests_pytorch/trainer/properties/test_estimated_stepping_batches.py index 846a39a748a60..177d2034a0273 100644 --- a/tests/tests_pytorch/trainer/properties/test_estimated_stepping_batches.py +++ b/tests/tests_pytorch/trainer/properties/test_estimated_stepping_batches.py @@ -125,9 +125,15 @@ def test_num_stepping_batches_accumulate_gradients(accumulate_grad_batches, expe ) def test_num_stepping_batches_gpu(trainer_kwargs, estimated_steps, monkeypatch): """Test stepping batches with GPU strategies.""" + num_devices_per_node = 7 monkeypatch.setattr(device_parser, "is_cuda_available", lambda: True) - monkeypatch.setattr(device_parser, "num_cuda_devices", lambda: 7) - trainer = Trainer(max_epochs=1, devices=7, accelerator="gpu", **trainer_kwargs) + monkeypatch.setattr(device_parser, "num_cuda_devices", lambda: num_devices_per_node) + trainer = Trainer(max_epochs=1, devices=num_devices_per_node, accelerator="gpu", **trainer_kwargs) + + # set the `parallel_devices` to cpu to run the test on CPU and take `num_nodes`` into consideration + # because we can't run on multi-node in ci + trainer.strategy.parallel_devices = [torch.device("cpu", index=i) for i in range(num_devices_per_node)] + model = BoringModel() trainer._data_connector.attach_data(model) trainer.strategy.connect(model) From 2374465b013a1936d720967353cf61536ee68d29 Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Mon, 29 Aug 2022 22:48:40 +0200 Subject: [PATCH 010/193] Azure: local id for e2e (#14432) --- .azure/app-cloud-e2e.yml | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/.azure/app-cloud-e2e.yml b/.azure/app-cloud-e2e.yml index a8c66f420d7f4..a057b84079020 100644 --- a/.azure/app-cloud-e2e.yml +++ b/.azure/app-cloud-e2e.yml @@ -17,10 +17,12 @@ pr: - "master" - "release/*" +# variables are automatically exported as environment variables so this will override pip's default cache dir variables: - # variables are automatically exported as environment variables so this will override pip's default cache dir - name: pip_cache_dir value: $(Pipeline.Workspace)/.pip + - name: local_id + value: $(Build.BuildId) jobs: - job: App_cloud_e2e_testing @@ -57,17 +59,18 @@ jobs: workspace: clean: all steps: + + - script: echo '##vso[task.setvariable variable=local_id]$(System.PullRequest.PullRequestNumber)' + displayName: "Set id for this PR" + condition: eq(variables['Build.Reason'], 'PullRequest') + - bash: | whoami + printf "local id: $(local_id)\n" python --version pip --version displayName: 'Info' - # TODO: parse the PR number - - bash: | - ID=$(date +%s) - echo "##vso[task.setvariable variable=local_id]$ID" - - task: Cache@2 inputs: key: 'pip | "$(name)" | requirements/app/base.txt' From 18e2a8eecd3667364f3cad75bcdcc9306fd64bcd Mon Sep 17 00:00:00 2001 From: Marc Skov Madsen Date: Tue, 30 Aug 2022 02:33:21 +0200 Subject: [PATCH 011/193] PanelFrontend and Panel Web UI Intermediate docs (#13531) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: thomas chaton Co-authored-by: Adrian Wälchli Co-authored-by: Marc Skov Madsen Co-authored-by: Laverne Henderson Co-authored-by: Felonious-Spellfire Co-authored-by: Jirka Borovec Co-authored-by: Mansy Co-authored-by: Jirka --- .../api/lightning_app.core.LightningApp.rst | 13 + .../api/lightning_app.core.LightningFlow.rst | 13 + .../api/lightning_app.core.LightningWork.rst | 13 + docs/source-app/api_reference/frontend.rst | 1 + .../workflows/add_web_ui/index_content.rst | 8 + .../workflows/add_web_ui/panel/basic.rst | 358 ++++++++++++++++++ .../workflows/add_web_ui/panel/index.rst | 85 +++++ .../add_web_ui/panel/intermediate.rst | 210 ++++++++++ .../intermediate.rst | 5 +- requirements/app/ui.txt | 1 + src/lightning_app/CHANGELOG.md | 12 +- src/lightning_app/frontend/__init__.py | 3 +- src/lightning_app/frontend/panel/__init__.py | 6 + .../frontend/panel/app_state_comm.py | 86 +++++ .../frontend/panel/app_state_watcher.py | 106 ++++++ .../frontend/panel/panel_frontend.py | 171 +++++++++ .../frontend/panel/panel_serve_render_fn.py | 52 +++ src/lightning_app/frontend/streamlit_base.py | 17 +- src/lightning_app/frontend/utils.py | 57 +++ src/lightning_app/utilities/imports.py | 5 + src/lightning_app/utilities/state.py | 2 +- tests/tests_app/frontend/conftest.py | 73 ++++ tests/tests_app/frontend/panel/__init__.py | 0 tests/tests_app/frontend/panel/app_panel.py | 5 + .../frontend/panel/test_app_state_comm.py | 39 ++ .../frontend/panel/test_app_state_watcher.py | 85 +++++ .../frontend/panel/test_panel_frontend.py | 164 ++++++++ .../panel/test_panel_serve_render_fn.py | 79 ++++ tests/tests_app/frontend/test_utils.py | 42 ++ .../tests_app/frontend/utilities/__init__.py | 0 tests/tests_app/utilities/test_cloud.py | 2 + 31 files changed, 1695 insertions(+), 18 deletions(-) create mode 100644 docs/source-app/api_reference/api/lightning_app.core.LightningApp.rst create mode 100644 docs/source-app/api_reference/api/lightning_app.core.LightningFlow.rst create mode 100644 docs/source-app/api_reference/api/lightning_app.core.LightningWork.rst create mode 100644 docs/source-app/workflows/add_web_ui/panel/basic.rst create mode 100644 docs/source-app/workflows/add_web_ui/panel/index.rst create mode 100644 docs/source-app/workflows/add_web_ui/panel/intermediate.rst create mode 100644 src/lightning_app/frontend/panel/__init__.py create mode 100644 src/lightning_app/frontend/panel/app_state_comm.py create mode 100644 src/lightning_app/frontend/panel/app_state_watcher.py create mode 100644 src/lightning_app/frontend/panel/panel_frontend.py create mode 100644 src/lightning_app/frontend/panel/panel_serve_render_fn.py create mode 100644 src/lightning_app/frontend/utils.py create mode 100644 tests/tests_app/frontend/conftest.py create mode 100644 tests/tests_app/frontend/panel/__init__.py create mode 100644 tests/tests_app/frontend/panel/app_panel.py create mode 100644 tests/tests_app/frontend/panel/test_app_state_comm.py create mode 100644 tests/tests_app/frontend/panel/test_app_state_watcher.py create mode 100644 tests/tests_app/frontend/panel/test_panel_frontend.py create mode 100644 tests/tests_app/frontend/panel/test_panel_serve_render_fn.py create mode 100644 tests/tests_app/frontend/test_utils.py create mode 100644 tests/tests_app/frontend/utilities/__init__.py diff --git a/docs/source-app/api_reference/api/lightning_app.core.LightningApp.rst b/docs/source-app/api_reference/api/lightning_app.core.LightningApp.rst new file mode 100644 index 0000000000000..269a4e8f1143a --- /dev/null +++ b/docs/source-app/api_reference/api/lightning_app.core.LightningApp.rst @@ -0,0 +1,13 @@ +:orphan: + +.. role:: hidden + :class: hidden-section +.. currentmodule:: lightning_app.core + + +LightningApp +============ + +.. autoclass:: LightningApp + :members: + :noindex: diff --git a/docs/source-app/api_reference/api/lightning_app.core.LightningFlow.rst b/docs/source-app/api_reference/api/lightning_app.core.LightningFlow.rst new file mode 100644 index 0000000000000..336efd4d7165b --- /dev/null +++ b/docs/source-app/api_reference/api/lightning_app.core.LightningFlow.rst @@ -0,0 +1,13 @@ +:orphan: + +.. role:: hidden + :class: hidden-section +.. currentmodule:: lightning_app.core + + +LightningFlow +============= + +.. autoclass:: LightningFlow + :members: + :noindex: diff --git a/docs/source-app/api_reference/api/lightning_app.core.LightningWork.rst b/docs/source-app/api_reference/api/lightning_app.core.LightningWork.rst new file mode 100644 index 0000000000000..db80b79e41d8b --- /dev/null +++ b/docs/source-app/api_reference/api/lightning_app.core.LightningWork.rst @@ -0,0 +1,13 @@ +:orphan: + +.. role:: hidden + :class: hidden-section +.. currentmodule:: lightning_app.core + + +LightningWork +============= + +.. autoclass:: LightningWork + :members: + :noindex: diff --git a/docs/source-app/api_reference/frontend.rst b/docs/source-app/api_reference/frontend.rst index 4a4ba082a2905..f5e57516bcbe0 100644 --- a/docs/source-app/api_reference/frontend.rst +++ b/docs/source-app/api_reference/frontend.rst @@ -20,3 +20,4 @@ ___________________ ~frontend.Frontend ~web.StaticWebFrontend ~stream_lit.StreamlitFrontend + ~panel.PanelFrontend diff --git a/docs/source-app/workflows/add_web_ui/index_content.rst b/docs/source-app/workflows/add_web_ui/index_content.rst index 9602537a53574..ceef98b4ea0dc 100644 --- a/docs/source-app/workflows/add_web_ui/index_content.rst +++ b/docs/source-app/workflows/add_web_ui/index_content.rst @@ -25,6 +25,14 @@ Web UIs for non Javascript Developers :height: 150 :tag: basic +.. displayitem:: + :header: Panel + :description: Learn how to add a web UI built in Python with Panel. + :col_css: col-md-4 + :button_link: panel/index.html + :height: 150 + :tag: basic + .. displayitem:: :header: Jupyter Notebook :description: Learn how to enable a web UI that is a Jupyter Notebook. diff --git a/docs/source-app/workflows/add_web_ui/panel/basic.rst b/docs/source-app/workflows/add_web_ui/panel/basic.rst new file mode 100644 index 0000000000000..695e6cdee2310 --- /dev/null +++ b/docs/source-app/workflows/add_web_ui/panel/basic.rst @@ -0,0 +1,358 @@ +:orphan: + +############################### +Add a web UI with Panel (basic) +############################### + +**Audience:** Users who want to add a web UI written with Python and Panel. + +**Prereqs:** Basic Python knowledge. + +---- + +************** +What is Panel? +************** + +`Panel`_ and the `HoloViz`_ ecosystem provide unique and powerful +features such as big data visualization using `DataShader`_, easy cross filtering +using `HoloViews`_, streaming and much more. + +* Panel is highly flexible and ties into the PyData and Jupyter ecosystems as you can develop in notebooks and use ipywidgets. You can also develop in .py files. + +* Panel is one of the most popular data app frameworks in Python with `more than 400.000 downloads a month `_. It's especially popular in the scientific community. + +* Panel is used, for example, by Rapids to power `CuxFilter`_, a CuDF based big data visualization framework. + +* Panel can be deployed on your favorite server or cloud including `Lightning`_. + +.. figure:: https://pl-flash-data.s3.amazonaws.com/assets_lightning/docs/images/frontend/panel/panel-intro.gif + :alt: Example Panel App + + Example Panel App + +Panel is **particularly well suited for Lightning Apps** that need to display live progress. This is because the Panel server can react +to state changes and asynchronously push messages from the server to the client using web socket communication. + +.. figure:: https://pl-flash-data.s3.amazonaws.com/assets_lightning/docs/images/frontend/panel/panel-streaming-intro.gif + :alt: Example Panel Streaming App + + Example Panel Streaming App + +Install Panel with: + +.. code:: bash + + pip install panel + +---- + +********************* +Run a basic Panel App +********************* + +In the next few sections, we'll build an App step-by-step. + +First, create a file named ``app_panel.py`` with the App content: + +.. code:: python + + # app_panel.py + + import panel as pn + + pn.panel("Hello **Panel ⚡** World").servable() + +Then, create a file named ``app.py`` with the following App content: + +.. code:: python + + # app.py + + import lightning as L + from lightning.app.frontend.panel import PanelFrontend + + + class LitPanel(L.LightningFlow): + + def configure_layout(self): + return PanelFrontend("app_panel.py") + + + class LitApp(L.LightningFlow): + def __init__(self): + super().__init__() + self.lit_panel = LitPanel() + + def configure_layout(self): + return {"name": "home", "content": self.lit_panel} + + + app = L.LightningApp(LitApp()) + +Finally, add ``panel`` to your ``requirements.txt`` file: + +.. code:: bash + + echo 'panel' >> requirements.txt + +.. note:: This is a best practice to make Apps reproducible. + +---- + +*********** +Run the App +*********** + +Run the App locally: + +.. code:: bash + + lightning run app app.py + +The App should look like this: + +.. figure:: https://pl-flash-data.s3.amazonaws.com/assets_lightning/docs/images/frontend/panel/panel-lightning-basic.png + :alt: Basic Panel Lightning App + + Basic Panel Lightning App + +Now, run it on the cloud: + +.. code:: bash + + lightning run app app.py --cloud + +---- + +************************* +Step-by-step walk-through +************************* + +In this section, we explain each part of the code in detail. + +---- + +0. Define a Panel app +^^^^^^^^^^^^^^^^^^^^^ + +First, find the Panel app you want to integrate. In this example, that app looks like: + +.. code:: python + + import panel as pn + + pn.panel("Hello **Panel ⚡** World").servable() + +Refer to the `Panel documentation `_ and `awesome-panel.org `_ for more complex examples. + +---- + +1. Add Panel to a Component +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Link this app to the Lightning App by using the ``PanelFrontend`` class which needs to be returned from +the ``configure_layout`` method of the Lightning Component you want to connect to Panel. + +.. code:: python + :emphasize-lines: 7-10 + + import lightning as L + from lightning.app.frontend.panel import PanelFrontend + + + class LitPanel(L.LightningFlow): + + def configure_layout(self): + return PanelFrontend("app_panel.py") + + + class LitApp(L.LightningFlow): + def __init__(self): + super().__init__() + self.lit_panel = LitPanel() + + def configure_layout(self): + return {"name": "home", "content": self.lit_panel} + + + app = L.LightningApp(LitApp()) + +The argument of the ``PanelFrontend`` class, points to the script, notebook, or function that +runs your Panel app. + +---- + +2. Route the UI in the root component +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The second step, is to tell the Root component in which tab to render this component's UI. +In this case, we render the ``LitPanel`` UI in the ``home`` tab of the app. + +.. code:: python + :emphasize-lines: 16-17 + + import lightning as L + from lightning.app.frontend.panel import PanelFrontend + + + class LitPanel(L.LightningFlow): + + def configure_layout(self): + return PanelFrontend("app_panel.py") + + + class LitApp(L.LightningFlow): + def __init__(self): + super().__init__() + self.lit_panel = LitPanel() + + def configure_layout(self): + return {"name": "home", "content": self.lit_panel} + +---- + +************* +Tips & Tricks +************* + +0. Use autoreload while developing +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +To speed up your development workflow, you can run your Lightning App with Panel **autoreload** by +setting the environment variable ``PANEL_AUTORELOAD`` to ``yes``. + +Try running the following: + +.. code-block:: + + PANEL_AUTORELOAD=yes lightning run app app.py + +.. figure:: https://pl-flash-data.s3.amazonaws.com/assets_lightning/docs/images/frontend/panel/panel-lightning-autoreload.gif + :alt: Basic Panel Lightning App with autoreload + + Basic Panel Lightning App with autoreload + +1. Theme your App +^^^^^^^^^^^^^^^^^ + +To theme your App you, can use the Lightning accent color ``#792EE5`` with the `FastListTemplate`_. + +Try replacing the contents of ``app_panel.py`` with the following: + +.. code:: bash + + # app_panel.py + + import panel as pn + import plotly.express as px + + ACCENT = "#792EE5" + + pn.extension("plotly", sizing_mode="stretch_width", template="fast") + pn.state.template.param.update( + title="⚡ Hello Panel + Lightning ⚡", accent_base_color=ACCENT, header_background=ACCENT + ) + + pn.config.raw_css.append( + """ + .bk-root:first-of-type { + height: calc( 100vh - 200px ) !important; + } + """ + ) + + + def get_panel_theme(): + """Returns 'default' or 'dark'""" + return pn.state.session_args.get("theme", [b"default"])[0].decode() + + + def get_plotly_template(): + if get_panel_theme() == "dark": + return "plotly_dark" + return "plotly_white" + + + def get_plot(length=5): + xseries = [index for index in range(length + 1)] + yseries = [x**2 for x in xseries] + fig = px.line( + x=xseries, + y=yseries, + template=get_plotly_template(), + color_discrete_sequence=[ACCENT], + range_x=(0, 10), + markers=True, + ) + fig.layout.autosize = True + return fig + + + length = pn.widgets.IntSlider(value=5, start=1, end=10, name="Length") + dynamic_plot = pn.panel( + pn.bind(get_plot, length=length), sizing_mode="stretch_both", config={"responsive": True} + ) + pn.Column(length, dynamic_plot).servable() + + +Install some additional libraries and remember to add the dependencies to the ``requirements.txt`` file: + + +.. code:: bash + + echo 'plotly' >> requirements.txt + echo 'pandas' >> requirements.txt + +Finally run the App + +.. code:: bash + + lightning run app app.py + +.. figure:: https://pl-flash-data.s3.amazonaws.com/assets_lightning/docs/images/frontend/panel/panel-lightning-theme.gif + :alt: Basic Panel Plotly Lightning App with theming + + Basic Panel Plotly Lightning App with theming + +.. _Panel: https://panel.holoviz.org/ +.. _FastListTemplate: https://panel.holoviz.org/reference/templates/FastListTemplate.html#templates-gallery-fastlisttemplate +.. _HoloViz: https://holoviz.org/ +.. _DataShader: https://datashader.org/ +.. _HoloViews: https://holoviews.org/ +.. _Lightning: https://lightning.ai/ +.. _CuxFilter: https://github.com/rapidsai/cuxfilter +.. _AwesomePanel: https://awesome-panel.org/home + + +---- + +********** +Next Steps +********** + +.. raw:: html + +
+
+ +.. displayitem:: + :header: 2: Enable two-way communication + :description: Enable two-way communication between Panel and a Lightning App. + :col_css: col-md-6 + :button_link: intermediate.html + :height: 150 + :tag: intermediate + +.. displayitem:: + :header: Add a web user interface (UI) + :description: Users who want to add a UI to their Lightning Apps + :col_css: col-md-6 + :button_link: ../index.html + :height: 150 + :tag: intermediate + +.. raw:: html + +
+
diff --git a/docs/source-app/workflows/add_web_ui/panel/index.rst b/docs/source-app/workflows/add_web_ui/panel/index.rst new file mode 100644 index 0000000000000..0d48a1dc9f7ea --- /dev/null +++ b/docs/source-app/workflows/add_web_ui/panel/index.rst @@ -0,0 +1,85 @@ +:orphan: + +.. toctree:: + :maxdepth: 1 + :hidden: + + basic + intermediate + +####################### +Add a web UI with Panel +####################### + +.. raw:: html + +
+
+ +.. Add callout items below this line + +.. displayitem:: + :header: 1: Connect Panel + :description: Learn how to connect Panel to a Lightning Component. + :col_css: col-md-6 + :button_link: basic.html + :height: 150 + :tag: basic + +.. displayitem:: + :header: 2: Enable two-way communication + :description: Enable two-way communication between Panel and a Lightning App. + :col_css: col-md-6 + :button_link: intermediate.html + :height: 150 + :tag: intermediate + +.. raw:: html + +
+
+ +---- + +******** +Examples +******** + +Here are a few example apps that use a Panel web UI. + + +.. raw:: html + +
+
+ +.. Add callout items below this line + +.. displayitem:: + :header: Example 1 + :description: Show off your work! Contribute an example. + :col_css: col-md-4 + :button_link: ../../../contribute_app.html + :height: 150 + :tag: Waiting for contributed example + +.. displayitem:: + :header: Example 2 + :description: Show off your work! Contribute an example. + :col_css: col-md-4 + :button_link: ../../../contribute_app.html + :height: 150 + :tag: Waiting for contributed example + +.. displayitem:: + :header: Example 3 + :description: Show off your work! Contribute an example. + :col_css: col-md-4 + :button_link: ../../../contribute_app.html + :height: 150 + :tag: Waiting for contributed example + +.. raw:: html + +
+
diff --git a/docs/source-app/workflows/add_web_ui/panel/intermediate.rst b/docs/source-app/workflows/add_web_ui/panel/intermediate.rst new file mode 100644 index 0000000000000..171f91d82c3b5 --- /dev/null +++ b/docs/source-app/workflows/add_web_ui/panel/intermediate.rst @@ -0,0 +1,210 @@ +:orphan: + +###################################### +Add a web UI with Panel (intermediate) +###################################### + +**Audience:** Users who want to communicate between the Lightning App and Panel. + +**Prereqs:** Must have read the `Panel basic `_ guide. + +---- + +************************************** +Interact with the Component from Panel +************************************** + +The ``PanelFrontend`` enables user interactions with the Lightning App using widgets. +You can modify the state variables of a Lightning Component using the ``AppStateWatcher``. + +For example, here we increase the ``count`` variable of the Lightning Component every time a user +presses a button: + +.. code:: python + + # app_panel.py + + import panel as pn + from lightning.app.frontend.panel import AppStateWatcher + + pn.extension(sizing_mode="stretch_width") + + app = AppStateWatcher() + + submit_button = pn.widgets.Button(name="submit") + + @pn.depends(submit_button, watch=True) + def submit(_): + app.state.count += 1 + + @pn.depends(app.param.state) + def current_count(_): + return f"current count: {app.state.count}" + + pn.Column( + submit_button, + current_count, + ).servable() + + + +.. code:: python + + # app.py + + import lightning as L + from lightning.app.frontend.panel import PanelFrontend + + class LitPanel(L.LightningFlow): + def __init__(self): + super().__init__() + self.count = 0 + self.last_count = 0 + + def run(self): + if self.count != self.last_count: + self.last_count = self.count + print("Count changed to: ", self.count) + + def configure_layout(self): + return PanelFrontend("app_panel.py") + + + class LitApp(L.LightningFlow): + def __init__(self): + super().__init__() + self.lit_panel = LitPanel() + + def run(self): + self.lit_panel.run() + + def configure_layout(self): + return {"name": "home", "content": self.lit_panel} + + + app = L.LightningApp(LitApp()) + +.. figure:: https://pl-flash-data.s3.amazonaws.com/assets_lightning/docs/images/frontend/panel/panel-lightning-counter-from-frontend.gif + :alt: Panel Lightning App updating a counter from the frontend + + Panel Lightning App updating a counter from the frontend + +---- + +************************************ +Interact with Panel from a Component +************************************ + +To update the `PanelFrontend` from any Lightning Component, update the property in the Component. +Make sure to call the ``run`` method from the parent component. + +In this example, we update the ``count`` value of the Component: + +.. code:: python + + # app_panel.py + + import panel as pn + from lightning.app.frontend.panel import AppStateWatcher + + app = AppStateWatcher() + + pn.extension(sizing_mode="stretch_width") + + def counter(state): + return f"Counter: {state.count}" + + last_update = pn.bind(counter, app.param.state) + + pn.panel(last_update).servable() + +.. code:: python + + # app.py + + from datetime import datetime as dt + from lightning.app.frontend.panel import PanelFrontend + + import lightning as L + + + class LitPanel(L.LightningFlow): + def __init__(self): + super().__init__() + self.count = 0 + self._last_update = dt.now() + + def run(self): + now = dt.now() + if (now - self._last_update).microseconds >= 250: + self.count += 1 + self._last_update = now + print("Counter changed to: ", self.count) + + def configure_layout(self): + return PanelFrontend("app_panel.py") + + + class LitApp(L.LightningFlow): + def __init__(self): + super().__init__() + self.lit_panel = LitPanel() + + def run(self): + self.lit_panel.run() + + def configure_layout(self): + tab1 = {"name": "home", "content": self.lit_panel} + return tab1 + + app = L.LightningApp(LitApp()) + +.. figure:: https://pl-flash-data.s3.amazonaws.com/assets_lightning/docs/images/frontend/panel/panel-lightning-counter-from-component.gif + :alt: Panel Lightning App updating a counter from the component + + Panel Lightning App updating a counter from the Component + +---- + +************* +Tips & Tricks +************* + +* Caching: Panel provides the easy to use ``pn.state.cache`` memory based, ``dict`` caching. If you are looking for something persistent try `DiskCache `_ its really powerful and simple to use. You can use it to communicate large amounts of data between the components and frontend(s). + +* Notifications: Panel provides easy to use `notifications `_. You can for example use them to provide notifications about runs starting or ending. + +* Tabulator Table: Panel provides the `Tabulator table `_ which features expandable rows. The table is useful to provide for example an overview of you runs. But you can dig into the details by clicking and expanding the row. + +* Task Scheduling: Panel provides easy to use `task scheduling `_. You can use this to for example read and display files created by your components on a scheduled basis. + +* Terminal: Panel provides the `Xterm.js terminal `_ which can be used to display live logs from your components and allow you to provide a terminal interface to your component. + +.. figure:: https://pl-flash-data.s3.amazonaws.com/assets_lightning/docs/images/frontend/panel/panel-lightning-github-runner.gif + :alt: Panel Lightning App running models on github + + Panel Lightning App running models on GitHub + +---- + +********** +Next Steps +********** + +.. raw:: html + +
+
+ +.. displayitem:: + :header: Add a web user interface (UI) + :description: Users who want to add a UI to their Lightning Apps + :col_css: col-md-6 + :button_link: ../index.html + :height: 150 + :tag: intermediate + +.. raw:: html + +
+
diff --git a/docs/source-app/workflows/build_lightning_component/intermediate.rst b/docs/source-app/workflows/build_lightning_component/intermediate.rst index 871224ba4fdee..2533cbac35c77 100644 --- a/docs/source-app/workflows/build_lightning_component/intermediate.rst +++ b/docs/source-app/workflows/build_lightning_component/intermediate.rst @@ -9,8 +9,9 @@ Develop a Lightning Component (intermediate) ***************************** Add a web user interface (UI) ***************************** -Every Lightning Component can have its own user interface (UI). Lightning Components support any kind -of UI interface such as react.js, vue.js, streamlit, gradio, dash, web urls, etc...(`full list here <../add_web_ui/index.html>`_). +Every lightning component can have its own user interface (UI). Lightning components support any kind +of UI interface such as dash, gradio, panel, react.js, streamlit, vue.js, web urls, +etc...(`full list here <../add_web_ui/index.html>`_). Let's say that we have a user interface defined in html: diff --git a/requirements/app/ui.txt b/requirements/app/ui.txt index f0e4b2cdef471..1fb2214f836c8 100644 --- a/requirements/app/ui.txt +++ b/requirements/app/ui.txt @@ -1 +1,2 @@ streamlit>=1.3.1, <=1.11.1 +panel>=0.12, <=0.13.1 diff --git a/src/lightning_app/CHANGELOG.md b/src/lightning_app/CHANGELOG.md index 1e74509e23ad9..18a3e4ac8223d 100644 --- a/src/lightning_app/CHANGELOG.md +++ b/src/lightning_app/CHANGELOG.md @@ -4,7 +4,17 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). -## [0.6.0] - 2022-08-DD + +## [0.7.0] - 2022-MM-DD + +### Added + +- Adds `PanelFrontend` to easily create complex UI in Python ([#13531](https://github.com/Lightning-AI/lightning/pull/13531)) + + + + +## [0.6.0] - 2022-08-23 ### Added diff --git a/src/lightning_app/frontend/__init__.py b/src/lightning_app/frontend/__init__.py index ed8c6abb9463a..955df9d375d3f 100644 --- a/src/lightning_app/frontend/__init__.py +++ b/src/lightning_app/frontend/__init__.py @@ -1,5 +1,6 @@ from lightning_app.frontend.frontend import Frontend +from lightning_app.frontend.panel import PanelFrontend from lightning_app.frontend.stream_lit import StreamlitFrontend from lightning_app.frontend.web import StaticWebFrontend -__all__ = ["Frontend", "StaticWebFrontend", "StreamlitFrontend"] +__all__ = ["Frontend", "PanelFrontend", "StaticWebFrontend", "StreamlitFrontend"] diff --git a/src/lightning_app/frontend/panel/__init__.py b/src/lightning_app/frontend/panel/__init__.py new file mode 100644 index 0000000000000..ba76dd1dce1eb --- /dev/null +++ b/src/lightning_app/frontend/panel/__init__.py @@ -0,0 +1,6 @@ +"""The PanelFrontend and AppStateWatcher make it easy to create Lightning Apps with the Panel data app +framework.""" +from lightning_app.frontend.panel.app_state_watcher import AppStateWatcher +from lightning_app.frontend.panel.panel_frontend import PanelFrontend + +__all__ = ["PanelFrontend", "AppStateWatcher"] diff --git a/src/lightning_app/frontend/panel/app_state_comm.py b/src/lightning_app/frontend/panel/app_state_comm.py new file mode 100644 index 0000000000000..f7d9c01e7dc2d --- /dev/null +++ b/src/lightning_app/frontend/panel/app_state_comm.py @@ -0,0 +1,86 @@ +"""The watch_app_state function enables us to trigger a callback function when ever the app state changes.""" +# Todo: Refactor with Streamlit +# Note: It would be nice one day to just watch changes within the Flow scope instead of whole app +from __future__ import annotations + +import asyncio +import logging +import os +import threading +from typing import Callable + +import websockets + +from lightning_app.core.constants import APP_SERVER_PORT + +_logger = logging.getLogger(__name__) + +_CALLBACKS = [] +_THREAD: None | threading.Thread = None + + +def _get_ws_port(): + if "LIGHTNING_APP_STATE_URL" in os.environ: + return 8080 + return APP_SERVER_PORT + + +def _get_ws_url(): + port = _get_ws_port() + return f"ws://localhost:{port}/api/v1/ws" + + +def _run_callbacks(): + for callback in _CALLBACKS: + callback() + + +def _target_fn(): + async def update_fn(): + ws_url = _get_ws_url() + _logger.debug("connecting to web socket %s", ws_url) + async with websockets.connect(ws_url) as websocket: # pylint: disable=no-member + while True: + await websocket.recv() + # Note: I have not seen use cases where the two lines below are needed + # Changing '< 0.2' to '< 1' makes the App very sluggish to the end user + # Also the implementation can cause the App state to lag behind because only 1 update + # is received per 0.2 second (or 1 second). + # while (time.time() - last_updated) < 0.2: + # time.sleep(0.05) + + # Todo: Add some kind of throttling. If 10 messages are received within 100ms then + # there is no need to trigger the app state changed, request state and update + # 10 times. + _logger.debug("App State Changed. Running callbacks") + _run_callbacks() + + asyncio.run(update_fn()) + + +def _start_websocket(): + global _THREAD # pylint: disable=global-statement + if not _THREAD: + _logger.debug("Starting the watch_app_state thread.") + _THREAD = threading.Thread(target=_target_fn) + _THREAD.setDaemon(True) + _THREAD.start() + _logger.debug("thread started") + + +def watch_app_state(callback: Callable): + """Start the process that serves the UI at the given hostname and port number. + + Arguments: + callback: A function to run when the App state changes. Must be thread safe. + + Example: + + .. code-block:: python + + def handle_state_change(): + print("The App State changed.") + watch_app_state(handle_state_change) + """ + _CALLBACKS.append(callback) + _start_websocket() diff --git a/src/lightning_app/frontend/panel/app_state_watcher.py b/src/lightning_app/frontend/panel/app_state_watcher.py new file mode 100644 index 0000000000000..49eee09ea80fb --- /dev/null +++ b/src/lightning_app/frontend/panel/app_state_watcher.py @@ -0,0 +1,106 @@ +"""The AppStateWatcher enables a Frontend to. + +- subscribe to App state changes +- to access and change the App state. + +This is particularly useful for the PanelFrontend but can be used by other Frontends too. +""" +from __future__ import annotations + +import logging +import os + +from lightning_app.frontend.panel.app_state_comm import watch_app_state +from lightning_app.frontend.utils import _get_flow_state +from lightning_app.utilities.imports import _is_param_available, requires +from lightning_app.utilities.state import AppState + +_logger = logging.getLogger(__name__) + + +if _is_param_available(): + from param import ClassSelector, edit_constant, Parameterized +else: + Parameterized = object + ClassSelector = dict + + +class AppStateWatcher(Parameterized): + """The AppStateWatcher enables a Frontend to: + + - Subscribe to any App state changes. + - To access and change the App state from the UI. + + This is particularly useful for the PanelFrontend, but can be used by + other Frontend's too. + + Example: + + .. code-block:: python + + import param + + app = AppStateWatcher() + + app.state.counter = 1 + + + @param.depends(app.param.state, watch=True) + def update(state): + print(f"The counter was updated to {state.counter}") + + + app.state.counter += 1 + + This would print ``The counter was updated to 2``. + + The AppStateWatcher is built on top of Param which is a framework like dataclass, attrs and + Pydantic which additionally provides powerful and unique features for building reactive apps. + + Please note the AppStateWatcher is a singleton, i.e. only one instance is instantiated + """ + + state: AppState = ClassSelector( + class_=AppState, + constant=True, + doc="The AppState holds the state of the app reduced to the scope of the Flow", + ) + + def __new__(cls): + # This makes the AppStateWatcher a *singleton*. + # The AppStateWatcher is a singleton to minimize the number of requests etc.. + if not hasattr(cls, "_instance"): + cls._instance = super().__new__(cls) + return cls._instance + + @requires("param") + def __init__(self): + # It's critical to initialize only once + # See https://github.com/holoviz/param/issues/643 + if not hasattr(self, "_initialized"): + super().__init__(name="singleton") + self._start_watching() + self.param.state.allow_None = False + self._initialized = True + + # The below was observed when using mocks during testing + if not self.state: + raise Exception(".state has not been set.") + if not self.state._state: + raise Exception(".state._state has not been set.") + + def _start_watching(self): + # Create a thread listening to state changes. + watch_app_state(self._update_flow_state) + self._update_flow_state() + + def _get_flow_state(self) -> AppState: + flow = os.environ["LIGHTNING_FLOW_NAME"] + return _get_flow_state(flow) + + def _update_flow_state(self): + # Todo: Consider whether to only update if ._state changed + # This might be much more performant. + with edit_constant(self): + self.state = self._get_flow_state() + _logger.debug("Requested App State.") diff --git a/src/lightning_app/frontend/panel/panel_frontend.py b/src/lightning_app/frontend/panel/panel_frontend.py new file mode 100644 index 0000000000000..d89ed898751be --- /dev/null +++ b/src/lightning_app/frontend/panel/panel_frontend.py @@ -0,0 +1,171 @@ +"""The PanelFrontend wraps your Panel code in your LightningFlow.""" +from __future__ import annotations + +import inspect +import logging +import os +import pathlib +import subprocess +import sys +from typing import Callable, TextIO + +from lightning_app.frontend.frontend import Frontend +from lightning_app.frontend.utils import _get_frontend_environment +from lightning_app.utilities.cloud import is_running_in_cloud +from lightning_app.utilities.imports import requires +from lightning_app.utilities.log import get_frontend_logfile + +_logger = logging.getLogger("PanelFrontend") + + +def has_panel_autoreload() -> bool: + """Returns True if the PANEL_AUTORELOAD environment variable is set to 'yes' or 'true'. + + Please note the casing of value does not matter + """ + return os.environ.get("PANEL_AUTORELOAD", "no").lower() in ["yes", "y", "true"] + + +class PanelFrontend(Frontend): + """The PanelFrontend enables you to serve Panel code as a Frontend for your LightningFlow. + + To use this frontend, you must first install the `panel` package: + + .. code-block:: bash + + pip install panel + + Example: + + `panel_app_basic.py` + + .. code-block:: python + + import panel as pn + + pn.panel("Hello **Panel ⚡** World").servable() + + `app_basic.py` + + .. code-block:: python + + import lightning as L + from lightning.app.frontend.panel import PanelFrontend + + + class LitPanel(L.LightningFlow): + def configure_layout(self): + return PanelFrontend("panel_app_basic.py") + + + class LitApp(L.LightningFlow): + def __init__(self): + super().__init__() + self.lit_panel = LitPanel() + + def configure_layout(self): + return {"name": "home", "content": self.lit_panel} + + + app = L.LightningApp(LitApp()) + + You can start the Lightning server with Panel autoreload by setting the `PANEL_AUTORELOAD` + environment variable to 'yes': `AUTORELOAD=yes lightning run app app_basic.py`. + + Args: + entry_point: A pure function or the path to a .py or .ipynb file. + The function must be a pure function that contains your Panel code. + The function can optionally accept an `AppStateWatcher` argument. + + Raises: + TypeError: Raised if the entry_point is a class method + """ + + @requires("panel") + def __init__(self, entry_point: Callable | str): + super().__init__() + + if inspect.ismethod(entry_point): + raise TypeError( + "The `PanelFrontend` doesn't support `entry_point` being a method. Please, use a pure function." + ) + + self.entry_point = entry_point + self._process: None | subprocess.Popen = None + self._log_files: dict[str, TextIO] = {} + _logger.debug("PanelFrontend Frontend with %s is initialized.", entry_point) + + def start_server(self, host: str, port: int) -> None: + _logger.debug("PanelFrontend starting server on %s:%s", host, port) + + # 1: Prepare environment variables and arguments. + env = _get_frontend_environment( + self.flow.name, + self.entry_point, + port, + host, + ) + command = self._get_popen_args(host, port) + + if is_running_in_cloud(): + self._open_log_files() + + self._process = subprocess.Popen(command, env=env, **self._log_files) # pylint: disable=consider-using-with + + def stop_server(self) -> None: + if self._process is None: + raise RuntimeError("Server is not running. Call `PanelFrontend.start_server()` first.") + self._process.kill() + self._close_log_files() + + def _close_log_files(self): + for file_ in self._log_files.values(): + if not file_.closed: + file_.close() + self._log_files = {} + + def _open_log_files(self) -> None: + # Don't log to file when developing locally. Makes it harder to debug. + self._close_log_files() + + std_err_out = get_frontend_logfile("error.log") + std_out_out = get_frontend_logfile("output.log") + stderr = std_err_out.open("wb") + stdout = std_out_out.open("wb") + self._log_files = {"stdout": stderr, "stderr": stdout} + + def _get_popen_args(self, host: str, port: int) -> list: + if callable(self.entry_point): + path = str(pathlib.Path(__file__).parent / "panel_serve_render_fn.py") + else: + path = pathlib.Path(self.entry_point) + + abs_path = str(path) + # The app is served at http://localhost:{port}/{flow}/{entry_point} + # Lightning embeds http://localhost:{port}/{flow} but this redirects to the above and + # seems to work fine. + command = [ + sys.executable, + "-m", + "panel", + "serve", + abs_path, + "--port", + str(port), + "--address", + host, + "--prefix", + self.flow.name, + "--allow-websocket-origin", + _get_allowed_hosts(), + ] + if has_panel_autoreload(): + command.append("--autoreload") + _logger.debug("PanelFrontend command %s", command) + return command + + +def _get_allowed_hosts() -> str: + """Returns a comma separated list of host[:port] that should be allowed to connect.""" + # TODO: Enable only lightning.ai domain in the cloud + return "*" diff --git a/src/lightning_app/frontend/panel/panel_serve_render_fn.py b/src/lightning_app/frontend/panel/panel_serve_render_fn.py new file mode 100644 index 0000000000000..7aff3d5c3e601 --- /dev/null +++ b/src/lightning_app/frontend/panel/panel_serve_render_fn.py @@ -0,0 +1,52 @@ +"""This file gets run by Python to launch a Panel Server with Lightning. + +We will call the ``render_fn`` that the user provided to the PanelFrontend. + +It requires the following environment variables to be set + + +- LIGHTNING_RENDER_FUNCTION +- LIGHTNING_RENDER_MODULE_FILE + +Example: + +.. code-block:: bash + + python panel_serve_render_fn +""" +import inspect +import os +import pydoc +from typing import Callable + +from lightning_app.frontend.panel.app_state_watcher import AppStateWatcher + + +def _get_render_fn_from_environment(render_fn_name: str, render_fn_module_file: str) -> Callable: + """Returns the render_fn function to serve in the Frontend.""" + module = pydoc.importfile(render_fn_module_file) + return getattr(module, render_fn_name) + + +def _get_render_fn(): + render_fn_name = os.environ["LIGHTNING_RENDER_FUNCTION"] + render_fn_module_file = os.environ["LIGHTNING_RENDER_MODULE_FILE"] + render_fn = _get_render_fn_from_environment(render_fn_name, render_fn_module_file) + if inspect.signature(render_fn).parameters: + + def _render_fn_wrapper(): + app = AppStateWatcher() + return render_fn(app) + + return _render_fn_wrapper + return render_fn + + +if __name__.startswith("bokeh"): + import panel as pn + + # I use caching for efficiency reasons. It shaves off 10ms from having + # to get_render_fn_from_environment every time + if "lightning_render_fn" not in pn.state.cache: + pn.state.cache["lightning_render_fn"] = _get_render_fn() + pn.state.cache["lightning_render_fn"]() diff --git a/src/lightning_app/frontend/streamlit_base.py b/src/lightning_app/frontend/streamlit_base.py index af2a8314e07bd..c57ad2f9f9808 100644 --- a/src/lightning_app/frontend/streamlit_base.py +++ b/src/lightning_app/frontend/streamlit_base.py @@ -4,9 +4,9 @@ """ import os import pydoc -from typing import Callable, Union +from typing import Callable -from lightning_app.core.flow import LightningFlow +from lightning_app.frontend.utils import _reduce_to_flow_scope from lightning_app.utilities.app_helpers import StreamLitStatePlugin from lightning_app.utilities.state import AppState @@ -20,19 +20,10 @@ def _get_render_fn_from_environment() -> Callable: return getattr(module, render_fn_name) -def _app_state_to_flow_scope(state: AppState, flow: Union[str, LightningFlow]) -> AppState: - """Returns a new AppState with the scope reduced to the given flow, as if the given flow as the root.""" - flow_name = flow.name if isinstance(flow, LightningFlow) else flow - flow_name_parts = flow_name.split(".")[1:] # exclude root - flow_state = state - for part in flow_name_parts: - flow_state = getattr(flow_state, part) - return flow_state - - def main(): + """Run the render_fn with the current flow_state.""" # Fetch the information of which flow attaches to this streamlit instance - flow_state = _app_state_to_flow_scope(app_state, flow=os.environ["LIGHTNING_FLOW_NAME"]) + flow_state = _reduce_to_flow_scope(app_state, flow=os.environ["LIGHTNING_FLOW_NAME"]) # Call the provided render function. # Pass it the state, scoped to the current flow. diff --git a/src/lightning_app/frontend/utils.py b/src/lightning_app/frontend/utils.py new file mode 100644 index 0000000000000..1795445ef141f --- /dev/null +++ b/src/lightning_app/frontend/utils.py @@ -0,0 +1,57 @@ +"""Utility functions for lightning Frontends.""" +from __future__ import annotations + +import inspect +import os +from typing import Callable + +from lightning_app.core.flow import LightningFlow +from lightning_app.utilities.state import AppState + + +def _reduce_to_flow_scope(state: AppState, flow: str | LightningFlow) -> AppState: + """Returns a new AppState with the scope reduced to the given flow.""" + flow_name = flow.name if isinstance(flow, LightningFlow) else flow + flow_name_parts = flow_name.split(".")[1:] # exclude root + flow_state = state + for part in flow_name_parts: + flow_state = getattr(flow_state, part) + return flow_state + + +def _get_flow_state(flow: str) -> AppState: + """Returns an AppState scoped to the current Flow. + + Returns: + AppState: An AppState scoped to the current Flow. + """ + app_state = AppState() + app_state._request_state() # pylint: disable=protected-access + flow_state = _reduce_to_flow_scope(app_state, flow) + return flow_state + + +def _get_frontend_environment(flow: str, render_fn_or_file: Callable | str, port: int, host: str) -> os._Environ: + """Returns an _Environ with the environment variables for serving a Frontend app set. + + Args: + flow: The name of the flow, for example root.lit_frontend + render_fn_or_file: A function to render + port: The port number, for example 54321 + host: The host, for example 'localhost' + + Returns: + os._Environ: An environment + """ + env = os.environ.copy() + env["LIGHTNING_FLOW_NAME"] = flow + env["LIGHTNING_RENDER_PORT"] = str(port) + env["LIGHTNING_RENDER_ADDRESS"] = str(host) + + if isinstance(render_fn_or_file, str): + env["LIGHTNING_RENDER_FILE"] = render_fn_or_file + else: + env["LIGHTNING_RENDER_FUNCTION"] = render_fn_or_file.__name__ + env["LIGHTNING_RENDER_MODULE_FILE"] = inspect.getmodule(render_fn_or_file).__file__ + + return env diff --git a/src/lightning_app/utilities/imports.py b/src/lightning_app/utilities/imports.py index 90c149e551569..090af1b879340 100644 --- a/src/lightning_app/utilities/imports.py +++ b/src/lightning_app/utilities/imports.py @@ -105,6 +105,11 @@ def _is_streamlit_available() -> bool: return _module_available("streamlit") +@functools.lru_cache() +def _is_param_available() -> bool: + return _module_available("param") + + @functools.lru_cache() def _is_streamlit_tensorboard_available() -> bool: return _module_available("streamlit_tensorboard") diff --git a/src/lightning_app/utilities/state.py b/src/lightning_app/utilities/state.py index 300bca34533df..378c3e20ec14e 100644 --- a/src/lightning_app/utilities/state.py +++ b/src/lightning_app/utilities/state.py @@ -66,7 +66,7 @@ def __init__( my_affiliation: Tuple[str, ...] = None, plugin: Optional[BaseStatePlugin] = None, ) -> None: - """The AppState class enable streamlit user to interact their application state. + """The AppState class enables Frontend users to interact with their application state. When the state isn't defined, it would be pulled from the app REST API Server. If the state gets modified by the user, the new state would be sent to the API Server. diff --git a/tests/tests_app/frontend/conftest.py b/tests/tests_app/frontend/conftest.py new file mode 100644 index 0000000000000..673fcf190508e --- /dev/null +++ b/tests/tests_app/frontend/conftest.py @@ -0,0 +1,73 @@ +"""Test configuration.""" +# pylint: disable=protected-access +from unittest import mock + +import pytest + +FLOW_SUB = "lit_flow" +FLOW = f"root.{FLOW_SUB}" +PORT = 61896 + +FLOW_STATE = { + "vars": { + "_paths": {}, + "_layout": {"target": f"http://localhost:{PORT}/{FLOW}"}, + }, + "calls": {}, + "flows": {}, + "works": {}, + "structures": {}, + "changes": {}, +} + +APP_STATE = { + "vars": {"_paths": {}, "_layout": [{"name": "home", "content": FLOW}]}, + "calls": {}, + "flows": { + FLOW_SUB: FLOW_STATE, + }, + "works": {}, + "structures": {}, + "changes": {}, + "app_state": {"stage": "running"}, +} + + +def _request_state(self): + _state = APP_STATE + self._store_state(_state) + + +@pytest.fixture() +def flow(): + return FLOW + + +@pytest.fixture(autouse=True, scope="module") +def mock_request_state(): + """Avoid requests to the api.""" + with mock.patch("lightning_app.utilities.state.AppState._request_state", _request_state): + yield + + +def do_nothing(): + """Be lazy!""" + + +@pytest.fixture(autouse=True, scope="module") +def mock_start_websocket(): + """Avoid starting the websocket.""" + with mock.patch("lightning_app.frontend.panel.app_state_comm._start_websocket", do_nothing): + yield + + +@pytest.fixture +def app_state_state(): + """Returns an AppState dict.""" + return APP_STATE.copy() + + +@pytest.fixture +def flow_state_state(): + """Returns an AppState dict scoped to the flow.""" + return FLOW_STATE.copy() diff --git a/tests/tests_app/frontend/panel/__init__.py b/tests/tests_app/frontend/panel/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/tests/tests_app/frontend/panel/app_panel.py b/tests/tests_app/frontend/panel/app_panel.py new file mode 100644 index 0000000000000..6b54261e37e7d --- /dev/null +++ b/tests/tests_app/frontend/panel/app_panel.py @@ -0,0 +1,5 @@ +if __name__ == "__main__": + + import panel as pn + + pn.pane.Markdown("# Panel App").servable() diff --git a/tests/tests_app/frontend/panel/test_app_state_comm.py b/tests/tests_app/frontend/panel/test_app_state_comm.py new file mode 100644 index 0000000000000..3766a1ccce564 --- /dev/null +++ b/tests/tests_app/frontend/panel/test_app_state_comm.py @@ -0,0 +1,39 @@ +"""The watch_app_state function enables us to trigger a callback function whenever the App state changes.""" +import os +from unittest import mock + +from lightning_app.core.constants import APP_SERVER_PORT +from lightning_app.frontend.panel.app_state_comm import _get_ws_url, _run_callbacks, watch_app_state + +FLOW_SUB = "lit_flow" +FLOW = f"root.{FLOW_SUB}" + + +def do_nothing(): + """Be lazy!""" + + +def test_get_ws_url_when_local(): + """The websocket uses port APP_SERVER_PORT when local.""" + assert _get_ws_url() == f"ws://localhost:{APP_SERVER_PORT}/api/v1/ws" + + +@mock.patch.dict(os.environ, {"LIGHTNING_APP_STATE_URL": "some_url"}) +def test_get_ws_url_when_cloud(): + """The websocket uses port 8080 when LIGHTNING_APP_STATE_URL is set.""" + assert _get_ws_url() == "ws://localhost:8080/api/v1/ws" + + +@mock.patch.dict(os.environ, {"LIGHTNING_FLOW_NAME": "FLOW"}) +def test_watch_app_state(): + """We can watch the App state and a callback function will be run when it changes.""" + callback = mock.MagicMock() + # When + watch_app_state(callback) + + # Here we would like to send messages using the web socket + # For testing the web socket is not started. See conftest.py + # So we need to manually trigger _run_callbacks here + _run_callbacks() + # Then + callback.assert_called_once() diff --git a/tests/tests_app/frontend/panel/test_app_state_watcher.py b/tests/tests_app/frontend/panel/test_app_state_watcher.py new file mode 100644 index 0000000000000..25b99c8b25922 --- /dev/null +++ b/tests/tests_app/frontend/panel/test_app_state_watcher.py @@ -0,0 +1,85 @@ +"""The AppStateWatcher enables a Frontend to. + +- subscribe to App state changes. +- to access and change the App state. + +This is particularly useful for the PanelFrontend, but can be used by other Frontends too. +""" +# pylint: disable=protected-access +import os +from unittest import mock + +import pytest + +from lightning_app.frontend.panel.app_state_watcher import AppStateWatcher +from lightning_app.utilities.state import AppState + +FLOW_SUB = "lit_flow" +FLOW = f"root.{FLOW_SUB}" +PORT = 61896 + + +@pytest.fixture(autouse=True) +def mock_settings_env_vars(): + """Set the LIGHTNING environment variables.""" + with mock.patch.dict( + os.environ, + { + "LIGHTNING_FLOW_NAME": FLOW, + "LIGHTNING_RENDER_ADDRESS": "localhost", + "LIGHTNING_RENDER_PORT": f"{PORT}", + }, + ): + yield + + +def test_init(flow_state_state: dict): + """We can instantiate the AppStateWatcher. + + - the .state is set + - the .state is scoped to the flow state + """ + # When + app = AppStateWatcher() + # Needed as AppStateWatcher is singleton and might have been + # instantiated and the state changed in other tests + app._update_flow_state() + + # Then + assert isinstance(app.state, AppState) + assert app.state._state == flow_state_state + + +def test_update_flow_state(flow_state_state: dict): + """We can update the state. + + - the .state is scoped to the flow state + """ + app = AppStateWatcher() + org_state = app.state + app._update_flow_state() + assert app.state is not org_state + assert app.state._state == flow_state_state + + +def test_is_singleton(): + """The AppStateWatcher is a singleton for efficiency reasons. + + Its key that __new__ and __init__ of AppStateWatcher is only called once. See + https://github.com/holoviz/param/issues/643 + """ + # When + app1 = AppStateWatcher() + name1 = app1.name + state1 = app1.state + + app2 = AppStateWatcher() + name2 = app2.name + state2 = app2.state + + # Then + assert app1 is app2 + assert name1 == name2 + assert app1.name == name2 + assert state1 is state2 + assert app1.state is state2 diff --git a/tests/tests_app/frontend/panel/test_panel_frontend.py b/tests/tests_app/frontend/panel/test_panel_frontend.py new file mode 100644 index 0000000000000..c31018378772c --- /dev/null +++ b/tests/tests_app/frontend/panel/test_panel_frontend.py @@ -0,0 +1,164 @@ +"""The PanelFrontend wraps your Panel code in your LightningFlow.""" +# pylint: disable=protected-access, too-few-public-methods +import os +import runpy +import sys +from unittest import mock +from unittest.mock import Mock + +import pytest + +from lightning_app import LightningFlow +from lightning_app.frontend.panel import panel_serve_render_fn, PanelFrontend +from lightning_app.frontend.panel.panel_frontend import has_panel_autoreload +from lightning_app.utilities.state import AppState + + +def test_stop_server_not_running(): + """If the server is not running but stopped an Exception should be raised.""" + frontend = PanelFrontend(entry_point=Mock()) + with pytest.raises(RuntimeError, match="Server is not running."): + frontend.stop_server() + + +def _noop_render_fn(_): + pass + + +class MockFlow(LightningFlow): + """Test Flow.""" + + @property + def name(self): + """Return name.""" + return "root.my.flow" + + def run(self): # pylint: disable=arguments-differ + """Be lazy!""" + + +@mock.patch("lightning_app.frontend.panel.panel_frontend.subprocess") +def test_panel_frontend_start_stop_server(subprocess_mock): + """Test that `PanelFrontend.start_server()` invokes subprocess.Popen with the right parameters.""" + # Given + frontend = PanelFrontend(entry_point=_noop_render_fn) + frontend.flow = MockFlow() + # When + frontend.start_server(host="hostname", port=1111) + # Then + subprocess_mock.Popen.assert_called_once() + + env_variables = subprocess_mock.method_calls[0].kwargs["env"] + call_args = subprocess_mock.method_calls[0].args[0] + assert call_args == [ + sys.executable, + "-m", + "panel", + "serve", + panel_serve_render_fn.__file__, + "--port", + "1111", + "--address", + "hostname", + "--prefix", + "root.my.flow", + "--allow-websocket-origin", + "*", + ] + + assert env_variables["LIGHTNING_FLOW_NAME"] == "root.my.flow" + assert env_variables["LIGHTNING_RENDER_ADDRESS"] == "hostname" + assert env_variables["LIGHTNING_RENDER_FUNCTION"] == "_noop_render_fn" + assert env_variables["LIGHTNING_RENDER_MODULE_FILE"] == __file__ + assert env_variables["LIGHTNING_RENDER_PORT"] == "1111" + + assert "LIGHTNING_FLOW_NAME" not in os.environ + assert "LIGHTNING_RENDER_FUNCTION" not in os.environ + assert "LIGHTNING_RENDER_MODULE_FILE" not in os.environ + assert "LIGHTNING_RENDER_MODULE_PORT" not in os.environ + assert "LIGHTNING_RENDER_MODULE_ADDRESS" not in os.environ + # When + frontend.stop_server() + # Then + subprocess_mock.Popen().kill.assert_called_once() + + +def _call_me(state): + assert isinstance(state, AppState) + print(state) + + +@mock.patch.dict( + os.environ, + { + "LIGHTNING_FLOW_NAME": "root", + "LIGHTNING_RENDER_FUNCTION": "_call_me", + "LIGHTNING_RENDER_MODULE_FILE": __file__, + "LIGHTNING_RENDER_ADDRESS": "127.0.0.1", + "LIGHTNING_RENDER_PORT": "61896", + }, +) +def test_panel_wrapper_calls_entry_point(*_): + """Run the panel_serve_entry_point.""" + runpy.run_module("lightning_app.frontend.panel.panel_serve_render_fn") + + +def test_method_exception(): + """The PanelFrontend does not support entry_point being a method and should raise an Exception.""" + + class _DummyClass: + def _render_fn(self): + pass + + with pytest.raises(TypeError, match="being a method"): + PanelFrontend(entry_point=_DummyClass()._render_fn) + + +def test_open_close_log_files(): + """We can open and close the log files.""" + frontend = PanelFrontend(_noop_render_fn) + assert not frontend._log_files + # When + frontend._open_log_files() + # Then + stdout = frontend._log_files["stdout"] + stderr = frontend._log_files["stderr"] + assert not stdout.closed + assert not stderr.closed + + # When + frontend._close_log_files() + # Then + assert not frontend._log_files + assert stdout.closed + assert stderr.closed + + # We can close even if not open + frontend._close_log_files() + + +@pytest.mark.parametrize( + ["value", "expected"], + ( + ("Yes", True), + ("yes", True), + ("YES", True), + ("Y", True), + ("y", True), + ("True", True), + ("true", True), + ("TRUE", True), + ("No", False), + ("no", False), + ("NO", False), + ("N", False), + ("n", False), + ("False", False), + ("false", False), + ("FALSE", False), + ), +) +def test_has_panel_autoreload(value, expected): + """We can get and set autoreload using the environment variable PANEL_AUTORELOAD.""" + with mock.patch.dict(os.environ, {"PANEL_AUTORELOAD": value}): + assert has_panel_autoreload() == expected diff --git a/tests/tests_app/frontend/panel/test_panel_serve_render_fn.py b/tests/tests_app/frontend/panel/test_panel_serve_render_fn.py new file mode 100644 index 0000000000000..810367fe15934 --- /dev/null +++ b/tests/tests_app/frontend/panel/test_panel_serve_render_fn.py @@ -0,0 +1,79 @@ +"""The panel_serve_render_fn_or_file file gets run by Python to launch a Panel Server with Lightning. + +These tests are for serving a render_fn function. +""" +import inspect +import os +from unittest import mock + +import pytest + +from lightning_app.frontend.panel.app_state_watcher import AppStateWatcher +from lightning_app.frontend.panel.panel_serve_render_fn import _get_render_fn, _get_render_fn_from_environment + + +@pytest.fixture(autouse=True) +def _mock_settings_env_vars(): + with mock.patch.dict( + os.environ, + { + "LIGHTNING_FLOW_NAME": "root.lit_flow", + "LIGHTNING_RENDER_ADDRESS": "localhost", + "LIGHTNING_RENDER_MODULE_FILE": __file__, + "LIGHTNING_RENDER_PORT": "61896", + }, + ): + yield + + +def render_fn(app): + """Test render_fn function with app args.""" + return app + + +@mock.patch.dict( + os.environ, + { + "LIGHTNING_RENDER_FUNCTION": "render_fn", + }, +) +def test_get_view_fn_args(): + """We have a helper get_view_fn function that create a function for our view. + + If the render_fn provides an argument an AppStateWatcher is provided as argument + """ + result = _get_render_fn() + assert isinstance(result(), AppStateWatcher) + + +def render_fn_no_args(): + """Test function with no arguments.""" + return "no_args" + + +@mock.patch.dict( + os.environ, + { + "LIGHTNING_RENDER_FUNCTION": "render_fn_no_args", + }, +) +def test_get_view_fn_no_args(): + """We have a helper get_view_fn function that create a function for our view. + + If the render_fn provides an argument an AppStateWatcher is provided as argument + """ + result = _get_render_fn() + assert result() == "no_args" + + +def render_fn_2(): + """Do nothing.""" + + +def test_get_render_fn_from_environment(): + """We have a method to get the render_fn from the environment.""" + # When + result = _get_render_fn_from_environment("render_fn_2", __file__) + # Then + assert result.__name__ == render_fn_2.__name__ + assert inspect.getmodule(result).__file__ == __file__ diff --git a/tests/tests_app/frontend/test_utils.py b/tests/tests_app/frontend/test_utils.py new file mode 100644 index 0000000000000..711eac464d830 --- /dev/null +++ b/tests/tests_app/frontend/test_utils.py @@ -0,0 +1,42 @@ +"""We have some utility functions that can be used across frontends.""" + +from lightning_app.frontend.utils import _get_flow_state, _get_frontend_environment +from lightning_app.utilities.state import AppState + + +def test_get_flow_state(flow_state_state: dict, flow): + """We have a method to get an AppState scoped to the Flow state.""" + # When + flow_state = _get_flow_state(flow) + # Then + assert isinstance(flow_state, AppState) + assert flow_state._state == flow_state_state # pylint: disable=protected-access + + +def some_fn(_): + """Be lazy!""" + + +def test_get_frontend_environment_fn(): + """We have a utility function to get the frontend render_fn environment.""" + # When + env = _get_frontend_environment(flow="root.lit_frontend", render_fn_or_file=some_fn, host="myhost", port=1234) + # Then + assert env["LIGHTNING_FLOW_NAME"] == "root.lit_frontend" + assert env["LIGHTNING_RENDER_ADDRESS"] == "myhost" + assert env["LIGHTNING_RENDER_FUNCTION"] == "some_fn" + assert env["LIGHTNING_RENDER_MODULE_FILE"] == __file__ + assert env["LIGHTNING_RENDER_PORT"] == "1234" + + +def test_get_frontend_environment_file(): + """We have a utility function to get the frontend render_fn environment.""" + # When + env = _get_frontend_environment( + flow="root.lit_frontend", render_fn_or_file="app_panel.py", host="myhost", port=1234 + ) + # Then + assert env["LIGHTNING_FLOW_NAME"] == "root.lit_frontend" + assert env["LIGHTNING_RENDER_ADDRESS"] == "myhost" + assert env["LIGHTNING_RENDER_FILE"] == "app_panel.py" + assert env["LIGHTNING_RENDER_PORT"] == "1234" diff --git a/tests/tests_app/frontend/utilities/__init__.py b/tests/tests_app/frontend/utilities/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/tests/tests_app/utilities/test_cloud.py b/tests/tests_app/utilities/test_cloud.py index db5a3efdf13bd..573ec46106b84 100644 --- a/tests/tests_app/utilities/test_cloud.py +++ b/tests/tests_app/utilities/test_cloud.py @@ -6,9 +6,11 @@ @mock.patch.dict(os.environ, clear=True) def test_is_running_locally(): + """We can determine if Lightning is running locally.""" assert not is_running_in_cloud() @mock.patch.dict(os.environ, {"LIGHTNING_APP_STATE_URL": "127.0.0.1"}) def test_is_running_cloud(): + """We can determine if Lightning is running in the cloud.""" assert is_running_in_cloud() From 291267c3bff8054ec438960857c9f2fec1d54899 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Tue, 30 Aug 2022 11:51:30 +0200 Subject: [PATCH 012/193] Unify rank zero messaging utilities (#14116) --- .../callbacks/early_stopping.py | 19 +++--- src/pytorch_lightning/utilities/rank_zero.py | 19 ++++-- src/pytorch_lightning/utilities/seed.py | 4 +- .../callbacks/test_early_stopping.py | 5 +- .../tests_pytorch/deprecated_api/__init__.py | 8 --- .../tests_pytorch/utilities/test_rank_zero.py | 62 +++++++++---------- tests/tests_pytorch/utilities/test_seed.py | 18 ------ 7 files changed, 56 insertions(+), 79 deletions(-) diff --git a/src/pytorch_lightning/callbacks/early_stopping.py b/src/pytorch_lightning/callbacks/early_stopping.py index 72d8445d84407..87585bb8120d0 100644 --- a/src/pytorch_lightning/callbacks/early_stopping.py +++ b/src/pytorch_lightning/callbacks/early_stopping.py @@ -28,7 +28,7 @@ import pytorch_lightning as pl from pytorch_lightning.callbacks.callback import Callback from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning.utilities.rank_zero import rank_zero_warn +from pytorch_lightning.utilities.rank_zero import _get_rank, _rank_prefixed_message, rank_zero_warn log = logging.getLogger(__name__) @@ -259,14 +259,9 @@ def _improvement_message(self, current: Tensor) -> str: @staticmethod def _log_info(trainer: Optional["pl.Trainer"], message: str, log_rank_zero_only: bool) -> None: - if trainer: - # ignore logging in non-zero ranks if log_rank_zero_only flag is enabled - if log_rank_zero_only and trainer.global_rank != 0: - return - # if world size is more than one then specify the rank of the process being logged - if trainer.world_size > 1: - log.info(f"[rank: {trainer.global_rank}] {message}") - return - - # if above conditions don't meet and we have to log - log.info(message) + rank = _get_rank(trainer) + if trainer is not None and trainer.world_size <= 1: + rank = None + message = _rank_prefixed_message(message, rank) + if rank is None or not log_rank_zero_only or rank == 0: + log.info(message) diff --git a/src/pytorch_lightning/utilities/rank_zero.py b/src/pytorch_lightning/utilities/rank_zero.py index e2292789c317c..55bdc08930905 100644 --- a/src/pytorch_lightning/utilities/rank_zero.py +++ b/src/pytorch_lightning/utilities/rank_zero.py @@ -20,6 +20,8 @@ from platform import python_version from typing import Any, Callable, Optional, Union +import pytorch_lightning as pl + log = logging.getLogger(__name__) @@ -35,8 +37,9 @@ def wrapped_fn(*args: Any, **kwargs: Any) -> Optional[Any]: return wrapped_fn -# TODO: this should be part of the cluster environment -def _get_rank() -> int: +def _get_rank(trainer: Optional["pl.Trainer"] = None) -> Optional[int]: + if trainer is not None: + return trainer.global_rank # SLURM_PROCID can be set even if SLURM is not managing the multiprocessing, # therefore LOCAL_RANK needs to be checked first rank_keys = ("RANK", "LOCAL_RANK", "SLURM_PROCID", "JSM_NAMESPACE_RANK") @@ -44,11 +47,12 @@ def _get_rank() -> int: rank = os.environ.get(key) if rank is not None: return int(rank) - return 0 + # None to differentiate whether an environment variable was set at all + return None # add the attribute to the function but don't overwrite in case Trainer has already set it -rank_zero_only.rank = getattr(rank_zero_only, "rank", _get_rank()) +rank_zero_only.rank = getattr(rank_zero_only, "rank", _get_rank() or 0) def _info(*args: Any, stacklevel: int = 2, **kwargs: Any) -> None: @@ -97,3 +101,10 @@ class LightningDeprecationWarning(DeprecationWarning): rank_zero_deprecation = partial(rank_zero_warn, category=LightningDeprecationWarning) + + +def _rank_prefixed_message(message: str, rank: Optional[int]) -> str: + if rank is not None: + # specify the rank of the process being logged + return f"[rank: {rank}] {message}" + return message diff --git a/src/pytorch_lightning/utilities/seed.py b/src/pytorch_lightning/utilities/seed.py index 925337c7845ae..cc9ff6673ef36 100644 --- a/src/pytorch_lightning/utilities/seed.py +++ b/src/pytorch_lightning/utilities/seed.py @@ -24,7 +24,7 @@ import numpy as np import torch -from pytorch_lightning.utilities.rank_zero import _get_rank, rank_zero_only, rank_zero_warn +from pytorch_lightning.utilities.rank_zero import _get_rank, _rank_prefixed_message, rank_zero_only, rank_zero_warn log = logging.getLogger(__name__) @@ -66,7 +66,7 @@ def seed_everything(seed: Optional[int] = None, workers: bool = False) -> int: rank_zero_warn(f"{seed} is not in bounds, numpy accepts from {min_seed_value} to {max_seed_value}") seed = _select_seed_randomly(min_seed_value, max_seed_value) - log.info(f"[rank: {_get_rank()}] Global seed set to {seed}") + log.info(_rank_prefixed_message(f"Global seed set to {seed}", _get_rank())) os.environ["PL_GLOBAL_SEED"] = str(seed) random.seed(seed) np.random.seed(seed) diff --git a/tests/tests_pytorch/callbacks/test_early_stopping.py b/tests/tests_pytorch/callbacks/test_early_stopping.py index 458df2ea23b3f..a3a98027cc12b 100644 --- a/tests/tests_pytorch/callbacks/test_early_stopping.py +++ b/tests/tests_pytorch/callbacks/test_early_stopping.py @@ -472,9 +472,8 @@ def test_early_stopping_squeezes(): (True, 2, 1, None), ], ) -def test_early_stopping_log_info(tmpdir, trainer, log_rank_zero_only, world_size, global_rank, expected_log): - """checks if log.info() gets called with expected message when used within EarlyStopping.""" - +def test_early_stopping_log_info(trainer, log_rank_zero_only, world_size, global_rank, expected_log): + """Checks if log.info() gets called with expected message when used within EarlyStopping.""" # set the global_rank and world_size if trainer is not None # or else always expect the simple logging message if trainer: diff --git a/tests/tests_pytorch/deprecated_api/__init__.py b/tests/tests_pytorch/deprecated_api/__init__.py index 611637d543a7e..6e29ec8b3ab22 100644 --- a/tests/tests_pytorch/deprecated_api/__init__.py +++ b/tests/tests_pytorch/deprecated_api/__init__.py @@ -11,20 +11,12 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""Test deprecated functionality which will be removed in vX.Y.Z.""" -import sys from contextlib import contextmanager from typing import Optional from tests_pytorch.helpers.utils import no_warning_call -def _soft_unimport_module(str_module): - # once the module is imported e.g with parsing with pytest it lives in memory - if str_module in sys.modules: - del sys.modules[str_module] - - @contextmanager def no_deprecated_call(match: Optional[str] = None): with no_warning_call(expected_warning=DeprecationWarning, match=match): diff --git a/tests/tests_pytorch/utilities/test_rank_zero.py b/tests/tests_pytorch/utilities/test_rank_zero.py index ebc827cc46c8a..76fa27926aa39 100644 --- a/tests/tests_pytorch/utilities/test_rank_zero.py +++ b/tests/tests_pytorch/utilities/test_rank_zero.py @@ -12,49 +12,40 @@ # See the License for the specific language governing permissions and # limitations under the License. import os -from typing import Mapping +import sys from unittest import mock import pytest - -@pytest.mark.parametrize("env_vars", [{"RANK": "0"}, {"SLURM_PROCID": "0"}, {"JSM_NAMESPACE_RANK": "0"}]) -def test_rank_zero_known_cluster_envs(env_vars: Mapping[str, str]): - """Test that SLURM environment variables are properly checked for rank_zero_only.""" - from pytorch_lightning.utilities.rank_zero import _get_rank, rank_zero_only - - rank_zero_only.rank = _get_rank() - - with mock.patch.dict(os.environ, env_vars): - from pytorch_lightning.utilities.rank_zero import _get_rank, rank_zero_only - - rank_zero_only.rank = _get_rank() - - @rank_zero_only - def foo(): # The return type is optional because on non-zero ranks it will not be called - return 1 - - x = foo() - assert x == 1 +from pytorch_lightning.utilities.rank_zero import _get_rank, _rank_prefixed_message @pytest.mark.parametrize( - "rank_key,rank", [("RANK", "1"), ("SLURM_PROCID", "2"), ("LOCAL_RANK", "3"), ("JSM_NAMESPACE_RANK", "4")] + "env_vars, expected", + [ + ({"RANK": "0"}, 1), + ({"SLURM_PROCID": "0"}, 1), + ({"LOCAL_RANK": "0"}, 1), + ({"JSM_NAMESPACE_RANK": "0"}, 1), + ({}, 1), + ({"RANK": "1"}, None), + ({"SLURM_PROCID": "2"}, None), + ({"LOCAL_RANK": "3"}, None), + ({"JSM_NAMESPACE_RANK": "4"}, None), + ], ) -def test_rank_zero_none_set(rank_key, rank): - """Test that function is not called when rank environment variables are not global zero.""" - - with mock.patch.dict(os.environ, {rank_key: rank}): - from pytorch_lightning.utilities.rank_zero import _get_rank, rank_zero_only - - rank_zero_only.rank = _get_rank() +def test_rank_zero_known_environment_variables(env_vars, expected): + """Test that rank environment variables are properly checked for rank_zero_only.""" + with mock.patch.dict(os.environ, env_vars): + # force module reload to re-trigger the rank_zero_only.rank global computation + sys.modules.pop("pytorch_lightning.utilities.rank_zero", None) + from pytorch_lightning.utilities.rank_zero import rank_zero_only @rank_zero_only def foo(): return 1 - x = foo() - assert x is None + assert foo() == expected @pytest.mark.parametrize( @@ -69,6 +60,13 @@ def foo(): def test_rank_zero_priority(environ, expected_rank): """Test the priority in which the rank gets determined when multiple environment variables are available.""" with mock.patch.dict(os.environ, environ): - from pytorch_lightning.utilities.rank_zero import _get_rank - assert _get_rank() == expected_rank + + +@pytest.mark.parametrize("env_vars", [{"RANK": "0"}, {"RANK": "1"}, {"RANK": "4"}]) +def test_rank_prefixed_message_with_env_vars(env_vars): + with mock.patch.dict(os.environ, env_vars, clear=True): + rank = _get_rank() + message = _rank_prefixed_message("bar", rank) + + assert message == f"[rank: {rank}] bar" diff --git a/tests/tests_pytorch/utilities/test_seed.py b/tests/tests_pytorch/utilities/test_seed.py index c8df824e93b41..2c89883e3c7a1 100644 --- a/tests/tests_pytorch/utilities/test_seed.py +++ b/tests/tests_pytorch/utilities/test_seed.py @@ -1,8 +1,6 @@ import os import random -from typing import Mapping from unittest import mock -from unittest.mock import MagicMock import numpy as np import pytest @@ -116,19 +114,3 @@ def test_backward_compatibility_rng_states_dict(): assert "torch.cuda" in states states.pop("torch.cuda") _set_rng_states(states) - - -@mock.patch("pytorch_lightning.utilities.seed.log.info") -@pytest.mark.parametrize("env_vars", [{"RANK": "0"}, {"RANK": "1"}, {"RANK": "4"}]) -def test_seed_everything_log_info(log_mock: MagicMock, env_vars: Mapping[str, str]): - """Test that log message prefix with correct rank info.""" - with mock.patch.dict(os.environ, env_vars, clear=True): - from pytorch_lightning.utilities.rank_zero import _get_rank - - rank = _get_rank() - - seed_utils.seed_everything(123) - - expected_log = f"[rank: {rank}] Global seed set to 123" - - log_mock.assert_called_once_with(expected_log) From ce2c1936627b54b269fda6d3b91e7e6bdca3bd40 Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Tue, 30 Aug 2022 12:33:58 +0200 Subject: [PATCH 013/193] CI: add e2e cron job (#14402) * add e2e cron job * trigger * workflow_dispatch Co-authored-by: Mansy Co-authored-by: thomas chaton --- .azure/app-cloud-e2e.yml | 6 +- .github/workflows/ci-app-cloud-e2e-test.yml | 139 ++++++++++++++++++++ 2 files changed, 141 insertions(+), 4 deletions(-) create mode 100644 .github/workflows/ci-app-cloud-e2e-test.yml diff --git a/.azure/app-cloud-e2e.yml b/.azure/app-cloud-e2e.yml index a057b84079020..b4225e59138c8 100644 --- a/.azure/app-cloud-e2e.yml +++ b/.azure/app-cloud-e2e.yml @@ -30,8 +30,6 @@ jobs: container: image: mcr.microsoft.com/playwright/python:v1.25.2-focal options: "--shm-size=2g" - timeoutInMinutes: "30" - cancelTimeoutInMinutes: "2" strategy: matrix: 'App: v0_app': @@ -56,8 +54,8 @@ jobs: name: "payload" 'App: commands_and_api': name: "commands_and_api" - workspace: - clean: all + timeoutInMinutes: "30" + cancelTimeoutInMinutes: "2" steps: - script: echo '##vso[task.setvariable variable=local_id]$(System.PullRequest.PullRequestNumber)' diff --git a/.github/workflows/ci-app-cloud-e2e-test.yml b/.github/workflows/ci-app-cloud-e2e-test.yml new file mode 100644 index 0000000000000..c85e245696c23 --- /dev/null +++ b/.github/workflows/ci-app-cloud-e2e-test.yml @@ -0,0 +1,139 @@ +name: cloud-e2e-testing + +# Used to run the e2e tests on lightning.ai +on: + push: + branches: [master, "release/*"] + pull_request: + paths: + - ".github/workflows/ci-app-cloud-e2e-test.yml" + workflow_dispatch: # TODO: add inputs for specific git_head + + schedule: + # At the end of every day + - cron: "0 0 * * *" + +jobs: + app-cloud-e2e: + name: Cloud e2e Test + runs-on: ubuntu-20.04 + strategy: + fail-fast: false + matrix: + app_name: + - v0_app + - boring_app +# - quick_start # TODO: fix this + - template_streamlit_ui + - template_react_ui + - template_jupyterlab + - idle_timeout + - collect_failures + - custom_work_dependencies + - drive + - payload + - commands_and_api + timeout-minutes: 35 + steps: + - uses: actions/checkout@v2 + - uses: actions/setup-python@v4 + with: + python-version: "3.8" + + - name: Get PR ID + id: running + run: echo "::set-output name=ID::$(date +%s)" + + - name: Install dependencies + shell: bash + run: | + pip --version + python -m pip install -r requirements/app/devel.txt --no-cache --quiet --find-links https://download.pytorch.org/whl/cpu/torch_stable.html + + - name: Cache Playwright dependencies + id: playwright-cache + uses: actions/cache@v3 + with: + path: ~/.cache/ms-playwright + key: ${{ runner.os }}-playwright-${{ matrix.app_name }}-${{ hashFiles('requirements/app/base.txt', 'requirements/app/*.txt', 'src/lightning_app/__version__.py') }} + restore-keys: ${{ runner.os }}-playwright-${{ matrix.app_name }}- + + - name: Install Playwright system dependencies + shell: bash + run: | + python -m pip install playwright + python -m playwright install --with-deps + + - name: Install lightning + run: | + pip install -e . --find-links https://download.pytorch.org/whl/cpu/torch_stable.html + shell: bash + + #- name: Lightning Install quick-start + # if: ${{ (matrix.app_name == 'quick_start') }} + # shell: bash + # run: | + # python -m lightning install app lightning/quick-start -y + + - name: Clone Template React UI Repo + if: ${{ (matrix.app_name == 'template_react_ui') }} + run: | + git clone https://github.com/Lightning-AI/lightning-template-react examples/app_template_react_ui + shell: bash + + - name: Clone Template Jupyter Lab Repo + if: ${{ (matrix.app_name == 'template_jupyterlab') }} + run: | + git clone https://github.com/Lightning-AI/LAI-lightning-template-jupyterlab-App examples/app_template_jupyterlab + cp examples/app_template_jupyterlab/tests/test_template_jupyterlab.py tests/tests_app_examples/test_template_jupyterlab.py + shell: bash + + - name: List pip dependency + shell: bash + run: pip list + + - name: Run the tests + env: + LAI_USER: ${{ secrets.LAI_USER }} + LAI_PASS: ${{ secrets.LAI_PASS }} + LIGHTNING_USER_ID: ${{ secrets.LIGHTNING_USER_ID }} + LIGHTNING_API_KEY: ${{ secrets.LIGHTNING_API_KEY }} + LIGHTNING_USERNAME: ${{ secrets.LIGHTNING_USERNAME }} + LIGHTNING_CLOUD_URL: ${{ secrets.LIGHTNING_CLOUD_URL }} + CLOUD: "1" + VIDEO_LOCATION: ./artifacts/videos + PR_NUMBER: ${{ steps.running.outputs.ID }} + TEST_APP_NAME: ${{ matrix.app_name }} + HAR_LOCATION: ./artifacts/hars + SLOW_MO: 50 + shell: bash + run: | + mkdir -p ${VIDEO_LOCATION} + HEADLESS=1 PACKAGE_LIGHTNING=1 python -m pytest tests/tests_app_examples/test_${{ matrix.app_name }}.py::test_${{ matrix.app_name }}_example_cloud --timeout=900 --capture=no -v --color=yes + # Delete the artifacts if successful + rm -r ${VIDEO_LOCATION}/${{ matrix.app_name }} + + - uses: actions/upload-artifact@v3 + if: ${{ always() }} + with: + name: test-artifacts + path: ./artifacts/videos + + - name: Clean Previous Apps + if: ${{ always() }} + env: + LAI_USER: ${{ secrets.LAI_USER }} + LAI_PASS: ${{ secrets.LAI_PASS }} + LIGHTNING_USER_ID: ${{ secrets.LIGHTNING_USER_ID }} + LIGHTNING_API_KEY: ${{ secrets.LIGHTNING_API_KEY }} + LIGHTNING_USERNAME: ${{ secrets.LIGHTNING_USERNAME }} + LIGHTNING_CLOUD_URL: ${{ secrets.LIGHTNING_CLOUD_URL }} + PR_NUMBER: ${{ steps.running.outputs.ID }} + TEST_APP_NAME: ${{ matrix.app_name }} + GRID_USER_ID: ${{ secrets.LIGHTNING_USER_ID }} + GRID_USER_KEY: ${{ secrets.LIGHTNING_API_KEY }} + GRID_URL: ${{ secrets.LIGHTNING_CLOUD_URL }} + _GRID_USERNAME: ${{ secrets.LIGHTNING_USERNAME }} + shell: bash + run: | + time python -c "from lightning.app import testing; testing.delete_cloud_lightning_apps()" From 7987a1b453150f6c9733db24ab0cdcf11e60dd9d Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Tue, 30 Aug 2022 14:12:48 +0200 Subject: [PATCH 014/193] update notebooks (#14340) --- _notebooks | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/_notebooks b/_notebooks index 8a36a41548f34..6d5634b794218 160000 --- a/_notebooks +++ b/_notebooks @@ -1 +1 @@ -Subproject commit 8a36a41548f34c44ac455d515a72994487e85813 +Subproject commit 6d5634b7942180e6ba4a30bfbd74926d1c22f1eb From 208bf6faa87fca8d9015655f478c9048cba9840e Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Tue, 30 Aug 2022 15:25:05 +0200 Subject: [PATCH 015/193] prepare space for fused docs (#14160) * copy app conf * ci + req. * script symlink * wip * keep only App * add also PL * lightning * artifact --- .github/workflows/docs-checks.yml | 6 +- .gitignore | 3 + Makefile | 3 + docs/create-symlinks.py | 27 ++ .../code_samples/quickstart/app/app_0.py | 3 +- docs/source-lit/Makefile | 19 + docs/source-lit/_static/copybutton.js | 1 + .../images/accelerator/ipus/profiler.png | 1 + .../images/benchmarks/figure-parity-times.png | 1 + .../general/PTL101_youtube_thumbnail.jpg | 1 + .../_static/images/general/fast_2.gif | 1 + .../_static/images/general/pl_overview.gif | 1 + .../images/general/pl_overview_flat.jpg | 1 + .../pl_quick_start_full_compressed.gif | 1 + .../_static/images/general/tf_loss.jpg | 1 + .../_static/images/general/tf_tags.jpg | 1 + .../_static/images/general/tutorial_cover.jpg | 1 + docs/source-lit/_static/images/icon.svg | 1 + .../_static/images/lightning_lite/lite.gif | 1 + .../images/lightning_module/pt_to_pl.png | 1 + .../images/lightning_module/pt_trainer.png | 1 + docs/source-lit/_static/images/logo-large.svg | 1 + docs/source-lit/_static/images/logo-small.svg | 1 + docs/source-lit/_static/images/logo.png | 1 + docs/source-lit/_static/images/logo.svg | 1 + docs/source-lit/_static/images/logo_light.svg | 1 + .../images/mnist_imgs/mnist_cpu_bar.png | 1 + .../_static/images/mnist_imgs/mnist_gpu.png | 1 + .../_static/images/mnist_imgs/mnist_tb.png | 1 + .../_static/images/mnist_imgs/pt_to_pl.jpg | 1 + .../images/mnist_imgs/restart_runtime.png | 1 + .../_static/images/mnist_imgs/runtime_tpu.png | 1 + .../_static/images/mnist_imgs/tpu_fast.png | 1 + .../_static/images/mnist_imgs/tpu_start.png | 1 + .../_static/images/trainer/lr_finder.png | 1 + docs/source-lit/_static/main.css | 1 + .../_templates/autosummary/module.rst | 1 + docs/source-lit/_templates/classtemplate.rst | 1 + .../_templates/classtemplate_no_index.rst | 1 + docs/source-lit/_templates/layout.html | 1 + .../_templates/theme_variables.jinja | 1 + .../accelerators/accelerator_prepare.rst | 1 + docs/source-lit/accelerators/gpu.rst | 1 + docs/source-lit/accelerators/gpu_advanced.rst | 1 + docs/source-lit/accelerators/gpu_basic.rst | 1 + docs/source-lit/accelerators/gpu_expert.rst | 1 + docs/source-lit/accelerators/gpu_faq.rst | 1 + .../accelerators/gpu_intermediate.rst | 1 + docs/source-lit/accelerators/hpu.rst | 1 + docs/source-lit/accelerators/hpu_basic.rst | 1 + .../accelerators/hpu_intermediate.rst | 1 + docs/source-lit/accelerators/ipu.rst | 1 + docs/source-lit/accelerators/ipu_advanced.rst | 1 + docs/source-lit/accelerators/ipu_basic.rst | 1 + .../accelerators/ipu_intermediate.rst | 1 + docs/source-lit/accelerators/mps.rst | 1 + docs/source-lit/accelerators/mps_basic.rst | 1 + docs/source-lit/accelerators/tpu.rst | 1 + docs/source-lit/accelerators/tpu_advanced.rst | 1 + docs/source-lit/accelerators/tpu_basic.rst | 1 + docs/source-lit/accelerators/tpu_faq.rst | 1 + .../accelerators/tpu_intermediate.rst | 1 + docs/source-lit/advanced/finetuning.rst | 1 + docs/source-lit/advanced/model_parallel.rst | 1 + docs/source-lit/advanced/pretrained.rst | 1 + .../advanced/pruning_quantization.rst | 1 + .../source-lit/advanced/strategy_registry.rst | 1 + docs/source-lit/advanced/training_tricks.rst | 1 + .../source-lit/advanced/transfer_learning.rst | 1 + docs/source-lit/api_reference/components.rst | 1 + docs/source-lit/api_reference/core.rst | 1 + docs/source-lit/api_reference/frontend.rst | 1 + docs/source-lit/api_reference/runners.rst | 1 + docs/source-lit/api_reference/storage.rst | 1 + docs/source-lit/api_references.rst | 1 + docs/source-lit/basics.rst | 1 + docs/source-lit/benchmarking/benchmarks.rst | 1 + docs/source-lit/cli/lightning_cli.rst | 1 + .../source-lit/cli/lightning_cli_advanced.rst | 1 + .../cli/lightning_cli_advanced_2.rst | 1 + .../cli/lightning_cli_advanced_3.rst | 1 + docs/source-lit/cli/lightning_cli_expert.rst | 1 + docs/source-lit/cli/lightning_cli_faq.rst | 1 + .../cli/lightning_cli_intermediate.rst | 1 + .../cli/lightning_cli_intermediate_2.rst | 1 + docs/source-lit/clouds/cloud_training.rst | 1 + .../clouds/cloud_training_intermediate.rst | 1 + docs/source-lit/clouds/cluster.rst | 1 + docs/source-lit/clouds/cluster_advanced.rst | 1 + docs/source-lit/clouds/cluster_expert.rst | 1 + .../clouds/cluster_intermediate_1.rst | 1 + .../clouds/cluster_intermediate_2.rst | 1 + .../clouds/fault_tolerant_training.rst | 1 + .../clouds/fault_tolerant_training_basic.rst | 1 + .../clouds/fault_tolerant_training_expert.rst | 1 + .../clouds/fault_tolerant_training_faq.rst | 1 + docs/source-lit/clouds/grid_costs.rst | 1 + docs/source-lit/clouds/run_advanced.rst | 1 + docs/source-lit/clouds/run_basic.rst | 1 + docs/source-lit/clouds/run_expert.rst | 1 + docs/source-lit/clouds/run_intermediate.rst | 1 + docs/source-lit/clouds/session_basic.rst | 1 + .../clouds/session_intermediate.rst | 1 + docs/source-lit/code_samples/basics/0.py | 1 + docs/source-lit/code_samples/basics/1.py | 1 + .../code_samples/convert_pl_to_app/app.py | 1 + .../convert_pl_to_app/requirements.txt | 1 + .../code_samples/convert_pl_to_app/train.py | 1 + .../code_samples/quickstart/__init__.py | 1 + .../code_samples/quickstart/app/__init__.py | 1 + .../code_samples/quickstart/app/app_0.py | 1 + .../code_samples/quickstart/app/app_1.py | 1 + .../code_samples/quickstart/app_01.py | 1 + .../code_samples/quickstart/app_02.py | 1 + .../code_samples/quickstart/app_03.py | 1 + .../code_samples/quickstart/app_comp.py | 1 + .../quickstart/hello_world/app.py | 1 + .../quickstart/hello_world/app_ui.py | 1 + .../quickstart/hello_world/ui/index.html | 1 + docs/source-lit/common/checkpointing.rst | 1 + .../common/checkpointing_advanced.rst | 1 + .../source-lit/common/checkpointing_basic.rst | 1 + .../common/checkpointing_expert.rst | 1 + .../common/checkpointing_intermediate.rst | 1 + docs/source-lit/common/child_modules.rst | 1 + docs/source-lit/common/console_logs.rst | 1 + docs/source-lit/common/early_stopping.rst | 1 + docs/source-lit/common/evaluation.rst | 1 + docs/source-lit/common/evaluation_basic.rst | 1 + .../common/evaluation_intermediate.rst | 1 + .../common/gradient_accumulation.rst | 1 + docs/source-lit/common/hyperparameters.rst | 1 + docs/source-lit/common/lightning_module.rst | 1 + docs/source-lit/common/optimization.rst | 1 + docs/source-lit/common/precision.rst | 1 + docs/source-lit/common/precision_basic.rst | 1 + docs/source-lit/common/precision_expert.rst | 1 + .../common/precision_intermediate.rst | 1 + docs/source-lit/common/progress_bar.rst | 1 + docs/source-lit/common/remote_fs.rst | 1 + docs/source-lit/common/trainer.rst | 1 + docs/source-lit/common_usecases.rst | 1 + docs/source-lit/conf.py | 417 ++++++++++++++++++ docs/source-lit/contribute_app.rst | 1 + docs/source-lit/core_api/core_api.rst | 1 + docs/source-lit/core_api/lightning_app/app.py | 1 + .../core_api/lightning_app/communication.rst | 1 + .../lightning_app/communication_content.rst | 1 + .../core_api/lightning_app/dynamic_work.rst | 1 + .../lightning_app/dynamic_work_content.rst | 1 + .../core_api/lightning_app/index.rst | 1 + .../core_api/lightning_app/lightning_app.rst | 1 + docs/source-lit/core_api/lightning_flow.rst | 1 + .../core_api/lightning_work/compute.rst | 1 + .../lightning_work/compute_content.rst | 1 + .../lightning_work/handling_app_exception.rst | 1 + .../handling_app_exception_content.rst | 1 + .../core_api/lightning_work/index.rst | 1 + .../lightning_work/lightning_work.rst | 1 + .../core_api/lightning_work/payload.rst | 1 + .../lightning_work/payload_content.rst | 1 + .../core_api/lightning_work/status.rst | 1 + .../lightning_work/status_content.rst | 1 + docs/source-lit/data/datamodule.rst | 1 + docs/source-lit/debug/debugging.rst | 1 + docs/source-lit/debug/debugging_advanced.rst | 1 + docs/source-lit/debug/debugging_basic.rst | 1 + .../debug/debugging_intermediate.rst | 1 + docs/source-lit/deploy/production.rst | 1 + .../source-lit/deploy/production_advanced.rst | 1 + .../deploy/production_advanced_2.rst | 1 + docs/source-lit/deploy/production_basic.rst | 1 + .../deploy/production_intermediate.rst | 1 + docs/source-lit/ecosystem/asr_nlp_tts.rst | 1 + docs/source-lit/ecosystem/bolts.rst | 1 + .../ecosystem/community_examples.rst | 1 + docs/source-lit/ecosystem/ecosystem-ci.rst | 1 + docs/source-lit/ecosystem/flash.rst | 1 + docs/source-lit/ecosystem/metrics.rst | 1 + docs/source-lit/ecosystem/transformers.rst | 1 + docs/source-lit/examples/dag/dag.rst | 1 + .../examples/dag/dag_from_scratch.rst | 1 + docs/source-lit/examples/data_explore_app.rst | 1 + docs/source-lit/examples/etl_app.rst | 1 + docs/source-lit/examples/file_server/app.py | 1 + .../examples/file_server/file_server.rst | 1 + .../file_server/file_server_content.rst | 1 + .../file_server/file_server_step_1.rst | 1 + .../file_server/file_server_step_2.rst | 1 + .../file_server/file_server_step_3.rst | 1 + .../file_server/file_server_step_4.rst | 1 + .../examples/github_repo_runner/app.py | 1 + .../github_repo_runner/github_repo_runner.rst | 1 + .../github_repo_runner_content.rst | 1 + .../github_repo_runner_step_1.rst | 1 + .../github_repo_runner_step_2.rst | 1 + .../github_repo_runner_step_3.rst | 1 + .../github_repo_runner_step_4.rst | 1 + .../github_repo_runner_step_5.rst | 1 + docs/source-lit/examples/hands_on_example.rst | 1 + .../examples/hpo/build_from_scratch.rst | 1 + docs/source-lit/examples/hpo/hpo.py | 1 + docs/source-lit/examples/hpo/hpo.rst | 1 + docs/source-lit/examples/hpo/hpo_wi.rst | 1 + docs/source-lit/examples/hpo/hpo_wo.rst | 1 + .../source-lit/examples/hpo/lightning_hpo.rst | 1 + .../examples/hpo/lightning_hpo_target.py | 1 + docs/source-lit/examples/hpo/objective.py | 1 + .../examples/hpo/optuna_reference.py | 1 + .../examples/model_server_app/app.py | 1 + .../model_server_app/load_testing.rst | 1 + .../model_server_app/locust_component.py | 1 + .../examples/model_server_app/locustfile.py | 1 + .../examples/model_server_app/model_server.py | 1 + .../model_server_app/model_server.rst | 1 + .../model_server_app/model_server_app.rst | 1 + .../model_server_app_content.rst | 1 + .../putting_everything_together.rst | 1 + .../examples/model_server_app/train.py | 1 + .../examples/model_server_app/train.rst | 1 + .../source-lit/examples/research_demo_app.rst | 1 + docs/source-lit/expertise_levels.rst | 1 + docs/source-lit/extensions/accelerator.rst | 1 + docs/source-lit/extensions/callbacks.rst | 1 + .../source-lit/extensions/callbacks_state.rst | 1 + .../extensions/datamodules_state.rst | 1 + docs/source-lit/extensions/entry_points.rst | 1 + docs/source-lit/extensions/logging.rst | 1 + docs/source-lit/extensions/loops.rst | 1 + docs/source-lit/extensions/loops_advanced.rst | 1 + docs/source-lit/extensions/plugins.rst | 1 + docs/source-lit/extensions/strategy.rst | 1 + .../get_started/add_an_interactive_demo.rst | 1 + docs/source-lit/get_started/build_model.rst | 1 + .../get_started/go_beyond_training.rst | 1 + .../go_beyond_training_content.rst | 1 + .../jumpstart_from_app_gallery.rst | 1 + .../jumpstart_from_component_gallery.rst | 1 + .../get_started/lightning_apps_intro.rst | 1 + .../get_started/training_with_apps.rst | 1 + .../get_started/what_app_can_do.rst | 1 + docs/source-lit/glossary/app_tree.rst | 1 + .../glossary/build_config/build_config.rst | 1 + .../build_config/build_config_advanced.rst | 1 + .../build_config/build_config_basic.rst | 1 + .../build_config_intermediate.rst | 1 + docs/source-lit/glossary/dag.rst | 1 + docs/source-lit/glossary/debug_app.rst | 1 + docs/source-lit/glossary/distributed_fe.rst | 1 + .../glossary/distributed_hardware.rst | 1 + .../glossary/environment_variables.rst | 1 + docs/source-lit/glossary/event_loop.rst | 1 + docs/source-lit/glossary/fault_tolerance.rst | 1 + docs/source-lit/glossary/index.rst | 1 + .../glossary/lightning_app_overview/index.rst | 1 + docs/source-lit/glossary/scheduling.rst | 1 + .../glossary/sharing_components.rst | 1 + .../glossary/storage/differences.rst | 1 + docs/source-lit/glossary/storage/drive.rst | 1 + .../glossary/storage/drive_content.rst | 1 + docs/source-lit/glossary/storage/path.rst | 1 + docs/source-lit/glossary/storage/storage.rst | 1 + docs/source-lit/governance.rst | 1 + docs/source-lit/guides/data.rst | 1 + docs/source-lit/guides/speed.rst | 1 + docs/source-lit/index.rst | 262 +++++++++++ docs/source-lit/install_beginner.rst | 1 + docs/source-lit/installation.rst | 1 + docs/source-lit/installation_mac.rst | 1 + docs/source-lit/installation_win.rst | 1 + docs/source-lit/intro.rst | 1 + docs/source-lit/levels/advanced.rst | 1 + docs/source-lit/levels/advanced/index.rst | 1 + docs/source-lit/levels/advanced/level_16.rst | 1 + docs/source-lit/levels/advanced/level_17.rst | 1 + docs/source-lit/levels/advanced/level_18.rst | 1 + docs/source-lit/levels/advanced/level_19.rst | 1 + docs/source-lit/levels/advanced/level_20.rst | 1 + docs/source-lit/levels/advanced_level_15.rst | 1 + docs/source-lit/levels/advanced_level_16.rst | 1 + docs/source-lit/levels/advanced_level_17.rst | 1 + docs/source-lit/levels/advanced_level_18.rst | 1 + docs/source-lit/levels/advanced_level_19.rst | 1 + docs/source-lit/levels/advanced_level_20.rst | 1 + docs/source-lit/levels/advanced_level_21.rst | 1 + docs/source-lit/levels/advanced_level_22.rst | 1 + docs/source-lit/levels/basic/index.rst | 1 + docs/source-lit/levels/basic/level_1.rst | 1 + docs/source-lit/levels/basic/level_2.rst | 1 + docs/source-lit/levels/basic/level_3.rst | 1 + docs/source-lit/levels/basic/level_4.rst | 1 + docs/source-lit/levels/basic/level_5.rst | 1 + docs/source-lit/levels/basic/level_6.rst | 1 + docs/source-lit/levels/basic/level_7.rst | 1 + docs/source-lit/levels/basic_level_2.rst | 1 + docs/source-lit/levels/basic_level_5.rst | 1 + docs/source-lit/levels/core_level_3.rst | 1 + docs/source-lit/levels/core_level_6.rst | 1 + docs/source-lit/levels/core_skills.rst | 1 + docs/source-lit/levels/expert.rst | 1 + docs/source-lit/levels/expert_level_23.rst | 1 + docs/source-lit/levels/expert_level_24.rst | 1 + docs/source-lit/levels/expert_level_27.rst | 1 + docs/source-lit/levels/intermediate.rst | 1 + docs/source-lit/levels/intermediate/index.rst | 1 + .../levels/intermediate/level_10.rst | 1 + .../levels/intermediate/level_11.rst | 1 + .../levels/intermediate/level_12.rst | 1 + .../levels/intermediate/level_13.rst | 1 + .../levels/intermediate/level_14.rst | 1 + .../levels/intermediate/level_15.rst | 1 + .../levels/intermediate/level_8.rst | 1 + .../levels/intermediate/level_9.rst | 1 + .../levels/intermediate_level_10.rst | 1 + .../levels/intermediate_level_11.rst | 1 + .../levels/intermediate_level_12.rst | 1 + .../levels/intermediate_level_13.rst | 1 + .../levels/intermediate_level_14.rst | 1 + .../levels/intermediate_level_7.rst | 1 + .../levels/intermediate_level_8.rst | 1 + .../levels/intermediate_level_9.rst | 1 + docs/source-lit/links.rst | 1 + docs/source-lit/make.bat | 35 ++ docs/source-lit/model/build_model.rst | 1 + .../source-lit/model/build_model_advanced.rst | 1 + docs/source-lit/model/build_model_expert.rst | 1 + .../model/build_model_intermediate.rst | 1 + docs/source-lit/model/manual_optimization.rst | 1 + docs/source-lit/model/own_your_loop.rst | 1 + docs/source-lit/model/train_model_basic.rst | 1 + docs/source-lit/moving_to_the_cloud.rst | 1 + docs/source-lit/quickstart.rst | 1 + docs/source-lit/starter/converting.rst | 1 + docs/source-lit/starter/installation.rst | 1 + docs/source-lit/starter/installation_mac.rst | 1 + docs/source-lit/starter/introduction.rst | 1 + docs/source-lit/starter/lightning_lite.rst | 1 + docs/source-lit/starter/style_guide.rst | 1 + docs/source-lit/strategies/hivemind.rst | 1 + docs/source-lit/strategies/hivemind_basic.rst | 1 + .../source-lit/strategies/hivemind_expert.rst | 1 + .../strategies/hivemind_intermediate.rst | 1 + docs/source-lit/testing.rst | 1 + docs/source-lit/tuning/profiler.rst | 1 + docs/source-lit/tuning/profiler_advanced.rst | 1 + docs/source-lit/tuning/profiler_basic.rst | 1 + docs/source-lit/tuning/profiler_expert.rst | 1 + .../tuning/profiler_intermediate.rst | 1 + docs/source-lit/ui_and_frontends.rst | 1 + .../visualize/experiment_managers.rst | 1 + docs/source-lit/visualize/loggers.rst | 1 + .../source-lit/visualize/logging_advanced.rst | 1 + docs/source-lit/visualize/logging_basic.rst | 1 + docs/source-lit/visualize/logging_expert.rst | 1 + .../visualize/logging_intermediate.rst | 1 + .../visualize/supported_exp_managers.rst | 1 + .../access_app_state/access_app_state.rst | 1 + .../workflows/add_components/index.rst | 1 + .../workflows/add_server/any_server.rst | 1 + .../workflows/add_server/flask_basic.rst | 1 + .../source-lit/workflows/add_server/index.rst | 1 + .../workflows/add_server/index_content.rst | 1 + docs/source-lit/workflows/add_web_link.rst | 1 + .../add_web_ui/angular_js_intermediate.rst | 1 + .../workflows/add_web_ui/dash/basic.rst | 1 + .../workflows/add_web_ui/dash/index.rst | 1 + .../add_web_ui/dash/intermediate.rst | 1 + .../add_web_ui/dash/intermediate_plot.py | 1 + .../add_web_ui/dash/intermediate_state.py | 1 + .../workflows/add_web_ui/example_app.rst | 1 + .../add_web_ui/glossary_front_end.rst | 1 + .../workflows/add_web_ui/glossary_ui.rst | 1 + .../workflows/add_web_ui/gradio/basic.rst | 1 + .../workflows/add_web_ui/gradio/index.rst | 1 + .../add_web_ui/gradio/intermediate.rst | 1 + .../workflows/add_web_ui/html/basic.rst | 1 + .../workflows/add_web_ui/html/index.rst | 1 + .../add_web_ui/html/intermediate.rst | 1 + .../source-lit/workflows/add_web_ui/index.rst | 1 + .../workflows/add_web_ui/index_content.rst | 1 + .../integrate_any_javascript_framework.rst | 1 + .../workflows/add_web_ui/jupyter_basic.rst | 1 + ...ommunicate_between_react_and_lightning.rst | 1 + .../react/connect_react_and_lightning.rst | 1 + .../react/create_react_template.rst | 1 + .../workflows/add_web_ui/react/index.rst | 1 + .../react/react_development_workflow.rst | 1 + .../workflows/add_web_ui/streamlit/basic.rst | 1 + .../workflows/add_web_ui/streamlit/index.rst | 1 + .../add_web_ui/streamlit/intermediate.rst | 1 + .../add_web_ui/vue_js_intermediate.rst | 1 + .../arrange_tabs/arrange_app_basic.rst | 1 + .../arrange_tabs/arrange_app_intermediate.rst | 1 + .../workflows/arrange_tabs/index.rst | 1 + .../workflows/arrange_tabs/index_content.rst | 1 + .../from_pytorch_lightning_script.rst | 1 + .../build_lightning_app/from_scratch.rst | 1 + .../from_scratch_content.rst | 1 + .../workflows/build_lightning_app/index.rst | 1 + .../build_lightning_app/index_content.rst | 1 + .../build_lightning_component/basic.rst | 1 + .../from_scratch_component_content.rst | 1 + .../build_lightning_component/index.rst | 1 + .../index_content.rst | 1 + .../intermediate.rst | 1 + .../publish_a_component.rst | 1 + docs/source-lit/workflows/byoc/index.rst | 1 + docs/source-lit/workflows/debug_locally.rst | 1 + .../workflows/enable_fault_tolerance.rst | 1 + docs/source-lit/workflows/extend_app.rst | 1 + docs/source-lit/workflows/index.rst | 1 + .../run_app_on_cloud/cloud_files.rst | 1 + .../workflows/run_app_on_cloud/index.rst | 1 + .../run_app_on_cloud/index_content.rst | 1 + .../run_app_on_cloud/lightning_cloud.rst | 1 + .../workflows/run_app_on_cloud/on_prem.rst | 1 + .../run_app_on_cloud/on_your_own_machine.rst | 1 + docs/source-lit/workflows/run_app_snippet.rst | 1 + .../run_components_on_different_hardware.rst | 1 + .../workflows/run_on_private_cloud.rst | 1 + .../workflows/run_work_in_parallel.rst | 1 + .../run_work_in_parallel_content.rst | 1 + docs/source-lit/workflows/run_work_once.rst | 1 + .../workflows/run_work_once_content.rst | 1 + docs/source-lit/workflows/schedule_apps.rst | 1 + docs/source-lit/workflows/share_app.rst | 1 + .../share_files_between_components.rst | 1 + .../share_files_between_components/app.py | 1 + docs/source-lit/workflows/test_an_app.rst | 1 + requirements/lit/base.txt | 1 + requirements/lit/devel.txt | 1 + requirements/lit/docs.txt | 9 + setup.py | 4 +- 433 files changed, 1204 insertions(+), 6 deletions(-) create mode 100644 docs/create-symlinks.py create mode 100644 docs/source-lit/Makefile create mode 120000 docs/source-lit/_static/copybutton.js create mode 120000 docs/source-lit/_static/images/accelerator/ipus/profiler.png create mode 120000 docs/source-lit/_static/images/benchmarks/figure-parity-times.png create mode 120000 docs/source-lit/_static/images/general/PTL101_youtube_thumbnail.jpg create mode 120000 docs/source-lit/_static/images/general/fast_2.gif create mode 120000 docs/source-lit/_static/images/general/pl_overview.gif create mode 120000 docs/source-lit/_static/images/general/pl_overview_flat.jpg create mode 120000 docs/source-lit/_static/images/general/pl_quick_start_full_compressed.gif create mode 120000 docs/source-lit/_static/images/general/tf_loss.jpg create mode 120000 docs/source-lit/_static/images/general/tf_tags.jpg create mode 120000 docs/source-lit/_static/images/general/tutorial_cover.jpg create mode 120000 docs/source-lit/_static/images/icon.svg create mode 120000 docs/source-lit/_static/images/lightning_lite/lite.gif create mode 120000 docs/source-lit/_static/images/lightning_module/pt_to_pl.png create mode 120000 docs/source-lit/_static/images/lightning_module/pt_trainer.png create mode 120000 docs/source-lit/_static/images/logo-large.svg create mode 120000 docs/source-lit/_static/images/logo-small.svg create mode 120000 docs/source-lit/_static/images/logo.png create mode 120000 docs/source-lit/_static/images/logo.svg create mode 120000 docs/source-lit/_static/images/logo_light.svg create mode 120000 docs/source-lit/_static/images/mnist_imgs/mnist_cpu_bar.png create mode 120000 docs/source-lit/_static/images/mnist_imgs/mnist_gpu.png create mode 120000 docs/source-lit/_static/images/mnist_imgs/mnist_tb.png create mode 120000 docs/source-lit/_static/images/mnist_imgs/pt_to_pl.jpg create mode 120000 docs/source-lit/_static/images/mnist_imgs/restart_runtime.png create mode 120000 docs/source-lit/_static/images/mnist_imgs/runtime_tpu.png create mode 120000 docs/source-lit/_static/images/mnist_imgs/tpu_fast.png create mode 120000 docs/source-lit/_static/images/mnist_imgs/tpu_start.png create mode 120000 docs/source-lit/_static/images/trainer/lr_finder.png create mode 120000 docs/source-lit/_static/main.css create mode 120000 docs/source-lit/_templates/autosummary/module.rst create mode 120000 docs/source-lit/_templates/classtemplate.rst create mode 120000 docs/source-lit/_templates/classtemplate_no_index.rst create mode 120000 docs/source-lit/_templates/layout.html create mode 120000 docs/source-lit/_templates/theme_variables.jinja create mode 120000 docs/source-lit/accelerators/accelerator_prepare.rst create mode 120000 docs/source-lit/accelerators/gpu.rst create mode 120000 docs/source-lit/accelerators/gpu_advanced.rst create mode 120000 docs/source-lit/accelerators/gpu_basic.rst create mode 120000 docs/source-lit/accelerators/gpu_expert.rst create mode 120000 docs/source-lit/accelerators/gpu_faq.rst create mode 120000 docs/source-lit/accelerators/gpu_intermediate.rst create mode 120000 docs/source-lit/accelerators/hpu.rst create mode 120000 docs/source-lit/accelerators/hpu_basic.rst create mode 120000 docs/source-lit/accelerators/hpu_intermediate.rst create mode 120000 docs/source-lit/accelerators/ipu.rst create mode 120000 docs/source-lit/accelerators/ipu_advanced.rst create mode 120000 docs/source-lit/accelerators/ipu_basic.rst create mode 120000 docs/source-lit/accelerators/ipu_intermediate.rst create mode 120000 docs/source-lit/accelerators/mps.rst create mode 120000 docs/source-lit/accelerators/mps_basic.rst create mode 120000 docs/source-lit/accelerators/tpu.rst create mode 120000 docs/source-lit/accelerators/tpu_advanced.rst create mode 120000 docs/source-lit/accelerators/tpu_basic.rst create mode 120000 docs/source-lit/accelerators/tpu_faq.rst create mode 120000 docs/source-lit/accelerators/tpu_intermediate.rst create mode 120000 docs/source-lit/advanced/finetuning.rst create mode 120000 docs/source-lit/advanced/model_parallel.rst create mode 120000 docs/source-lit/advanced/pretrained.rst create mode 120000 docs/source-lit/advanced/pruning_quantization.rst create mode 120000 docs/source-lit/advanced/strategy_registry.rst create mode 120000 docs/source-lit/advanced/training_tricks.rst create mode 120000 docs/source-lit/advanced/transfer_learning.rst create mode 120000 docs/source-lit/api_reference/components.rst create mode 120000 docs/source-lit/api_reference/core.rst create mode 120000 docs/source-lit/api_reference/frontend.rst create mode 120000 docs/source-lit/api_reference/runners.rst create mode 120000 docs/source-lit/api_reference/storage.rst create mode 120000 docs/source-lit/api_references.rst create mode 120000 docs/source-lit/basics.rst create mode 120000 docs/source-lit/benchmarking/benchmarks.rst create mode 120000 docs/source-lit/cli/lightning_cli.rst create mode 120000 docs/source-lit/cli/lightning_cli_advanced.rst create mode 120000 docs/source-lit/cli/lightning_cli_advanced_2.rst create mode 120000 docs/source-lit/cli/lightning_cli_advanced_3.rst create mode 120000 docs/source-lit/cli/lightning_cli_expert.rst create mode 120000 docs/source-lit/cli/lightning_cli_faq.rst create mode 120000 docs/source-lit/cli/lightning_cli_intermediate.rst create mode 120000 docs/source-lit/cli/lightning_cli_intermediate_2.rst create mode 120000 docs/source-lit/clouds/cloud_training.rst create mode 120000 docs/source-lit/clouds/cloud_training_intermediate.rst create mode 120000 docs/source-lit/clouds/cluster.rst create mode 120000 docs/source-lit/clouds/cluster_advanced.rst create mode 120000 docs/source-lit/clouds/cluster_expert.rst create mode 120000 docs/source-lit/clouds/cluster_intermediate_1.rst create mode 120000 docs/source-lit/clouds/cluster_intermediate_2.rst create mode 120000 docs/source-lit/clouds/fault_tolerant_training.rst create mode 120000 docs/source-lit/clouds/fault_tolerant_training_basic.rst create mode 120000 docs/source-lit/clouds/fault_tolerant_training_expert.rst create mode 120000 docs/source-lit/clouds/fault_tolerant_training_faq.rst create mode 120000 docs/source-lit/clouds/grid_costs.rst create mode 120000 docs/source-lit/clouds/run_advanced.rst create mode 120000 docs/source-lit/clouds/run_basic.rst create mode 120000 docs/source-lit/clouds/run_expert.rst create mode 120000 docs/source-lit/clouds/run_intermediate.rst create mode 120000 docs/source-lit/clouds/session_basic.rst create mode 120000 docs/source-lit/clouds/session_intermediate.rst create mode 120000 docs/source-lit/code_samples/basics/0.py create mode 120000 docs/source-lit/code_samples/basics/1.py create mode 120000 docs/source-lit/code_samples/convert_pl_to_app/app.py create mode 120000 docs/source-lit/code_samples/convert_pl_to_app/requirements.txt create mode 120000 docs/source-lit/code_samples/convert_pl_to_app/train.py create mode 120000 docs/source-lit/code_samples/quickstart/__init__.py create mode 120000 docs/source-lit/code_samples/quickstart/app/__init__.py create mode 120000 docs/source-lit/code_samples/quickstart/app/app_0.py create mode 120000 docs/source-lit/code_samples/quickstart/app/app_1.py create mode 120000 docs/source-lit/code_samples/quickstart/app_01.py create mode 120000 docs/source-lit/code_samples/quickstart/app_02.py create mode 120000 docs/source-lit/code_samples/quickstart/app_03.py create mode 120000 docs/source-lit/code_samples/quickstart/app_comp.py create mode 120000 docs/source-lit/code_samples/quickstart/hello_world/app.py create mode 120000 docs/source-lit/code_samples/quickstart/hello_world/app_ui.py create mode 120000 docs/source-lit/code_samples/quickstart/hello_world/ui/index.html create mode 120000 docs/source-lit/common/checkpointing.rst create mode 120000 docs/source-lit/common/checkpointing_advanced.rst create mode 120000 docs/source-lit/common/checkpointing_basic.rst create mode 120000 docs/source-lit/common/checkpointing_expert.rst create mode 120000 docs/source-lit/common/checkpointing_intermediate.rst create mode 120000 docs/source-lit/common/child_modules.rst create mode 120000 docs/source-lit/common/console_logs.rst create mode 120000 docs/source-lit/common/early_stopping.rst create mode 120000 docs/source-lit/common/evaluation.rst create mode 120000 docs/source-lit/common/evaluation_basic.rst create mode 120000 docs/source-lit/common/evaluation_intermediate.rst create mode 120000 docs/source-lit/common/gradient_accumulation.rst create mode 120000 docs/source-lit/common/hyperparameters.rst create mode 120000 docs/source-lit/common/lightning_module.rst create mode 120000 docs/source-lit/common/optimization.rst create mode 120000 docs/source-lit/common/precision.rst create mode 120000 docs/source-lit/common/precision_basic.rst create mode 120000 docs/source-lit/common/precision_expert.rst create mode 120000 docs/source-lit/common/precision_intermediate.rst create mode 120000 docs/source-lit/common/progress_bar.rst create mode 120000 docs/source-lit/common/remote_fs.rst create mode 120000 docs/source-lit/common/trainer.rst create mode 120000 docs/source-lit/common_usecases.rst create mode 100644 docs/source-lit/conf.py create mode 120000 docs/source-lit/contribute_app.rst create mode 120000 docs/source-lit/core_api/core_api.rst create mode 120000 docs/source-lit/core_api/lightning_app/app.py create mode 120000 docs/source-lit/core_api/lightning_app/communication.rst create mode 120000 docs/source-lit/core_api/lightning_app/communication_content.rst create mode 120000 docs/source-lit/core_api/lightning_app/dynamic_work.rst create mode 120000 docs/source-lit/core_api/lightning_app/dynamic_work_content.rst create mode 120000 docs/source-lit/core_api/lightning_app/index.rst create mode 120000 docs/source-lit/core_api/lightning_app/lightning_app.rst create mode 120000 docs/source-lit/core_api/lightning_flow.rst create mode 120000 docs/source-lit/core_api/lightning_work/compute.rst create mode 120000 docs/source-lit/core_api/lightning_work/compute_content.rst create mode 120000 docs/source-lit/core_api/lightning_work/handling_app_exception.rst create mode 120000 docs/source-lit/core_api/lightning_work/handling_app_exception_content.rst create mode 120000 docs/source-lit/core_api/lightning_work/index.rst create mode 120000 docs/source-lit/core_api/lightning_work/lightning_work.rst create mode 120000 docs/source-lit/core_api/lightning_work/payload.rst create mode 120000 docs/source-lit/core_api/lightning_work/payload_content.rst create mode 120000 docs/source-lit/core_api/lightning_work/status.rst create mode 120000 docs/source-lit/core_api/lightning_work/status_content.rst create mode 120000 docs/source-lit/data/datamodule.rst create mode 120000 docs/source-lit/debug/debugging.rst create mode 120000 docs/source-lit/debug/debugging_advanced.rst create mode 120000 docs/source-lit/debug/debugging_basic.rst create mode 120000 docs/source-lit/debug/debugging_intermediate.rst create mode 120000 docs/source-lit/deploy/production.rst create mode 120000 docs/source-lit/deploy/production_advanced.rst create mode 120000 docs/source-lit/deploy/production_advanced_2.rst create mode 120000 docs/source-lit/deploy/production_basic.rst create mode 120000 docs/source-lit/deploy/production_intermediate.rst create mode 120000 docs/source-lit/ecosystem/asr_nlp_tts.rst create mode 120000 docs/source-lit/ecosystem/bolts.rst create mode 120000 docs/source-lit/ecosystem/community_examples.rst create mode 120000 docs/source-lit/ecosystem/ecosystem-ci.rst create mode 120000 docs/source-lit/ecosystem/flash.rst create mode 120000 docs/source-lit/ecosystem/metrics.rst create mode 120000 docs/source-lit/ecosystem/transformers.rst create mode 120000 docs/source-lit/examples/dag/dag.rst create mode 120000 docs/source-lit/examples/dag/dag_from_scratch.rst create mode 120000 docs/source-lit/examples/data_explore_app.rst create mode 120000 docs/source-lit/examples/etl_app.rst create mode 120000 docs/source-lit/examples/file_server/app.py create mode 120000 docs/source-lit/examples/file_server/file_server.rst create mode 120000 docs/source-lit/examples/file_server/file_server_content.rst create mode 120000 docs/source-lit/examples/file_server/file_server_step_1.rst create mode 120000 docs/source-lit/examples/file_server/file_server_step_2.rst create mode 120000 docs/source-lit/examples/file_server/file_server_step_3.rst create mode 120000 docs/source-lit/examples/file_server/file_server_step_4.rst create mode 120000 docs/source-lit/examples/github_repo_runner/app.py create mode 120000 docs/source-lit/examples/github_repo_runner/github_repo_runner.rst create mode 120000 docs/source-lit/examples/github_repo_runner/github_repo_runner_content.rst create mode 120000 docs/source-lit/examples/github_repo_runner/github_repo_runner_step_1.rst create mode 120000 docs/source-lit/examples/github_repo_runner/github_repo_runner_step_2.rst create mode 120000 docs/source-lit/examples/github_repo_runner/github_repo_runner_step_3.rst create mode 120000 docs/source-lit/examples/github_repo_runner/github_repo_runner_step_4.rst create mode 120000 docs/source-lit/examples/github_repo_runner/github_repo_runner_step_5.rst create mode 120000 docs/source-lit/examples/hands_on_example.rst create mode 120000 docs/source-lit/examples/hpo/build_from_scratch.rst create mode 120000 docs/source-lit/examples/hpo/hpo.py create mode 120000 docs/source-lit/examples/hpo/hpo.rst create mode 120000 docs/source-lit/examples/hpo/hpo_wi.rst create mode 120000 docs/source-lit/examples/hpo/hpo_wo.rst create mode 120000 docs/source-lit/examples/hpo/lightning_hpo.rst create mode 120000 docs/source-lit/examples/hpo/lightning_hpo_target.py create mode 120000 docs/source-lit/examples/hpo/objective.py create mode 120000 docs/source-lit/examples/hpo/optuna_reference.py create mode 120000 docs/source-lit/examples/model_server_app/app.py create mode 120000 docs/source-lit/examples/model_server_app/load_testing.rst create mode 120000 docs/source-lit/examples/model_server_app/locust_component.py create mode 120000 docs/source-lit/examples/model_server_app/locustfile.py create mode 120000 docs/source-lit/examples/model_server_app/model_server.py create mode 120000 docs/source-lit/examples/model_server_app/model_server.rst create mode 120000 docs/source-lit/examples/model_server_app/model_server_app.rst create mode 120000 docs/source-lit/examples/model_server_app/model_server_app_content.rst create mode 120000 docs/source-lit/examples/model_server_app/putting_everything_together.rst create mode 120000 docs/source-lit/examples/model_server_app/train.py create mode 120000 docs/source-lit/examples/model_server_app/train.rst create mode 120000 docs/source-lit/examples/research_demo_app.rst create mode 120000 docs/source-lit/expertise_levels.rst create mode 120000 docs/source-lit/extensions/accelerator.rst create mode 120000 docs/source-lit/extensions/callbacks.rst create mode 120000 docs/source-lit/extensions/callbacks_state.rst create mode 120000 docs/source-lit/extensions/datamodules_state.rst create mode 120000 docs/source-lit/extensions/entry_points.rst create mode 120000 docs/source-lit/extensions/logging.rst create mode 120000 docs/source-lit/extensions/loops.rst create mode 120000 docs/source-lit/extensions/loops_advanced.rst create mode 120000 docs/source-lit/extensions/plugins.rst create mode 120000 docs/source-lit/extensions/strategy.rst create mode 120000 docs/source-lit/get_started/add_an_interactive_demo.rst create mode 120000 docs/source-lit/get_started/build_model.rst create mode 120000 docs/source-lit/get_started/go_beyond_training.rst create mode 120000 docs/source-lit/get_started/go_beyond_training_content.rst create mode 120000 docs/source-lit/get_started/jumpstart_from_app_gallery.rst create mode 120000 docs/source-lit/get_started/jumpstart_from_component_gallery.rst create mode 120000 docs/source-lit/get_started/lightning_apps_intro.rst create mode 120000 docs/source-lit/get_started/training_with_apps.rst create mode 120000 docs/source-lit/get_started/what_app_can_do.rst create mode 120000 docs/source-lit/glossary/app_tree.rst create mode 120000 docs/source-lit/glossary/build_config/build_config.rst create mode 120000 docs/source-lit/glossary/build_config/build_config_advanced.rst create mode 120000 docs/source-lit/glossary/build_config/build_config_basic.rst create mode 120000 docs/source-lit/glossary/build_config/build_config_intermediate.rst create mode 120000 docs/source-lit/glossary/dag.rst create mode 120000 docs/source-lit/glossary/debug_app.rst create mode 120000 docs/source-lit/glossary/distributed_fe.rst create mode 120000 docs/source-lit/glossary/distributed_hardware.rst create mode 120000 docs/source-lit/glossary/environment_variables.rst create mode 120000 docs/source-lit/glossary/event_loop.rst create mode 120000 docs/source-lit/glossary/fault_tolerance.rst create mode 120000 docs/source-lit/glossary/index.rst create mode 120000 docs/source-lit/glossary/lightning_app_overview/index.rst create mode 120000 docs/source-lit/glossary/scheduling.rst create mode 120000 docs/source-lit/glossary/sharing_components.rst create mode 120000 docs/source-lit/glossary/storage/differences.rst create mode 120000 docs/source-lit/glossary/storage/drive.rst create mode 120000 docs/source-lit/glossary/storage/drive_content.rst create mode 120000 docs/source-lit/glossary/storage/path.rst create mode 120000 docs/source-lit/glossary/storage/storage.rst create mode 120000 docs/source-lit/governance.rst create mode 120000 docs/source-lit/guides/data.rst create mode 120000 docs/source-lit/guides/speed.rst create mode 100644 docs/source-lit/index.rst create mode 120000 docs/source-lit/install_beginner.rst create mode 120000 docs/source-lit/installation.rst create mode 120000 docs/source-lit/installation_mac.rst create mode 120000 docs/source-lit/installation_win.rst create mode 120000 docs/source-lit/intro.rst create mode 120000 docs/source-lit/levels/advanced.rst create mode 120000 docs/source-lit/levels/advanced/index.rst create mode 120000 docs/source-lit/levels/advanced/level_16.rst create mode 120000 docs/source-lit/levels/advanced/level_17.rst create mode 120000 docs/source-lit/levels/advanced/level_18.rst create mode 120000 docs/source-lit/levels/advanced/level_19.rst create mode 120000 docs/source-lit/levels/advanced/level_20.rst create mode 120000 docs/source-lit/levels/advanced_level_15.rst create mode 120000 docs/source-lit/levels/advanced_level_16.rst create mode 120000 docs/source-lit/levels/advanced_level_17.rst create mode 120000 docs/source-lit/levels/advanced_level_18.rst create mode 120000 docs/source-lit/levels/advanced_level_19.rst create mode 120000 docs/source-lit/levels/advanced_level_20.rst create mode 120000 docs/source-lit/levels/advanced_level_21.rst create mode 120000 docs/source-lit/levels/advanced_level_22.rst create mode 120000 docs/source-lit/levels/basic/index.rst create mode 120000 docs/source-lit/levels/basic/level_1.rst create mode 120000 docs/source-lit/levels/basic/level_2.rst create mode 120000 docs/source-lit/levels/basic/level_3.rst create mode 120000 docs/source-lit/levels/basic/level_4.rst create mode 120000 docs/source-lit/levels/basic/level_5.rst create mode 120000 docs/source-lit/levels/basic/level_6.rst create mode 120000 docs/source-lit/levels/basic/level_7.rst create mode 120000 docs/source-lit/levels/basic_level_2.rst create mode 120000 docs/source-lit/levels/basic_level_5.rst create mode 120000 docs/source-lit/levels/core_level_3.rst create mode 120000 docs/source-lit/levels/core_level_6.rst create mode 120000 docs/source-lit/levels/core_skills.rst create mode 120000 docs/source-lit/levels/expert.rst create mode 120000 docs/source-lit/levels/expert_level_23.rst create mode 120000 docs/source-lit/levels/expert_level_24.rst create mode 120000 docs/source-lit/levels/expert_level_27.rst create mode 120000 docs/source-lit/levels/intermediate.rst create mode 120000 docs/source-lit/levels/intermediate/index.rst create mode 120000 docs/source-lit/levels/intermediate/level_10.rst create mode 120000 docs/source-lit/levels/intermediate/level_11.rst create mode 120000 docs/source-lit/levels/intermediate/level_12.rst create mode 120000 docs/source-lit/levels/intermediate/level_13.rst create mode 120000 docs/source-lit/levels/intermediate/level_14.rst create mode 120000 docs/source-lit/levels/intermediate/level_15.rst create mode 120000 docs/source-lit/levels/intermediate/level_8.rst create mode 120000 docs/source-lit/levels/intermediate/level_9.rst create mode 120000 docs/source-lit/levels/intermediate_level_10.rst create mode 120000 docs/source-lit/levels/intermediate_level_11.rst create mode 120000 docs/source-lit/levels/intermediate_level_12.rst create mode 120000 docs/source-lit/levels/intermediate_level_13.rst create mode 120000 docs/source-lit/levels/intermediate_level_14.rst create mode 120000 docs/source-lit/levels/intermediate_level_7.rst create mode 120000 docs/source-lit/levels/intermediate_level_8.rst create mode 120000 docs/source-lit/levels/intermediate_level_9.rst create mode 120000 docs/source-lit/links.rst create mode 100644 docs/source-lit/make.bat create mode 120000 docs/source-lit/model/build_model.rst create mode 120000 docs/source-lit/model/build_model_advanced.rst create mode 120000 docs/source-lit/model/build_model_expert.rst create mode 120000 docs/source-lit/model/build_model_intermediate.rst create mode 120000 docs/source-lit/model/manual_optimization.rst create mode 120000 docs/source-lit/model/own_your_loop.rst create mode 120000 docs/source-lit/model/train_model_basic.rst create mode 120000 docs/source-lit/moving_to_the_cloud.rst create mode 120000 docs/source-lit/quickstart.rst create mode 120000 docs/source-lit/starter/converting.rst create mode 120000 docs/source-lit/starter/installation.rst create mode 120000 docs/source-lit/starter/installation_mac.rst create mode 120000 docs/source-lit/starter/introduction.rst create mode 120000 docs/source-lit/starter/lightning_lite.rst create mode 120000 docs/source-lit/starter/style_guide.rst create mode 120000 docs/source-lit/strategies/hivemind.rst create mode 120000 docs/source-lit/strategies/hivemind_basic.rst create mode 120000 docs/source-lit/strategies/hivemind_expert.rst create mode 120000 docs/source-lit/strategies/hivemind_intermediate.rst create mode 120000 docs/source-lit/testing.rst create mode 120000 docs/source-lit/tuning/profiler.rst create mode 120000 docs/source-lit/tuning/profiler_advanced.rst create mode 120000 docs/source-lit/tuning/profiler_basic.rst create mode 120000 docs/source-lit/tuning/profiler_expert.rst create mode 120000 docs/source-lit/tuning/profiler_intermediate.rst create mode 120000 docs/source-lit/ui_and_frontends.rst create mode 120000 docs/source-lit/visualize/experiment_managers.rst create mode 120000 docs/source-lit/visualize/loggers.rst create mode 120000 docs/source-lit/visualize/logging_advanced.rst create mode 120000 docs/source-lit/visualize/logging_basic.rst create mode 120000 docs/source-lit/visualize/logging_expert.rst create mode 120000 docs/source-lit/visualize/logging_intermediate.rst create mode 120000 docs/source-lit/visualize/supported_exp_managers.rst create mode 120000 docs/source-lit/workflows/access_app_state/access_app_state.rst create mode 120000 docs/source-lit/workflows/add_components/index.rst create mode 120000 docs/source-lit/workflows/add_server/any_server.rst create mode 120000 docs/source-lit/workflows/add_server/flask_basic.rst create mode 120000 docs/source-lit/workflows/add_server/index.rst create mode 120000 docs/source-lit/workflows/add_server/index_content.rst create mode 120000 docs/source-lit/workflows/add_web_link.rst create mode 120000 docs/source-lit/workflows/add_web_ui/angular_js_intermediate.rst create mode 120000 docs/source-lit/workflows/add_web_ui/dash/basic.rst create mode 120000 docs/source-lit/workflows/add_web_ui/dash/index.rst create mode 120000 docs/source-lit/workflows/add_web_ui/dash/intermediate.rst create mode 120000 docs/source-lit/workflows/add_web_ui/dash/intermediate_plot.py create mode 120000 docs/source-lit/workflows/add_web_ui/dash/intermediate_state.py create mode 120000 docs/source-lit/workflows/add_web_ui/example_app.rst create mode 120000 docs/source-lit/workflows/add_web_ui/glossary_front_end.rst create mode 120000 docs/source-lit/workflows/add_web_ui/glossary_ui.rst create mode 120000 docs/source-lit/workflows/add_web_ui/gradio/basic.rst create mode 120000 docs/source-lit/workflows/add_web_ui/gradio/index.rst create mode 120000 docs/source-lit/workflows/add_web_ui/gradio/intermediate.rst create mode 120000 docs/source-lit/workflows/add_web_ui/html/basic.rst create mode 120000 docs/source-lit/workflows/add_web_ui/html/index.rst create mode 120000 docs/source-lit/workflows/add_web_ui/html/intermediate.rst create mode 120000 docs/source-lit/workflows/add_web_ui/index.rst create mode 120000 docs/source-lit/workflows/add_web_ui/index_content.rst create mode 120000 docs/source-lit/workflows/add_web_ui/integrate_any_javascript_framework.rst create mode 120000 docs/source-lit/workflows/add_web_ui/jupyter_basic.rst create mode 120000 docs/source-lit/workflows/add_web_ui/react/communicate_between_react_and_lightning.rst create mode 120000 docs/source-lit/workflows/add_web_ui/react/connect_react_and_lightning.rst create mode 120000 docs/source-lit/workflows/add_web_ui/react/create_react_template.rst create mode 120000 docs/source-lit/workflows/add_web_ui/react/index.rst create mode 120000 docs/source-lit/workflows/add_web_ui/react/react_development_workflow.rst create mode 120000 docs/source-lit/workflows/add_web_ui/streamlit/basic.rst create mode 120000 docs/source-lit/workflows/add_web_ui/streamlit/index.rst create mode 120000 docs/source-lit/workflows/add_web_ui/streamlit/intermediate.rst create mode 120000 docs/source-lit/workflows/add_web_ui/vue_js_intermediate.rst create mode 120000 docs/source-lit/workflows/arrange_tabs/arrange_app_basic.rst create mode 120000 docs/source-lit/workflows/arrange_tabs/arrange_app_intermediate.rst create mode 120000 docs/source-lit/workflows/arrange_tabs/index.rst create mode 120000 docs/source-lit/workflows/arrange_tabs/index_content.rst create mode 120000 docs/source-lit/workflows/build_lightning_app/from_pytorch_lightning_script.rst create mode 120000 docs/source-lit/workflows/build_lightning_app/from_scratch.rst create mode 120000 docs/source-lit/workflows/build_lightning_app/from_scratch_content.rst create mode 120000 docs/source-lit/workflows/build_lightning_app/index.rst create mode 120000 docs/source-lit/workflows/build_lightning_app/index_content.rst create mode 120000 docs/source-lit/workflows/build_lightning_component/basic.rst create mode 120000 docs/source-lit/workflows/build_lightning_component/from_scratch_component_content.rst create mode 120000 docs/source-lit/workflows/build_lightning_component/index.rst create mode 120000 docs/source-lit/workflows/build_lightning_component/index_content.rst create mode 120000 docs/source-lit/workflows/build_lightning_component/intermediate.rst create mode 120000 docs/source-lit/workflows/build_lightning_component/publish_a_component.rst create mode 120000 docs/source-lit/workflows/byoc/index.rst create mode 120000 docs/source-lit/workflows/debug_locally.rst create mode 120000 docs/source-lit/workflows/enable_fault_tolerance.rst create mode 120000 docs/source-lit/workflows/extend_app.rst create mode 120000 docs/source-lit/workflows/index.rst create mode 120000 docs/source-lit/workflows/run_app_on_cloud/cloud_files.rst create mode 120000 docs/source-lit/workflows/run_app_on_cloud/index.rst create mode 120000 docs/source-lit/workflows/run_app_on_cloud/index_content.rst create mode 120000 docs/source-lit/workflows/run_app_on_cloud/lightning_cloud.rst create mode 120000 docs/source-lit/workflows/run_app_on_cloud/on_prem.rst create mode 120000 docs/source-lit/workflows/run_app_on_cloud/on_your_own_machine.rst create mode 120000 docs/source-lit/workflows/run_app_snippet.rst create mode 120000 docs/source-lit/workflows/run_components_on_different_hardware.rst create mode 120000 docs/source-lit/workflows/run_on_private_cloud.rst create mode 120000 docs/source-lit/workflows/run_work_in_parallel.rst create mode 120000 docs/source-lit/workflows/run_work_in_parallel_content.rst create mode 120000 docs/source-lit/workflows/run_work_once.rst create mode 120000 docs/source-lit/workflows/run_work_once_content.rst create mode 120000 docs/source-lit/workflows/schedule_apps.rst create mode 120000 docs/source-lit/workflows/share_app.rst create mode 120000 docs/source-lit/workflows/share_files_between_components.rst create mode 120000 docs/source-lit/workflows/share_files_between_components/app.py create mode 120000 docs/source-lit/workflows/test_an_app.rst create mode 100644 requirements/lit/base.txt create mode 100644 requirements/lit/devel.txt create mode 100644 requirements/lit/docs.txt diff --git a/.github/workflows/docs-checks.yml b/.github/workflows/docs-checks.yml index a91f216af963f..0e616b17d3598 100644 --- a/.github/workflows/docs-checks.yml +++ b/.github/workflows/docs-checks.yml @@ -17,7 +17,7 @@ jobs: strategy: fail-fast: false matrix: - pkg: ["app", "pytorch"] + pkg: ["app", "pytorch"] # TODO: , "lit" steps: - uses: actions/checkout@v2 with: @@ -68,7 +68,7 @@ jobs: strategy: fail-fast: false matrix: - pkg: ["app", "pytorch"] + pkg: ["app", "pytorch", "lit"] steps: - uses: actions/checkout@v2 with: @@ -112,7 +112,7 @@ jobs: - name: Upload built docs uses: actions/upload-artifact@v3 with: - name: docs-results-${{ github.sha }} + name: docs-${{ matrix.pkg }}-${{ github.sha }} path: docs/build/html/ # Use always() to always run this step to publish test results when there are test failures if: success() diff --git a/.gitignore b/.gitignore index 259d9f271189c..a7dc915b84284 100644 --- a/.gitignore +++ b/.gitignore @@ -9,6 +9,9 @@ lightning_logs/ # Documentations docs/source-app/generated docs/source-app/*/generated +docs/source-lit/generated +docs/source-lit/*/generated +docs/source-lit/api docs/source-pytorch/api docs/source-pytorch/*.md docs/source-pytorch/generated diff --git a/Makefile b/Makefile index c434fc19c99f6..0bb2722d4eaee 100644 --- a/Makefile +++ b/Makefile @@ -22,6 +22,9 @@ clean: rm -rf ./docs/source-pytorch/api rm -rf ./docs/source-app/generated rm -rf ./docs/source-app/*/generated + rm -rf ./docs/source-lit/api + rm -rf ./docs/source-lit/generated + rm -rf ./docs/source-lit/*/generated rm -rf build rm -rf dist rm -rf *.egg-info diff --git a/docs/create-symlinks.py b/docs/create-symlinks.py new file mode 100644 index 0000000000000..41dcf8bc53c89 --- /dev/null +++ b/docs/create-symlinks.py @@ -0,0 +1,27 @@ +import glob +import os + + +def symlink_folder(source_dir, target_dir: str = "source-lit") -> None: + assert os.path.isdir(source_dir) + assert os.path.isdir(target_dir) + ls = glob.glob(os.path.join(source_dir, "**"), recursive=True) + for path_ in ls: + path_target = path_.replace(source_dir, target_dir) + if os.path.isdir(path_) or os.path.exists(path_target): + continue + if os.path.islink(path_target): + print(path_target) + continue + path_dir = os.path.dirname(path_target) + os.makedirs(path_dir, exist_ok=True) + depth = path_.count(os.path.sep) + path_root = os.path.sep.join([".."] * depth) + path_source = os.path.join(path_root, path_) + # print(path_source, path_target, os.path.exists(path_target)) + os.symlink(path_source, path_target) + + +if __name__ == "__main__": + for name in ("app", "pytorch"): + symlink_folder(f"source-{name}") diff --git a/docs/source-app/code_samples/quickstart/app/app_0.py b/docs/source-app/code_samples/quickstart/app/app_0.py index 3952cafc957e0..82b687b0f258b 100644 --- a/docs/source-app/code_samples/quickstart/app/app_0.py +++ b/docs/source-app/code_samples/quickstart/app/app_0.py @@ -1,6 +1,5 @@ -from docs.quickstart.app_02 import HourLongWork - import lightning as L +from docs.quickstart.app_02 import HourLongWork class RootFlow(L.LightningFlow): diff --git a/docs/source-lit/Makefile b/docs/source-lit/Makefile new file mode 100644 index 0000000000000..268e09561bb72 --- /dev/null +++ b/docs/source-lit/Makefile @@ -0,0 +1,19 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line. +SPHINXOPTS = -T -W +SPHINXBUILD = sphinx-build +SOURCEDIR = . +BUILDDIR = ../build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/source-lit/_static/copybutton.js b/docs/source-lit/_static/copybutton.js new file mode 120000 index 0000000000000..9107b39a0198f --- /dev/null +++ b/docs/source-lit/_static/copybutton.js @@ -0,0 +1 @@ +../../source-app/_static/copybutton.js \ No newline at end of file diff --git a/docs/source-lit/_static/images/accelerator/ipus/profiler.png b/docs/source-lit/_static/images/accelerator/ipus/profiler.png new file mode 120000 index 0000000000000..39d2255c3315a --- /dev/null +++ b/docs/source-lit/_static/images/accelerator/ipus/profiler.png @@ -0,0 +1 @@ +../../../../../source-pytorch/_static/images/accelerator/ipus/profiler.png \ No newline at end of file diff --git a/docs/source-lit/_static/images/benchmarks/figure-parity-times.png b/docs/source-lit/_static/images/benchmarks/figure-parity-times.png new file mode 120000 index 0000000000000..c24facff8aa35 --- /dev/null +++ b/docs/source-lit/_static/images/benchmarks/figure-parity-times.png @@ -0,0 +1 @@ +../../../../source-pytorch/_static/images/benchmarks/figure-parity-times.png \ No newline at end of file diff --git a/docs/source-lit/_static/images/general/PTL101_youtube_thumbnail.jpg b/docs/source-lit/_static/images/general/PTL101_youtube_thumbnail.jpg new file mode 120000 index 0000000000000..93e577a562f2f --- /dev/null +++ b/docs/source-lit/_static/images/general/PTL101_youtube_thumbnail.jpg @@ -0,0 +1 @@ +../../../../source-pytorch/_static/images/general/PTL101_youtube_thumbnail.jpg \ No newline at end of file diff --git a/docs/source-lit/_static/images/general/fast_2.gif b/docs/source-lit/_static/images/general/fast_2.gif new file mode 120000 index 0000000000000..65c39c0916c28 --- /dev/null +++ b/docs/source-lit/_static/images/general/fast_2.gif @@ -0,0 +1 @@ +../../../../source-pytorch/_static/images/general/fast_2.gif \ No newline at end of file diff --git a/docs/source-lit/_static/images/general/pl_overview.gif b/docs/source-lit/_static/images/general/pl_overview.gif new file mode 120000 index 0000000000000..2c16af0265045 --- /dev/null +++ b/docs/source-lit/_static/images/general/pl_overview.gif @@ -0,0 +1 @@ +../../../../source-pytorch/_static/images/general/pl_overview.gif \ No newline at end of file diff --git a/docs/source-lit/_static/images/general/pl_overview_flat.jpg b/docs/source-lit/_static/images/general/pl_overview_flat.jpg new file mode 120000 index 0000000000000..d1ac4ab4765ab --- /dev/null +++ b/docs/source-lit/_static/images/general/pl_overview_flat.jpg @@ -0,0 +1 @@ +../../../../source-pytorch/_static/images/general/pl_overview_flat.jpg \ No newline at end of file diff --git a/docs/source-lit/_static/images/general/pl_quick_start_full_compressed.gif b/docs/source-lit/_static/images/general/pl_quick_start_full_compressed.gif new file mode 120000 index 0000000000000..031a1981c2173 --- /dev/null +++ b/docs/source-lit/_static/images/general/pl_quick_start_full_compressed.gif @@ -0,0 +1 @@ +../../../../source-pytorch/_static/images/general/pl_quick_start_full_compressed.gif \ No newline at end of file diff --git a/docs/source-lit/_static/images/general/tf_loss.jpg b/docs/source-lit/_static/images/general/tf_loss.jpg new file mode 120000 index 0000000000000..9ea26d885891a --- /dev/null +++ b/docs/source-lit/_static/images/general/tf_loss.jpg @@ -0,0 +1 @@ +../../../../source-pytorch/_static/images/general/tf_loss.jpg \ No newline at end of file diff --git a/docs/source-lit/_static/images/general/tf_tags.jpg b/docs/source-lit/_static/images/general/tf_tags.jpg new file mode 120000 index 0000000000000..b0a8c01d3baee --- /dev/null +++ b/docs/source-lit/_static/images/general/tf_tags.jpg @@ -0,0 +1 @@ +../../../../source-pytorch/_static/images/general/tf_tags.jpg \ No newline at end of file diff --git a/docs/source-lit/_static/images/general/tutorial_cover.jpg b/docs/source-lit/_static/images/general/tutorial_cover.jpg new file mode 120000 index 0000000000000..4cf24f34579e6 --- /dev/null +++ b/docs/source-lit/_static/images/general/tutorial_cover.jpg @@ -0,0 +1 @@ +../../../../source-pytorch/_static/images/general/tutorial_cover.jpg \ No newline at end of file diff --git a/docs/source-lit/_static/images/icon.svg b/docs/source-lit/_static/images/icon.svg new file mode 120000 index 0000000000000..5822f0195bea1 --- /dev/null +++ b/docs/source-lit/_static/images/icon.svg @@ -0,0 +1 @@ +../../../source-app/_static/images/icon.svg \ No newline at end of file diff --git a/docs/source-lit/_static/images/lightning_lite/lite.gif b/docs/source-lit/_static/images/lightning_lite/lite.gif new file mode 120000 index 0000000000000..5672b49b05653 --- /dev/null +++ b/docs/source-lit/_static/images/lightning_lite/lite.gif @@ -0,0 +1 @@ +../../../../source-pytorch/_static/images/lightning_lite/lite.gif \ No newline at end of file diff --git a/docs/source-lit/_static/images/lightning_module/pt_to_pl.png b/docs/source-lit/_static/images/lightning_module/pt_to_pl.png new file mode 120000 index 0000000000000..ea86e767ce97d --- /dev/null +++ b/docs/source-lit/_static/images/lightning_module/pt_to_pl.png @@ -0,0 +1 @@ +../../../../source-pytorch/_static/images/lightning_module/pt_to_pl.png \ No newline at end of file diff --git a/docs/source-lit/_static/images/lightning_module/pt_trainer.png b/docs/source-lit/_static/images/lightning_module/pt_trainer.png new file mode 120000 index 0000000000000..d2103d1f5d9d4 --- /dev/null +++ b/docs/source-lit/_static/images/lightning_module/pt_trainer.png @@ -0,0 +1 @@ +../../../../source-pytorch/_static/images/lightning_module/pt_trainer.png \ No newline at end of file diff --git a/docs/source-lit/_static/images/logo-large.svg b/docs/source-lit/_static/images/logo-large.svg new file mode 120000 index 0000000000000..36fc4e8b52602 --- /dev/null +++ b/docs/source-lit/_static/images/logo-large.svg @@ -0,0 +1 @@ +../../../source-app/_static/images/logo-large.svg \ No newline at end of file diff --git a/docs/source-lit/_static/images/logo-small.svg b/docs/source-lit/_static/images/logo-small.svg new file mode 120000 index 0000000000000..8562d1a8c3a12 --- /dev/null +++ b/docs/source-lit/_static/images/logo-small.svg @@ -0,0 +1 @@ +../../../source-app/_static/images/logo-small.svg \ No newline at end of file diff --git a/docs/source-lit/_static/images/logo.png b/docs/source-lit/_static/images/logo.png new file mode 120000 index 0000000000000..e71cac8362e59 --- /dev/null +++ b/docs/source-lit/_static/images/logo.png @@ -0,0 +1 @@ +../../../source-app/_static/images/logo.png \ No newline at end of file diff --git a/docs/source-lit/_static/images/logo.svg b/docs/source-lit/_static/images/logo.svg new file mode 120000 index 0000000000000..c077f71dde13b --- /dev/null +++ b/docs/source-lit/_static/images/logo.svg @@ -0,0 +1 @@ +../../../source-app/_static/images/logo.svg \ No newline at end of file diff --git a/docs/source-lit/_static/images/logo_light.svg b/docs/source-lit/_static/images/logo_light.svg new file mode 120000 index 0000000000000..756920c261937 --- /dev/null +++ b/docs/source-lit/_static/images/logo_light.svg @@ -0,0 +1 @@ +../../../source-pytorch/_static/images/logo_light.svg \ No newline at end of file diff --git a/docs/source-lit/_static/images/mnist_imgs/mnist_cpu_bar.png b/docs/source-lit/_static/images/mnist_imgs/mnist_cpu_bar.png new file mode 120000 index 0000000000000..ccff03f287f22 --- /dev/null +++ b/docs/source-lit/_static/images/mnist_imgs/mnist_cpu_bar.png @@ -0,0 +1 @@ +../../../../source-pytorch/_static/images/mnist_imgs/mnist_cpu_bar.png \ No newline at end of file diff --git a/docs/source-lit/_static/images/mnist_imgs/mnist_gpu.png b/docs/source-lit/_static/images/mnist_imgs/mnist_gpu.png new file mode 120000 index 0000000000000..a2f24ca539cdf --- /dev/null +++ b/docs/source-lit/_static/images/mnist_imgs/mnist_gpu.png @@ -0,0 +1 @@ +../../../../source-pytorch/_static/images/mnist_imgs/mnist_gpu.png \ No newline at end of file diff --git a/docs/source-lit/_static/images/mnist_imgs/mnist_tb.png b/docs/source-lit/_static/images/mnist_imgs/mnist_tb.png new file mode 120000 index 0000000000000..75a78cd66eca8 --- /dev/null +++ b/docs/source-lit/_static/images/mnist_imgs/mnist_tb.png @@ -0,0 +1 @@ +../../../../source-pytorch/_static/images/mnist_imgs/mnist_tb.png \ No newline at end of file diff --git a/docs/source-lit/_static/images/mnist_imgs/pt_to_pl.jpg b/docs/source-lit/_static/images/mnist_imgs/pt_to_pl.jpg new file mode 120000 index 0000000000000..4305631585129 --- /dev/null +++ b/docs/source-lit/_static/images/mnist_imgs/pt_to_pl.jpg @@ -0,0 +1 @@ +../../../../source-pytorch/_static/images/mnist_imgs/pt_to_pl.jpg \ No newline at end of file diff --git a/docs/source-lit/_static/images/mnist_imgs/restart_runtime.png b/docs/source-lit/_static/images/mnist_imgs/restart_runtime.png new file mode 120000 index 0000000000000..0497e5ed283bb --- /dev/null +++ b/docs/source-lit/_static/images/mnist_imgs/restart_runtime.png @@ -0,0 +1 @@ +../../../../source-pytorch/_static/images/mnist_imgs/restart_runtime.png \ No newline at end of file diff --git a/docs/source-lit/_static/images/mnist_imgs/runtime_tpu.png b/docs/source-lit/_static/images/mnist_imgs/runtime_tpu.png new file mode 120000 index 0000000000000..9cc3b120494b6 --- /dev/null +++ b/docs/source-lit/_static/images/mnist_imgs/runtime_tpu.png @@ -0,0 +1 @@ +../../../../source-pytorch/_static/images/mnist_imgs/runtime_tpu.png \ No newline at end of file diff --git a/docs/source-lit/_static/images/mnist_imgs/tpu_fast.png b/docs/source-lit/_static/images/mnist_imgs/tpu_fast.png new file mode 120000 index 0000000000000..8a2ef86c9c216 --- /dev/null +++ b/docs/source-lit/_static/images/mnist_imgs/tpu_fast.png @@ -0,0 +1 @@ +../../../../source-pytorch/_static/images/mnist_imgs/tpu_fast.png \ No newline at end of file diff --git a/docs/source-lit/_static/images/mnist_imgs/tpu_start.png b/docs/source-lit/_static/images/mnist_imgs/tpu_start.png new file mode 120000 index 0000000000000..a4f112db3196f --- /dev/null +++ b/docs/source-lit/_static/images/mnist_imgs/tpu_start.png @@ -0,0 +1 @@ +../../../../source-pytorch/_static/images/mnist_imgs/tpu_start.png \ No newline at end of file diff --git a/docs/source-lit/_static/images/trainer/lr_finder.png b/docs/source-lit/_static/images/trainer/lr_finder.png new file mode 120000 index 0000000000000..367bc3205d83d --- /dev/null +++ b/docs/source-lit/_static/images/trainer/lr_finder.png @@ -0,0 +1 @@ +../../../../source-pytorch/_static/images/trainer/lr_finder.png \ No newline at end of file diff --git a/docs/source-lit/_static/main.css b/docs/source-lit/_static/main.css new file mode 120000 index 0000000000000..fe7381f7d2e68 --- /dev/null +++ b/docs/source-lit/_static/main.css @@ -0,0 +1 @@ +../../source-app/_static/main.css \ No newline at end of file diff --git a/docs/source-lit/_templates/autosummary/module.rst b/docs/source-lit/_templates/autosummary/module.rst new file mode 120000 index 0000000000000..a3c22cac8f6a5 --- /dev/null +++ b/docs/source-lit/_templates/autosummary/module.rst @@ -0,0 +1 @@ +../../../source-pytorch/_templates/autosummary/module.rst \ No newline at end of file diff --git a/docs/source-lit/_templates/classtemplate.rst b/docs/source-lit/_templates/classtemplate.rst new file mode 120000 index 0000000000000..1cd4d3b75682c --- /dev/null +++ b/docs/source-lit/_templates/classtemplate.rst @@ -0,0 +1 @@ +../../source-app/_templates/classtemplate.rst \ No newline at end of file diff --git a/docs/source-lit/_templates/classtemplate_no_index.rst b/docs/source-lit/_templates/classtemplate_no_index.rst new file mode 120000 index 0000000000000..f78de48f60417 --- /dev/null +++ b/docs/source-lit/_templates/classtemplate_no_index.rst @@ -0,0 +1 @@ +../../source-app/_templates/classtemplate_no_index.rst \ No newline at end of file diff --git a/docs/source-lit/_templates/layout.html b/docs/source-lit/_templates/layout.html new file mode 120000 index 0000000000000..94fb27f9ce3a8 --- /dev/null +++ b/docs/source-lit/_templates/layout.html @@ -0,0 +1 @@ +../../source-app/_templates/layout.html \ No newline at end of file diff --git a/docs/source-lit/_templates/theme_variables.jinja b/docs/source-lit/_templates/theme_variables.jinja new file mode 120000 index 0000000000000..59fce3da75a95 --- /dev/null +++ b/docs/source-lit/_templates/theme_variables.jinja @@ -0,0 +1 @@ +../../source-app/_templates/theme_variables.jinja \ No newline at end of file diff --git a/docs/source-lit/accelerators/accelerator_prepare.rst b/docs/source-lit/accelerators/accelerator_prepare.rst new file mode 120000 index 0000000000000..c6b02ebd783ce --- /dev/null +++ b/docs/source-lit/accelerators/accelerator_prepare.rst @@ -0,0 +1 @@ +../../source-pytorch/accelerators/accelerator_prepare.rst \ No newline at end of file diff --git a/docs/source-lit/accelerators/gpu.rst b/docs/source-lit/accelerators/gpu.rst new file mode 120000 index 0000000000000..63a081f0885e0 --- /dev/null +++ b/docs/source-lit/accelerators/gpu.rst @@ -0,0 +1 @@ +../../source-pytorch/accelerators/gpu.rst \ No newline at end of file diff --git a/docs/source-lit/accelerators/gpu_advanced.rst b/docs/source-lit/accelerators/gpu_advanced.rst new file mode 120000 index 0000000000000..8d03094616eb6 --- /dev/null +++ b/docs/source-lit/accelerators/gpu_advanced.rst @@ -0,0 +1 @@ +../../source-pytorch/accelerators/gpu_advanced.rst \ No newline at end of file diff --git a/docs/source-lit/accelerators/gpu_basic.rst b/docs/source-lit/accelerators/gpu_basic.rst new file mode 120000 index 0000000000000..f0e2653a5a379 --- /dev/null +++ b/docs/source-lit/accelerators/gpu_basic.rst @@ -0,0 +1 @@ +../../source-pytorch/accelerators/gpu_basic.rst \ No newline at end of file diff --git a/docs/source-lit/accelerators/gpu_expert.rst b/docs/source-lit/accelerators/gpu_expert.rst new file mode 120000 index 0000000000000..70cd79443e084 --- /dev/null +++ b/docs/source-lit/accelerators/gpu_expert.rst @@ -0,0 +1 @@ +../../source-pytorch/accelerators/gpu_expert.rst \ No newline at end of file diff --git a/docs/source-lit/accelerators/gpu_faq.rst b/docs/source-lit/accelerators/gpu_faq.rst new file mode 120000 index 0000000000000..de095922b4e9c --- /dev/null +++ b/docs/source-lit/accelerators/gpu_faq.rst @@ -0,0 +1 @@ +../../source-pytorch/accelerators/gpu_faq.rst \ No newline at end of file diff --git a/docs/source-lit/accelerators/gpu_intermediate.rst b/docs/source-lit/accelerators/gpu_intermediate.rst new file mode 120000 index 0000000000000..05fdaa82d6449 --- /dev/null +++ b/docs/source-lit/accelerators/gpu_intermediate.rst @@ -0,0 +1 @@ +../../source-pytorch/accelerators/gpu_intermediate.rst \ No newline at end of file diff --git a/docs/source-lit/accelerators/hpu.rst b/docs/source-lit/accelerators/hpu.rst new file mode 120000 index 0000000000000..1aed29dac02e7 --- /dev/null +++ b/docs/source-lit/accelerators/hpu.rst @@ -0,0 +1 @@ +../../source-pytorch/accelerators/hpu.rst \ No newline at end of file diff --git a/docs/source-lit/accelerators/hpu_basic.rst b/docs/source-lit/accelerators/hpu_basic.rst new file mode 120000 index 0000000000000..7341b334d4ccc --- /dev/null +++ b/docs/source-lit/accelerators/hpu_basic.rst @@ -0,0 +1 @@ +../../source-pytorch/accelerators/hpu_basic.rst \ No newline at end of file diff --git a/docs/source-lit/accelerators/hpu_intermediate.rst b/docs/source-lit/accelerators/hpu_intermediate.rst new file mode 120000 index 0000000000000..74867398c5560 --- /dev/null +++ b/docs/source-lit/accelerators/hpu_intermediate.rst @@ -0,0 +1 @@ +../../source-pytorch/accelerators/hpu_intermediate.rst \ No newline at end of file diff --git a/docs/source-lit/accelerators/ipu.rst b/docs/source-lit/accelerators/ipu.rst new file mode 120000 index 0000000000000..0a86662b21022 --- /dev/null +++ b/docs/source-lit/accelerators/ipu.rst @@ -0,0 +1 @@ +../../source-pytorch/accelerators/ipu.rst \ No newline at end of file diff --git a/docs/source-lit/accelerators/ipu_advanced.rst b/docs/source-lit/accelerators/ipu_advanced.rst new file mode 120000 index 0000000000000..2bf58cf1c40e2 --- /dev/null +++ b/docs/source-lit/accelerators/ipu_advanced.rst @@ -0,0 +1 @@ +../../source-pytorch/accelerators/ipu_advanced.rst \ No newline at end of file diff --git a/docs/source-lit/accelerators/ipu_basic.rst b/docs/source-lit/accelerators/ipu_basic.rst new file mode 120000 index 0000000000000..86eb352e4611e --- /dev/null +++ b/docs/source-lit/accelerators/ipu_basic.rst @@ -0,0 +1 @@ +../../source-pytorch/accelerators/ipu_basic.rst \ No newline at end of file diff --git a/docs/source-lit/accelerators/ipu_intermediate.rst b/docs/source-lit/accelerators/ipu_intermediate.rst new file mode 120000 index 0000000000000..92aff36a7457e --- /dev/null +++ b/docs/source-lit/accelerators/ipu_intermediate.rst @@ -0,0 +1 @@ +../../source-pytorch/accelerators/ipu_intermediate.rst \ No newline at end of file diff --git a/docs/source-lit/accelerators/mps.rst b/docs/source-lit/accelerators/mps.rst new file mode 120000 index 0000000000000..7b7e19ca22eb9 --- /dev/null +++ b/docs/source-lit/accelerators/mps.rst @@ -0,0 +1 @@ +../../source-pytorch/accelerators/mps.rst \ No newline at end of file diff --git a/docs/source-lit/accelerators/mps_basic.rst b/docs/source-lit/accelerators/mps_basic.rst new file mode 120000 index 0000000000000..3a3a4aa2a751e --- /dev/null +++ b/docs/source-lit/accelerators/mps_basic.rst @@ -0,0 +1 @@ +../../source-pytorch/accelerators/mps_basic.rst \ No newline at end of file diff --git a/docs/source-lit/accelerators/tpu.rst b/docs/source-lit/accelerators/tpu.rst new file mode 120000 index 0000000000000..f195539b4e396 --- /dev/null +++ b/docs/source-lit/accelerators/tpu.rst @@ -0,0 +1 @@ +../../source-pytorch/accelerators/tpu.rst \ No newline at end of file diff --git a/docs/source-lit/accelerators/tpu_advanced.rst b/docs/source-lit/accelerators/tpu_advanced.rst new file mode 120000 index 0000000000000..41e52bc335b0d --- /dev/null +++ b/docs/source-lit/accelerators/tpu_advanced.rst @@ -0,0 +1 @@ +../../source-pytorch/accelerators/tpu_advanced.rst \ No newline at end of file diff --git a/docs/source-lit/accelerators/tpu_basic.rst b/docs/source-lit/accelerators/tpu_basic.rst new file mode 120000 index 0000000000000..f32d5fea29d9d --- /dev/null +++ b/docs/source-lit/accelerators/tpu_basic.rst @@ -0,0 +1 @@ +../../source-pytorch/accelerators/tpu_basic.rst \ No newline at end of file diff --git a/docs/source-lit/accelerators/tpu_faq.rst b/docs/source-lit/accelerators/tpu_faq.rst new file mode 120000 index 0000000000000..c17d4ea43a242 --- /dev/null +++ b/docs/source-lit/accelerators/tpu_faq.rst @@ -0,0 +1 @@ +../../source-pytorch/accelerators/tpu_faq.rst \ No newline at end of file diff --git a/docs/source-lit/accelerators/tpu_intermediate.rst b/docs/source-lit/accelerators/tpu_intermediate.rst new file mode 120000 index 0000000000000..2cc579464434a --- /dev/null +++ b/docs/source-lit/accelerators/tpu_intermediate.rst @@ -0,0 +1 @@ +../../source-pytorch/accelerators/tpu_intermediate.rst \ No newline at end of file diff --git a/docs/source-lit/advanced/finetuning.rst b/docs/source-lit/advanced/finetuning.rst new file mode 120000 index 0000000000000..ba4ea9cedf444 --- /dev/null +++ b/docs/source-lit/advanced/finetuning.rst @@ -0,0 +1 @@ +../../source-pytorch/advanced/finetuning.rst \ No newline at end of file diff --git a/docs/source-lit/advanced/model_parallel.rst b/docs/source-lit/advanced/model_parallel.rst new file mode 120000 index 0000000000000..9d80550eb1731 --- /dev/null +++ b/docs/source-lit/advanced/model_parallel.rst @@ -0,0 +1 @@ +../../source-pytorch/advanced/model_parallel.rst \ No newline at end of file diff --git a/docs/source-lit/advanced/pretrained.rst b/docs/source-lit/advanced/pretrained.rst new file mode 120000 index 0000000000000..49a32f9e8ad54 --- /dev/null +++ b/docs/source-lit/advanced/pretrained.rst @@ -0,0 +1 @@ +../../source-pytorch/advanced/pretrained.rst \ No newline at end of file diff --git a/docs/source-lit/advanced/pruning_quantization.rst b/docs/source-lit/advanced/pruning_quantization.rst new file mode 120000 index 0000000000000..ef1e673e32374 --- /dev/null +++ b/docs/source-lit/advanced/pruning_quantization.rst @@ -0,0 +1 @@ +../../source-pytorch/advanced/pruning_quantization.rst \ No newline at end of file diff --git a/docs/source-lit/advanced/strategy_registry.rst b/docs/source-lit/advanced/strategy_registry.rst new file mode 120000 index 0000000000000..dbee19ce19323 --- /dev/null +++ b/docs/source-lit/advanced/strategy_registry.rst @@ -0,0 +1 @@ +../../source-pytorch/advanced/strategy_registry.rst \ No newline at end of file diff --git a/docs/source-lit/advanced/training_tricks.rst b/docs/source-lit/advanced/training_tricks.rst new file mode 120000 index 0000000000000..83ffa08becd79 --- /dev/null +++ b/docs/source-lit/advanced/training_tricks.rst @@ -0,0 +1 @@ +../../source-pytorch/advanced/training_tricks.rst \ No newline at end of file diff --git a/docs/source-lit/advanced/transfer_learning.rst b/docs/source-lit/advanced/transfer_learning.rst new file mode 120000 index 0000000000000..40691f544b641 --- /dev/null +++ b/docs/source-lit/advanced/transfer_learning.rst @@ -0,0 +1 @@ +../../source-pytorch/advanced/transfer_learning.rst \ No newline at end of file diff --git a/docs/source-lit/api_reference/components.rst b/docs/source-lit/api_reference/components.rst new file mode 120000 index 0000000000000..dab2b19e9dd2c --- /dev/null +++ b/docs/source-lit/api_reference/components.rst @@ -0,0 +1 @@ +../../source-app/api_reference/components.rst \ No newline at end of file diff --git a/docs/source-lit/api_reference/core.rst b/docs/source-lit/api_reference/core.rst new file mode 120000 index 0000000000000..ebb000b19943b --- /dev/null +++ b/docs/source-lit/api_reference/core.rst @@ -0,0 +1 @@ +../../source-app/api_reference/core.rst \ No newline at end of file diff --git a/docs/source-lit/api_reference/frontend.rst b/docs/source-lit/api_reference/frontend.rst new file mode 120000 index 0000000000000..12381418b0a52 --- /dev/null +++ b/docs/source-lit/api_reference/frontend.rst @@ -0,0 +1 @@ +../../source-app/api_reference/frontend.rst \ No newline at end of file diff --git a/docs/source-lit/api_reference/runners.rst b/docs/source-lit/api_reference/runners.rst new file mode 120000 index 0000000000000..03ab544b74a67 --- /dev/null +++ b/docs/source-lit/api_reference/runners.rst @@ -0,0 +1 @@ +../../source-app/api_reference/runners.rst \ No newline at end of file diff --git a/docs/source-lit/api_reference/storage.rst b/docs/source-lit/api_reference/storage.rst new file mode 120000 index 0000000000000..6110360aae64a --- /dev/null +++ b/docs/source-lit/api_reference/storage.rst @@ -0,0 +1 @@ +../../source-app/api_reference/storage.rst \ No newline at end of file diff --git a/docs/source-lit/api_references.rst b/docs/source-lit/api_references.rst new file mode 120000 index 0000000000000..2a0e20f0ef97c --- /dev/null +++ b/docs/source-lit/api_references.rst @@ -0,0 +1 @@ +../source-app/api_references.rst \ No newline at end of file diff --git a/docs/source-lit/basics.rst b/docs/source-lit/basics.rst new file mode 120000 index 0000000000000..093a83dcf6fca --- /dev/null +++ b/docs/source-lit/basics.rst @@ -0,0 +1 @@ +../source-app/basics.rst \ No newline at end of file diff --git a/docs/source-lit/benchmarking/benchmarks.rst b/docs/source-lit/benchmarking/benchmarks.rst new file mode 120000 index 0000000000000..70d2b20866e56 --- /dev/null +++ b/docs/source-lit/benchmarking/benchmarks.rst @@ -0,0 +1 @@ +../../source-pytorch/benchmarking/benchmarks.rst \ No newline at end of file diff --git a/docs/source-lit/cli/lightning_cli.rst b/docs/source-lit/cli/lightning_cli.rst new file mode 120000 index 0000000000000..d1f102516e37d --- /dev/null +++ b/docs/source-lit/cli/lightning_cli.rst @@ -0,0 +1 @@ +../../source-pytorch/cli/lightning_cli.rst \ No newline at end of file diff --git a/docs/source-lit/cli/lightning_cli_advanced.rst b/docs/source-lit/cli/lightning_cli_advanced.rst new file mode 120000 index 0000000000000..7dcbd81cb3799 --- /dev/null +++ b/docs/source-lit/cli/lightning_cli_advanced.rst @@ -0,0 +1 @@ +../../source-pytorch/cli/lightning_cli_advanced.rst \ No newline at end of file diff --git a/docs/source-lit/cli/lightning_cli_advanced_2.rst b/docs/source-lit/cli/lightning_cli_advanced_2.rst new file mode 120000 index 0000000000000..5d84b50e28527 --- /dev/null +++ b/docs/source-lit/cli/lightning_cli_advanced_2.rst @@ -0,0 +1 @@ +../../source-pytorch/cli/lightning_cli_advanced_2.rst \ No newline at end of file diff --git a/docs/source-lit/cli/lightning_cli_advanced_3.rst b/docs/source-lit/cli/lightning_cli_advanced_3.rst new file mode 120000 index 0000000000000..297f6b45dfb51 --- /dev/null +++ b/docs/source-lit/cli/lightning_cli_advanced_3.rst @@ -0,0 +1 @@ +../../source-pytorch/cli/lightning_cli_advanced_3.rst \ No newline at end of file diff --git a/docs/source-lit/cli/lightning_cli_expert.rst b/docs/source-lit/cli/lightning_cli_expert.rst new file mode 120000 index 0000000000000..5ab1bcff291d0 --- /dev/null +++ b/docs/source-lit/cli/lightning_cli_expert.rst @@ -0,0 +1 @@ +../../source-pytorch/cli/lightning_cli_expert.rst \ No newline at end of file diff --git a/docs/source-lit/cli/lightning_cli_faq.rst b/docs/source-lit/cli/lightning_cli_faq.rst new file mode 120000 index 0000000000000..8418844eef77d --- /dev/null +++ b/docs/source-lit/cli/lightning_cli_faq.rst @@ -0,0 +1 @@ +../../source-pytorch/cli/lightning_cli_faq.rst \ No newline at end of file diff --git a/docs/source-lit/cli/lightning_cli_intermediate.rst b/docs/source-lit/cli/lightning_cli_intermediate.rst new file mode 120000 index 0000000000000..c5985e30fbc1f --- /dev/null +++ b/docs/source-lit/cli/lightning_cli_intermediate.rst @@ -0,0 +1 @@ +../../source-pytorch/cli/lightning_cli_intermediate.rst \ No newline at end of file diff --git a/docs/source-lit/cli/lightning_cli_intermediate_2.rst b/docs/source-lit/cli/lightning_cli_intermediate_2.rst new file mode 120000 index 0000000000000..0129bedd59425 --- /dev/null +++ b/docs/source-lit/cli/lightning_cli_intermediate_2.rst @@ -0,0 +1 @@ +../../source-pytorch/cli/lightning_cli_intermediate_2.rst \ No newline at end of file diff --git a/docs/source-lit/clouds/cloud_training.rst b/docs/source-lit/clouds/cloud_training.rst new file mode 120000 index 0000000000000..c6fbdd11d6a74 --- /dev/null +++ b/docs/source-lit/clouds/cloud_training.rst @@ -0,0 +1 @@ +../../source-pytorch/clouds/cloud_training.rst \ No newline at end of file diff --git a/docs/source-lit/clouds/cloud_training_intermediate.rst b/docs/source-lit/clouds/cloud_training_intermediate.rst new file mode 120000 index 0000000000000..38d9e726c0258 --- /dev/null +++ b/docs/source-lit/clouds/cloud_training_intermediate.rst @@ -0,0 +1 @@ +../../source-pytorch/clouds/cloud_training_intermediate.rst \ No newline at end of file diff --git a/docs/source-lit/clouds/cluster.rst b/docs/source-lit/clouds/cluster.rst new file mode 120000 index 0000000000000..a558161ff2f14 --- /dev/null +++ b/docs/source-lit/clouds/cluster.rst @@ -0,0 +1 @@ +../../source-pytorch/clouds/cluster.rst \ No newline at end of file diff --git a/docs/source-lit/clouds/cluster_advanced.rst b/docs/source-lit/clouds/cluster_advanced.rst new file mode 120000 index 0000000000000..5a4a3f695041b --- /dev/null +++ b/docs/source-lit/clouds/cluster_advanced.rst @@ -0,0 +1 @@ +../../source-pytorch/clouds/cluster_advanced.rst \ No newline at end of file diff --git a/docs/source-lit/clouds/cluster_expert.rst b/docs/source-lit/clouds/cluster_expert.rst new file mode 120000 index 0000000000000..b90bf104d72bc --- /dev/null +++ b/docs/source-lit/clouds/cluster_expert.rst @@ -0,0 +1 @@ +../../source-pytorch/clouds/cluster_expert.rst \ No newline at end of file diff --git a/docs/source-lit/clouds/cluster_intermediate_1.rst b/docs/source-lit/clouds/cluster_intermediate_1.rst new file mode 120000 index 0000000000000..69b8515ef336d --- /dev/null +++ b/docs/source-lit/clouds/cluster_intermediate_1.rst @@ -0,0 +1 @@ +../../source-pytorch/clouds/cluster_intermediate_1.rst \ No newline at end of file diff --git a/docs/source-lit/clouds/cluster_intermediate_2.rst b/docs/source-lit/clouds/cluster_intermediate_2.rst new file mode 120000 index 0000000000000..bf8f14f717250 --- /dev/null +++ b/docs/source-lit/clouds/cluster_intermediate_2.rst @@ -0,0 +1 @@ +../../source-pytorch/clouds/cluster_intermediate_2.rst \ No newline at end of file diff --git a/docs/source-lit/clouds/fault_tolerant_training.rst b/docs/source-lit/clouds/fault_tolerant_training.rst new file mode 120000 index 0000000000000..b9a08d2b6ca4b --- /dev/null +++ b/docs/source-lit/clouds/fault_tolerant_training.rst @@ -0,0 +1 @@ +../../source-pytorch/clouds/fault_tolerant_training.rst \ No newline at end of file diff --git a/docs/source-lit/clouds/fault_tolerant_training_basic.rst b/docs/source-lit/clouds/fault_tolerant_training_basic.rst new file mode 120000 index 0000000000000..96cc54be6071a --- /dev/null +++ b/docs/source-lit/clouds/fault_tolerant_training_basic.rst @@ -0,0 +1 @@ +../../source-pytorch/clouds/fault_tolerant_training_basic.rst \ No newline at end of file diff --git a/docs/source-lit/clouds/fault_tolerant_training_expert.rst b/docs/source-lit/clouds/fault_tolerant_training_expert.rst new file mode 120000 index 0000000000000..b7f026be06853 --- /dev/null +++ b/docs/source-lit/clouds/fault_tolerant_training_expert.rst @@ -0,0 +1 @@ +../../source-pytorch/clouds/fault_tolerant_training_expert.rst \ No newline at end of file diff --git a/docs/source-lit/clouds/fault_tolerant_training_faq.rst b/docs/source-lit/clouds/fault_tolerant_training_faq.rst new file mode 120000 index 0000000000000..14b678256d655 --- /dev/null +++ b/docs/source-lit/clouds/fault_tolerant_training_faq.rst @@ -0,0 +1 @@ +../../source-pytorch/clouds/fault_tolerant_training_faq.rst \ No newline at end of file diff --git a/docs/source-lit/clouds/grid_costs.rst b/docs/source-lit/clouds/grid_costs.rst new file mode 120000 index 0000000000000..22c5077fba99f --- /dev/null +++ b/docs/source-lit/clouds/grid_costs.rst @@ -0,0 +1 @@ +../../source-pytorch/clouds/grid_costs.rst \ No newline at end of file diff --git a/docs/source-lit/clouds/run_advanced.rst b/docs/source-lit/clouds/run_advanced.rst new file mode 120000 index 0000000000000..8b5c272af1492 --- /dev/null +++ b/docs/source-lit/clouds/run_advanced.rst @@ -0,0 +1 @@ +../../source-pytorch/clouds/run_advanced.rst \ No newline at end of file diff --git a/docs/source-lit/clouds/run_basic.rst b/docs/source-lit/clouds/run_basic.rst new file mode 120000 index 0000000000000..8c75eed4437ad --- /dev/null +++ b/docs/source-lit/clouds/run_basic.rst @@ -0,0 +1 @@ +../../source-pytorch/clouds/run_basic.rst \ No newline at end of file diff --git a/docs/source-lit/clouds/run_expert.rst b/docs/source-lit/clouds/run_expert.rst new file mode 120000 index 0000000000000..017a4481de3eb --- /dev/null +++ b/docs/source-lit/clouds/run_expert.rst @@ -0,0 +1 @@ +../../source-pytorch/clouds/run_expert.rst \ No newline at end of file diff --git a/docs/source-lit/clouds/run_intermediate.rst b/docs/source-lit/clouds/run_intermediate.rst new file mode 120000 index 0000000000000..64530707ba214 --- /dev/null +++ b/docs/source-lit/clouds/run_intermediate.rst @@ -0,0 +1 @@ +../../source-pytorch/clouds/run_intermediate.rst \ No newline at end of file diff --git a/docs/source-lit/clouds/session_basic.rst b/docs/source-lit/clouds/session_basic.rst new file mode 120000 index 0000000000000..8c0a5975b45b9 --- /dev/null +++ b/docs/source-lit/clouds/session_basic.rst @@ -0,0 +1 @@ +../../source-pytorch/clouds/session_basic.rst \ No newline at end of file diff --git a/docs/source-lit/clouds/session_intermediate.rst b/docs/source-lit/clouds/session_intermediate.rst new file mode 120000 index 0000000000000..a667c20f3df6f --- /dev/null +++ b/docs/source-lit/clouds/session_intermediate.rst @@ -0,0 +1 @@ +../../source-pytorch/clouds/session_intermediate.rst \ No newline at end of file diff --git a/docs/source-lit/code_samples/basics/0.py b/docs/source-lit/code_samples/basics/0.py new file mode 120000 index 0000000000000..c996143c8226d --- /dev/null +++ b/docs/source-lit/code_samples/basics/0.py @@ -0,0 +1 @@ +../../../source-app/code_samples/basics/0.py \ No newline at end of file diff --git a/docs/source-lit/code_samples/basics/1.py b/docs/source-lit/code_samples/basics/1.py new file mode 120000 index 0000000000000..86eaafbce8e2c --- /dev/null +++ b/docs/source-lit/code_samples/basics/1.py @@ -0,0 +1 @@ +../../../source-app/code_samples/basics/1.py \ No newline at end of file diff --git a/docs/source-lit/code_samples/convert_pl_to_app/app.py b/docs/source-lit/code_samples/convert_pl_to_app/app.py new file mode 120000 index 0000000000000..d37a224bcc08a --- /dev/null +++ b/docs/source-lit/code_samples/convert_pl_to_app/app.py @@ -0,0 +1 @@ +../../../source-app/code_samples/convert_pl_to_app/app.py \ No newline at end of file diff --git a/docs/source-lit/code_samples/convert_pl_to_app/requirements.txt b/docs/source-lit/code_samples/convert_pl_to_app/requirements.txt new file mode 120000 index 0000000000000..2d66a5b80f7e0 --- /dev/null +++ b/docs/source-lit/code_samples/convert_pl_to_app/requirements.txt @@ -0,0 +1 @@ +../../../source-app/code_samples/convert_pl_to_app/requirements.txt \ No newline at end of file diff --git a/docs/source-lit/code_samples/convert_pl_to_app/train.py b/docs/source-lit/code_samples/convert_pl_to_app/train.py new file mode 120000 index 0000000000000..35d582798c184 --- /dev/null +++ b/docs/source-lit/code_samples/convert_pl_to_app/train.py @@ -0,0 +1 @@ +../../../source-app/code_samples/convert_pl_to_app/train.py \ No newline at end of file diff --git a/docs/source-lit/code_samples/quickstart/__init__.py b/docs/source-lit/code_samples/quickstart/__init__.py new file mode 120000 index 0000000000000..a25fd5adf4bd5 --- /dev/null +++ b/docs/source-lit/code_samples/quickstart/__init__.py @@ -0,0 +1 @@ +../../../source-app/code_samples/quickstart/__init__.py \ No newline at end of file diff --git a/docs/source-lit/code_samples/quickstart/app/__init__.py b/docs/source-lit/code_samples/quickstart/app/__init__.py new file mode 120000 index 0000000000000..4f6359944c3b9 --- /dev/null +++ b/docs/source-lit/code_samples/quickstart/app/__init__.py @@ -0,0 +1 @@ +../../../../source-app/code_samples/quickstart/app/__init__.py \ No newline at end of file diff --git a/docs/source-lit/code_samples/quickstart/app/app_0.py b/docs/source-lit/code_samples/quickstart/app/app_0.py new file mode 120000 index 0000000000000..484ff87eb8c08 --- /dev/null +++ b/docs/source-lit/code_samples/quickstart/app/app_0.py @@ -0,0 +1 @@ +../../../../source-app/code_samples/quickstart/app/app_0.py \ No newline at end of file diff --git a/docs/source-lit/code_samples/quickstart/app/app_1.py b/docs/source-lit/code_samples/quickstart/app/app_1.py new file mode 120000 index 0000000000000..33fded8f37ea8 --- /dev/null +++ b/docs/source-lit/code_samples/quickstart/app/app_1.py @@ -0,0 +1 @@ +../../../../source-app/code_samples/quickstart/app/app_1.py \ No newline at end of file diff --git a/docs/source-lit/code_samples/quickstart/app_01.py b/docs/source-lit/code_samples/quickstart/app_01.py new file mode 120000 index 0000000000000..eb5c5c41634a2 --- /dev/null +++ b/docs/source-lit/code_samples/quickstart/app_01.py @@ -0,0 +1 @@ +../../../source-app/code_samples/quickstart/app_01.py \ No newline at end of file diff --git a/docs/source-lit/code_samples/quickstart/app_02.py b/docs/source-lit/code_samples/quickstart/app_02.py new file mode 120000 index 0000000000000..81eabb232abc1 --- /dev/null +++ b/docs/source-lit/code_samples/quickstart/app_02.py @@ -0,0 +1 @@ +../../../source-app/code_samples/quickstart/app_02.py \ No newline at end of file diff --git a/docs/source-lit/code_samples/quickstart/app_03.py b/docs/source-lit/code_samples/quickstart/app_03.py new file mode 120000 index 0000000000000..caba2737ec919 --- /dev/null +++ b/docs/source-lit/code_samples/quickstart/app_03.py @@ -0,0 +1 @@ +../../../source-app/code_samples/quickstart/app_03.py \ No newline at end of file diff --git a/docs/source-lit/code_samples/quickstart/app_comp.py b/docs/source-lit/code_samples/quickstart/app_comp.py new file mode 120000 index 0000000000000..808a52ffe6a1b --- /dev/null +++ b/docs/source-lit/code_samples/quickstart/app_comp.py @@ -0,0 +1 @@ +../../../source-app/code_samples/quickstart/app_comp.py \ No newline at end of file diff --git a/docs/source-lit/code_samples/quickstart/hello_world/app.py b/docs/source-lit/code_samples/quickstart/hello_world/app.py new file mode 120000 index 0000000000000..4cdc9129e7872 --- /dev/null +++ b/docs/source-lit/code_samples/quickstart/hello_world/app.py @@ -0,0 +1 @@ +../../../../source-app/code_samples/quickstart/hello_world/app.py \ No newline at end of file diff --git a/docs/source-lit/code_samples/quickstart/hello_world/app_ui.py b/docs/source-lit/code_samples/quickstart/hello_world/app_ui.py new file mode 120000 index 0000000000000..bd815d07d90ea --- /dev/null +++ b/docs/source-lit/code_samples/quickstart/hello_world/app_ui.py @@ -0,0 +1 @@ +../../../../source-app/code_samples/quickstart/hello_world/app_ui.py \ No newline at end of file diff --git a/docs/source-lit/code_samples/quickstart/hello_world/ui/index.html b/docs/source-lit/code_samples/quickstart/hello_world/ui/index.html new file mode 120000 index 0000000000000..ad21e064cd918 --- /dev/null +++ b/docs/source-lit/code_samples/quickstart/hello_world/ui/index.html @@ -0,0 +1 @@ +../../../../../source-app/code_samples/quickstart/hello_world/ui/index.html \ No newline at end of file diff --git a/docs/source-lit/common/checkpointing.rst b/docs/source-lit/common/checkpointing.rst new file mode 120000 index 0000000000000..7d133e17f0bb3 --- /dev/null +++ b/docs/source-lit/common/checkpointing.rst @@ -0,0 +1 @@ +../../source-pytorch/common/checkpointing.rst \ No newline at end of file diff --git a/docs/source-lit/common/checkpointing_advanced.rst b/docs/source-lit/common/checkpointing_advanced.rst new file mode 120000 index 0000000000000..e90bdea8daf4f --- /dev/null +++ b/docs/source-lit/common/checkpointing_advanced.rst @@ -0,0 +1 @@ +../../source-pytorch/common/checkpointing_advanced.rst \ No newline at end of file diff --git a/docs/source-lit/common/checkpointing_basic.rst b/docs/source-lit/common/checkpointing_basic.rst new file mode 120000 index 0000000000000..15141dfcba5e7 --- /dev/null +++ b/docs/source-lit/common/checkpointing_basic.rst @@ -0,0 +1 @@ +../../source-pytorch/common/checkpointing_basic.rst \ No newline at end of file diff --git a/docs/source-lit/common/checkpointing_expert.rst b/docs/source-lit/common/checkpointing_expert.rst new file mode 120000 index 0000000000000..450dea5431529 --- /dev/null +++ b/docs/source-lit/common/checkpointing_expert.rst @@ -0,0 +1 @@ +../../source-pytorch/common/checkpointing_expert.rst \ No newline at end of file diff --git a/docs/source-lit/common/checkpointing_intermediate.rst b/docs/source-lit/common/checkpointing_intermediate.rst new file mode 120000 index 0000000000000..370fe06144e66 --- /dev/null +++ b/docs/source-lit/common/checkpointing_intermediate.rst @@ -0,0 +1 @@ +../../source-pytorch/common/checkpointing_intermediate.rst \ No newline at end of file diff --git a/docs/source-lit/common/child_modules.rst b/docs/source-lit/common/child_modules.rst new file mode 120000 index 0000000000000..c6486184a2440 --- /dev/null +++ b/docs/source-lit/common/child_modules.rst @@ -0,0 +1 @@ +../../source-pytorch/common/child_modules.rst \ No newline at end of file diff --git a/docs/source-lit/common/console_logs.rst b/docs/source-lit/common/console_logs.rst new file mode 120000 index 0000000000000..41f235a6709c5 --- /dev/null +++ b/docs/source-lit/common/console_logs.rst @@ -0,0 +1 @@ +../../source-pytorch/common/console_logs.rst \ No newline at end of file diff --git a/docs/source-lit/common/early_stopping.rst b/docs/source-lit/common/early_stopping.rst new file mode 120000 index 0000000000000..081e619b2f0f8 --- /dev/null +++ b/docs/source-lit/common/early_stopping.rst @@ -0,0 +1 @@ +../../source-pytorch/common/early_stopping.rst \ No newline at end of file diff --git a/docs/source-lit/common/evaluation.rst b/docs/source-lit/common/evaluation.rst new file mode 120000 index 0000000000000..f6966056e1969 --- /dev/null +++ b/docs/source-lit/common/evaluation.rst @@ -0,0 +1 @@ +../../source-pytorch/common/evaluation.rst \ No newline at end of file diff --git a/docs/source-lit/common/evaluation_basic.rst b/docs/source-lit/common/evaluation_basic.rst new file mode 120000 index 0000000000000..aa2cda92d238a --- /dev/null +++ b/docs/source-lit/common/evaluation_basic.rst @@ -0,0 +1 @@ +../../source-pytorch/common/evaluation_basic.rst \ No newline at end of file diff --git a/docs/source-lit/common/evaluation_intermediate.rst b/docs/source-lit/common/evaluation_intermediate.rst new file mode 120000 index 0000000000000..9f88cb55c5c12 --- /dev/null +++ b/docs/source-lit/common/evaluation_intermediate.rst @@ -0,0 +1 @@ +../../source-pytorch/common/evaluation_intermediate.rst \ No newline at end of file diff --git a/docs/source-lit/common/gradient_accumulation.rst b/docs/source-lit/common/gradient_accumulation.rst new file mode 120000 index 0000000000000..1e4cb1011a914 --- /dev/null +++ b/docs/source-lit/common/gradient_accumulation.rst @@ -0,0 +1 @@ +../../source-pytorch/common/gradient_accumulation.rst \ No newline at end of file diff --git a/docs/source-lit/common/hyperparameters.rst b/docs/source-lit/common/hyperparameters.rst new file mode 120000 index 0000000000000..2aee692eea31d --- /dev/null +++ b/docs/source-lit/common/hyperparameters.rst @@ -0,0 +1 @@ +../../source-pytorch/common/hyperparameters.rst \ No newline at end of file diff --git a/docs/source-lit/common/lightning_module.rst b/docs/source-lit/common/lightning_module.rst new file mode 120000 index 0000000000000..7b3809f69fba0 --- /dev/null +++ b/docs/source-lit/common/lightning_module.rst @@ -0,0 +1 @@ +../../source-pytorch/common/lightning_module.rst \ No newline at end of file diff --git a/docs/source-lit/common/optimization.rst b/docs/source-lit/common/optimization.rst new file mode 120000 index 0000000000000..fa5f346f27ff7 --- /dev/null +++ b/docs/source-lit/common/optimization.rst @@ -0,0 +1 @@ +../../source-pytorch/common/optimization.rst \ No newline at end of file diff --git a/docs/source-lit/common/precision.rst b/docs/source-lit/common/precision.rst new file mode 120000 index 0000000000000..3c708e95b2c85 --- /dev/null +++ b/docs/source-lit/common/precision.rst @@ -0,0 +1 @@ +../../source-pytorch/common/precision.rst \ No newline at end of file diff --git a/docs/source-lit/common/precision_basic.rst b/docs/source-lit/common/precision_basic.rst new file mode 120000 index 0000000000000..6c1da30189b7f --- /dev/null +++ b/docs/source-lit/common/precision_basic.rst @@ -0,0 +1 @@ +../../source-pytorch/common/precision_basic.rst \ No newline at end of file diff --git a/docs/source-lit/common/precision_expert.rst b/docs/source-lit/common/precision_expert.rst new file mode 120000 index 0000000000000..dbf59b407ea82 --- /dev/null +++ b/docs/source-lit/common/precision_expert.rst @@ -0,0 +1 @@ +../../source-pytorch/common/precision_expert.rst \ No newline at end of file diff --git a/docs/source-lit/common/precision_intermediate.rst b/docs/source-lit/common/precision_intermediate.rst new file mode 120000 index 0000000000000..da60d92799f42 --- /dev/null +++ b/docs/source-lit/common/precision_intermediate.rst @@ -0,0 +1 @@ +../../source-pytorch/common/precision_intermediate.rst \ No newline at end of file diff --git a/docs/source-lit/common/progress_bar.rst b/docs/source-lit/common/progress_bar.rst new file mode 120000 index 0000000000000..5d8cd86a638ae --- /dev/null +++ b/docs/source-lit/common/progress_bar.rst @@ -0,0 +1 @@ +../../source-pytorch/common/progress_bar.rst \ No newline at end of file diff --git a/docs/source-lit/common/remote_fs.rst b/docs/source-lit/common/remote_fs.rst new file mode 120000 index 0000000000000..1b8613eae9750 --- /dev/null +++ b/docs/source-lit/common/remote_fs.rst @@ -0,0 +1 @@ +../../source-pytorch/common/remote_fs.rst \ No newline at end of file diff --git a/docs/source-lit/common/trainer.rst b/docs/source-lit/common/trainer.rst new file mode 120000 index 0000000000000..598cdab32df63 --- /dev/null +++ b/docs/source-lit/common/trainer.rst @@ -0,0 +1 @@ +../../source-pytorch/common/trainer.rst \ No newline at end of file diff --git a/docs/source-lit/common_usecases.rst b/docs/source-lit/common_usecases.rst new file mode 120000 index 0000000000000..23a188218916f --- /dev/null +++ b/docs/source-lit/common_usecases.rst @@ -0,0 +1 @@ +../source-pytorch/common_usecases.rst \ No newline at end of file diff --git a/docs/source-lit/conf.py b/docs/source-lit/conf.py new file mode 100644 index 0000000000000..d1d48b28163e7 --- /dev/null +++ b/docs/source-lit/conf.py @@ -0,0 +1,417 @@ +# Configuration file for the Sphinx documentation builder. +# +# This file does only contain a selection of the most common options. For a +# full list see the documentation: +# http://www.sphinx-doc.org/en/master/config + +# -- Path setup -------------------------------------------------------------- + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. + +import glob +import inspect +import os +import shutil +import sys + +import pt_lightning_sphinx_theme + +import lightning + +_PATH_HERE = os.path.abspath(os.path.dirname(__file__)) +_PATH_ROOT = os.path.realpath(os.path.join(_PATH_HERE, "..", "..")) +sys.path.insert(0, os.path.abspath(_PATH_ROOT)) + +SPHINX_MOCK_REQUIREMENTS = int(os.environ.get("SPHINX_MOCK_REQUIREMENTS", True)) + +# -- Project information ----------------------------------------------------- + +# this name shall match the project name in Github as it is used for linking to code +project = "lightning" +copyright = lightning.__copyright__ +author = lightning.__author__ + +# The short X.Y version +version = lightning.__version__ +# The full version, including alpha/beta/rc tags +release = lightning.__version__ + +# Options for the linkcode extension +# ---------------------------------- +github_user = "Lightning-AI" +github_repo = project + +# -- Project documents ------------------------------------------------------- + + +# def _transform_changelog(path_in: str, path_out: str) -> None: +# with open(path_in) as fp: +# chlog_lines = fp.readlines() +# # enrich short subsub-titles to be unique +# chlog_ver = "" +# for i, ln in enumerate(chlog_lines): +# if ln.startswith("## "): +# chlog_ver = ln[2:].split("-")[0].strip() +# elif ln.startswith("### "): +# ln = ln.replace("###", f"### {chlog_ver} -") +# chlog_lines[i] = ln +# with open(path_out, "w") as fp: +# fp.writelines(chlog_lines) + + +# export the READme +# _convert_markdown(os.path.join(_PATH_ROOT, "README.md"), "readme.md") + +# -- General configuration --------------------------------------------------- + +# If your documentation needs a minimal Sphinx version, state it here. + +needs_sphinx = "4.5" + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + "sphinx.ext.autodoc", + # 'sphinxcontrib.mockautodoc', # raises error: directive 'automodule' is already registered ... + # 'sphinxcontrib.fulltoc', # breaks pytorch-theme with unexpected kw argument 'titles_only' + "sphinx.ext.doctest", + "sphinx.ext.intersphinx", + "sphinx.ext.todo", + "sphinx.ext.coverage", + "sphinx.ext.linkcode", + "sphinx.ext.autosummary", + "sphinx.ext.napoleon", + "sphinx.ext.imgmath", + "myst_parser", + "sphinx.ext.autosectionlabel", + "nbsphinx", + "sphinx_autodoc_typehints", + "sphinx_copybutton", + "sphinx_paramlinks", + "sphinx_togglebutton", + "sphinx.ext.githubpages", + "pt_lightning_sphinx_theme.extensions.lightning", +] +# todo: remove this after finishing fusion +suppress_warnings = ["autosectionlabel.*"] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ["_templates"] + +myst_update_mathjax = False + +# https://berkeley-stat159-f17.github.io/stat159-f17/lectures/14-sphinx..html#conf.py-(cont.) +# https://stackoverflow.com/questions/38526888/embed-ipython-notebook-in-sphinx-document +# I execute the notebooks manually in advance. If notebooks test the code, +# they should be run at build time. +nbsphinx_execute = "never" +nbsphinx_allow_errors = True +nbsphinx_requirejs_path = "" + +# The suffix(es) of source filenames. +# You can specify multiple suffix as a list of string: +# +# source_suffix = ['.rst', '.md'] +# source_suffix = ['.rst', '.md', '.ipynb'] +source_suffix = { + ".rst": "restructuredtext", + ".txt": "markdown", + ".md": "markdown", + ".ipynb": "nbsphinx", +} + +# The master toctree document. +master_doc = "index" + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +# +# This is also used if you do content translation via gettext catalogs. +# Usually you set "language" from the command line for these cases. +language = None + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path. +exclude_patterns = [ + "generated/PULL_REQUEST_TEMPLATE.md", + "**/README.md/*", + "code_samples/convert_pl_to_app/requirements.txt", +] + +os.makedirs(os.path.join(_PATH_HERE, "generated"), exist_ok=True) +# copy all documents from GH templates like contribution guide +for md in glob.glob(os.path.join(_PATH_ROOT, ".github", "*.md")): + shutil.copy(md, os.path.join(_PATH_HERE, "generated", os.path.basename(md))) + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = None + +# -- Options for HTML output ------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +html_theme = "pt_lightning_sphinx_theme" +html_theme_path = [pt_lightning_sphinx_theme.get_html_theme_path()] + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. + +html_theme_options = { + "pytorch_project": lightning.__homepage__, + "canonical_url": lightning.__homepage__, + "collapse_navigation": False, + "display_version": True, + "logo_only": False, +} + +html_favicon = "_static/images/icon.svg" + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ["_templates", "_static"] + +# Custom sidebar templates, must be a dictionary that maps document names +# to template names. +# +# The default sidebars (for documents that don't match any pattern) are +# defined by theme itself. Builtin themes are using these templates by +# default: ``['localtoc.html', 'relations.html', 'sourcelink.html', +# 'searchbox.html']``. +# +# html_sidebars = {} + +# -- Options for HTMLHelp output --------------------------------------------- + +# Output file base name for HTML help builder. +htmlhelp_basename = project + "-doc" + +# -- Options for LaTeX output ------------------------------------------------ + +latex_elements = { + # The paper size ('letterpaper' or 'a4paper'). + # 'papersize': 'letterpaper', + # The font size ('10pt', '11pt' or '12pt'). + # 'pointsize': '10pt', + # Additional stuff for the LaTeX preamble. + # 'preamble': '', + # Latex figure (float) alignment + "figure_align": "htbp", +} + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, +# author, documentclass [howto, manual, or own class]). +latex_documents = [ + (master_doc, project + ".tex", project + " Documentation", author, "manual"), +] + +# -- Options for manual page output ------------------------------------------ + +# One entry per manual page. List of tuples +# (source start file, name, description, authors, manual section). +man_pages = [(master_doc, project, project + " Documentation", [author], 1)] + +# -- Options for Texinfo output ---------------------------------------------- + +# Grouping the document tree into Texinfo files. List of tuples +# (source start file, target name, title, author, +# dir menu entry, description, category) +texinfo_documents = [ + ( + master_doc, + project, + project + " Documentation", + author, + project, + lightning.__docs__, + "Miscellaneous", + ), +] + +# -- Options for Epub output ------------------------------------------------- + +# Bibliographic Dublin Core info. +epub_title = project + +# The unique identifier of the text. This can be a ISBN number +# or the project homepage. +# +# epub_identifier = '' + +# A unique identification for the text. +# +# epub_uid = '' + +# A list of files that should not be packed into the epub file. +epub_exclude_files = ["search.html"] + +# -- Extension configuration ------------------------------------------------- + +# -- Options for intersphinx extension --------------------------------------- + +# Example configuration for intersphinx: refer to the Python standard library. +intersphinx_mapping = { + "python": ("https://docs.python.org/3", None), + "torch": ("https://pytorch.org/docs/stable/", None), + # "numpy": ("https://docs.scipy.org/doc/numpy/", None), +} + +# -- Options for todo extension ---------------------------------------------- + +# If true, `todo` and `todoList` produce output, else they produce nothing. +todo_include_todos = True + + +def setup(app): + # this is for hiding doctest decoration, + # see: http://z4r.github.io/python/2011/12/02/hides-the-prompts-and-output/ + app.add_js_file("copybutton.js") + app.add_css_file("main.css") + + +# copy all notebooks to local folder +path_nbs = os.path.join(_PATH_HERE, "notebooks") +if not os.path.isdir(path_nbs): + os.mkdir(path_nbs) +for path_ipynb in glob.glob(os.path.join(_PATH_ROOT, "notebooks", "*.ipynb")): + path_ipynb2 = os.path.join(path_nbs, os.path.basename(path_ipynb)) + shutil.copy(path_ipynb, path_ipynb2) + +# copy all examples to local folder +path_examples = os.path.join(_PATH_HERE, "..", "examples") +if not os.path.isdir(path_examples): + os.mkdir(path_examples) +for path_app_example in glob.glob(os.path.join(_PATH_ROOT, "examples", "app_*")): + path_app_example2 = os.path.join(path_examples, os.path.basename(path_app_example)) + if not os.path.isdir(path_app_example2): + shutil.copytree(path_app_example, path_app_example2, dirs_exist_ok=True) + + +# Ignoring Third-party packages +# https://stackoverflow.com/questions/15889621/sphinx-how-to-exclude-imports-in-automodule +def _package_list_from_file(file): + list_pkgs = [] + with open(file) as fp: + lines = fp.readlines() + for ln in lines: + found = [ln.index(ch) for ch in list(",=<>#") if ch in ln] + pkg = ln[: min(found)] if found else ln + if pkg.rstrip(): + list_pkgs.append(pkg.rstrip()) + return list_pkgs + + +# define mapping from PyPI names to python imports +PACKAGE_MAPPING = { + "PyYAML": "yaml", +} +MOCK_PACKAGES = [] +if SPHINX_MOCK_REQUIREMENTS: + # mock also base packages when we are on RTD since we don't install them there + MOCK_PACKAGES += _package_list_from_file(os.path.join(_PATH_ROOT, "requirements.txt")) +MOCK_PACKAGES = [PACKAGE_MAPPING.get(pkg, pkg) for pkg in MOCK_PACKAGES] + +autodoc_mock_imports = MOCK_PACKAGES + + +# Resolve function +# This function is used to populate the (source) links in the API +def linkcode_resolve(domain, info): + def find_source(): + # try to find the file and line number, based on code from numpy: + # https://github.com/numpy/numpy/blob/master/doc/source-app/conf.py#L286 + obj = sys.modules[info["module"]] + for part in info["fullname"].split("."): + obj = getattr(obj, part) + fname = inspect.getsourcefile(obj) + # https://github.com/rtfd/readthedocs.org/issues/5735 + if any(s in fname for s in ("readthedocs", "rtfd", "checkouts")): + # /home/docs/checkouts/readthedocs.org/user_builds/pytorch_lightning/checkouts/ + # devel/pytorch_lightning/utilities/cls_experiment.py#L26-L176 + path_top = os.path.abspath(os.path.join("..", "..", "..")) + fname = os.path.relpath(fname, start=path_top) + else: + # Local build, imitate master + fname = "master/" + os.path.relpath(fname, start=os.path.abspath("..")) + source, lineno = inspect.getsourcelines(obj) + return fname, lineno, lineno + len(source) - 1 + + if domain != "py" or not info["module"]: + return None + try: + filename = "%s#L%d-L%d" % find_source() + except Exception: + filename = info["module"].replace(".", "/") + ".py" + # import subprocess + # tag = subprocess.Popen(['git', 'rev-parse', 'HEAD'], stdout=subprocess.PIPE, + # universal_newlines=True).communicate()[0][:-1] + branch = filename.split("/")[0] + # do mapping from latest tags to master + branch = {"latest": "master", "stable": "master"}.get(branch, branch) + filename = "/".join([branch] + filename.split("/")[1:]) + return f"https://github.com/{github_user}/{github_repo}/blob/{filename}" + + +autosummary_generate = True + +autodoc_member_order = "groupwise" +autoclass_content = "both" +# the options are fixed and will be soon in release, +# see https://github.com/sphinx-doc/sphinx/issues/5459 +autodoc_default_options = { + "members": None, + "methods": None, + # 'attributes': None, + "special-members": "__call__", + "exclude-members": "_abc_impl", + "show-inheritance": True, + "private-members": True, + "noindex": True, +} + +# Sphinx will add “permalinks” for each heading and description environment as paragraph signs that +# become visible when the mouse hovers over them. +# This value determines the text for the permalink; it defaults to "¶". Set it to None or the empty +# string to disable permalinks. +# https://www.sphinx-doc.org/en/master/usage/configuration.html#confval-html_permalinks +# html_add_permalinks = "¶" +# True to prefix each section label with the name of the document it is in, followed by a colon. +# For example, index:Introduction for a section called Introduction that appears in document index.rst. +# Useful for avoiding ambiguity when the same section heading appears in different documents. +# http://www.sphinx-doc.org/en/master/usage/extensions/autosectionlabel.html +autosectionlabel_prefix_document = True + +# only run doctests marked with a ".. doctest::" directive +doctest_test_doctest_blocks = "" +doctest_global_setup = """ +import importlib +import os +import sys +import lightning as L +from typing import Optional + +import torch +import pytorch_lightning as pl +from torch import nn +from torch.utils.data import IterableDataset, DataLoader, Dataset +from pytorch_lightning import LightningDataModule, LightningModule, Trainer, seed_everything +from pytorch_lightning.callbacks import Callback +from pytorch_lightning.cli import LightningCLI +from pytorch_lightning.utilities import ( + _APEX_AVAILABLE, + _XLA_AVAILABLE, + _TPU_AVAILABLE, + _TORCHVISION_AVAILABLE, + _TORCH_GREATER_EQUAL_1_10, + _module_available, +) +_JSONARGPARSE_AVAILABLE = _module_available("jsonargparse") +""" +coverage_skip_undoc_in_source = True diff --git a/docs/source-lit/contribute_app.rst b/docs/source-lit/contribute_app.rst new file mode 120000 index 0000000000000..e61c4aaa58a0f --- /dev/null +++ b/docs/source-lit/contribute_app.rst @@ -0,0 +1 @@ +../source-app/contribute_app.rst \ No newline at end of file diff --git a/docs/source-lit/core_api/core_api.rst b/docs/source-lit/core_api/core_api.rst new file mode 120000 index 0000000000000..a80245326d38d --- /dev/null +++ b/docs/source-lit/core_api/core_api.rst @@ -0,0 +1 @@ +../../source-app/core_api/core_api.rst \ No newline at end of file diff --git a/docs/source-lit/core_api/lightning_app/app.py b/docs/source-lit/core_api/lightning_app/app.py new file mode 120000 index 0000000000000..eebf08241d01b --- /dev/null +++ b/docs/source-lit/core_api/lightning_app/app.py @@ -0,0 +1 @@ +../../../source-app/core_api/lightning_app/app.py \ No newline at end of file diff --git a/docs/source-lit/core_api/lightning_app/communication.rst b/docs/source-lit/core_api/lightning_app/communication.rst new file mode 120000 index 0000000000000..cb10da3243517 --- /dev/null +++ b/docs/source-lit/core_api/lightning_app/communication.rst @@ -0,0 +1 @@ +../../../source-app/core_api/lightning_app/communication.rst \ No newline at end of file diff --git a/docs/source-lit/core_api/lightning_app/communication_content.rst b/docs/source-lit/core_api/lightning_app/communication_content.rst new file mode 120000 index 0000000000000..d61467ec9ea1e --- /dev/null +++ b/docs/source-lit/core_api/lightning_app/communication_content.rst @@ -0,0 +1 @@ +../../../source-app/core_api/lightning_app/communication_content.rst \ No newline at end of file diff --git a/docs/source-lit/core_api/lightning_app/dynamic_work.rst b/docs/source-lit/core_api/lightning_app/dynamic_work.rst new file mode 120000 index 0000000000000..9676e73e817e0 --- /dev/null +++ b/docs/source-lit/core_api/lightning_app/dynamic_work.rst @@ -0,0 +1 @@ +../../../source-app/core_api/lightning_app/dynamic_work.rst \ No newline at end of file diff --git a/docs/source-lit/core_api/lightning_app/dynamic_work_content.rst b/docs/source-lit/core_api/lightning_app/dynamic_work_content.rst new file mode 120000 index 0000000000000..75263d87cda83 --- /dev/null +++ b/docs/source-lit/core_api/lightning_app/dynamic_work_content.rst @@ -0,0 +1 @@ +../../../source-app/core_api/lightning_app/dynamic_work_content.rst \ No newline at end of file diff --git a/docs/source-lit/core_api/lightning_app/index.rst b/docs/source-lit/core_api/lightning_app/index.rst new file mode 120000 index 0000000000000..76a45ca6b70e8 --- /dev/null +++ b/docs/source-lit/core_api/lightning_app/index.rst @@ -0,0 +1 @@ +../../../source-app/core_api/lightning_app/index.rst \ No newline at end of file diff --git a/docs/source-lit/core_api/lightning_app/lightning_app.rst b/docs/source-lit/core_api/lightning_app/lightning_app.rst new file mode 120000 index 0000000000000..53dcd3b6c6989 --- /dev/null +++ b/docs/source-lit/core_api/lightning_app/lightning_app.rst @@ -0,0 +1 @@ +../../../source-app/core_api/lightning_app/lightning_app.rst \ No newline at end of file diff --git a/docs/source-lit/core_api/lightning_flow.rst b/docs/source-lit/core_api/lightning_flow.rst new file mode 120000 index 0000000000000..18f0c15b9929d --- /dev/null +++ b/docs/source-lit/core_api/lightning_flow.rst @@ -0,0 +1 @@ +../../source-app/core_api/lightning_flow.rst \ No newline at end of file diff --git a/docs/source-lit/core_api/lightning_work/compute.rst b/docs/source-lit/core_api/lightning_work/compute.rst new file mode 120000 index 0000000000000..5b16ae2cd6acb --- /dev/null +++ b/docs/source-lit/core_api/lightning_work/compute.rst @@ -0,0 +1 @@ +../../../source-app/core_api/lightning_work/compute.rst \ No newline at end of file diff --git a/docs/source-lit/core_api/lightning_work/compute_content.rst b/docs/source-lit/core_api/lightning_work/compute_content.rst new file mode 120000 index 0000000000000..f56a04893b409 --- /dev/null +++ b/docs/source-lit/core_api/lightning_work/compute_content.rst @@ -0,0 +1 @@ +../../../source-app/core_api/lightning_work/compute_content.rst \ No newline at end of file diff --git a/docs/source-lit/core_api/lightning_work/handling_app_exception.rst b/docs/source-lit/core_api/lightning_work/handling_app_exception.rst new file mode 120000 index 0000000000000..4e0a0770d0628 --- /dev/null +++ b/docs/source-lit/core_api/lightning_work/handling_app_exception.rst @@ -0,0 +1 @@ +../../../source-app/core_api/lightning_work/handling_app_exception.rst \ No newline at end of file diff --git a/docs/source-lit/core_api/lightning_work/handling_app_exception_content.rst b/docs/source-lit/core_api/lightning_work/handling_app_exception_content.rst new file mode 120000 index 0000000000000..f0354b0020182 --- /dev/null +++ b/docs/source-lit/core_api/lightning_work/handling_app_exception_content.rst @@ -0,0 +1 @@ +../../../source-app/core_api/lightning_work/handling_app_exception_content.rst \ No newline at end of file diff --git a/docs/source-lit/core_api/lightning_work/index.rst b/docs/source-lit/core_api/lightning_work/index.rst new file mode 120000 index 0000000000000..39251eb4d550c --- /dev/null +++ b/docs/source-lit/core_api/lightning_work/index.rst @@ -0,0 +1 @@ +../../../source-app/core_api/lightning_work/index.rst \ No newline at end of file diff --git a/docs/source-lit/core_api/lightning_work/lightning_work.rst b/docs/source-lit/core_api/lightning_work/lightning_work.rst new file mode 120000 index 0000000000000..c6531f8dc1d65 --- /dev/null +++ b/docs/source-lit/core_api/lightning_work/lightning_work.rst @@ -0,0 +1 @@ +../../../source-app/core_api/lightning_work/lightning_work.rst \ No newline at end of file diff --git a/docs/source-lit/core_api/lightning_work/payload.rst b/docs/source-lit/core_api/lightning_work/payload.rst new file mode 120000 index 0000000000000..664870bd60e17 --- /dev/null +++ b/docs/source-lit/core_api/lightning_work/payload.rst @@ -0,0 +1 @@ +../../../source-app/core_api/lightning_work/payload.rst \ No newline at end of file diff --git a/docs/source-lit/core_api/lightning_work/payload_content.rst b/docs/source-lit/core_api/lightning_work/payload_content.rst new file mode 120000 index 0000000000000..ce194796ac989 --- /dev/null +++ b/docs/source-lit/core_api/lightning_work/payload_content.rst @@ -0,0 +1 @@ +../../../source-app/core_api/lightning_work/payload_content.rst \ No newline at end of file diff --git a/docs/source-lit/core_api/lightning_work/status.rst b/docs/source-lit/core_api/lightning_work/status.rst new file mode 120000 index 0000000000000..54e2486cf54c2 --- /dev/null +++ b/docs/source-lit/core_api/lightning_work/status.rst @@ -0,0 +1 @@ +../../../source-app/core_api/lightning_work/status.rst \ No newline at end of file diff --git a/docs/source-lit/core_api/lightning_work/status_content.rst b/docs/source-lit/core_api/lightning_work/status_content.rst new file mode 120000 index 0000000000000..eda1167dabb00 --- /dev/null +++ b/docs/source-lit/core_api/lightning_work/status_content.rst @@ -0,0 +1 @@ +../../../source-app/core_api/lightning_work/status_content.rst \ No newline at end of file diff --git a/docs/source-lit/data/datamodule.rst b/docs/source-lit/data/datamodule.rst new file mode 120000 index 0000000000000..3dc0b014d2f48 --- /dev/null +++ b/docs/source-lit/data/datamodule.rst @@ -0,0 +1 @@ +../../source-pytorch/data/datamodule.rst \ No newline at end of file diff --git a/docs/source-lit/debug/debugging.rst b/docs/source-lit/debug/debugging.rst new file mode 120000 index 0000000000000..39662659d31b3 --- /dev/null +++ b/docs/source-lit/debug/debugging.rst @@ -0,0 +1 @@ +../../source-pytorch/debug/debugging.rst \ No newline at end of file diff --git a/docs/source-lit/debug/debugging_advanced.rst b/docs/source-lit/debug/debugging_advanced.rst new file mode 120000 index 0000000000000..28791645854e7 --- /dev/null +++ b/docs/source-lit/debug/debugging_advanced.rst @@ -0,0 +1 @@ +../../source-pytorch/debug/debugging_advanced.rst \ No newline at end of file diff --git a/docs/source-lit/debug/debugging_basic.rst b/docs/source-lit/debug/debugging_basic.rst new file mode 120000 index 0000000000000..8ef513e766cfb --- /dev/null +++ b/docs/source-lit/debug/debugging_basic.rst @@ -0,0 +1 @@ +../../source-pytorch/debug/debugging_basic.rst \ No newline at end of file diff --git a/docs/source-lit/debug/debugging_intermediate.rst b/docs/source-lit/debug/debugging_intermediate.rst new file mode 120000 index 0000000000000..80c80f94ebb92 --- /dev/null +++ b/docs/source-lit/debug/debugging_intermediate.rst @@ -0,0 +1 @@ +../../source-pytorch/debug/debugging_intermediate.rst \ No newline at end of file diff --git a/docs/source-lit/deploy/production.rst b/docs/source-lit/deploy/production.rst new file mode 120000 index 0000000000000..2d07481d1e29a --- /dev/null +++ b/docs/source-lit/deploy/production.rst @@ -0,0 +1 @@ +../../source-pytorch/deploy/production.rst \ No newline at end of file diff --git a/docs/source-lit/deploy/production_advanced.rst b/docs/source-lit/deploy/production_advanced.rst new file mode 120000 index 0000000000000..84c8a9ca6c145 --- /dev/null +++ b/docs/source-lit/deploy/production_advanced.rst @@ -0,0 +1 @@ +../../source-pytorch/deploy/production_advanced.rst \ No newline at end of file diff --git a/docs/source-lit/deploy/production_advanced_2.rst b/docs/source-lit/deploy/production_advanced_2.rst new file mode 120000 index 0000000000000..5fb603c470b93 --- /dev/null +++ b/docs/source-lit/deploy/production_advanced_2.rst @@ -0,0 +1 @@ +../../source-pytorch/deploy/production_advanced_2.rst \ No newline at end of file diff --git a/docs/source-lit/deploy/production_basic.rst b/docs/source-lit/deploy/production_basic.rst new file mode 120000 index 0000000000000..ed2ae0d40aa7d --- /dev/null +++ b/docs/source-lit/deploy/production_basic.rst @@ -0,0 +1 @@ +../../source-pytorch/deploy/production_basic.rst \ No newline at end of file diff --git a/docs/source-lit/deploy/production_intermediate.rst b/docs/source-lit/deploy/production_intermediate.rst new file mode 120000 index 0000000000000..1e88ff5ae542e --- /dev/null +++ b/docs/source-lit/deploy/production_intermediate.rst @@ -0,0 +1 @@ +../../source-pytorch/deploy/production_intermediate.rst \ No newline at end of file diff --git a/docs/source-lit/ecosystem/asr_nlp_tts.rst b/docs/source-lit/ecosystem/asr_nlp_tts.rst new file mode 120000 index 0000000000000..d1ebb2c160587 --- /dev/null +++ b/docs/source-lit/ecosystem/asr_nlp_tts.rst @@ -0,0 +1 @@ +../../source-pytorch/ecosystem/asr_nlp_tts.rst \ No newline at end of file diff --git a/docs/source-lit/ecosystem/bolts.rst b/docs/source-lit/ecosystem/bolts.rst new file mode 120000 index 0000000000000..e72c4bf8ae6a5 --- /dev/null +++ b/docs/source-lit/ecosystem/bolts.rst @@ -0,0 +1 @@ +../../source-pytorch/ecosystem/bolts.rst \ No newline at end of file diff --git a/docs/source-lit/ecosystem/community_examples.rst b/docs/source-lit/ecosystem/community_examples.rst new file mode 120000 index 0000000000000..c83f00f1c15a9 --- /dev/null +++ b/docs/source-lit/ecosystem/community_examples.rst @@ -0,0 +1 @@ +../../source-pytorch/ecosystem/community_examples.rst \ No newline at end of file diff --git a/docs/source-lit/ecosystem/ecosystem-ci.rst b/docs/source-lit/ecosystem/ecosystem-ci.rst new file mode 120000 index 0000000000000..5f1fcdd91f1ad --- /dev/null +++ b/docs/source-lit/ecosystem/ecosystem-ci.rst @@ -0,0 +1 @@ +../../source-pytorch/ecosystem/ecosystem-ci.rst \ No newline at end of file diff --git a/docs/source-lit/ecosystem/flash.rst b/docs/source-lit/ecosystem/flash.rst new file mode 120000 index 0000000000000..0e8a7d7433d10 --- /dev/null +++ b/docs/source-lit/ecosystem/flash.rst @@ -0,0 +1 @@ +../../source-pytorch/ecosystem/flash.rst \ No newline at end of file diff --git a/docs/source-lit/ecosystem/metrics.rst b/docs/source-lit/ecosystem/metrics.rst new file mode 120000 index 0000000000000..b04d02241a07c --- /dev/null +++ b/docs/source-lit/ecosystem/metrics.rst @@ -0,0 +1 @@ +../../source-pytorch/ecosystem/metrics.rst \ No newline at end of file diff --git a/docs/source-lit/ecosystem/transformers.rst b/docs/source-lit/ecosystem/transformers.rst new file mode 120000 index 0000000000000..a78f134d58e00 --- /dev/null +++ b/docs/source-lit/ecosystem/transformers.rst @@ -0,0 +1 @@ +../../source-pytorch/ecosystem/transformers.rst \ No newline at end of file diff --git a/docs/source-lit/examples/dag/dag.rst b/docs/source-lit/examples/dag/dag.rst new file mode 120000 index 0000000000000..ea72fcd6a5f40 --- /dev/null +++ b/docs/source-lit/examples/dag/dag.rst @@ -0,0 +1 @@ +../../../source-app/examples/dag/dag.rst \ No newline at end of file diff --git a/docs/source-lit/examples/dag/dag_from_scratch.rst b/docs/source-lit/examples/dag/dag_from_scratch.rst new file mode 120000 index 0000000000000..4b3149826398d --- /dev/null +++ b/docs/source-lit/examples/dag/dag_from_scratch.rst @@ -0,0 +1 @@ +../../../source-app/examples/dag/dag_from_scratch.rst \ No newline at end of file diff --git a/docs/source-lit/examples/data_explore_app.rst b/docs/source-lit/examples/data_explore_app.rst new file mode 120000 index 0000000000000..1d9778932b40e --- /dev/null +++ b/docs/source-lit/examples/data_explore_app.rst @@ -0,0 +1 @@ +../../source-app/examples/data_explore_app.rst \ No newline at end of file diff --git a/docs/source-lit/examples/etl_app.rst b/docs/source-lit/examples/etl_app.rst new file mode 120000 index 0000000000000..a46c78179238d --- /dev/null +++ b/docs/source-lit/examples/etl_app.rst @@ -0,0 +1 @@ +../../source-app/examples/etl_app.rst \ No newline at end of file diff --git a/docs/source-lit/examples/file_server/app.py b/docs/source-lit/examples/file_server/app.py new file mode 120000 index 0000000000000..c8a915dddee5d --- /dev/null +++ b/docs/source-lit/examples/file_server/app.py @@ -0,0 +1 @@ +../../../source-app/examples/file_server/app.py \ No newline at end of file diff --git a/docs/source-lit/examples/file_server/file_server.rst b/docs/source-lit/examples/file_server/file_server.rst new file mode 120000 index 0000000000000..6be534d028007 --- /dev/null +++ b/docs/source-lit/examples/file_server/file_server.rst @@ -0,0 +1 @@ +../../../source-app/examples/file_server/file_server.rst \ No newline at end of file diff --git a/docs/source-lit/examples/file_server/file_server_content.rst b/docs/source-lit/examples/file_server/file_server_content.rst new file mode 120000 index 0000000000000..1e1eff2815e71 --- /dev/null +++ b/docs/source-lit/examples/file_server/file_server_content.rst @@ -0,0 +1 @@ +../../../source-app/examples/file_server/file_server_content.rst \ No newline at end of file diff --git a/docs/source-lit/examples/file_server/file_server_step_1.rst b/docs/source-lit/examples/file_server/file_server_step_1.rst new file mode 120000 index 0000000000000..87ae8cdfe0c70 --- /dev/null +++ b/docs/source-lit/examples/file_server/file_server_step_1.rst @@ -0,0 +1 @@ +../../../source-app/examples/file_server/file_server_step_1.rst \ No newline at end of file diff --git a/docs/source-lit/examples/file_server/file_server_step_2.rst b/docs/source-lit/examples/file_server/file_server_step_2.rst new file mode 120000 index 0000000000000..302b100b6ebd7 --- /dev/null +++ b/docs/source-lit/examples/file_server/file_server_step_2.rst @@ -0,0 +1 @@ +../../../source-app/examples/file_server/file_server_step_2.rst \ No newline at end of file diff --git a/docs/source-lit/examples/file_server/file_server_step_3.rst b/docs/source-lit/examples/file_server/file_server_step_3.rst new file mode 120000 index 0000000000000..2ff9e518f29a1 --- /dev/null +++ b/docs/source-lit/examples/file_server/file_server_step_3.rst @@ -0,0 +1 @@ +../../../source-app/examples/file_server/file_server_step_3.rst \ No newline at end of file diff --git a/docs/source-lit/examples/file_server/file_server_step_4.rst b/docs/source-lit/examples/file_server/file_server_step_4.rst new file mode 120000 index 0000000000000..bf3413a543f29 --- /dev/null +++ b/docs/source-lit/examples/file_server/file_server_step_4.rst @@ -0,0 +1 @@ +../../../source-app/examples/file_server/file_server_step_4.rst \ No newline at end of file diff --git a/docs/source-lit/examples/github_repo_runner/app.py b/docs/source-lit/examples/github_repo_runner/app.py new file mode 120000 index 0000000000000..cf69d8612f2c7 --- /dev/null +++ b/docs/source-lit/examples/github_repo_runner/app.py @@ -0,0 +1 @@ +../../../source-app/examples/github_repo_runner/app.py \ No newline at end of file diff --git a/docs/source-lit/examples/github_repo_runner/github_repo_runner.rst b/docs/source-lit/examples/github_repo_runner/github_repo_runner.rst new file mode 120000 index 0000000000000..6af6a63533eb4 --- /dev/null +++ b/docs/source-lit/examples/github_repo_runner/github_repo_runner.rst @@ -0,0 +1 @@ +../../../source-app/examples/github_repo_runner/github_repo_runner.rst \ No newline at end of file diff --git a/docs/source-lit/examples/github_repo_runner/github_repo_runner_content.rst b/docs/source-lit/examples/github_repo_runner/github_repo_runner_content.rst new file mode 120000 index 0000000000000..895ab8a7e192b --- /dev/null +++ b/docs/source-lit/examples/github_repo_runner/github_repo_runner_content.rst @@ -0,0 +1 @@ +../../../source-app/examples/github_repo_runner/github_repo_runner_content.rst \ No newline at end of file diff --git a/docs/source-lit/examples/github_repo_runner/github_repo_runner_step_1.rst b/docs/source-lit/examples/github_repo_runner/github_repo_runner_step_1.rst new file mode 120000 index 0000000000000..93eaee80ceaee --- /dev/null +++ b/docs/source-lit/examples/github_repo_runner/github_repo_runner_step_1.rst @@ -0,0 +1 @@ +../../../source-app/examples/github_repo_runner/github_repo_runner_step_1.rst \ No newline at end of file diff --git a/docs/source-lit/examples/github_repo_runner/github_repo_runner_step_2.rst b/docs/source-lit/examples/github_repo_runner/github_repo_runner_step_2.rst new file mode 120000 index 0000000000000..4b90ae59a0fae --- /dev/null +++ b/docs/source-lit/examples/github_repo_runner/github_repo_runner_step_2.rst @@ -0,0 +1 @@ +../../../source-app/examples/github_repo_runner/github_repo_runner_step_2.rst \ No newline at end of file diff --git a/docs/source-lit/examples/github_repo_runner/github_repo_runner_step_3.rst b/docs/source-lit/examples/github_repo_runner/github_repo_runner_step_3.rst new file mode 120000 index 0000000000000..fda9815a4c09f --- /dev/null +++ b/docs/source-lit/examples/github_repo_runner/github_repo_runner_step_3.rst @@ -0,0 +1 @@ +../../../source-app/examples/github_repo_runner/github_repo_runner_step_3.rst \ No newline at end of file diff --git a/docs/source-lit/examples/github_repo_runner/github_repo_runner_step_4.rst b/docs/source-lit/examples/github_repo_runner/github_repo_runner_step_4.rst new file mode 120000 index 0000000000000..27e650f496504 --- /dev/null +++ b/docs/source-lit/examples/github_repo_runner/github_repo_runner_step_4.rst @@ -0,0 +1 @@ +../../../source-app/examples/github_repo_runner/github_repo_runner_step_4.rst \ No newline at end of file diff --git a/docs/source-lit/examples/github_repo_runner/github_repo_runner_step_5.rst b/docs/source-lit/examples/github_repo_runner/github_repo_runner_step_5.rst new file mode 120000 index 0000000000000..6397318bce487 --- /dev/null +++ b/docs/source-lit/examples/github_repo_runner/github_repo_runner_step_5.rst @@ -0,0 +1 @@ +../../../source-app/examples/github_repo_runner/github_repo_runner_step_5.rst \ No newline at end of file diff --git a/docs/source-lit/examples/hands_on_example.rst b/docs/source-lit/examples/hands_on_example.rst new file mode 120000 index 0000000000000..1063a42672ed7 --- /dev/null +++ b/docs/source-lit/examples/hands_on_example.rst @@ -0,0 +1 @@ +../../source-app/examples/hands_on_example.rst \ No newline at end of file diff --git a/docs/source-lit/examples/hpo/build_from_scratch.rst b/docs/source-lit/examples/hpo/build_from_scratch.rst new file mode 120000 index 0000000000000..a743030e8643f --- /dev/null +++ b/docs/source-lit/examples/hpo/build_from_scratch.rst @@ -0,0 +1 @@ +../../../source-app/examples/hpo/build_from_scratch.rst \ No newline at end of file diff --git a/docs/source-lit/examples/hpo/hpo.py b/docs/source-lit/examples/hpo/hpo.py new file mode 120000 index 0000000000000..a26ef671f9de5 --- /dev/null +++ b/docs/source-lit/examples/hpo/hpo.py @@ -0,0 +1 @@ +../../../source-app/examples/hpo/hpo.py \ No newline at end of file diff --git a/docs/source-lit/examples/hpo/hpo.rst b/docs/source-lit/examples/hpo/hpo.rst new file mode 120000 index 0000000000000..e5808b20b8ec5 --- /dev/null +++ b/docs/source-lit/examples/hpo/hpo.rst @@ -0,0 +1 @@ +../../../source-app/examples/hpo/hpo.rst \ No newline at end of file diff --git a/docs/source-lit/examples/hpo/hpo_wi.rst b/docs/source-lit/examples/hpo/hpo_wi.rst new file mode 120000 index 0000000000000..1fcf07e6d3d75 --- /dev/null +++ b/docs/source-lit/examples/hpo/hpo_wi.rst @@ -0,0 +1 @@ +../../../source-app/examples/hpo/hpo_wi.rst \ No newline at end of file diff --git a/docs/source-lit/examples/hpo/hpo_wo.rst b/docs/source-lit/examples/hpo/hpo_wo.rst new file mode 120000 index 0000000000000..c0b562258aebb --- /dev/null +++ b/docs/source-lit/examples/hpo/hpo_wo.rst @@ -0,0 +1 @@ +../../../source-app/examples/hpo/hpo_wo.rst \ No newline at end of file diff --git a/docs/source-lit/examples/hpo/lightning_hpo.rst b/docs/source-lit/examples/hpo/lightning_hpo.rst new file mode 120000 index 0000000000000..c6f5cdaa7d93e --- /dev/null +++ b/docs/source-lit/examples/hpo/lightning_hpo.rst @@ -0,0 +1 @@ +../../../source-app/examples/hpo/lightning_hpo.rst \ No newline at end of file diff --git a/docs/source-lit/examples/hpo/lightning_hpo_target.py b/docs/source-lit/examples/hpo/lightning_hpo_target.py new file mode 120000 index 0000000000000..8a492d411a559 --- /dev/null +++ b/docs/source-lit/examples/hpo/lightning_hpo_target.py @@ -0,0 +1 @@ +../../../source-app/examples/hpo/lightning_hpo_target.py \ No newline at end of file diff --git a/docs/source-lit/examples/hpo/objective.py b/docs/source-lit/examples/hpo/objective.py new file mode 120000 index 0000000000000..9023edf5d254f --- /dev/null +++ b/docs/source-lit/examples/hpo/objective.py @@ -0,0 +1 @@ +../../../source-app/examples/hpo/objective.py \ No newline at end of file diff --git a/docs/source-lit/examples/hpo/optuna_reference.py b/docs/source-lit/examples/hpo/optuna_reference.py new file mode 120000 index 0000000000000..70a01efee4454 --- /dev/null +++ b/docs/source-lit/examples/hpo/optuna_reference.py @@ -0,0 +1 @@ +../../../source-app/examples/hpo/optuna_reference.py \ No newline at end of file diff --git a/docs/source-lit/examples/model_server_app/app.py b/docs/source-lit/examples/model_server_app/app.py new file mode 120000 index 0000000000000..84e7cdcc0e088 --- /dev/null +++ b/docs/source-lit/examples/model_server_app/app.py @@ -0,0 +1 @@ +../../../source-app/examples/model_server_app/app.py \ No newline at end of file diff --git a/docs/source-lit/examples/model_server_app/load_testing.rst b/docs/source-lit/examples/model_server_app/load_testing.rst new file mode 120000 index 0000000000000..1e4e9aaeeabd2 --- /dev/null +++ b/docs/source-lit/examples/model_server_app/load_testing.rst @@ -0,0 +1 @@ +../../../source-app/examples/model_server_app/load_testing.rst \ No newline at end of file diff --git a/docs/source-lit/examples/model_server_app/locust_component.py b/docs/source-lit/examples/model_server_app/locust_component.py new file mode 120000 index 0000000000000..ad39d764d75ac --- /dev/null +++ b/docs/source-lit/examples/model_server_app/locust_component.py @@ -0,0 +1 @@ +../../../source-app/examples/model_server_app/locust_component.py \ No newline at end of file diff --git a/docs/source-lit/examples/model_server_app/locustfile.py b/docs/source-lit/examples/model_server_app/locustfile.py new file mode 120000 index 0000000000000..5e04becc68ecd --- /dev/null +++ b/docs/source-lit/examples/model_server_app/locustfile.py @@ -0,0 +1 @@ +../../../source-app/examples/model_server_app/locustfile.py \ No newline at end of file diff --git a/docs/source-lit/examples/model_server_app/model_server.py b/docs/source-lit/examples/model_server_app/model_server.py new file mode 120000 index 0000000000000..e2ec8e92d21ea --- /dev/null +++ b/docs/source-lit/examples/model_server_app/model_server.py @@ -0,0 +1 @@ +../../../source-app/examples/model_server_app/model_server.py \ No newline at end of file diff --git a/docs/source-lit/examples/model_server_app/model_server.rst b/docs/source-lit/examples/model_server_app/model_server.rst new file mode 120000 index 0000000000000..a58a2a4f73c46 --- /dev/null +++ b/docs/source-lit/examples/model_server_app/model_server.rst @@ -0,0 +1 @@ +../../../source-app/examples/model_server_app/model_server.rst \ No newline at end of file diff --git a/docs/source-lit/examples/model_server_app/model_server_app.rst b/docs/source-lit/examples/model_server_app/model_server_app.rst new file mode 120000 index 0000000000000..939588eac8b29 --- /dev/null +++ b/docs/source-lit/examples/model_server_app/model_server_app.rst @@ -0,0 +1 @@ +../../../source-app/examples/model_server_app/model_server_app.rst \ No newline at end of file diff --git a/docs/source-lit/examples/model_server_app/model_server_app_content.rst b/docs/source-lit/examples/model_server_app/model_server_app_content.rst new file mode 120000 index 0000000000000..a797c3e709fa0 --- /dev/null +++ b/docs/source-lit/examples/model_server_app/model_server_app_content.rst @@ -0,0 +1 @@ +../../../source-app/examples/model_server_app/model_server_app_content.rst \ No newline at end of file diff --git a/docs/source-lit/examples/model_server_app/putting_everything_together.rst b/docs/source-lit/examples/model_server_app/putting_everything_together.rst new file mode 120000 index 0000000000000..26e8628aad8cd --- /dev/null +++ b/docs/source-lit/examples/model_server_app/putting_everything_together.rst @@ -0,0 +1 @@ +../../../source-app/examples/model_server_app/putting_everything_together.rst \ No newline at end of file diff --git a/docs/source-lit/examples/model_server_app/train.py b/docs/source-lit/examples/model_server_app/train.py new file mode 120000 index 0000000000000..fedacd40770d4 --- /dev/null +++ b/docs/source-lit/examples/model_server_app/train.py @@ -0,0 +1 @@ +../../../source-app/examples/model_server_app/train.py \ No newline at end of file diff --git a/docs/source-lit/examples/model_server_app/train.rst b/docs/source-lit/examples/model_server_app/train.rst new file mode 120000 index 0000000000000..beeb056d0f54d --- /dev/null +++ b/docs/source-lit/examples/model_server_app/train.rst @@ -0,0 +1 @@ +../../../source-app/examples/model_server_app/train.rst \ No newline at end of file diff --git a/docs/source-lit/examples/research_demo_app.rst b/docs/source-lit/examples/research_demo_app.rst new file mode 120000 index 0000000000000..c268152450921 --- /dev/null +++ b/docs/source-lit/examples/research_demo_app.rst @@ -0,0 +1 @@ +../../source-app/examples/research_demo_app.rst \ No newline at end of file diff --git a/docs/source-lit/expertise_levels.rst b/docs/source-lit/expertise_levels.rst new file mode 120000 index 0000000000000..6106c0f649c31 --- /dev/null +++ b/docs/source-lit/expertise_levels.rst @@ -0,0 +1 @@ +../source-pytorch/expertise_levels.rst \ No newline at end of file diff --git a/docs/source-lit/extensions/accelerator.rst b/docs/source-lit/extensions/accelerator.rst new file mode 120000 index 0000000000000..8bdbeb605c05c --- /dev/null +++ b/docs/source-lit/extensions/accelerator.rst @@ -0,0 +1 @@ +../../source-pytorch/extensions/accelerator.rst \ No newline at end of file diff --git a/docs/source-lit/extensions/callbacks.rst b/docs/source-lit/extensions/callbacks.rst new file mode 120000 index 0000000000000..0da2e7359d855 --- /dev/null +++ b/docs/source-lit/extensions/callbacks.rst @@ -0,0 +1 @@ +../../source-pytorch/extensions/callbacks.rst \ No newline at end of file diff --git a/docs/source-lit/extensions/callbacks_state.rst b/docs/source-lit/extensions/callbacks_state.rst new file mode 120000 index 0000000000000..65ccb4b1b9976 --- /dev/null +++ b/docs/source-lit/extensions/callbacks_state.rst @@ -0,0 +1 @@ +../../source-pytorch/extensions/callbacks_state.rst \ No newline at end of file diff --git a/docs/source-lit/extensions/datamodules_state.rst b/docs/source-lit/extensions/datamodules_state.rst new file mode 120000 index 0000000000000..0000add0d30ec --- /dev/null +++ b/docs/source-lit/extensions/datamodules_state.rst @@ -0,0 +1 @@ +../../source-pytorch/extensions/datamodules_state.rst \ No newline at end of file diff --git a/docs/source-lit/extensions/entry_points.rst b/docs/source-lit/extensions/entry_points.rst new file mode 120000 index 0000000000000..defcc94fc8a2b --- /dev/null +++ b/docs/source-lit/extensions/entry_points.rst @@ -0,0 +1 @@ +../../source-pytorch/extensions/entry_points.rst \ No newline at end of file diff --git a/docs/source-lit/extensions/logging.rst b/docs/source-lit/extensions/logging.rst new file mode 120000 index 0000000000000..84d6847a695e7 --- /dev/null +++ b/docs/source-lit/extensions/logging.rst @@ -0,0 +1 @@ +../../source-pytorch/extensions/logging.rst \ No newline at end of file diff --git a/docs/source-lit/extensions/loops.rst b/docs/source-lit/extensions/loops.rst new file mode 120000 index 0000000000000..4578d2dc91c0e --- /dev/null +++ b/docs/source-lit/extensions/loops.rst @@ -0,0 +1 @@ +../../source-pytorch/extensions/loops.rst \ No newline at end of file diff --git a/docs/source-lit/extensions/loops_advanced.rst b/docs/source-lit/extensions/loops_advanced.rst new file mode 120000 index 0000000000000..83b9d56813734 --- /dev/null +++ b/docs/source-lit/extensions/loops_advanced.rst @@ -0,0 +1 @@ +../../source-pytorch/extensions/loops_advanced.rst \ No newline at end of file diff --git a/docs/source-lit/extensions/plugins.rst b/docs/source-lit/extensions/plugins.rst new file mode 120000 index 0000000000000..b01661c0a2c9a --- /dev/null +++ b/docs/source-lit/extensions/plugins.rst @@ -0,0 +1 @@ +../../source-pytorch/extensions/plugins.rst \ No newline at end of file diff --git a/docs/source-lit/extensions/strategy.rst b/docs/source-lit/extensions/strategy.rst new file mode 120000 index 0000000000000..a4578e3febbde --- /dev/null +++ b/docs/source-lit/extensions/strategy.rst @@ -0,0 +1 @@ +../../source-pytorch/extensions/strategy.rst \ No newline at end of file diff --git a/docs/source-lit/get_started/add_an_interactive_demo.rst b/docs/source-lit/get_started/add_an_interactive_demo.rst new file mode 120000 index 0000000000000..262af0c2c426e --- /dev/null +++ b/docs/source-lit/get_started/add_an_interactive_demo.rst @@ -0,0 +1 @@ +../../source-app/get_started/add_an_interactive_demo.rst \ No newline at end of file diff --git a/docs/source-lit/get_started/build_model.rst b/docs/source-lit/get_started/build_model.rst new file mode 120000 index 0000000000000..562d90c729ed2 --- /dev/null +++ b/docs/source-lit/get_started/build_model.rst @@ -0,0 +1 @@ +../../source-app/get_started/build_model.rst \ No newline at end of file diff --git a/docs/source-lit/get_started/go_beyond_training.rst b/docs/source-lit/get_started/go_beyond_training.rst new file mode 120000 index 0000000000000..31915ed2cb45f --- /dev/null +++ b/docs/source-lit/get_started/go_beyond_training.rst @@ -0,0 +1 @@ +../../source-app/get_started/go_beyond_training.rst \ No newline at end of file diff --git a/docs/source-lit/get_started/go_beyond_training_content.rst b/docs/source-lit/get_started/go_beyond_training_content.rst new file mode 120000 index 0000000000000..99f5f62d96205 --- /dev/null +++ b/docs/source-lit/get_started/go_beyond_training_content.rst @@ -0,0 +1 @@ +../../source-app/get_started/go_beyond_training_content.rst \ No newline at end of file diff --git a/docs/source-lit/get_started/jumpstart_from_app_gallery.rst b/docs/source-lit/get_started/jumpstart_from_app_gallery.rst new file mode 120000 index 0000000000000..27f0d9914e091 --- /dev/null +++ b/docs/source-lit/get_started/jumpstart_from_app_gallery.rst @@ -0,0 +1 @@ +../../source-app/get_started/jumpstart_from_app_gallery.rst \ No newline at end of file diff --git a/docs/source-lit/get_started/jumpstart_from_component_gallery.rst b/docs/source-lit/get_started/jumpstart_from_component_gallery.rst new file mode 120000 index 0000000000000..510da2a649b03 --- /dev/null +++ b/docs/source-lit/get_started/jumpstart_from_component_gallery.rst @@ -0,0 +1 @@ +../../source-app/get_started/jumpstart_from_component_gallery.rst \ No newline at end of file diff --git a/docs/source-lit/get_started/lightning_apps_intro.rst b/docs/source-lit/get_started/lightning_apps_intro.rst new file mode 120000 index 0000000000000..2455c3658532b --- /dev/null +++ b/docs/source-lit/get_started/lightning_apps_intro.rst @@ -0,0 +1 @@ +../../source-app/get_started/lightning_apps_intro.rst \ No newline at end of file diff --git a/docs/source-lit/get_started/training_with_apps.rst b/docs/source-lit/get_started/training_with_apps.rst new file mode 120000 index 0000000000000..66e828517e4d4 --- /dev/null +++ b/docs/source-lit/get_started/training_with_apps.rst @@ -0,0 +1 @@ +../../source-app/get_started/training_with_apps.rst \ No newline at end of file diff --git a/docs/source-lit/get_started/what_app_can_do.rst b/docs/source-lit/get_started/what_app_can_do.rst new file mode 120000 index 0000000000000..5e5bc1642f798 --- /dev/null +++ b/docs/source-lit/get_started/what_app_can_do.rst @@ -0,0 +1 @@ +../../source-app/get_started/what_app_can_do.rst \ No newline at end of file diff --git a/docs/source-lit/glossary/app_tree.rst b/docs/source-lit/glossary/app_tree.rst new file mode 120000 index 0000000000000..65cc6ef5c8062 --- /dev/null +++ b/docs/source-lit/glossary/app_tree.rst @@ -0,0 +1 @@ +../../source-app/glossary/app_tree.rst \ No newline at end of file diff --git a/docs/source-lit/glossary/build_config/build_config.rst b/docs/source-lit/glossary/build_config/build_config.rst new file mode 120000 index 0000000000000..33c31c1ad1482 --- /dev/null +++ b/docs/source-lit/glossary/build_config/build_config.rst @@ -0,0 +1 @@ +../../../source-app/glossary/build_config/build_config.rst \ No newline at end of file diff --git a/docs/source-lit/glossary/build_config/build_config_advanced.rst b/docs/source-lit/glossary/build_config/build_config_advanced.rst new file mode 120000 index 0000000000000..d95b41e5af210 --- /dev/null +++ b/docs/source-lit/glossary/build_config/build_config_advanced.rst @@ -0,0 +1 @@ +../../../source-app/glossary/build_config/build_config_advanced.rst \ No newline at end of file diff --git a/docs/source-lit/glossary/build_config/build_config_basic.rst b/docs/source-lit/glossary/build_config/build_config_basic.rst new file mode 120000 index 0000000000000..d71e5ded650c7 --- /dev/null +++ b/docs/source-lit/glossary/build_config/build_config_basic.rst @@ -0,0 +1 @@ +../../../source-app/glossary/build_config/build_config_basic.rst \ No newline at end of file diff --git a/docs/source-lit/glossary/build_config/build_config_intermediate.rst b/docs/source-lit/glossary/build_config/build_config_intermediate.rst new file mode 120000 index 0000000000000..06432b96c725c --- /dev/null +++ b/docs/source-lit/glossary/build_config/build_config_intermediate.rst @@ -0,0 +1 @@ +../../../source-app/glossary/build_config/build_config_intermediate.rst \ No newline at end of file diff --git a/docs/source-lit/glossary/dag.rst b/docs/source-lit/glossary/dag.rst new file mode 120000 index 0000000000000..ba5a8719bcb87 --- /dev/null +++ b/docs/source-lit/glossary/dag.rst @@ -0,0 +1 @@ +../../source-app/glossary/dag.rst \ No newline at end of file diff --git a/docs/source-lit/glossary/debug_app.rst b/docs/source-lit/glossary/debug_app.rst new file mode 120000 index 0000000000000..946f9d59c46f2 --- /dev/null +++ b/docs/source-lit/glossary/debug_app.rst @@ -0,0 +1 @@ +../../source-app/glossary/debug_app.rst \ No newline at end of file diff --git a/docs/source-lit/glossary/distributed_fe.rst b/docs/source-lit/glossary/distributed_fe.rst new file mode 120000 index 0000000000000..2f3378fdeca9b --- /dev/null +++ b/docs/source-lit/glossary/distributed_fe.rst @@ -0,0 +1 @@ +../../source-app/glossary/distributed_fe.rst \ No newline at end of file diff --git a/docs/source-lit/glossary/distributed_hardware.rst b/docs/source-lit/glossary/distributed_hardware.rst new file mode 120000 index 0000000000000..95e32bbffa19e --- /dev/null +++ b/docs/source-lit/glossary/distributed_hardware.rst @@ -0,0 +1 @@ +../../source-app/glossary/distributed_hardware.rst \ No newline at end of file diff --git a/docs/source-lit/glossary/environment_variables.rst b/docs/source-lit/glossary/environment_variables.rst new file mode 120000 index 0000000000000..c5e8b956193de --- /dev/null +++ b/docs/source-lit/glossary/environment_variables.rst @@ -0,0 +1 @@ +../../source-app/glossary/environment_variables.rst \ No newline at end of file diff --git a/docs/source-lit/glossary/event_loop.rst b/docs/source-lit/glossary/event_loop.rst new file mode 120000 index 0000000000000..706ea9b3f7f9e --- /dev/null +++ b/docs/source-lit/glossary/event_loop.rst @@ -0,0 +1 @@ +../../source-app/glossary/event_loop.rst \ No newline at end of file diff --git a/docs/source-lit/glossary/fault_tolerance.rst b/docs/source-lit/glossary/fault_tolerance.rst new file mode 120000 index 0000000000000..095b72ed30380 --- /dev/null +++ b/docs/source-lit/glossary/fault_tolerance.rst @@ -0,0 +1 @@ +../../source-app/glossary/fault_tolerance.rst \ No newline at end of file diff --git a/docs/source-lit/glossary/index.rst b/docs/source-lit/glossary/index.rst new file mode 120000 index 0000000000000..681b9f0dc52f6 --- /dev/null +++ b/docs/source-lit/glossary/index.rst @@ -0,0 +1 @@ +../../source-app/glossary/index.rst \ No newline at end of file diff --git a/docs/source-lit/glossary/lightning_app_overview/index.rst b/docs/source-lit/glossary/lightning_app_overview/index.rst new file mode 120000 index 0000000000000..b6f6013f0ee43 --- /dev/null +++ b/docs/source-lit/glossary/lightning_app_overview/index.rst @@ -0,0 +1 @@ +../../../source-app/glossary/lightning_app_overview/index.rst \ No newline at end of file diff --git a/docs/source-lit/glossary/scheduling.rst b/docs/source-lit/glossary/scheduling.rst new file mode 120000 index 0000000000000..c9ecc0ea7bc45 --- /dev/null +++ b/docs/source-lit/glossary/scheduling.rst @@ -0,0 +1 @@ +../../source-app/glossary/scheduling.rst \ No newline at end of file diff --git a/docs/source-lit/glossary/sharing_components.rst b/docs/source-lit/glossary/sharing_components.rst new file mode 120000 index 0000000000000..6bc83a8d56860 --- /dev/null +++ b/docs/source-lit/glossary/sharing_components.rst @@ -0,0 +1 @@ +../../source-app/glossary/sharing_components.rst \ No newline at end of file diff --git a/docs/source-lit/glossary/storage/differences.rst b/docs/source-lit/glossary/storage/differences.rst new file mode 120000 index 0000000000000..979d61a755c56 --- /dev/null +++ b/docs/source-lit/glossary/storage/differences.rst @@ -0,0 +1 @@ +../../../source-app/glossary/storage/differences.rst \ No newline at end of file diff --git a/docs/source-lit/glossary/storage/drive.rst b/docs/source-lit/glossary/storage/drive.rst new file mode 120000 index 0000000000000..85b8ed489eee1 --- /dev/null +++ b/docs/source-lit/glossary/storage/drive.rst @@ -0,0 +1 @@ +../../../source-app/glossary/storage/drive.rst \ No newline at end of file diff --git a/docs/source-lit/glossary/storage/drive_content.rst b/docs/source-lit/glossary/storage/drive_content.rst new file mode 120000 index 0000000000000..0ce5ccfffd861 --- /dev/null +++ b/docs/source-lit/glossary/storage/drive_content.rst @@ -0,0 +1 @@ +../../../source-app/glossary/storage/drive_content.rst \ No newline at end of file diff --git a/docs/source-lit/glossary/storage/path.rst b/docs/source-lit/glossary/storage/path.rst new file mode 120000 index 0000000000000..a9835d5acd223 --- /dev/null +++ b/docs/source-lit/glossary/storage/path.rst @@ -0,0 +1 @@ +../../../source-app/glossary/storage/path.rst \ No newline at end of file diff --git a/docs/source-lit/glossary/storage/storage.rst b/docs/source-lit/glossary/storage/storage.rst new file mode 120000 index 0000000000000..b67c17bcf00b4 --- /dev/null +++ b/docs/source-lit/glossary/storage/storage.rst @@ -0,0 +1 @@ +../../../source-app/glossary/storage/storage.rst \ No newline at end of file diff --git a/docs/source-lit/governance.rst b/docs/source-lit/governance.rst new file mode 120000 index 0000000000000..f84207b3e69d4 --- /dev/null +++ b/docs/source-lit/governance.rst @@ -0,0 +1 @@ +../source-pytorch/governance.rst \ No newline at end of file diff --git a/docs/source-lit/guides/data.rst b/docs/source-lit/guides/data.rst new file mode 120000 index 0000000000000..11f3f3b333652 --- /dev/null +++ b/docs/source-lit/guides/data.rst @@ -0,0 +1 @@ +../../source-pytorch/guides/data.rst \ No newline at end of file diff --git a/docs/source-lit/guides/speed.rst b/docs/source-lit/guides/speed.rst new file mode 120000 index 0000000000000..a4573b475c9f8 --- /dev/null +++ b/docs/source-lit/guides/speed.rst @@ -0,0 +1 @@ +../../source-pytorch/guides/speed.rst \ No newline at end of file diff --git a/docs/source-lit/index.rst b/docs/source-lit/index.rst new file mode 100644 index 0000000000000..d62cc72d1a836 --- /dev/null +++ b/docs/source-lit/index.rst @@ -0,0 +1,262 @@ +.. lightning documentation master file, created by + sphinx-quickstart on Sat Sep 19 16:37:02 2020. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +############################ +Welcome to ⚡ Lightning Apps +############################ + +.. twocolumns:: + :left: + .. image:: https://pl-flash-data.s3.amazonaws.com/assets_lightning/Lightning.gif + :alt: Animation showing how to convert a standard training loop to a Lightning loop + :right: + The `open-source Lightning framework `_ gives ML Researchers and Data Scientists, the fastest & most flexible + way to iterate on ML research ideas and deliver scalable ML systems with the performance enterprises requires at the same time. + +.. join_slack:: + :align: center + :margin: 0 + +---- + +.. toctree:: + :maxdepth: 1 + :caption: Home + + self + +.. toctree:: + :maxdepth: 1 + :caption: Get Started + + installation + get_started/lightning_apps_intro + +.. toctree:: + :maxdepth: 1 + :caption: App Building Skills + + Basic + Intermediate + Advanced + +.. toctree:: + :maxdepth: 1 + :caption: Examples + + Develop a DAG + Develop a File Server + Develop a Github Repo Script Runner + Develop a HPO Sweeper + Develop a Model Server + +.. + [Docs under construction] Build a data exploring app + [Docs under construction] Build a ETL app + [Docs under construction] Build a model deployment app + [Docs under construction] Build a research demo app + +.. toctree:: + :maxdepth: 1 + :caption: How to... + + Access the App State + Add a web user interface (UI) + Add a web link + Arrange app tabs + Develop a Lightning App + Develop a Lightning Component + Cache Work run calls + Customize your cloud compute + Extend an existing app + Publish a Lightning component + Run a server within a Lightning App + Run an App on the cloud + Run Apps on your cloud account (BYOC) + Run work in parallel + Share an app + Share files between components + +.. + [Docs under construction] Add a Lightning component + [Docs under construction] Debug a distributed cloud app locally + [Docs under construction] Enable fault tolerance + [Docs under construction] Run components on different hardware + [Docs under construction] Schedule app runs + [Docs under construction] Test an app + +.. toctree:: + :maxdepth: 1 + :caption: Core API Reference + + LightningApp + LightningFlow + LightningWork + +.. toctree:: + :maxdepth: 1 + :caption: Addons API Reference + + api_reference/components + api_reference/frontend + api_reference/runners + api_reference/storage + +.. toctree:: + :maxdepth: 1 + :caption: Glossary + + App Components Tree + Build Configuration + DAG + Event Loop + Environment Variables + Frontend + Sharing Components + Scheduling + Storage + UI + +.. + [Docs under construction] Debug an app + [Docs under construction] Distributed front-ends + [Docs under construction] Distributed hardware + [Docs under construction] Fault tolerance + +.. toctree:: + :maxdepth: 1 + :name: start + :caption: Get Started + + starter/introduction + starter/installation + + +.. toctree:: + :maxdepth: 2 + :name: levels + :caption: Level Up + + levels/core_skills + levels/intermediate + levels/advanced + levels/expert + +.. toctree:: + :maxdepth: 2 + :name: pl_docs + :caption: Core API + + common/lightning_module + common/trainer + +.. toctree:: + :maxdepth: 2 + :name: api + :caption: API Reference + + api_references + +.. toctree:: + :maxdepth: 1 + :name: Common Workflows + :caption: Common Workflows + + Avoid overfitting + model/build_model.rst + common/hyperparameters + common/progress_bar + deploy/production + advanced/training_tricks + cli/lightning_cli + tuning/profiler + Manage experiments + Organize existing PyTorch into Lightning + clouds/cluster + Save and load model progress + Save memory with half-precision + Training over the internet + advanced/model_parallel + clouds/cloud_training + Train on single or multiple GPUs + Train on single or multiple HPUs + Train on single or multiple IPUs + Train on single or multiple TPUs + Train on MPS + Use a pretrained model + model/own_your_loop + +.. toctree:: + :maxdepth: 1 + :name: Glossary + :caption: Glossary + + Accelerators + Callback + Checkpointing + Cluster + Cloud checkpoint + Console Logging + Debugging + Early stopping + Experiment manager (Logger) + Fault tolerant training + Finetuning + Flash + Grid AI + GPU + Half precision + HPU + Inference + IPU + Lightning CLI + Lightning Lite + LightningDataModule + LightningModule + Lightning Transformers + Log + Loops + TPU + Metrics + Model + Model Parallel + Collaborative Training + Plugins + Progress bar + Production + Predict + Pretrained models + Profiler + Pruning and Quantization + Remote filesystem and FSSPEC + Strategy + Strategy registry + Style guide + Sweep + SWA + SLURM + Transfer learning + Trainer + Torch distributed + +.. toctree:: + :maxdepth: 1 + :name: Hands-on Examples + :caption: Hands-on Examples + :glob: + + PyTorch Lightning 101 class + From PyTorch to PyTorch Lightning [Blog] + From PyTorch to PyTorch Lightning [Video] + +.. toctree:: + :maxdepth: 1 + :name: Community + :caption: Community + + generated/CODE_OF_CONDUCT.md + generated/CONTRIBUTING.md + generated/BECOMING_A_CORE_CONTRIBUTOR.md + governance diff --git a/docs/source-lit/install_beginner.rst b/docs/source-lit/install_beginner.rst new file mode 120000 index 0000000000000..ed66700079593 --- /dev/null +++ b/docs/source-lit/install_beginner.rst @@ -0,0 +1 @@ +../source-app/install_beginner.rst \ No newline at end of file diff --git a/docs/source-lit/installation.rst b/docs/source-lit/installation.rst new file mode 120000 index 0000000000000..88a3a7b4a3d3e --- /dev/null +++ b/docs/source-lit/installation.rst @@ -0,0 +1 @@ +../source-app/installation.rst \ No newline at end of file diff --git a/docs/source-lit/installation_mac.rst b/docs/source-lit/installation_mac.rst new file mode 120000 index 0000000000000..39add6d255d9f --- /dev/null +++ b/docs/source-lit/installation_mac.rst @@ -0,0 +1 @@ +../source-app/installation_mac.rst \ No newline at end of file diff --git a/docs/source-lit/installation_win.rst b/docs/source-lit/installation_win.rst new file mode 120000 index 0000000000000..eafa37da7bde5 --- /dev/null +++ b/docs/source-lit/installation_win.rst @@ -0,0 +1 @@ +../source-app/installation_win.rst \ No newline at end of file diff --git a/docs/source-lit/intro.rst b/docs/source-lit/intro.rst new file mode 120000 index 0000000000000..3f5e7b6071a7f --- /dev/null +++ b/docs/source-lit/intro.rst @@ -0,0 +1 @@ +../source-app/intro.rst \ No newline at end of file diff --git a/docs/source-lit/levels/advanced.rst b/docs/source-lit/levels/advanced.rst new file mode 120000 index 0000000000000..b83c8fa5aa1d1 --- /dev/null +++ b/docs/source-lit/levels/advanced.rst @@ -0,0 +1 @@ +../../source-pytorch/levels/advanced.rst \ No newline at end of file diff --git a/docs/source-lit/levels/advanced/index.rst b/docs/source-lit/levels/advanced/index.rst new file mode 120000 index 0000000000000..542cc5e9b7de1 --- /dev/null +++ b/docs/source-lit/levels/advanced/index.rst @@ -0,0 +1 @@ +../../../source-app/levels/advanced/index.rst \ No newline at end of file diff --git a/docs/source-lit/levels/advanced/level_16.rst b/docs/source-lit/levels/advanced/level_16.rst new file mode 120000 index 0000000000000..f6f9cfd0a3ca6 --- /dev/null +++ b/docs/source-lit/levels/advanced/level_16.rst @@ -0,0 +1 @@ +../../../source-app/levels/advanced/level_16.rst \ No newline at end of file diff --git a/docs/source-lit/levels/advanced/level_17.rst b/docs/source-lit/levels/advanced/level_17.rst new file mode 120000 index 0000000000000..67dd7abca9456 --- /dev/null +++ b/docs/source-lit/levels/advanced/level_17.rst @@ -0,0 +1 @@ +../../../source-app/levels/advanced/level_17.rst \ No newline at end of file diff --git a/docs/source-lit/levels/advanced/level_18.rst b/docs/source-lit/levels/advanced/level_18.rst new file mode 120000 index 0000000000000..10aab334a8742 --- /dev/null +++ b/docs/source-lit/levels/advanced/level_18.rst @@ -0,0 +1 @@ +../../../source-app/levels/advanced/level_18.rst \ No newline at end of file diff --git a/docs/source-lit/levels/advanced/level_19.rst b/docs/source-lit/levels/advanced/level_19.rst new file mode 120000 index 0000000000000..1708a795bb45e --- /dev/null +++ b/docs/source-lit/levels/advanced/level_19.rst @@ -0,0 +1 @@ +../../../source-app/levels/advanced/level_19.rst \ No newline at end of file diff --git a/docs/source-lit/levels/advanced/level_20.rst b/docs/source-lit/levels/advanced/level_20.rst new file mode 120000 index 0000000000000..bad6b2fea4900 --- /dev/null +++ b/docs/source-lit/levels/advanced/level_20.rst @@ -0,0 +1 @@ +../../../source-app/levels/advanced/level_20.rst \ No newline at end of file diff --git a/docs/source-lit/levels/advanced_level_15.rst b/docs/source-lit/levels/advanced_level_15.rst new file mode 120000 index 0000000000000..a1cc8f373d0f0 --- /dev/null +++ b/docs/source-lit/levels/advanced_level_15.rst @@ -0,0 +1 @@ +../../source-pytorch/levels/advanced_level_15.rst \ No newline at end of file diff --git a/docs/source-lit/levels/advanced_level_16.rst b/docs/source-lit/levels/advanced_level_16.rst new file mode 120000 index 0000000000000..a803f05b8190a --- /dev/null +++ b/docs/source-lit/levels/advanced_level_16.rst @@ -0,0 +1 @@ +../../source-pytorch/levels/advanced_level_16.rst \ No newline at end of file diff --git a/docs/source-lit/levels/advanced_level_17.rst b/docs/source-lit/levels/advanced_level_17.rst new file mode 120000 index 0000000000000..fa542cb0a7596 --- /dev/null +++ b/docs/source-lit/levels/advanced_level_17.rst @@ -0,0 +1 @@ +../../source-pytorch/levels/advanced_level_17.rst \ No newline at end of file diff --git a/docs/source-lit/levels/advanced_level_18.rst b/docs/source-lit/levels/advanced_level_18.rst new file mode 120000 index 0000000000000..6ee1ca80fe05d --- /dev/null +++ b/docs/source-lit/levels/advanced_level_18.rst @@ -0,0 +1 @@ +../../source-pytorch/levels/advanced_level_18.rst \ No newline at end of file diff --git a/docs/source-lit/levels/advanced_level_19.rst b/docs/source-lit/levels/advanced_level_19.rst new file mode 120000 index 0000000000000..d9a616b5acf00 --- /dev/null +++ b/docs/source-lit/levels/advanced_level_19.rst @@ -0,0 +1 @@ +../../source-pytorch/levels/advanced_level_19.rst \ No newline at end of file diff --git a/docs/source-lit/levels/advanced_level_20.rst b/docs/source-lit/levels/advanced_level_20.rst new file mode 120000 index 0000000000000..e9d6f59c144a5 --- /dev/null +++ b/docs/source-lit/levels/advanced_level_20.rst @@ -0,0 +1 @@ +../../source-pytorch/levels/advanced_level_20.rst \ No newline at end of file diff --git a/docs/source-lit/levels/advanced_level_21.rst b/docs/source-lit/levels/advanced_level_21.rst new file mode 120000 index 0000000000000..a1a615f04a14b --- /dev/null +++ b/docs/source-lit/levels/advanced_level_21.rst @@ -0,0 +1 @@ +../../source-pytorch/levels/advanced_level_21.rst \ No newline at end of file diff --git a/docs/source-lit/levels/advanced_level_22.rst b/docs/source-lit/levels/advanced_level_22.rst new file mode 120000 index 0000000000000..e1adeaa1e319f --- /dev/null +++ b/docs/source-lit/levels/advanced_level_22.rst @@ -0,0 +1 @@ +../../source-pytorch/levels/advanced_level_22.rst \ No newline at end of file diff --git a/docs/source-lit/levels/basic/index.rst b/docs/source-lit/levels/basic/index.rst new file mode 120000 index 0000000000000..d468d049ab695 --- /dev/null +++ b/docs/source-lit/levels/basic/index.rst @@ -0,0 +1 @@ +../../../source-app/levels/basic/index.rst \ No newline at end of file diff --git a/docs/source-lit/levels/basic/level_1.rst b/docs/source-lit/levels/basic/level_1.rst new file mode 120000 index 0000000000000..76202880f4a3f --- /dev/null +++ b/docs/source-lit/levels/basic/level_1.rst @@ -0,0 +1 @@ +../../../source-app/levels/basic/level_1.rst \ No newline at end of file diff --git a/docs/source-lit/levels/basic/level_2.rst b/docs/source-lit/levels/basic/level_2.rst new file mode 120000 index 0000000000000..d7d529a201dc5 --- /dev/null +++ b/docs/source-lit/levels/basic/level_2.rst @@ -0,0 +1 @@ +../../../source-app/levels/basic/level_2.rst \ No newline at end of file diff --git a/docs/source-lit/levels/basic/level_3.rst b/docs/source-lit/levels/basic/level_3.rst new file mode 120000 index 0000000000000..f77489716585d --- /dev/null +++ b/docs/source-lit/levels/basic/level_3.rst @@ -0,0 +1 @@ +../../../source-app/levels/basic/level_3.rst \ No newline at end of file diff --git a/docs/source-lit/levels/basic/level_4.rst b/docs/source-lit/levels/basic/level_4.rst new file mode 120000 index 0000000000000..982cb503ef74f --- /dev/null +++ b/docs/source-lit/levels/basic/level_4.rst @@ -0,0 +1 @@ +../../../source-app/levels/basic/level_4.rst \ No newline at end of file diff --git a/docs/source-lit/levels/basic/level_5.rst b/docs/source-lit/levels/basic/level_5.rst new file mode 120000 index 0000000000000..e9d004eaadc46 --- /dev/null +++ b/docs/source-lit/levels/basic/level_5.rst @@ -0,0 +1 @@ +../../../source-app/levels/basic/level_5.rst \ No newline at end of file diff --git a/docs/source-lit/levels/basic/level_6.rst b/docs/source-lit/levels/basic/level_6.rst new file mode 120000 index 0000000000000..143a57162fe68 --- /dev/null +++ b/docs/source-lit/levels/basic/level_6.rst @@ -0,0 +1 @@ +../../../source-app/levels/basic/level_6.rst \ No newline at end of file diff --git a/docs/source-lit/levels/basic/level_7.rst b/docs/source-lit/levels/basic/level_7.rst new file mode 120000 index 0000000000000..91d2d40bc3126 --- /dev/null +++ b/docs/source-lit/levels/basic/level_7.rst @@ -0,0 +1 @@ +../../../source-app/levels/basic/level_7.rst \ No newline at end of file diff --git a/docs/source-lit/levels/basic_level_2.rst b/docs/source-lit/levels/basic_level_2.rst new file mode 120000 index 0000000000000..977bb5015868a --- /dev/null +++ b/docs/source-lit/levels/basic_level_2.rst @@ -0,0 +1 @@ +../../source-pytorch/levels/basic_level_2.rst \ No newline at end of file diff --git a/docs/source-lit/levels/basic_level_5.rst b/docs/source-lit/levels/basic_level_5.rst new file mode 120000 index 0000000000000..83ddc83f4a8fe --- /dev/null +++ b/docs/source-lit/levels/basic_level_5.rst @@ -0,0 +1 @@ +../../source-pytorch/levels/basic_level_5.rst \ No newline at end of file diff --git a/docs/source-lit/levels/core_level_3.rst b/docs/source-lit/levels/core_level_3.rst new file mode 120000 index 0000000000000..73ac40904a535 --- /dev/null +++ b/docs/source-lit/levels/core_level_3.rst @@ -0,0 +1 @@ +../../source-pytorch/levels/core_level_3.rst \ No newline at end of file diff --git a/docs/source-lit/levels/core_level_6.rst b/docs/source-lit/levels/core_level_6.rst new file mode 120000 index 0000000000000..35bb52acd06b5 --- /dev/null +++ b/docs/source-lit/levels/core_level_6.rst @@ -0,0 +1 @@ +../../source-pytorch/levels/core_level_6.rst \ No newline at end of file diff --git a/docs/source-lit/levels/core_skills.rst b/docs/source-lit/levels/core_skills.rst new file mode 120000 index 0000000000000..73c5c0667135f --- /dev/null +++ b/docs/source-lit/levels/core_skills.rst @@ -0,0 +1 @@ +../../source-pytorch/levels/core_skills.rst \ No newline at end of file diff --git a/docs/source-lit/levels/expert.rst b/docs/source-lit/levels/expert.rst new file mode 120000 index 0000000000000..704a59e4bc702 --- /dev/null +++ b/docs/source-lit/levels/expert.rst @@ -0,0 +1 @@ +../../source-pytorch/levels/expert.rst \ No newline at end of file diff --git a/docs/source-lit/levels/expert_level_23.rst b/docs/source-lit/levels/expert_level_23.rst new file mode 120000 index 0000000000000..5b1db9e0a1d44 --- /dev/null +++ b/docs/source-lit/levels/expert_level_23.rst @@ -0,0 +1 @@ +../../source-pytorch/levels/expert_level_23.rst \ No newline at end of file diff --git a/docs/source-lit/levels/expert_level_24.rst b/docs/source-lit/levels/expert_level_24.rst new file mode 120000 index 0000000000000..f0013cdf8c775 --- /dev/null +++ b/docs/source-lit/levels/expert_level_24.rst @@ -0,0 +1 @@ +../../source-pytorch/levels/expert_level_24.rst \ No newline at end of file diff --git a/docs/source-lit/levels/expert_level_27.rst b/docs/source-lit/levels/expert_level_27.rst new file mode 120000 index 0000000000000..bc0624e067fcd --- /dev/null +++ b/docs/source-lit/levels/expert_level_27.rst @@ -0,0 +1 @@ +../../source-pytorch/levels/expert_level_27.rst \ No newline at end of file diff --git a/docs/source-lit/levels/intermediate.rst b/docs/source-lit/levels/intermediate.rst new file mode 120000 index 0000000000000..1eb082a022363 --- /dev/null +++ b/docs/source-lit/levels/intermediate.rst @@ -0,0 +1 @@ +../../source-pytorch/levels/intermediate.rst \ No newline at end of file diff --git a/docs/source-lit/levels/intermediate/index.rst b/docs/source-lit/levels/intermediate/index.rst new file mode 120000 index 0000000000000..beec0d00ecc67 --- /dev/null +++ b/docs/source-lit/levels/intermediate/index.rst @@ -0,0 +1 @@ +../../../source-app/levels/intermediate/index.rst \ No newline at end of file diff --git a/docs/source-lit/levels/intermediate/level_10.rst b/docs/source-lit/levels/intermediate/level_10.rst new file mode 120000 index 0000000000000..356edd6156718 --- /dev/null +++ b/docs/source-lit/levels/intermediate/level_10.rst @@ -0,0 +1 @@ +../../../source-app/levels/intermediate/level_10.rst \ No newline at end of file diff --git a/docs/source-lit/levels/intermediate/level_11.rst b/docs/source-lit/levels/intermediate/level_11.rst new file mode 120000 index 0000000000000..7849803acf6a0 --- /dev/null +++ b/docs/source-lit/levels/intermediate/level_11.rst @@ -0,0 +1 @@ +../../../source-app/levels/intermediate/level_11.rst \ No newline at end of file diff --git a/docs/source-lit/levels/intermediate/level_12.rst b/docs/source-lit/levels/intermediate/level_12.rst new file mode 120000 index 0000000000000..04f106d4e1034 --- /dev/null +++ b/docs/source-lit/levels/intermediate/level_12.rst @@ -0,0 +1 @@ +../../../source-app/levels/intermediate/level_12.rst \ No newline at end of file diff --git a/docs/source-lit/levels/intermediate/level_13.rst b/docs/source-lit/levels/intermediate/level_13.rst new file mode 120000 index 0000000000000..ecfd386d4317f --- /dev/null +++ b/docs/source-lit/levels/intermediate/level_13.rst @@ -0,0 +1 @@ +../../../source-app/levels/intermediate/level_13.rst \ No newline at end of file diff --git a/docs/source-lit/levels/intermediate/level_14.rst b/docs/source-lit/levels/intermediate/level_14.rst new file mode 120000 index 0000000000000..da03ce2702a37 --- /dev/null +++ b/docs/source-lit/levels/intermediate/level_14.rst @@ -0,0 +1 @@ +../../../source-app/levels/intermediate/level_14.rst \ No newline at end of file diff --git a/docs/source-lit/levels/intermediate/level_15.rst b/docs/source-lit/levels/intermediate/level_15.rst new file mode 120000 index 0000000000000..2272e2475237f --- /dev/null +++ b/docs/source-lit/levels/intermediate/level_15.rst @@ -0,0 +1 @@ +../../../source-app/levels/intermediate/level_15.rst \ No newline at end of file diff --git a/docs/source-lit/levels/intermediate/level_8.rst b/docs/source-lit/levels/intermediate/level_8.rst new file mode 120000 index 0000000000000..695c726bf5f45 --- /dev/null +++ b/docs/source-lit/levels/intermediate/level_8.rst @@ -0,0 +1 @@ +../../../source-app/levels/intermediate/level_8.rst \ No newline at end of file diff --git a/docs/source-lit/levels/intermediate/level_9.rst b/docs/source-lit/levels/intermediate/level_9.rst new file mode 120000 index 0000000000000..a6d86ce90e671 --- /dev/null +++ b/docs/source-lit/levels/intermediate/level_9.rst @@ -0,0 +1 @@ +../../../source-app/levels/intermediate/level_9.rst \ No newline at end of file diff --git a/docs/source-lit/levels/intermediate_level_10.rst b/docs/source-lit/levels/intermediate_level_10.rst new file mode 120000 index 0000000000000..fcdbbda7988d1 --- /dev/null +++ b/docs/source-lit/levels/intermediate_level_10.rst @@ -0,0 +1 @@ +../../source-pytorch/levels/intermediate_level_10.rst \ No newline at end of file diff --git a/docs/source-lit/levels/intermediate_level_11.rst b/docs/source-lit/levels/intermediate_level_11.rst new file mode 120000 index 0000000000000..aed6ec5b0e84b --- /dev/null +++ b/docs/source-lit/levels/intermediate_level_11.rst @@ -0,0 +1 @@ +../../source-pytorch/levels/intermediate_level_11.rst \ No newline at end of file diff --git a/docs/source-lit/levels/intermediate_level_12.rst b/docs/source-lit/levels/intermediate_level_12.rst new file mode 120000 index 0000000000000..ce34f21ce4afd --- /dev/null +++ b/docs/source-lit/levels/intermediate_level_12.rst @@ -0,0 +1 @@ +../../source-pytorch/levels/intermediate_level_12.rst \ No newline at end of file diff --git a/docs/source-lit/levels/intermediate_level_13.rst b/docs/source-lit/levels/intermediate_level_13.rst new file mode 120000 index 0000000000000..3f13a263b5f7f --- /dev/null +++ b/docs/source-lit/levels/intermediate_level_13.rst @@ -0,0 +1 @@ +../../source-pytorch/levels/intermediate_level_13.rst \ No newline at end of file diff --git a/docs/source-lit/levels/intermediate_level_14.rst b/docs/source-lit/levels/intermediate_level_14.rst new file mode 120000 index 0000000000000..0660de09b5758 --- /dev/null +++ b/docs/source-lit/levels/intermediate_level_14.rst @@ -0,0 +1 @@ +../../source-pytorch/levels/intermediate_level_14.rst \ No newline at end of file diff --git a/docs/source-lit/levels/intermediate_level_7.rst b/docs/source-lit/levels/intermediate_level_7.rst new file mode 120000 index 0000000000000..3fdd9936118fd --- /dev/null +++ b/docs/source-lit/levels/intermediate_level_7.rst @@ -0,0 +1 @@ +../../source-pytorch/levels/intermediate_level_7.rst \ No newline at end of file diff --git a/docs/source-lit/levels/intermediate_level_8.rst b/docs/source-lit/levels/intermediate_level_8.rst new file mode 120000 index 0000000000000..788aea18e17a5 --- /dev/null +++ b/docs/source-lit/levels/intermediate_level_8.rst @@ -0,0 +1 @@ +../../source-pytorch/levels/intermediate_level_8.rst \ No newline at end of file diff --git a/docs/source-lit/levels/intermediate_level_9.rst b/docs/source-lit/levels/intermediate_level_9.rst new file mode 120000 index 0000000000000..ed7fa6971a9ea --- /dev/null +++ b/docs/source-lit/levels/intermediate_level_9.rst @@ -0,0 +1 @@ +../../source-pytorch/levels/intermediate_level_9.rst \ No newline at end of file diff --git a/docs/source-lit/links.rst b/docs/source-lit/links.rst new file mode 120000 index 0000000000000..2190c5680f9de --- /dev/null +++ b/docs/source-lit/links.rst @@ -0,0 +1 @@ +../source-pytorch/links.rst \ No newline at end of file diff --git a/docs/source-lit/make.bat b/docs/source-lit/make.bat new file mode 100644 index 0000000000000..9b565142aecbf --- /dev/null +++ b/docs/source-lit/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=. +set BUILDDIR=../build + +if "%1" == "" goto help + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.http://sphinx-doc.org/ + exit /b 1 +) + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% + +:end +popd diff --git a/docs/source-lit/model/build_model.rst b/docs/source-lit/model/build_model.rst new file mode 120000 index 0000000000000..020746c5a920c --- /dev/null +++ b/docs/source-lit/model/build_model.rst @@ -0,0 +1 @@ +../../source-pytorch/model/build_model.rst \ No newline at end of file diff --git a/docs/source-lit/model/build_model_advanced.rst b/docs/source-lit/model/build_model_advanced.rst new file mode 120000 index 0000000000000..cdb3200b09d6c --- /dev/null +++ b/docs/source-lit/model/build_model_advanced.rst @@ -0,0 +1 @@ +../../source-pytorch/model/build_model_advanced.rst \ No newline at end of file diff --git a/docs/source-lit/model/build_model_expert.rst b/docs/source-lit/model/build_model_expert.rst new file mode 120000 index 0000000000000..92a95eaa2d112 --- /dev/null +++ b/docs/source-lit/model/build_model_expert.rst @@ -0,0 +1 @@ +../../source-pytorch/model/build_model_expert.rst \ No newline at end of file diff --git a/docs/source-lit/model/build_model_intermediate.rst b/docs/source-lit/model/build_model_intermediate.rst new file mode 120000 index 0000000000000..597ca49b96b8a --- /dev/null +++ b/docs/source-lit/model/build_model_intermediate.rst @@ -0,0 +1 @@ +../../source-pytorch/model/build_model_intermediate.rst \ No newline at end of file diff --git a/docs/source-lit/model/manual_optimization.rst b/docs/source-lit/model/manual_optimization.rst new file mode 120000 index 0000000000000..21b789872dba3 --- /dev/null +++ b/docs/source-lit/model/manual_optimization.rst @@ -0,0 +1 @@ +../../source-pytorch/model/manual_optimization.rst \ No newline at end of file diff --git a/docs/source-lit/model/own_your_loop.rst b/docs/source-lit/model/own_your_loop.rst new file mode 120000 index 0000000000000..ceedfc67c2d68 --- /dev/null +++ b/docs/source-lit/model/own_your_loop.rst @@ -0,0 +1 @@ +../../source-pytorch/model/own_your_loop.rst \ No newline at end of file diff --git a/docs/source-lit/model/train_model_basic.rst b/docs/source-lit/model/train_model_basic.rst new file mode 120000 index 0000000000000..d07dd9756a5e1 --- /dev/null +++ b/docs/source-lit/model/train_model_basic.rst @@ -0,0 +1 @@ +../../source-pytorch/model/train_model_basic.rst \ No newline at end of file diff --git a/docs/source-lit/moving_to_the_cloud.rst b/docs/source-lit/moving_to_the_cloud.rst new file mode 120000 index 0000000000000..3280db3e69bab --- /dev/null +++ b/docs/source-lit/moving_to_the_cloud.rst @@ -0,0 +1 @@ +../source-app/moving_to_the_cloud.rst \ No newline at end of file diff --git a/docs/source-lit/quickstart.rst b/docs/source-lit/quickstart.rst new file mode 120000 index 0000000000000..3360068706b57 --- /dev/null +++ b/docs/source-lit/quickstart.rst @@ -0,0 +1 @@ +../source-app/quickstart.rst \ No newline at end of file diff --git a/docs/source-lit/starter/converting.rst b/docs/source-lit/starter/converting.rst new file mode 120000 index 0000000000000..8c53679be3c2b --- /dev/null +++ b/docs/source-lit/starter/converting.rst @@ -0,0 +1 @@ +../../source-pytorch/starter/converting.rst \ No newline at end of file diff --git a/docs/source-lit/starter/installation.rst b/docs/source-lit/starter/installation.rst new file mode 120000 index 0000000000000..6ac949b87d42d --- /dev/null +++ b/docs/source-lit/starter/installation.rst @@ -0,0 +1 @@ +../../source-pytorch/starter/installation.rst \ No newline at end of file diff --git a/docs/source-lit/starter/installation_mac.rst b/docs/source-lit/starter/installation_mac.rst new file mode 120000 index 0000000000000..a8b7d661776b1 --- /dev/null +++ b/docs/source-lit/starter/installation_mac.rst @@ -0,0 +1 @@ +../../source-pytorch/starter/installation_mac.rst \ No newline at end of file diff --git a/docs/source-lit/starter/introduction.rst b/docs/source-lit/starter/introduction.rst new file mode 120000 index 0000000000000..2464f783208b3 --- /dev/null +++ b/docs/source-lit/starter/introduction.rst @@ -0,0 +1 @@ +../../source-pytorch/starter/introduction.rst \ No newline at end of file diff --git a/docs/source-lit/starter/lightning_lite.rst b/docs/source-lit/starter/lightning_lite.rst new file mode 120000 index 0000000000000..382396dd915c3 --- /dev/null +++ b/docs/source-lit/starter/lightning_lite.rst @@ -0,0 +1 @@ +../../source-pytorch/starter/lightning_lite.rst \ No newline at end of file diff --git a/docs/source-lit/starter/style_guide.rst b/docs/source-lit/starter/style_guide.rst new file mode 120000 index 0000000000000..e0d531ef96c02 --- /dev/null +++ b/docs/source-lit/starter/style_guide.rst @@ -0,0 +1 @@ +../../source-pytorch/starter/style_guide.rst \ No newline at end of file diff --git a/docs/source-lit/strategies/hivemind.rst b/docs/source-lit/strategies/hivemind.rst new file mode 120000 index 0000000000000..946d1075aa1ee --- /dev/null +++ b/docs/source-lit/strategies/hivemind.rst @@ -0,0 +1 @@ +../../source-pytorch/strategies/hivemind.rst \ No newline at end of file diff --git a/docs/source-lit/strategies/hivemind_basic.rst b/docs/source-lit/strategies/hivemind_basic.rst new file mode 120000 index 0000000000000..54c9177211be0 --- /dev/null +++ b/docs/source-lit/strategies/hivemind_basic.rst @@ -0,0 +1 @@ +../../source-pytorch/strategies/hivemind_basic.rst \ No newline at end of file diff --git a/docs/source-lit/strategies/hivemind_expert.rst b/docs/source-lit/strategies/hivemind_expert.rst new file mode 120000 index 0000000000000..89eb74de541a8 --- /dev/null +++ b/docs/source-lit/strategies/hivemind_expert.rst @@ -0,0 +1 @@ +../../source-pytorch/strategies/hivemind_expert.rst \ No newline at end of file diff --git a/docs/source-lit/strategies/hivemind_intermediate.rst b/docs/source-lit/strategies/hivemind_intermediate.rst new file mode 120000 index 0000000000000..29045adfae55d --- /dev/null +++ b/docs/source-lit/strategies/hivemind_intermediate.rst @@ -0,0 +1 @@ +../../source-pytorch/strategies/hivemind_intermediate.rst \ No newline at end of file diff --git a/docs/source-lit/testing.rst b/docs/source-lit/testing.rst new file mode 120000 index 0000000000000..9aa53a233d2b3 --- /dev/null +++ b/docs/source-lit/testing.rst @@ -0,0 +1 @@ +../source-app/testing.rst \ No newline at end of file diff --git a/docs/source-lit/tuning/profiler.rst b/docs/source-lit/tuning/profiler.rst new file mode 120000 index 0000000000000..f2c584093b951 --- /dev/null +++ b/docs/source-lit/tuning/profiler.rst @@ -0,0 +1 @@ +../../source-pytorch/tuning/profiler.rst \ No newline at end of file diff --git a/docs/source-lit/tuning/profiler_advanced.rst b/docs/source-lit/tuning/profiler_advanced.rst new file mode 120000 index 0000000000000..3039766ef2cef --- /dev/null +++ b/docs/source-lit/tuning/profiler_advanced.rst @@ -0,0 +1 @@ +../../source-pytorch/tuning/profiler_advanced.rst \ No newline at end of file diff --git a/docs/source-lit/tuning/profiler_basic.rst b/docs/source-lit/tuning/profiler_basic.rst new file mode 120000 index 0000000000000..bf7547cfe2450 --- /dev/null +++ b/docs/source-lit/tuning/profiler_basic.rst @@ -0,0 +1 @@ +../../source-pytorch/tuning/profiler_basic.rst \ No newline at end of file diff --git a/docs/source-lit/tuning/profiler_expert.rst b/docs/source-lit/tuning/profiler_expert.rst new file mode 120000 index 0000000000000..08f95453816c6 --- /dev/null +++ b/docs/source-lit/tuning/profiler_expert.rst @@ -0,0 +1 @@ +../../source-pytorch/tuning/profiler_expert.rst \ No newline at end of file diff --git a/docs/source-lit/tuning/profiler_intermediate.rst b/docs/source-lit/tuning/profiler_intermediate.rst new file mode 120000 index 0000000000000..55df6d239374a --- /dev/null +++ b/docs/source-lit/tuning/profiler_intermediate.rst @@ -0,0 +1 @@ +../../source-pytorch/tuning/profiler_intermediate.rst \ No newline at end of file diff --git a/docs/source-lit/ui_and_frontends.rst b/docs/source-lit/ui_and_frontends.rst new file mode 120000 index 0000000000000..0bec96e856f21 --- /dev/null +++ b/docs/source-lit/ui_and_frontends.rst @@ -0,0 +1 @@ +../source-app/ui_and_frontends.rst \ No newline at end of file diff --git a/docs/source-lit/visualize/experiment_managers.rst b/docs/source-lit/visualize/experiment_managers.rst new file mode 120000 index 0000000000000..32be6b12ae075 --- /dev/null +++ b/docs/source-lit/visualize/experiment_managers.rst @@ -0,0 +1 @@ +../../source-pytorch/visualize/experiment_managers.rst \ No newline at end of file diff --git a/docs/source-lit/visualize/loggers.rst b/docs/source-lit/visualize/loggers.rst new file mode 120000 index 0000000000000..3ba2d049a5d8a --- /dev/null +++ b/docs/source-lit/visualize/loggers.rst @@ -0,0 +1 @@ +../../source-pytorch/visualize/loggers.rst \ No newline at end of file diff --git a/docs/source-lit/visualize/logging_advanced.rst b/docs/source-lit/visualize/logging_advanced.rst new file mode 120000 index 0000000000000..5859c6cb403df --- /dev/null +++ b/docs/source-lit/visualize/logging_advanced.rst @@ -0,0 +1 @@ +../../source-pytorch/visualize/logging_advanced.rst \ No newline at end of file diff --git a/docs/source-lit/visualize/logging_basic.rst b/docs/source-lit/visualize/logging_basic.rst new file mode 120000 index 0000000000000..03cd768b384e7 --- /dev/null +++ b/docs/source-lit/visualize/logging_basic.rst @@ -0,0 +1 @@ +../../source-pytorch/visualize/logging_basic.rst \ No newline at end of file diff --git a/docs/source-lit/visualize/logging_expert.rst b/docs/source-lit/visualize/logging_expert.rst new file mode 120000 index 0000000000000..754fdd23914fc --- /dev/null +++ b/docs/source-lit/visualize/logging_expert.rst @@ -0,0 +1 @@ +../../source-pytorch/visualize/logging_expert.rst \ No newline at end of file diff --git a/docs/source-lit/visualize/logging_intermediate.rst b/docs/source-lit/visualize/logging_intermediate.rst new file mode 120000 index 0000000000000..8629befaf2948 --- /dev/null +++ b/docs/source-lit/visualize/logging_intermediate.rst @@ -0,0 +1 @@ +../../source-pytorch/visualize/logging_intermediate.rst \ No newline at end of file diff --git a/docs/source-lit/visualize/supported_exp_managers.rst b/docs/source-lit/visualize/supported_exp_managers.rst new file mode 120000 index 0000000000000..fe9b04c094cd1 --- /dev/null +++ b/docs/source-lit/visualize/supported_exp_managers.rst @@ -0,0 +1 @@ +../../source-pytorch/visualize/supported_exp_managers.rst \ No newline at end of file diff --git a/docs/source-lit/workflows/access_app_state/access_app_state.rst b/docs/source-lit/workflows/access_app_state/access_app_state.rst new file mode 120000 index 0000000000000..9dad32f4f366e --- /dev/null +++ b/docs/source-lit/workflows/access_app_state/access_app_state.rst @@ -0,0 +1 @@ +../../../source-app/workflows/access_app_state/access_app_state.rst \ No newline at end of file diff --git a/docs/source-lit/workflows/add_components/index.rst b/docs/source-lit/workflows/add_components/index.rst new file mode 120000 index 0000000000000..194b19e2e2abc --- /dev/null +++ b/docs/source-lit/workflows/add_components/index.rst @@ -0,0 +1 @@ +../../../source-app/workflows/add_components/index.rst \ No newline at end of file diff --git a/docs/source-lit/workflows/add_server/any_server.rst b/docs/source-lit/workflows/add_server/any_server.rst new file mode 120000 index 0000000000000..bd0e10f00bc7f --- /dev/null +++ b/docs/source-lit/workflows/add_server/any_server.rst @@ -0,0 +1 @@ +../../../source-app/workflows/add_server/any_server.rst \ No newline at end of file diff --git a/docs/source-lit/workflows/add_server/flask_basic.rst b/docs/source-lit/workflows/add_server/flask_basic.rst new file mode 120000 index 0000000000000..978951a5eac8b --- /dev/null +++ b/docs/source-lit/workflows/add_server/flask_basic.rst @@ -0,0 +1 @@ +../../../source-app/workflows/add_server/flask_basic.rst \ No newline at end of file diff --git a/docs/source-lit/workflows/add_server/index.rst b/docs/source-lit/workflows/add_server/index.rst new file mode 120000 index 0000000000000..0cf8c1f4fc85d --- /dev/null +++ b/docs/source-lit/workflows/add_server/index.rst @@ -0,0 +1 @@ +../../../source-app/workflows/add_server/index.rst \ No newline at end of file diff --git a/docs/source-lit/workflows/add_server/index_content.rst b/docs/source-lit/workflows/add_server/index_content.rst new file mode 120000 index 0000000000000..083d234f3a47a --- /dev/null +++ b/docs/source-lit/workflows/add_server/index_content.rst @@ -0,0 +1 @@ +../../../source-app/workflows/add_server/index_content.rst \ No newline at end of file diff --git a/docs/source-lit/workflows/add_web_link.rst b/docs/source-lit/workflows/add_web_link.rst new file mode 120000 index 0000000000000..5a7240f158fc7 --- /dev/null +++ b/docs/source-lit/workflows/add_web_link.rst @@ -0,0 +1 @@ +../../source-app/workflows/add_web_link.rst \ No newline at end of file diff --git a/docs/source-lit/workflows/add_web_ui/angular_js_intermediate.rst b/docs/source-lit/workflows/add_web_ui/angular_js_intermediate.rst new file mode 120000 index 0000000000000..c02e7914c6163 --- /dev/null +++ b/docs/source-lit/workflows/add_web_ui/angular_js_intermediate.rst @@ -0,0 +1 @@ +../../../source-app/workflows/add_web_ui/angular_js_intermediate.rst \ No newline at end of file diff --git a/docs/source-lit/workflows/add_web_ui/dash/basic.rst b/docs/source-lit/workflows/add_web_ui/dash/basic.rst new file mode 120000 index 0000000000000..2754abb06ad3c --- /dev/null +++ b/docs/source-lit/workflows/add_web_ui/dash/basic.rst @@ -0,0 +1 @@ +../../../../source-app/workflows/add_web_ui/dash/basic.rst \ No newline at end of file diff --git a/docs/source-lit/workflows/add_web_ui/dash/index.rst b/docs/source-lit/workflows/add_web_ui/dash/index.rst new file mode 120000 index 0000000000000..1b6bccdec8933 --- /dev/null +++ b/docs/source-lit/workflows/add_web_ui/dash/index.rst @@ -0,0 +1 @@ +../../../../source-app/workflows/add_web_ui/dash/index.rst \ No newline at end of file diff --git a/docs/source-lit/workflows/add_web_ui/dash/intermediate.rst b/docs/source-lit/workflows/add_web_ui/dash/intermediate.rst new file mode 120000 index 0000000000000..74c69b4c51b72 --- /dev/null +++ b/docs/source-lit/workflows/add_web_ui/dash/intermediate.rst @@ -0,0 +1 @@ +../../../../source-app/workflows/add_web_ui/dash/intermediate.rst \ No newline at end of file diff --git a/docs/source-lit/workflows/add_web_ui/dash/intermediate_plot.py b/docs/source-lit/workflows/add_web_ui/dash/intermediate_plot.py new file mode 120000 index 0000000000000..a9663a033a633 --- /dev/null +++ b/docs/source-lit/workflows/add_web_ui/dash/intermediate_plot.py @@ -0,0 +1 @@ +../../../../source-app/workflows/add_web_ui/dash/intermediate_plot.py \ No newline at end of file diff --git a/docs/source-lit/workflows/add_web_ui/dash/intermediate_state.py b/docs/source-lit/workflows/add_web_ui/dash/intermediate_state.py new file mode 120000 index 0000000000000..8de4adddbcd85 --- /dev/null +++ b/docs/source-lit/workflows/add_web_ui/dash/intermediate_state.py @@ -0,0 +1 @@ +../../../../source-app/workflows/add_web_ui/dash/intermediate_state.py \ No newline at end of file diff --git a/docs/source-lit/workflows/add_web_ui/example_app.rst b/docs/source-lit/workflows/add_web_ui/example_app.rst new file mode 120000 index 0000000000000..9529d6a37c22b --- /dev/null +++ b/docs/source-lit/workflows/add_web_ui/example_app.rst @@ -0,0 +1 @@ +../../../source-app/workflows/add_web_ui/example_app.rst \ No newline at end of file diff --git a/docs/source-lit/workflows/add_web_ui/glossary_front_end.rst b/docs/source-lit/workflows/add_web_ui/glossary_front_end.rst new file mode 120000 index 0000000000000..d89a9b9edc742 --- /dev/null +++ b/docs/source-lit/workflows/add_web_ui/glossary_front_end.rst @@ -0,0 +1 @@ +../../../source-app/workflows/add_web_ui/glossary_front_end.rst \ No newline at end of file diff --git a/docs/source-lit/workflows/add_web_ui/glossary_ui.rst b/docs/source-lit/workflows/add_web_ui/glossary_ui.rst new file mode 120000 index 0000000000000..fcbb155462fcd --- /dev/null +++ b/docs/source-lit/workflows/add_web_ui/glossary_ui.rst @@ -0,0 +1 @@ +../../../source-app/workflows/add_web_ui/glossary_ui.rst \ No newline at end of file diff --git a/docs/source-lit/workflows/add_web_ui/gradio/basic.rst b/docs/source-lit/workflows/add_web_ui/gradio/basic.rst new file mode 120000 index 0000000000000..17dc184026e1d --- /dev/null +++ b/docs/source-lit/workflows/add_web_ui/gradio/basic.rst @@ -0,0 +1 @@ +../../../../source-app/workflows/add_web_ui/gradio/basic.rst \ No newline at end of file diff --git a/docs/source-lit/workflows/add_web_ui/gradio/index.rst b/docs/source-lit/workflows/add_web_ui/gradio/index.rst new file mode 120000 index 0000000000000..9e02ee9e1beb1 --- /dev/null +++ b/docs/source-lit/workflows/add_web_ui/gradio/index.rst @@ -0,0 +1 @@ +../../../../source-app/workflows/add_web_ui/gradio/index.rst \ No newline at end of file diff --git a/docs/source-lit/workflows/add_web_ui/gradio/intermediate.rst b/docs/source-lit/workflows/add_web_ui/gradio/intermediate.rst new file mode 120000 index 0000000000000..e3d797f9b6ec2 --- /dev/null +++ b/docs/source-lit/workflows/add_web_ui/gradio/intermediate.rst @@ -0,0 +1 @@ +../../../../source-app/workflows/add_web_ui/gradio/intermediate.rst \ No newline at end of file diff --git a/docs/source-lit/workflows/add_web_ui/html/basic.rst b/docs/source-lit/workflows/add_web_ui/html/basic.rst new file mode 120000 index 0000000000000..4011ea5678fd5 --- /dev/null +++ b/docs/source-lit/workflows/add_web_ui/html/basic.rst @@ -0,0 +1 @@ +../../../../source-app/workflows/add_web_ui/html/basic.rst \ No newline at end of file diff --git a/docs/source-lit/workflows/add_web_ui/html/index.rst b/docs/source-lit/workflows/add_web_ui/html/index.rst new file mode 120000 index 0000000000000..77f403f30f8b5 --- /dev/null +++ b/docs/source-lit/workflows/add_web_ui/html/index.rst @@ -0,0 +1 @@ +../../../../source-app/workflows/add_web_ui/html/index.rst \ No newline at end of file diff --git a/docs/source-lit/workflows/add_web_ui/html/intermediate.rst b/docs/source-lit/workflows/add_web_ui/html/intermediate.rst new file mode 120000 index 0000000000000..4ae32fde35fd0 --- /dev/null +++ b/docs/source-lit/workflows/add_web_ui/html/intermediate.rst @@ -0,0 +1 @@ +../../../../source-app/workflows/add_web_ui/html/intermediate.rst \ No newline at end of file diff --git a/docs/source-lit/workflows/add_web_ui/index.rst b/docs/source-lit/workflows/add_web_ui/index.rst new file mode 120000 index 0000000000000..fd1494e159dc9 --- /dev/null +++ b/docs/source-lit/workflows/add_web_ui/index.rst @@ -0,0 +1 @@ +../../../source-app/workflows/add_web_ui/index.rst \ No newline at end of file diff --git a/docs/source-lit/workflows/add_web_ui/index_content.rst b/docs/source-lit/workflows/add_web_ui/index_content.rst new file mode 120000 index 0000000000000..08e2a00de9826 --- /dev/null +++ b/docs/source-lit/workflows/add_web_ui/index_content.rst @@ -0,0 +1 @@ +../../../source-app/workflows/add_web_ui/index_content.rst \ No newline at end of file diff --git a/docs/source-lit/workflows/add_web_ui/integrate_any_javascript_framework.rst b/docs/source-lit/workflows/add_web_ui/integrate_any_javascript_framework.rst new file mode 120000 index 0000000000000..d59c56faff324 --- /dev/null +++ b/docs/source-lit/workflows/add_web_ui/integrate_any_javascript_framework.rst @@ -0,0 +1 @@ +../../../source-app/workflows/add_web_ui/integrate_any_javascript_framework.rst \ No newline at end of file diff --git a/docs/source-lit/workflows/add_web_ui/jupyter_basic.rst b/docs/source-lit/workflows/add_web_ui/jupyter_basic.rst new file mode 120000 index 0000000000000..0d34a5a6ce041 --- /dev/null +++ b/docs/source-lit/workflows/add_web_ui/jupyter_basic.rst @@ -0,0 +1 @@ +../../../source-app/workflows/add_web_ui/jupyter_basic.rst \ No newline at end of file diff --git a/docs/source-lit/workflows/add_web_ui/react/communicate_between_react_and_lightning.rst b/docs/source-lit/workflows/add_web_ui/react/communicate_between_react_and_lightning.rst new file mode 120000 index 0000000000000..d1b17c9f218db --- /dev/null +++ b/docs/source-lit/workflows/add_web_ui/react/communicate_between_react_and_lightning.rst @@ -0,0 +1 @@ +../../../../source-app/workflows/add_web_ui/react/communicate_between_react_and_lightning.rst \ No newline at end of file diff --git a/docs/source-lit/workflows/add_web_ui/react/connect_react_and_lightning.rst b/docs/source-lit/workflows/add_web_ui/react/connect_react_and_lightning.rst new file mode 120000 index 0000000000000..8baba5e5a07f1 --- /dev/null +++ b/docs/source-lit/workflows/add_web_ui/react/connect_react_and_lightning.rst @@ -0,0 +1 @@ +../../../../source-app/workflows/add_web_ui/react/connect_react_and_lightning.rst \ No newline at end of file diff --git a/docs/source-lit/workflows/add_web_ui/react/create_react_template.rst b/docs/source-lit/workflows/add_web_ui/react/create_react_template.rst new file mode 120000 index 0000000000000..838b16111a682 --- /dev/null +++ b/docs/source-lit/workflows/add_web_ui/react/create_react_template.rst @@ -0,0 +1 @@ +../../../../source-app/workflows/add_web_ui/react/create_react_template.rst \ No newline at end of file diff --git a/docs/source-lit/workflows/add_web_ui/react/index.rst b/docs/source-lit/workflows/add_web_ui/react/index.rst new file mode 120000 index 0000000000000..d729777b6d184 --- /dev/null +++ b/docs/source-lit/workflows/add_web_ui/react/index.rst @@ -0,0 +1 @@ +../../../../source-app/workflows/add_web_ui/react/index.rst \ No newline at end of file diff --git a/docs/source-lit/workflows/add_web_ui/react/react_development_workflow.rst b/docs/source-lit/workflows/add_web_ui/react/react_development_workflow.rst new file mode 120000 index 0000000000000..f55ce3d6c57e6 --- /dev/null +++ b/docs/source-lit/workflows/add_web_ui/react/react_development_workflow.rst @@ -0,0 +1 @@ +../../../../source-app/workflows/add_web_ui/react/react_development_workflow.rst \ No newline at end of file diff --git a/docs/source-lit/workflows/add_web_ui/streamlit/basic.rst b/docs/source-lit/workflows/add_web_ui/streamlit/basic.rst new file mode 120000 index 0000000000000..c7257ab7f6212 --- /dev/null +++ b/docs/source-lit/workflows/add_web_ui/streamlit/basic.rst @@ -0,0 +1 @@ +../../../../source-app/workflows/add_web_ui/streamlit/basic.rst \ No newline at end of file diff --git a/docs/source-lit/workflows/add_web_ui/streamlit/index.rst b/docs/source-lit/workflows/add_web_ui/streamlit/index.rst new file mode 120000 index 0000000000000..8367be3b7dced --- /dev/null +++ b/docs/source-lit/workflows/add_web_ui/streamlit/index.rst @@ -0,0 +1 @@ +../../../../source-app/workflows/add_web_ui/streamlit/index.rst \ No newline at end of file diff --git a/docs/source-lit/workflows/add_web_ui/streamlit/intermediate.rst b/docs/source-lit/workflows/add_web_ui/streamlit/intermediate.rst new file mode 120000 index 0000000000000..0104b5a777737 --- /dev/null +++ b/docs/source-lit/workflows/add_web_ui/streamlit/intermediate.rst @@ -0,0 +1 @@ +../../../../source-app/workflows/add_web_ui/streamlit/intermediate.rst \ No newline at end of file diff --git a/docs/source-lit/workflows/add_web_ui/vue_js_intermediate.rst b/docs/source-lit/workflows/add_web_ui/vue_js_intermediate.rst new file mode 120000 index 0000000000000..06e9270fe719f --- /dev/null +++ b/docs/source-lit/workflows/add_web_ui/vue_js_intermediate.rst @@ -0,0 +1 @@ +../../../source-app/workflows/add_web_ui/vue_js_intermediate.rst \ No newline at end of file diff --git a/docs/source-lit/workflows/arrange_tabs/arrange_app_basic.rst b/docs/source-lit/workflows/arrange_tabs/arrange_app_basic.rst new file mode 120000 index 0000000000000..d7b7c6c4c4983 --- /dev/null +++ b/docs/source-lit/workflows/arrange_tabs/arrange_app_basic.rst @@ -0,0 +1 @@ +../../../source-app/workflows/arrange_tabs/arrange_app_basic.rst \ No newline at end of file diff --git a/docs/source-lit/workflows/arrange_tabs/arrange_app_intermediate.rst b/docs/source-lit/workflows/arrange_tabs/arrange_app_intermediate.rst new file mode 120000 index 0000000000000..61b8311ff88e9 --- /dev/null +++ b/docs/source-lit/workflows/arrange_tabs/arrange_app_intermediate.rst @@ -0,0 +1 @@ +../../../source-app/workflows/arrange_tabs/arrange_app_intermediate.rst \ No newline at end of file diff --git a/docs/source-lit/workflows/arrange_tabs/index.rst b/docs/source-lit/workflows/arrange_tabs/index.rst new file mode 120000 index 0000000000000..b90f092c6031b --- /dev/null +++ b/docs/source-lit/workflows/arrange_tabs/index.rst @@ -0,0 +1 @@ +../../../source-app/workflows/arrange_tabs/index.rst \ No newline at end of file diff --git a/docs/source-lit/workflows/arrange_tabs/index_content.rst b/docs/source-lit/workflows/arrange_tabs/index_content.rst new file mode 120000 index 0000000000000..b40e94f4edda8 --- /dev/null +++ b/docs/source-lit/workflows/arrange_tabs/index_content.rst @@ -0,0 +1 @@ +../../../source-app/workflows/arrange_tabs/index_content.rst \ No newline at end of file diff --git a/docs/source-lit/workflows/build_lightning_app/from_pytorch_lightning_script.rst b/docs/source-lit/workflows/build_lightning_app/from_pytorch_lightning_script.rst new file mode 120000 index 0000000000000..753eb403cfc3b --- /dev/null +++ b/docs/source-lit/workflows/build_lightning_app/from_pytorch_lightning_script.rst @@ -0,0 +1 @@ +../../../source-app/workflows/build_lightning_app/from_pytorch_lightning_script.rst \ No newline at end of file diff --git a/docs/source-lit/workflows/build_lightning_app/from_scratch.rst b/docs/source-lit/workflows/build_lightning_app/from_scratch.rst new file mode 120000 index 0000000000000..78acacc851fd9 --- /dev/null +++ b/docs/source-lit/workflows/build_lightning_app/from_scratch.rst @@ -0,0 +1 @@ +../../../source-app/workflows/build_lightning_app/from_scratch.rst \ No newline at end of file diff --git a/docs/source-lit/workflows/build_lightning_app/from_scratch_content.rst b/docs/source-lit/workflows/build_lightning_app/from_scratch_content.rst new file mode 120000 index 0000000000000..ee07378c89324 --- /dev/null +++ b/docs/source-lit/workflows/build_lightning_app/from_scratch_content.rst @@ -0,0 +1 @@ +../../../source-app/workflows/build_lightning_app/from_scratch_content.rst \ No newline at end of file diff --git a/docs/source-lit/workflows/build_lightning_app/index.rst b/docs/source-lit/workflows/build_lightning_app/index.rst new file mode 120000 index 0000000000000..c5472b5badd18 --- /dev/null +++ b/docs/source-lit/workflows/build_lightning_app/index.rst @@ -0,0 +1 @@ +../../../source-app/workflows/build_lightning_app/index.rst \ No newline at end of file diff --git a/docs/source-lit/workflows/build_lightning_app/index_content.rst b/docs/source-lit/workflows/build_lightning_app/index_content.rst new file mode 120000 index 0000000000000..8062cb5e8b345 --- /dev/null +++ b/docs/source-lit/workflows/build_lightning_app/index_content.rst @@ -0,0 +1 @@ +../../../source-app/workflows/build_lightning_app/index_content.rst \ No newline at end of file diff --git a/docs/source-lit/workflows/build_lightning_component/basic.rst b/docs/source-lit/workflows/build_lightning_component/basic.rst new file mode 120000 index 0000000000000..7783eaf87e1e8 --- /dev/null +++ b/docs/source-lit/workflows/build_lightning_component/basic.rst @@ -0,0 +1 @@ +../../../source-app/workflows/build_lightning_component/basic.rst \ No newline at end of file diff --git a/docs/source-lit/workflows/build_lightning_component/from_scratch_component_content.rst b/docs/source-lit/workflows/build_lightning_component/from_scratch_component_content.rst new file mode 120000 index 0000000000000..7074239386319 --- /dev/null +++ b/docs/source-lit/workflows/build_lightning_component/from_scratch_component_content.rst @@ -0,0 +1 @@ +../../../source-app/workflows/build_lightning_component/from_scratch_component_content.rst \ No newline at end of file diff --git a/docs/source-lit/workflows/build_lightning_component/index.rst b/docs/source-lit/workflows/build_lightning_component/index.rst new file mode 120000 index 0000000000000..f2576ad563d4d --- /dev/null +++ b/docs/source-lit/workflows/build_lightning_component/index.rst @@ -0,0 +1 @@ +../../../source-app/workflows/build_lightning_component/index.rst \ No newline at end of file diff --git a/docs/source-lit/workflows/build_lightning_component/index_content.rst b/docs/source-lit/workflows/build_lightning_component/index_content.rst new file mode 120000 index 0000000000000..34bc275c55ce9 --- /dev/null +++ b/docs/source-lit/workflows/build_lightning_component/index_content.rst @@ -0,0 +1 @@ +../../../source-app/workflows/build_lightning_component/index_content.rst \ No newline at end of file diff --git a/docs/source-lit/workflows/build_lightning_component/intermediate.rst b/docs/source-lit/workflows/build_lightning_component/intermediate.rst new file mode 120000 index 0000000000000..f72db7929b4a4 --- /dev/null +++ b/docs/source-lit/workflows/build_lightning_component/intermediate.rst @@ -0,0 +1 @@ +../../../source-app/workflows/build_lightning_component/intermediate.rst \ No newline at end of file diff --git a/docs/source-lit/workflows/build_lightning_component/publish_a_component.rst b/docs/source-lit/workflows/build_lightning_component/publish_a_component.rst new file mode 120000 index 0000000000000..793b22f08daa9 --- /dev/null +++ b/docs/source-lit/workflows/build_lightning_component/publish_a_component.rst @@ -0,0 +1 @@ +../../../source-app/workflows/build_lightning_component/publish_a_component.rst \ No newline at end of file diff --git a/docs/source-lit/workflows/byoc/index.rst b/docs/source-lit/workflows/byoc/index.rst new file mode 120000 index 0000000000000..c97ffd8b754ca --- /dev/null +++ b/docs/source-lit/workflows/byoc/index.rst @@ -0,0 +1 @@ +../../../source-app/workflows/byoc/index.rst \ No newline at end of file diff --git a/docs/source-lit/workflows/debug_locally.rst b/docs/source-lit/workflows/debug_locally.rst new file mode 120000 index 0000000000000..33ef4444ead73 --- /dev/null +++ b/docs/source-lit/workflows/debug_locally.rst @@ -0,0 +1 @@ +../../source-app/workflows/debug_locally.rst \ No newline at end of file diff --git a/docs/source-lit/workflows/enable_fault_tolerance.rst b/docs/source-lit/workflows/enable_fault_tolerance.rst new file mode 120000 index 0000000000000..a401217cc5c90 --- /dev/null +++ b/docs/source-lit/workflows/enable_fault_tolerance.rst @@ -0,0 +1 @@ +../../source-app/workflows/enable_fault_tolerance.rst \ No newline at end of file diff --git a/docs/source-lit/workflows/extend_app.rst b/docs/source-lit/workflows/extend_app.rst new file mode 120000 index 0000000000000..bad57f7530a3b --- /dev/null +++ b/docs/source-lit/workflows/extend_app.rst @@ -0,0 +1 @@ +../../source-app/workflows/extend_app.rst \ No newline at end of file diff --git a/docs/source-lit/workflows/index.rst b/docs/source-lit/workflows/index.rst new file mode 120000 index 0000000000000..04d00f63c020c --- /dev/null +++ b/docs/source-lit/workflows/index.rst @@ -0,0 +1 @@ +../../source-app/workflows/index.rst \ No newline at end of file diff --git a/docs/source-lit/workflows/run_app_on_cloud/cloud_files.rst b/docs/source-lit/workflows/run_app_on_cloud/cloud_files.rst new file mode 120000 index 0000000000000..ae069a99cdc86 --- /dev/null +++ b/docs/source-lit/workflows/run_app_on_cloud/cloud_files.rst @@ -0,0 +1 @@ +../../../source-app/workflows/run_app_on_cloud/cloud_files.rst \ No newline at end of file diff --git a/docs/source-lit/workflows/run_app_on_cloud/index.rst b/docs/source-lit/workflows/run_app_on_cloud/index.rst new file mode 120000 index 0000000000000..fb47d8aad42ee --- /dev/null +++ b/docs/source-lit/workflows/run_app_on_cloud/index.rst @@ -0,0 +1 @@ +../../../source-app/workflows/run_app_on_cloud/index.rst \ No newline at end of file diff --git a/docs/source-lit/workflows/run_app_on_cloud/index_content.rst b/docs/source-lit/workflows/run_app_on_cloud/index_content.rst new file mode 120000 index 0000000000000..9cc40dd675018 --- /dev/null +++ b/docs/source-lit/workflows/run_app_on_cloud/index_content.rst @@ -0,0 +1 @@ +../../../source-app/workflows/run_app_on_cloud/index_content.rst \ No newline at end of file diff --git a/docs/source-lit/workflows/run_app_on_cloud/lightning_cloud.rst b/docs/source-lit/workflows/run_app_on_cloud/lightning_cloud.rst new file mode 120000 index 0000000000000..f0c25e59df0fd --- /dev/null +++ b/docs/source-lit/workflows/run_app_on_cloud/lightning_cloud.rst @@ -0,0 +1 @@ +../../../source-app/workflows/run_app_on_cloud/lightning_cloud.rst \ No newline at end of file diff --git a/docs/source-lit/workflows/run_app_on_cloud/on_prem.rst b/docs/source-lit/workflows/run_app_on_cloud/on_prem.rst new file mode 120000 index 0000000000000..158efda8a966d --- /dev/null +++ b/docs/source-lit/workflows/run_app_on_cloud/on_prem.rst @@ -0,0 +1 @@ +../../../source-app/workflows/run_app_on_cloud/on_prem.rst \ No newline at end of file diff --git a/docs/source-lit/workflows/run_app_on_cloud/on_your_own_machine.rst b/docs/source-lit/workflows/run_app_on_cloud/on_your_own_machine.rst new file mode 120000 index 0000000000000..577f772047c56 --- /dev/null +++ b/docs/source-lit/workflows/run_app_on_cloud/on_your_own_machine.rst @@ -0,0 +1 @@ +../../../source-app/workflows/run_app_on_cloud/on_your_own_machine.rst \ No newline at end of file diff --git a/docs/source-lit/workflows/run_app_snippet.rst b/docs/source-lit/workflows/run_app_snippet.rst new file mode 120000 index 0000000000000..aaed508223495 --- /dev/null +++ b/docs/source-lit/workflows/run_app_snippet.rst @@ -0,0 +1 @@ +../../source-app/workflows/run_app_snippet.rst \ No newline at end of file diff --git a/docs/source-lit/workflows/run_components_on_different_hardware.rst b/docs/source-lit/workflows/run_components_on_different_hardware.rst new file mode 120000 index 0000000000000..179127ae3b36f --- /dev/null +++ b/docs/source-lit/workflows/run_components_on_different_hardware.rst @@ -0,0 +1 @@ +../../source-app/workflows/run_components_on_different_hardware.rst \ No newline at end of file diff --git a/docs/source-lit/workflows/run_on_private_cloud.rst b/docs/source-lit/workflows/run_on_private_cloud.rst new file mode 120000 index 0000000000000..165997a93017d --- /dev/null +++ b/docs/source-lit/workflows/run_on_private_cloud.rst @@ -0,0 +1 @@ +../../source-app/workflows/run_on_private_cloud.rst \ No newline at end of file diff --git a/docs/source-lit/workflows/run_work_in_parallel.rst b/docs/source-lit/workflows/run_work_in_parallel.rst new file mode 120000 index 0000000000000..b6aa359918198 --- /dev/null +++ b/docs/source-lit/workflows/run_work_in_parallel.rst @@ -0,0 +1 @@ +../../source-app/workflows/run_work_in_parallel.rst \ No newline at end of file diff --git a/docs/source-lit/workflows/run_work_in_parallel_content.rst b/docs/source-lit/workflows/run_work_in_parallel_content.rst new file mode 120000 index 0000000000000..19fdfd5fafe64 --- /dev/null +++ b/docs/source-lit/workflows/run_work_in_parallel_content.rst @@ -0,0 +1 @@ +../../source-app/workflows/run_work_in_parallel_content.rst \ No newline at end of file diff --git a/docs/source-lit/workflows/run_work_once.rst b/docs/source-lit/workflows/run_work_once.rst new file mode 120000 index 0000000000000..172affc671be5 --- /dev/null +++ b/docs/source-lit/workflows/run_work_once.rst @@ -0,0 +1 @@ +../../source-app/workflows/run_work_once.rst \ No newline at end of file diff --git a/docs/source-lit/workflows/run_work_once_content.rst b/docs/source-lit/workflows/run_work_once_content.rst new file mode 120000 index 0000000000000..c0dae9c7e0cd8 --- /dev/null +++ b/docs/source-lit/workflows/run_work_once_content.rst @@ -0,0 +1 @@ +../../source-app/workflows/run_work_once_content.rst \ No newline at end of file diff --git a/docs/source-lit/workflows/schedule_apps.rst b/docs/source-lit/workflows/schedule_apps.rst new file mode 120000 index 0000000000000..501cf0f686df4 --- /dev/null +++ b/docs/source-lit/workflows/schedule_apps.rst @@ -0,0 +1 @@ +../../source-app/workflows/schedule_apps.rst \ No newline at end of file diff --git a/docs/source-lit/workflows/share_app.rst b/docs/source-lit/workflows/share_app.rst new file mode 120000 index 0000000000000..6ba2f6500b48c --- /dev/null +++ b/docs/source-lit/workflows/share_app.rst @@ -0,0 +1 @@ +../../source-app/workflows/share_app.rst \ No newline at end of file diff --git a/docs/source-lit/workflows/share_files_between_components.rst b/docs/source-lit/workflows/share_files_between_components.rst new file mode 120000 index 0000000000000..077f33c5da813 --- /dev/null +++ b/docs/source-lit/workflows/share_files_between_components.rst @@ -0,0 +1 @@ +../../source-app/workflows/share_files_between_components.rst \ No newline at end of file diff --git a/docs/source-lit/workflows/share_files_between_components/app.py b/docs/source-lit/workflows/share_files_between_components/app.py new file mode 120000 index 0000000000000..f40d6fb884885 --- /dev/null +++ b/docs/source-lit/workflows/share_files_between_components/app.py @@ -0,0 +1 @@ +../../../source-app/workflows/share_files_between_components/app.py \ No newline at end of file diff --git a/docs/source-lit/workflows/test_an_app.rst b/docs/source-lit/workflows/test_an_app.rst new file mode 120000 index 0000000000000..183bf951ae941 --- /dev/null +++ b/docs/source-lit/workflows/test_an_app.rst @@ -0,0 +1 @@ +../../source-app/workflows/test_an_app.rst \ No newline at end of file diff --git a/requirements/lit/base.txt b/requirements/lit/base.txt new file mode 100644 index 0000000000000..51c5f41437d7b --- /dev/null +++ b/requirements/lit/base.txt @@ -0,0 +1 @@ +# this is just compatibility placeholder for the future fused docs diff --git a/requirements/lit/devel.txt b/requirements/lit/devel.txt new file mode 100644 index 0000000000000..19ff73575415e --- /dev/null +++ b/requirements/lit/devel.txt @@ -0,0 +1 @@ +-r ../pytorch/examples.txt diff --git a/requirements/lit/docs.txt b/requirements/lit/docs.txt new file mode 100644 index 0000000000000..54dd258e9620e --- /dev/null +++ b/requirements/lit/docs.txt @@ -0,0 +1,9 @@ +-r ../docs.txt + +ipython[notebook] +ipython_genutils + +sphinx-autobuild +pt-lightning-sphinx-theme @ https://github.com/Lightning-AI/lightning_sphinx_theme/archive/master.zip + +-r ../../_notebooks/.actions/requirements.txt diff --git a/setup.py b/setup.py index 7d4084960d450..82a4a969ec80b 100755 --- a/setup.py +++ b/setup.py @@ -61,7 +61,9 @@ _PATH_ROOT = os.path.dirname(__file__) _PATH_SRC = os.path.join(_PATH_ROOT, "src") _PATH_REQUIRE = os.path.join(_PATH_ROOT, "requirements") -_PATH_SETUP = os.path.join(_PATH_SRC, _REAL_PKG_NAME or "lightning", "__setup__.py") +_PATH_SETUP = os.path.join(_PATH_SRC, _REAL_PKG_NAME, "__setup__.py") +if not os.path.isfile(_PATH_SETUP): + _PATH_SETUP = os.path.join(_PATH_SRC, "lightning", "__setup__.py") _FREEZE_REQUIREMENTS = bool(int(os.environ.get("FREEZE_REQUIREMENTS", 0))) From 255b46941e2a347cd79f60e12299145095cd33d8 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Tue, 30 Aug 2022 12:39:15 -0400 Subject: [PATCH 016/193] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 534a598b59446..0d109db3e45e2 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@
-**Build PyTorch models and lightning apps that glue together everything around the models, without the pain of infrastructure, cost management, scaling and everything else.** +**Build and train PyTorch models and connect them to the ML lifecycle using Lightning App templates, without handling DIY infrastructure, cost management, scaling, and other headaches.** ______________________________________________________________________ From 36aefdab90edd49b052354eb75552e09a3d58983 Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Tue, 30 Aug 2022 19:03:32 +0200 Subject: [PATCH 017/193] CI: set probot timeout (#14455) --- .github/workflows/probot-auto-cc.yml | 1 + .github/workflows/probot-check-group.yml | 1 + 2 files changed, 2 insertions(+) diff --git a/.github/workflows/probot-auto-cc.yml b/.github/workflows/probot-auto-cc.yml index 5c6de911cd00e..5bebcf9667b75 100644 --- a/.github/workflows/probot-auto-cc.yml +++ b/.github/workflows/probot-auto-cc.yml @@ -10,6 +10,7 @@ jobs: auto-cc: runs-on: ubuntu-latest if: github.event_name == 'issue' || github.event.pull_request.draft == false + timeout-minutes: 5 steps: - uses: carmocca/probot@v1 env: diff --git a/.github/workflows/probot-check-group.yml b/.github/workflows/probot-check-group.yml index 1b37e19c819b6..50413dd656c92 100644 --- a/.github/workflows/probot-check-group.yml +++ b/.github/workflows/probot-check-group.yml @@ -9,6 +9,7 @@ jobs: required-jobs: runs-on: ubuntu-latest if: github.event_name != 'issue_comment' || contains(github.event.comment.body, '@probot pls') + timeout-minutes: 5 steps: - uses: carmocca/probot@v2 env: From 6188593abd7127fac85123c58a2f2414530d7c07 Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Tue, 30 Aug 2022 19:38:51 +0200 Subject: [PATCH 018/193] ci: drop group probot (#14456) --- .github/workflows/probot-check-group.yml | 16 ---------------- 1 file changed, 16 deletions(-) delete mode 100644 .github/workflows/probot-check-group.yml diff --git a/.github/workflows/probot-check-group.yml b/.github/workflows/probot-check-group.yml deleted file mode 100644 index 50413dd656c92..0000000000000 --- a/.github/workflows/probot-check-group.yml +++ /dev/null @@ -1,16 +0,0 @@ -name: Probot - -on: - check_run: {} - pull_request: {} - issue_comment: {types: [created]} - -jobs: - required-jobs: - runs-on: ubuntu-latest - if: github.event_name != 'issue_comment' || contains(github.event.comment.body, '@probot pls') - timeout-minutes: 5 - steps: - - uses: carmocca/probot@v2 - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} From 74304db6f83d89e33a82360c169813a8c2a171cc Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Tue, 30 Aug 2022 21:17:38 +0200 Subject: [PATCH 019/193] CI: update TPU docker (#14448) --- dockers/tpu-tests/Dockerfile | 6 +++--- dockers/tpu-tests/tpu_test_cases.jsonnet | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/dockers/tpu-tests/Dockerfile b/dockers/tpu-tests/Dockerfile index 530cc9ec6fa2c..e23db55bb28e9 100644 --- a/dockers/tpu-tests/Dockerfile +++ b/dockers/tpu-tests/Dockerfile @@ -19,16 +19,16 @@ FROM pytorchlightning/pytorch_lightning:base-xla-py${PYTHON_VERSION}-torch${PYTO LABEL maintainer="Lightning-AI " -COPY ./ ./pytorch-lightning/ +COPY ./ ./lightning/ # Pull the legacy checkpoints -RUN cd pytorch-lightning && \ +RUN cd lightning && \ bash .actions/pull_legacy_checkpoints.sh RUN \ pip install -q fire && \ # drop unnecessary packages - pip install -r pytorch-lightning/requirements/pytorch/devel.txt --no-cache-dir + pip install -r lightning/requirements/pytorch/devel.txt --no-cache-dir COPY ./dockers/tpu-tests/docker-entrypoint.sh /usr/local/bin/ RUN chmod +x /usr/local/bin/docker-entrypoint.sh diff --git a/dockers/tpu-tests/tpu_test_cases.jsonnet b/dockers/tpu-tests/tpu_test_cases.jsonnet index 98056674cdc0b..a73b25277f3e2 100644 --- a/dockers/tpu-tests/tpu_test_cases.jsonnet +++ b/dockers/tpu-tests/tpu_test_cases.jsonnet @@ -22,7 +22,7 @@ local tputests = base.BaseTest { ||| source ~/.bashrc conda activate lightning - mkdir -p /home/runner/work/pytorch-lightning && cd /home/runner/work/pytorch-lightning + mkdir -p /home/runner/work/lightning && cd /home/runner/work/lightning git clone https://github.com/Lightning-AI/lightning.git cd lightning echo $PWD From e8121610a4917b2d8ec57800c2c67ca1400856ca Mon Sep 17 00:00:00 2001 From: Adam Bobowski <100693297+adam-lightning@users.noreply.github.com> Date: Tue, 30 Aug 2022 22:13:43 +0200 Subject: [PATCH 020/193] [App] fix panel requirements (#14450) * update base requirements * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * try main * Apply suggestions from code review * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * extract into separate function * drop * up * up * optional * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * . Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Jirka Co-authored-by: Jirka Borovec --- src/lightning_app/frontend/panel/app_state_comm.py | 6 +++--- src/lightning_app/frontend/panel/panel_serve_render_fn.py | 6 +++++- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/src/lightning_app/frontend/panel/app_state_comm.py b/src/lightning_app/frontend/panel/app_state_comm.py index f7d9c01e7dc2d..d9092628dd67f 100644 --- a/src/lightning_app/frontend/panel/app_state_comm.py +++ b/src/lightning_app/frontend/panel/app_state_comm.py @@ -6,7 +6,7 @@ import asyncio import logging import os -import threading +from threading import Thread from typing import Callable import websockets @@ -16,7 +16,7 @@ _logger = logging.getLogger(__name__) _CALLBACKS = [] -_THREAD: None | threading.Thread = None +_THREAD: Thread = None def _get_ws_port(): @@ -62,7 +62,7 @@ def _start_websocket(): global _THREAD # pylint: disable=global-statement if not _THREAD: _logger.debug("Starting the watch_app_state thread.") - _THREAD = threading.Thread(target=_target_fn) + _THREAD = Thread(target=_target_fn) _THREAD.setDaemon(True) _THREAD.start() _logger.debug("thread started") diff --git a/src/lightning_app/frontend/panel/panel_serve_render_fn.py b/src/lightning_app/frontend/panel/panel_serve_render_fn.py index 7aff3d5c3e601..88c8d6cb980d5 100644 --- a/src/lightning_app/frontend/panel/panel_serve_render_fn.py +++ b/src/lightning_app/frontend/panel/panel_serve_render_fn.py @@ -42,7 +42,7 @@ def _render_fn_wrapper(): return render_fn -if __name__.startswith("bokeh"): +def main(): import panel as pn # I use caching for efficiency reasons. It shaves off 10ms from having @@ -50,3 +50,7 @@ def _render_fn_wrapper(): if "lightning_render_fn" not in pn.state.cache: pn.state.cache["lightning_render_fn"] = _get_render_fn() pn.state.cache["lightning_render_fn"]() + + +if __name__.startswith("bokeh"): + main() From 74538f797cc558f71e69fffa38905935cb8ea409 Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Wed, 31 Aug 2022 01:20:06 +0200 Subject: [PATCH 021/193] CI: skip examples with draft (#14453) --- .github/workflows/ci-app-examples.yml | 1 + .github/workflows/docs-checks.yml | 1 + 2 files changed, 2 insertions(+) diff --git a/.github/workflows/ci-app-examples.yml b/.github/workflows/ci-app-examples.yml index ecd2a746412f1..22e8fc4454a37 100644 --- a/.github/workflows/ci-app-examples.yml +++ b/.github/workflows/ci-app-examples.yml @@ -20,6 +20,7 @@ concurrency: jobs: app-examples: + if: github.event.pull_request.draft == false runs-on: ${{ matrix.os }} strategy: fail-fast: false diff --git a/.github/workflows/docs-checks.yml b/.github/workflows/docs-checks.yml index 0e616b17d3598..1de92849ed0c0 100644 --- a/.github/workflows/docs-checks.yml +++ b/.github/workflows/docs-checks.yml @@ -14,6 +14,7 @@ concurrency: jobs: make-doctest: runs-on: ubuntu-20.04 + needs: make-html # make it depending on build docs to reduce load strategy: fail-fast: false matrix: From 937298126365f5fdf26e648678b5c7bfb106d373 Mon Sep 17 00:00:00 2001 From: Dmitry Frolov Date: Tue, 30 Aug 2022 22:22:01 -0400 Subject: [PATCH 022/193] [CLI] Cluster logs CLI improvements: new log labels + test coverage increasing (#14459) * Cluster logs improvements * Unit tests added * Labels for processing deletion errors --- src/lightning_app/utilities/cluster_logs.py | 4 ++ .../cli/test_cmd_show_cluster_logs.py | 40 +++++++++++++++++++ 2 files changed, 44 insertions(+) create mode 100644 tests/tests_app/cli/test_cmd_show_cluster_logs.py diff --git a/src/lightning_app/utilities/cluster_logs.py b/src/lightning_app/utilities/cluster_logs.py index 4a9bf1ba9148b..76eb45df8ab43 100644 --- a/src/lightning_app/utilities/cluster_logs.py +++ b/src/lightning_app/utilities/cluster_logs.py @@ -26,6 +26,10 @@ class _ClusterLogEventLabels: issuer: Optional[str] = None error: Optional[str] = None errorVerbose: Optional[str] = None + dir: Optional[str] = None + bucket: Optional[str] = None + prefix: Optional[str] = None + loki_s3: Optional[str] = None @dataclass diff --git a/tests/tests_app/cli/test_cmd_show_cluster_logs.py b/tests/tests_app/cli/test_cmd_show_cluster_logs.py new file mode 100644 index 0000000000000..a8e3c6d18e6aa --- /dev/null +++ b/tests/tests_app/cli/test_cmd_show_cluster_logs.py @@ -0,0 +1,40 @@ +from unittest import mock +from unittest.mock import MagicMock + +from click.testing import CliRunner +from lightning_cloud.openapi import Externalv1Cluster + +from lightning_app.cli.cmd_clusters import ClusterList +from lightning_app.cli.lightning_cli import cluster_logs + + +@mock.patch("lightning_app.cli.lightning_cli.LightningClient", MagicMock()) +@mock.patch("lightning_app.cli.cmd_clusters.LightningClient", MagicMock()) +@mock.patch("lightning_app.cli.lightning_cli.AWSClusterManager.get_clusters") +def test_show_logs_errors(get_clusters): + """Test that the CLI prints the errors for the show logs command.""" + + runner = CliRunner() + + # Run without arguments + get_clusters.return_value = ClusterList([]) + result = runner.invoke(cluster_logs, []) + + assert result.exit_code == 2 + assert "Usage: logs" in result.output + + # No clusters + get_clusters.return_value = ClusterList([]) + result = runner.invoke(cluster_logs, ["NonExistentCluster"]) + + assert result.exit_code == 1 + assert "Error: You don't have any clusters" in result.output + + # One cluster + clusters = ClusterList([Externalv1Cluster(name="MyFakeCluster", id="MyFakeCluster")]) + get_clusters.return_value = clusters + + result = runner.invoke(cluster_logs, ["MyFakeClusterTwo"]) + + assert result.exit_code == 1 + assert "Please select one of the following: [MyFakeCluster]" in str(result.output) From 00aefa82b785bd6e6a17674146aa84564b55a084 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Wed, 31 Aug 2022 13:38:54 +0200 Subject: [PATCH 023/193] Cleanup TPU CI script error management (#14389) --- dockers/tpu-tests/tpu_test_cases.jsonnet | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/dockers/tpu-tests/tpu_test_cases.jsonnet b/dockers/tpu-tests/tpu_test_cases.jsonnet index a73b25277f3e2..f2c106f220b60 100644 --- a/dockers/tpu-tests/tpu_test_cases.jsonnet +++ b/dockers/tpu-tests/tpu_test_cases.jsonnet @@ -21,6 +21,7 @@ local tputests = base.BaseTest { command: utils.scriptCommand( ||| source ~/.bashrc + set -e conda activate lightning mkdir -p /home/runner/work/lightning && cd /home/runner/work/lightning git clone https://github.com/Lightning-AI/lightning.git @@ -31,21 +32,18 @@ local tputests = base.BaseTest { git checkout {SHA} export PACKAGE_NAME=pytorch export FREEZE_REQUIREMENTS=1 - export PL_STANDALONE_TESTS_BATCH_SIZE=1 pip install -e .[test] echo $KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS export XRT_TPU_CONFIG="tpu_worker;0;${KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS:7}" export PL_RUN_TPU_TESTS=1 cd tests/tests_pytorch - set -e coverage run --source=pytorch_lightning -m pytest -vv --durations=0 ./ echo "\n||| Running standalone tests |||\n" + export PL_STANDALONE_TESTS_BATCH_SIZE=1 bash run_standalone_tests.sh - test_exit_code=$? echo "\n||| END PYTEST LOGS |||\n" coverage xml cat coverage.xml | tr -d '\t' - test $test_exit_code -eq 0 ||| ), }; From ebc650bd95a4d09a8d77dc7e0379a6b9a909bb69 Mon Sep 17 00:00:00 2001 From: Sherin Thomas Date: Wed, 31 Aug 2022 18:57:21 +0530 Subject: [PATCH 024/193] E2E fix for custom base image (#14468) * new custom base image * image tag --- tests/tests_app_examples/custom_work_dependencies/app.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/tests_app_examples/custom_work_dependencies/app.py b/tests/tests_app_examples/custom_work_dependencies/app.py index fb85c911906bb..a7dbb7ff70062 100644 --- a/tests/tests_app_examples/custom_work_dependencies/app.py +++ b/tests/tests_app_examples/custom_work_dependencies/app.py @@ -28,13 +28,13 @@ class WorkWithCustomBaseImage(LightningWork): def __init__(self, cloud_compute: CloudCompute = CloudCompute(), **kwargs): # this image has been created from ghcr.io/gridai/base-images:v1.8-cpu # by just adding an empty file at /content/.e2e_test - custom_image = "ghcr.io/gridai/image-for-testing-custom-images-in-e2e" + custom_image = "ghcr.io/gridai/image-for-testing-custom-images-in-e2e:v0.0.1" build_config = BuildConfig(image=custom_image) super().__init__(parallel=True, **kwargs, cloud_compute=cloud_compute, cloud_build_config=build_config) def run(self): # checking the existence of the file - this file had been added to the custom base image - assert ".e2e_test" in os.listdir("/content/"), "file not found" + assert ".e2e_test" in os.listdir("/testdir/"), "file not found" class CustomWorkBuildConfigChecker(LightningFlow): From a1dd718781581a2698faf26922eae286def206a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Wed, 31 Aug 2022 17:34:56 +0200 Subject: [PATCH 025/193] Remove deprecated support for passing the warning category positionally (#14470) --- src/pytorch_lightning/CHANGELOG.md | 3 +++ src/pytorch_lightning/utilities/rank_zero.py | 7 ------- tests/tests_pytorch/deprecated_api/test_remove_1-8.py | 7 +------ 3 files changed, 4 insertions(+), 13 deletions(-) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index bad937458c36f..feaf4b3610f0a 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -86,6 +86,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Removed the deprecated `DistributedType` and `DeviceType` enum classes ([#14045](https://github.com/Lightning-AI/lightning/pull/14045)) +- Removed deprecated support for passing the `rank_zero_warn` warning category positionally ([#14470](https://github.com/Lightning-AI/lightning/pull/14470)) + + - Removed the legacy and unused `Trainer.get_deprecated_arg_names()` ([#14415](https://github.com/Lightning-AI/lightning/pull/14415)) diff --git a/src/pytorch_lightning/utilities/rank_zero.py b/src/pytorch_lightning/utilities/rank_zero.py index 55bdc08930905..21f8ca7207abd 100644 --- a/src/pytorch_lightning/utilities/rank_zero.py +++ b/src/pytorch_lightning/utilities/rank_zero.py @@ -80,13 +80,6 @@ def rank_zero_info(*args: Any, stacklevel: int = 4, **kwargs: Any) -> None: def _warn(message: Union[str, Warning], stacklevel: int = 2, **kwargs: Any) -> None: - if type(stacklevel) is type and issubclass(stacklevel, Warning): - rank_zero_deprecation( - "Support for passing the warning category positionally is deprecated in v1.6 and will be removed in v1.8" - f" Please, use `category={stacklevel.__name__}`." - ) - kwargs["category"] = stacklevel - stacklevel = kwargs.pop("stacklevel", 2) warnings.warn(message, stacklevel=stacklevel, **kwargs) diff --git a/tests/tests_pytorch/deprecated_api/test_remove_1-8.py b/tests/tests_pytorch/deprecated_api/test_remove_1-8.py index 38178c5c8e9e9..3e3fbccc61aa1 100644 --- a/tests/tests_pytorch/deprecated_api/test_remove_1-8.py +++ b/tests/tests_pytorch/deprecated_api/test_remove_1-8.py @@ -34,7 +34,7 @@ from pytorch_lightning.trainer.configuration_validator import _check_datamodule_checkpoint_hooks from pytorch_lightning.trainer.states import RunningStage from pytorch_lightning.utilities import device_parser -from pytorch_lightning.utilities.rank_zero import rank_zero_only, rank_zero_warn +from pytorch_lightning.utilities.rank_zero import rank_zero_only from tests_pytorch.helpers.runif import RunIf @@ -78,11 +78,6 @@ def test_v1_8_0_deprecated_call_hook(): trainer.call_hook("test_hook") -def test_v1_8_0_deprecated_warning_positional_category(): - with pytest.deprecated_call(match=r"use `category=FutureWarning."): - rank_zero_warn("foo", FutureWarning) - - def test_v1_8_0_deprecated_run_stage(): trainer = Trainer() trainer._run_stage = Mock() From f2cacf4b1027a2ce3977e837a7cb1f7f5162698c Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Thu, 1 Sep 2022 00:37:49 +0900 Subject: [PATCH 026/193] CI: Reuse check schema (#14469) * rm _check-shema.yml * Reuse devtools' check schema --- .github/workflows/_check-schema.yml | 37 ----------------------------- .github/workflows/ci-schema.yml | 5 ++-- 2 files changed, 3 insertions(+), 39 deletions(-) delete mode 100644 .github/workflows/_check-schema.yml diff --git a/.github/workflows/_check-schema.yml b/.github/workflows/_check-schema.yml deleted file mode 100644 index 299af83503831..0000000000000 --- a/.github/workflows/_check-schema.yml +++ /dev/null @@ -1,37 +0,0 @@ -name: Reusable Check Schema - -on: - workflow_call: - inputs: - azure-dir: - description: 'Directory containing Azure Pipelines config files. Provide an empty string to skip checking on Azure Pipelines files.' - default: './.azure/' - required: false - type: string - -jobs: - schema: - runs-on: ubuntu-20.04 - steps: - - name: Checkout - uses: actions/checkout@v3 - - - name: Install dependencies - run: pip install check-jsonschema - - - name: GitHub Actions - workflow - run: check-jsonschema $(find .github/workflows -name '*.yml' -a ! -name '_*.yml') --builtin-schema "github-workflows" - - - name: GitHub Actions - action - run: | - if [ -d ".github/actions" ]; then - check-jsonschema .github/actions/*/*.yml --builtin-schema "github-actions" - fi - - - name: Azure Pipelines - env: - SCHEMA_FILE: https://raw.githubusercontent.com/microsoft/azure-pipelines-vscode/v1.204.0/service-schema.json - run: | - if [ -d ${{ inputs.azure-dir }} ]; then - check-jsonschema ${{ inputs.azure-dir }}/*.yml --schemafile "$SCHEMA_FILE" - fi diff --git a/.github/workflows/ci-schema.yml b/.github/workflows/ci-schema.yml index 156334ae96043..364266d340520 100644 --- a/.github/workflows/ci-schema.yml +++ b/.github/workflows/ci-schema.yml @@ -1,10 +1,11 @@ name: Check Schema on: - push: {} + push: + branches: [master, "release/*"] pull_request: branches: [master, "release/*"] jobs: check: - uses: ./.github/workflows/_check-schema.yml + uses: Lightning-AI/devtools/.github/workflows/check-schema.yml@v0.1.0 From 626827c872fdfb743df6ff622988d59987483519 Mon Sep 17 00:00:00 2001 From: Anner Date: Wed, 31 Aug 2022 17:36:35 +0100 Subject: [PATCH 027/193] update rng state save/load test to also run on cuda gpu (#14396) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Adrian Wälchli --- tests/tests_pytorch/utilities/test_seed.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/tests/tests_pytorch/utilities/test_seed.py b/tests/tests_pytorch/utilities/test_seed.py index 2c89883e3c7a1..502febcaa9223 100644 --- a/tests/tests_pytorch/utilities/test_seed.py +++ b/tests/tests_pytorch/utilities/test_seed.py @@ -8,6 +8,7 @@ import pytorch_lightning.utilities.seed as seed_utils from pytorch_lightning.utilities.seed import _collect_rng_states, _set_rng_states, isolate_rng +from tests_pytorch.helpers.runif import RunIf @mock.patch.dict(os.environ, {}, clear=True) @@ -77,7 +78,8 @@ def test_reset_seed_everything(workers): assert torch.allclose(before, after) -def test_isolate_rng(): +@pytest.mark.parametrize("with_torch_cuda", [False, pytest.param(True, marks=RunIf(min_cuda_gpus=1))]) +def test_isolate_rng(with_torch_cuda): """Test that the isolate_rng context manager isolates the random state from the outer scope.""" # torch torch.rand(1) @@ -86,7 +88,7 @@ def test_isolate_rng(): assert torch.equal(torch.rand(2), generated[0]) # torch.cuda - if torch.cuda.is_available(): + if with_torch_cuda: torch.cuda.FloatTensor(1).normal_() with isolate_rng(): generated = [torch.cuda.FloatTensor(2).normal_() for _ in range(3)] @@ -106,10 +108,7 @@ def test_isolate_rng(): def test_backward_compatibility_rng_states_dict(): - """Test that an older rng_states_dict without the "torch.cuda" key does not crash. - - This test is only relevant when torch.cuda is available. - """ + """Test that an older rng_states_dict without the "torch.cuda" key does not crash.""" states = _collect_rng_states() assert "torch.cuda" in states states.pop("torch.cuda") From 2e3d85af844e17049a7666f094847f486c51e087 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Wed, 31 Aug 2022 20:29:11 +0200 Subject: [PATCH 028/193] Remove deprecated rank zero utilities (#14471) Co-authored-by: Jirka Borovec --- src/pytorch_lightning/CHANGELOG.md | 11 ++++- src/pytorch_lightning/loggers/base.py | 2 +- src/pytorch_lightning/strategies/parallel.py | 2 +- .../utilities/distributed.py | 30 ++---------- src/pytorch_lightning/utilities/warnings.py | 48 +++++-------------- .../deprecated_api/test_remove_1-8.py | 34 ------------- 6 files changed, 28 insertions(+), 99 deletions(-) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index feaf4b3610f0a..ad6d7fdea6c28 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -110,6 +110,15 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Removed deprecated support for old torchtext versions ([#14375](https://github.com/Lightning-AI/lightning/pull/14375)) +- Remove the deprecated ([#14471](https://github.com/Lightning-AI/lightning/pull/14471)) + * `pytorch_lightning.utilities.distributed.rank_zero_only` in favor of `pytorch_lightning.utilities.rank_zero.rank_zero_only` + * `pytorch_lightning.utilities.distributed.rank_zero_debug` in favor of `pytorch_lightning.utilities.rank_zero.rank_zero_debug` + * `pytorch_lightning.utilities.distributed.rank_zero_info` in favor of `pytorch_lightning.utilities.rank_zero.rank_zero_info` + * `pytorch_lightning.utilities.warnings.rank_zero_warn` in favor of `pytorch_lightning.utilities.rank_zero.rank_zero_warn` + * `pytorch_lightning.utilities.warnings.rank_zero_deprecation` in favor of `pytorch_lightning.utilities.rank_zero.rank_zero_deprecation` + * `pytorch_lightning.utilities.warnings.LightningDeprecationWarning` in favor of `pytorch_lightning.utilities.rank_zero.LightningDeprecationWarning` + + - Removed deprecated `Trainer.num_processes` attribute in favour of `Trainer.num_devices` ([#14423](https://github.com/Lightning-AI/lightning/pull/14423)) @@ -634,7 +643,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Deprecated `pytorch_lightning.utilities.distributed.rank_zero_info` in favor of `pytorch_lightning.utilities.rank_zero.rank_zero_info` ([#11747](https://github.com/Lightning-AI/lightning/pull/11747)) - Deprecated `pytorch_lightning.utilities.warnings.rank_zero_warn` in favor of `pytorch_lightning.utilities.rank_zero.rank_zero_warn` ([#11747](https://github.com/Lightning-AI/lightning/pull/11747)) - Deprecated `pytorch_lightning.utilities.warnings.rank_zero_deprecation` in favor of `pytorch_lightning.utilities.rank_zero.rank_zero_deprecation` ([#11747](https://github.com/Lightning-AI/lightning/pull/11747)) -- Deprecated `pytorch_lightning.utilities.warnings.LightningDeprecationWarning` in favor of `pytorch_lightning.utilities.rank_zero.LightningDeprecationWarning` +- Deprecated `pytorch_lightning.utilities.warnings.LightningDeprecationWarning` in favor of `pytorch_lightning.utilities.rank_zero.LightningDeprecationWarning` ([#11747](https://github.com/Lightning-AI/lightning/pull/11747)) - Deprecated `on_pretrain_routine_start` and `on_pretrain_routine_end` callback hooks in favor of `on_fit_start` ([#11794](https://github.com/Lightning-AI/lightning/pull/11794)) - Deprecated `LightningModule.on_pretrain_routine_start` and `LightningModule.on_pretrain_routine_end` hooks in favor of `on_fit_start` ([#12122](https://github.com/Lightning-AI/lightning/pull/12122)) - Deprecated `agg_key_funcs` and `agg_default_func` parameters from `LightningLoggerBase` ([#11871](https://github.com/Lightning-AI/lightning/pull/11871)) diff --git a/src/pytorch_lightning/loggers/base.py b/src/pytorch_lightning/loggers/base.py index 43c572e3953c0..628a56609b34d 100644 --- a/src/pytorch_lightning/loggers/base.py +++ b/src/pytorch_lightning/loggers/base.py @@ -17,7 +17,7 @@ import numpy as np import pytorch_lightning.loggers.logger as logger -from pytorch_lightning.utilities.warnings import rank_zero_deprecation +from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation def rank_zero_experiment(fn: Callable) -> Callable: diff --git a/src/pytorch_lightning/strategies/parallel.py b/src/pytorch_lightning/strategies/parallel.py index 9d469313103a1..0790b5e75e077 100644 --- a/src/pytorch_lightning/strategies/parallel.py +++ b/src/pytorch_lightning/strategies/parallel.py @@ -30,7 +30,7 @@ get_default_process_group_backend_for_device, ReduceOp, ) -from pytorch_lightning.utilities.warnings import rank_zero_deprecation +from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation class ParallelStrategy(Strategy, ABC): diff --git a/src/pytorch_lightning/utilities/distributed.py b/src/pytorch_lightning/utilities/distributed.py index 361c6dd12beeb..7b33cb38b6709 100644 --- a/src/pytorch_lightning/utilities/distributed.py +++ b/src/pytorch_lightning/utilities/distributed.py @@ -23,10 +23,8 @@ import pytorch_lightning as pl from pytorch_lightning.utilities.imports import _HPU_AVAILABLE, _TPU_AVAILABLE -from pytorch_lightning.utilities.rank_zero import rank_zero_debug as new_rank_zero_debug from pytorch_lightning.utilities.rank_zero import rank_zero_only # noqa: F401 -from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation -from pytorch_lightning.utilities.rank_zero import rank_zero_info as new_rank_zero_info +from pytorch_lightning.utilities.rank_zero import rank_zero_debug, rank_zero_deprecation, rank_zero_info if _TPU_AVAILABLE: import torch_xla.core.xla_model as xm @@ -160,7 +158,7 @@ def sync_ddp(result: Tensor, group: Optional[Any] = None, reduce_op: Optional[Un is_hpu_backend = os.environ.get("HCCL_DISTRIBUTED_BACKEND") == "1" if is_hpu_backend: if (result.type() == "torch.LongTensor") or (result.type() == "torch.hpu.LongTensor"): - new_rank_zero_info("Long tensor unsupported on HPU, casting to float") + rank_zero_info("Long tensor unsupported on HPU, casting to float") result = result.float() # sync all processes before reduction @@ -312,12 +310,12 @@ def register_ddp_comm_hook( ddp_comm_hook: Callable = ddp_comm_hook if ddp_comm_wrapper is not None: - new_rank_zero_info( + rank_zero_info( f"DDP comm wrapper is provided, apply {ddp_comm_wrapper.__qualname__}({ddp_comm_hook.__qualname__})." ) ddp_comm_hook = ddp_comm_wrapper(ddp_comm_hook) - new_rank_zero_debug(f"Registering DDP comm hook: {ddp_comm_hook.__qualname__}.") + rank_zero_debug(f"Registering DDP comm hook: {ddp_comm_hook.__qualname__}.") model.register_comm_hook(state=ddp_comm_state, hook=ddp_comm_hook) # type: ignore[operator] @@ -374,7 +372,7 @@ def init_dist_connection( torch.distributed.init_process_group(torch_distributed_backend, rank=global_rank, world_size=world_size, **kwargs) # on rank=0 let everyone know training is starting - new_rank_zero_info( + rank_zero_info( f"{'-' * 100}\n" f"distributed_backend={torch_distributed_backend}\n" f"All distributed processes registered. Starting with {world_size} processes\n" @@ -402,21 +400,3 @@ def _collect_states_on_rank_zero(state: Dict[str, Any]) -> Dict[int, Any]: if not distributed_available(): return {0: state} return {rank: _broadcast_object_list(state, rank) for rank in range(torch.distributed.get_world_size())} - - -def rank_zero_info(*args: Any, **kwargs: Any) -> Any: - rank_zero_deprecation( - "pytorch_lightning.utilities.distributed.rank_zero_info has been deprecated in v1.6" - " and will be removed in v1.8." - " Use the equivalent function from the pytorch_lightning.utilities.rank_zero module instead." - ) - return new_rank_zero_info(*args, **kwargs) - - -def rank_zero_debug(*args: Any, **kwargs: Any) -> Any: - rank_zero_deprecation( - "pytorch_lightning.utilities.distributed.rank_zero_debug has been deprecated in v1.6" - " and will be removed in v1.8." - " Use the equivalent function from the pytorch_lightning.utilities.rank_zero module instead." - ) - return new_rank_zero_debug(*args, **kwargs) diff --git a/src/pytorch_lightning/utilities/warnings.py b/src/pytorch_lightning/utilities/warnings.py index 61883dad51144..45b382bc92214 100644 --- a/src/pytorch_lightning/utilities/warnings.py +++ b/src/pytorch_lightning/utilities/warnings.py @@ -12,17 +12,19 @@ # See the License for the specific language governing permissions and # limitations under the License. """Warning-related utilities.""" - import warnings from typing import Any -from pytorch_lightning.utilities.rank_zero import LightningDeprecationWarning as NewLightningDeprecationWarning -from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation as new_rank_zero_deprecation -from pytorch_lightning.utilities.rank_zero import rank_zero_info as new_rank_zero_info -from pytorch_lightning.utilities.rank_zero import rank_zero_warn as new_rank_zero_warn +from pytorch_lightning.utilities.rank_zero import ( + LightningDeprecationWarning, + rank_zero_deprecation, + rank_zero_info, + rank_zero_warn, +) # enable our warnings -warnings.simplefilter("default", category=NewLightningDeprecationWarning) + +warnings.simplefilter("default", category=LightningDeprecationWarning) class PossibleUserWarning(UserWarning): @@ -33,42 +35,14 @@ class WarningCache(set): def warn(self, message: str, stacklevel: int = 5, **kwargs: Any) -> None: if message not in self: self.add(message) - new_rank_zero_warn(message, stacklevel=stacklevel, **kwargs) + rank_zero_warn(message, stacklevel=stacklevel, **kwargs) def deprecation(self, message: str, stacklevel: int = 5, **kwargs: Any) -> None: if message not in self: self.add(message) - new_rank_zero_deprecation(message, stacklevel=stacklevel, **kwargs) + rank_zero_deprecation(message, stacklevel=stacklevel, **kwargs) def info(self, message: str, stacklevel: int = 5, **kwargs: Any) -> None: if message not in self: self.add(message) - new_rank_zero_info(message, stacklevel=stacklevel, **kwargs) - - -def rank_zero_warn(*args: Any, **kwargs: Any) -> Any: - new_rank_zero_deprecation( - "pytorch_lightning.utilities.warnings.rank_zero_warn has been deprecated in v1.6" - " and will be removed in v1.8." - " Use the equivalent function from the pytorch_lightning.utilities.rank_zero module instead." - ) - return new_rank_zero_warn(*args, **kwargs) - - -def rank_zero_deprecation(*args: Any, **kwargs: Any) -> Any: - new_rank_zero_deprecation( - "pytorch_lightning.utilities.warnings.rank_zero_deprecation has been deprecated in v1.6" - " and will be removed in v1.8." - " Use the equivalent function from the pytorch_lightning.utilities.rank_zero module instead." - ) - return new_rank_zero_deprecation(*args, **kwargs) - - -class LightningDeprecationWarning(NewLightningDeprecationWarning): - def __init__(self, *args: Any, **kwargs: Any) -> None: - new_rank_zero_deprecation( - "pytorch_lightning.utilities.warnings.LightningDeprecationWarning has been deprecated in v1.6" - " and will be removed in v1.8." - " Use the equivalent class from the pytorch_lightning.utilities.rank_zero module instead." - ) - super().__init__(*args, **kwargs) + rank_zero_info(message, stacklevel=stacklevel, **kwargs) diff --git a/tests/tests_pytorch/deprecated_api/test_remove_1-8.py b/tests/tests_pytorch/deprecated_api/test_remove_1-8.py index 3e3fbccc61aa1..489ef38f0c00f 100644 --- a/tests/tests_pytorch/deprecated_api/test_remove_1-8.py +++ b/tests/tests_pytorch/deprecated_api/test_remove_1-8.py @@ -275,40 +275,6 @@ def on_pretrain_routine_end(self, *args, **kwargs): trainer.fit(model) -def test_v1_8_0_rank_zero_imports(): - - import warnings - - from pytorch_lightning.utilities.distributed import rank_zero_debug, rank_zero_info - from pytorch_lightning.utilities.warnings import LightningDeprecationWarning, rank_zero_deprecation, rank_zero_warn - - with pytest.deprecated_call( - match="pytorch_lightning.utilities.distributed.rank_zero_debug has been deprecated in v1.6" - " and will be removed in v1.8." - ): - rank_zero_debug("foo") - with pytest.deprecated_call( - match="pytorch_lightning.utilities.distributed.rank_zero_info has been deprecated in v1.6" - " and will be removed in v1.8." - ): - rank_zero_info("foo") - with pytest.deprecated_call( - match="pytorch_lightning.utilities.warnings.rank_zero_warn has been deprecated in v1.6" - " and will be removed in v1.8." - ): - rank_zero_warn("foo") - with pytest.deprecated_call( - match="pytorch_lightning.utilities.warnings.rank_zero_deprecation has been deprecated in v1.6" - " and will be removed in v1.8." - ): - rank_zero_deprecation("foo") - with pytest.deprecated_call( - match="pytorch_lightning.utilities.warnings.LightningDeprecationWarning has been deprecated in v1.6" - " and will be removed in v1.8." - ): - warnings.warn("foo", LightningDeprecationWarning, stacklevel=5) - - def test_v1_8_0_on_before_accelerator_backend_setup(tmpdir): class TestCallback(Callback): def on_before_accelerator_backend_setup(self, *args, **kwargs): From 10adcd5d12043cb93cf1d72f7cf4cf4da387dd7c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Wed, 31 Aug 2022 20:30:18 +0200 Subject: [PATCH 029/193] Remove the unused pyDeprecate dependency (#14472) * Remove the unused pyDeprecate dependency * CHANGELOG --- docs/source-pytorch/conf.py | 1 - requirements/pytorch/base.txt | 1 - src/pytorch_lightning/CHANGELOG.md | 3 +++ src/pytorch_lightning/loops/dataloader/evaluation_loop.py | 5 ----- src/pytorch_lightning/loops/dataloader/prediction_loop.py | 2 -- src/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py | 4 ---- src/pytorch_lightning/loops/epoch/prediction_epoch_loop.py | 2 -- src/pytorch_lightning/loops/loop.py | 3 --- 8 files changed, 3 insertions(+), 18 deletions(-) diff --git a/docs/source-pytorch/conf.py b/docs/source-pytorch/conf.py index e75c2ed8638b1..fefe6df85104a 100644 --- a/docs/source-pytorch/conf.py +++ b/docs/source-pytorch/conf.py @@ -336,7 +336,6 @@ def package_list_from_file(file): "comet-ml": "comet_ml", "neptune-client": "neptune", "hydra-core": "hydra", - "pyDeprecate": "deprecate", } MOCK_PACKAGES = [] if SPHINX_MOCK_REQUIREMENTS: diff --git a/requirements/pytorch/base.txt b/requirements/pytorch/base.txt index 0c72678229208..698d0ec4ad858 100644 --- a/requirements/pytorch/base.txt +++ b/requirements/pytorch/base.txt @@ -8,6 +8,5 @@ PyYAML>=5.4, <=6.0 fsspec[http]>=2021.05.0, !=2021.06.0, <2022.6.0 tensorboard>=2.9.1, <2.11.0 torchmetrics>=0.7.0, <0.9.3 # needed for using fixed compare_version -pyDeprecate>=0.3.1, <=0.3.2 packaging>=17.0, <=21.3 typing-extensions>=4.0.0, <4.3.1 diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index ad6d7fdea6c28..384e193f43b1f 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -48,6 +48,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Changed `trainer.should_stop` to not stop in between an epoch and run until `min_steps/min_epochs` only ([#13890](https://github.com/Lightning-AI/lightning/pull/13890)) +- The `pyDeprecate` dependency is no longer installed ([#14472](https://github.com/Lightning-AI/lightning/pull/14472)) + + - When using multiple loggers, by default checkpoints and profiler output now get saved to the log dir of the first logger in the list ([14325](https://github.com/Lightning-AI/lightning/pull/14325)) diff --git a/src/pytorch_lightning/loops/dataloader/evaluation_loop.py b/src/pytorch_lightning/loops/dataloader/evaluation_loop.py index c8ab1e9cc921b..5760dd63a0d57 100644 --- a/src/pytorch_lightning/loops/dataloader/evaluation_loop.py +++ b/src/pytorch_lightning/loops/dataloader/evaluation_loop.py @@ -17,7 +17,6 @@ from collections import ChainMap, OrderedDict from typing import Any, Iterable, List, Optional, Sequence, Tuple, Type, Union -from deprecate.utils import void from torch import Tensor from torch.utils.data.dataloader import DataLoader @@ -124,8 +123,6 @@ def on_skip(self) -> List: def on_run_start(self, *args: Any, **kwargs: Any) -> None: """Runs the ``_on_evaluation_model_eval``, ``_on_evaluation_start`` and ``_on_evaluation_epoch_start`` hooks.""" - void(*args, **kwargs) - data_fetcher_cls = _select_data_fetcher_type(self.trainer) self._data_fetcher = data_fetcher_cls(prefetch_batches=self.prefetch_batches) @@ -137,8 +134,6 @@ def on_run_start(self, *args: Any, **kwargs: Any) -> None: def advance(self, *args: Any, **kwargs: Any) -> None: """Performs evaluation on one single dataloader.""" - void(*args, **kwargs) - dataloader_idx = self.current_dataloader_idx dataloader = self.current_dataloader diff --git a/src/pytorch_lightning/loops/dataloader/prediction_loop.py b/src/pytorch_lightning/loops/dataloader/prediction_loop.py index 2faf66a3e0c3e..dcd91ef058919 100644 --- a/src/pytorch_lightning/loops/dataloader/prediction_loop.py +++ b/src/pytorch_lightning/loops/dataloader/prediction_loop.py @@ -1,6 +1,5 @@ from typing import Any, List, Optional, Sequence -from deprecate.utils import void from torch.utils.data import DataLoader from pytorch_lightning.loops.dataloader.dataloader_loop import DataLoaderLoop @@ -91,7 +90,6 @@ def on_run_start(self) -> None: # type: ignore[override] def advance(self, *args: Any, **kwargs: Any) -> None: """Predicts one entire dataloader.""" - void(*args, **kwargs) dataloader = self.current_dataloader if dataloader is not None: _set_sampler_epoch(dataloader, self.trainer.fit_loop.epoch_progress.current.processed) diff --git a/src/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py b/src/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py index ad90de1f03a97..b9e6ec78d47b0 100644 --- a/src/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py +++ b/src/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py @@ -16,7 +16,6 @@ from functools import lru_cache from typing import Any, Dict, Optional -from deprecate import void from torch.utils.data import DataLoader from pytorch_lightning.loops.loop import Loop @@ -81,7 +80,6 @@ def on_run_start( # type: ignore[override] dl_max_batches: maximum number of batches the dataloader can produce kwargs: the kwargs passed down to the hooks. """ - void(kwargs) self._dl_max_batches = dl_max_batches self._reload_dataloader_state_dict(data_fetcher) # creates the iterator inside the fetcher but returns `self` @@ -120,8 +118,6 @@ def advance( # type: ignore[override] Raises: StopIteration: If the current batch is None """ - void(dl_max_batches) - if not isinstance(data_fetcher, DataLoaderIterDataFetcher): batch_idx = self.batch_progress.current.ready batch = next(data_fetcher) diff --git a/src/pytorch_lightning/loops/epoch/prediction_epoch_loop.py b/src/pytorch_lightning/loops/epoch/prediction_epoch_loop.py index 64f4523d914f3..39717929787bb 100644 --- a/src/pytorch_lightning/loops/epoch/prediction_epoch_loop.py +++ b/src/pytorch_lightning/loops/epoch/prediction_epoch_loop.py @@ -2,7 +2,6 @@ from typing import Any, Dict, Iterator, List, Tuple import torch -from deprecate import void from pytorch_lightning.loops.loop import Loop from pytorch_lightning.overrides.distributed import IndexBatchSamplerWrapper @@ -63,7 +62,6 @@ def on_run_start( # type: ignore[override] dl_max_batches: the maximum number of batches the current loader can produce num_dataloaders: the total number of dataloaders """ - void(dataloader_iter, dataloader_idx) self._dl_max_batches = dl_max_batches self._num_dataloaders = num_dataloaders # this call requires that `self.return_predictions` is set diff --git a/src/pytorch_lightning/loops/loop.py b/src/pytorch_lightning/loops/loop.py index 8b608b36de297..d1439870688c9 100644 --- a/src/pytorch_lightning/loops/loop.py +++ b/src/pytorch_lightning/loops/loop.py @@ -15,7 +15,6 @@ from abc import ABC, abstractmethod from typing import Any, Dict, Generic, Optional, Type, TypeVar, Union -from deprecate import void from torchmetrics import Metric import pytorch_lightning as pl @@ -225,14 +224,12 @@ def on_run_start(self, *args: Any, **kwargs: Any) -> None: Accepts all arguments passed to :attr:`run`. """ - void(*args, **kwargs) def on_advance_start(self, *args: Any, **kwargs: Any) -> None: """Hook to be called each time before :attr:`advance` is called. Accepts all arguments passed to :attr`run`. """ - void(*args, **kwargs) @abstractmethod def advance(self, *args: Any, **kwargs: Any) -> None: From 314d95180629c7fb13efecd13fd948497a13515b Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Wed, 31 Aug 2022 21:11:06 +0200 Subject: [PATCH 030/193] CI: Azure clear workspace (#14460) --- .azure/app-cloud-e2e.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.azure/app-cloud-e2e.yml b/.azure/app-cloud-e2e.yml index b4225e59138c8..3fd92512b7d40 100644 --- a/.azure/app-cloud-e2e.yml +++ b/.azure/app-cloud-e2e.yml @@ -56,6 +56,9 @@ jobs: name: "commands_and_api" timeoutInMinutes: "30" cancelTimeoutInMinutes: "2" + # values: https://docs.microsoft.com/en-us/azure/devops/pipelines/process/phases?view=azure-devops&tabs=yaml#workspace + workspace: + clean: all steps: - script: echo '##vso[task.setvariable variable=local_id]$(System.PullRequest.PullRequestNumber)' From a7c01c47549fcdf592945f98bba6996548a8b64e Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Wed, 31 Aug 2022 21:59:03 +0200 Subject: [PATCH 031/193] Introduce lightning connect (#14452) * update * update * update * update * Review of content * Formatting updates * Fomatting updates * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Updates based on new commits * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * update * update * update * update * Introduce lightning connect (#14183) Co-authored-by: Luca Antiga Co-authored-by: Felonious-Spellfire Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Luca Antiga --- .../glossary/command_lines/command_lines.rst | 76 +++++++ .../glossary/restful_api/restful_api.rst | 53 +++++ docs/source-app/index.rst | 4 + .../build_command_line_interface/app.py | 36 ++++ .../build_command_line_interface/cli.rst | 144 +++++++++++++ .../cli_client.rst | 176 ++++++++++++++++ .../commands/__init__.py | 0 .../commands/notebook/__init__.py | 0 .../commands/notebook/run.py | 32 +++ .../example_command.py | 24 +++ .../build_command_line_interface/index.rst | 55 +++++ .../index_content.rst | 51 +++++ .../post_example.py | 27 +++ .../workflows/build_rest_api/add_api.rst | 102 +++++++++ .../workflows/build_rest_api/index.rst | 32 +++ .../build_rest_api/index_content.rst | 50 +++++ .../workflows/build_rest_api/models.py | 6 + .../workflows/build_rest_api/post_example.py | 26 +++ .../build_rest_api/post_example_pydantic.py | 33 +++ .../build_rest_api/request_validation.rst | 69 ++++++ src/lightning_app/cli/commands/__init__.py | 0 .../cli/commands/app_commands.py | 93 +++++++++ src/lightning_app/cli/commands/connection.py | 197 ++++++++++++++++++ src/lightning_app/cli/lightning_cli.py | 99 +++------ src/lightning_app/testing/testing.py | 13 +- src/lightning_app/utilities/cli_helpers.py | 14 +- src/lightning_app/utilities/commands/base.py | 48 +++-- tests/tests_app/cli/jsons/connect_1.json | 1 + tests/tests_app/cli/test_connect.py | 190 +++++++++++++++++ tests/tests_app/utilities/test_commands.py | 14 +- tests/tests_app_examples/idle_timeout/app.py | 2 +- .../test_commands_and_api.py | 16 +- 32 files changed, 1578 insertions(+), 105 deletions(-) create mode 100644 docs/source-app/glossary/command_lines/command_lines.rst create mode 100644 docs/source-app/glossary/restful_api/restful_api.rst create mode 100644 docs/source-app/workflows/build_command_line_interface/app.py create mode 100644 docs/source-app/workflows/build_command_line_interface/cli.rst create mode 100644 docs/source-app/workflows/build_command_line_interface/cli_client.rst create mode 100644 docs/source-app/workflows/build_command_line_interface/commands/__init__.py create mode 100644 docs/source-app/workflows/build_command_line_interface/commands/notebook/__init__.py create mode 100644 docs/source-app/workflows/build_command_line_interface/commands/notebook/run.py create mode 100644 docs/source-app/workflows/build_command_line_interface/example_command.py create mode 100644 docs/source-app/workflows/build_command_line_interface/index.rst create mode 100644 docs/source-app/workflows/build_command_line_interface/index_content.rst create mode 100644 docs/source-app/workflows/build_command_line_interface/post_example.py create mode 100644 docs/source-app/workflows/build_rest_api/add_api.rst create mode 100644 docs/source-app/workflows/build_rest_api/index.rst create mode 100644 docs/source-app/workflows/build_rest_api/index_content.rst create mode 100644 docs/source-app/workflows/build_rest_api/models.py create mode 100644 docs/source-app/workflows/build_rest_api/post_example.py create mode 100644 docs/source-app/workflows/build_rest_api/post_example_pydantic.py create mode 100644 docs/source-app/workflows/build_rest_api/request_validation.rst create mode 100644 src/lightning_app/cli/commands/__init__.py create mode 100644 src/lightning_app/cli/commands/app_commands.py create mode 100644 src/lightning_app/cli/commands/connection.py create mode 100644 tests/tests_app/cli/jsons/connect_1.json create mode 100644 tests/tests_app/cli/test_connect.py diff --git a/docs/source-app/glossary/command_lines/command_lines.rst b/docs/source-app/glossary/command_lines/command_lines.rst new file mode 100644 index 0000000000000..1ad4cdfeefab0 --- /dev/null +++ b/docs/source-app/glossary/command_lines/command_lines.rst @@ -0,0 +1,76 @@ +:orphan: + +############################ +Command-line Interface (CLI) +############################ + +**Audience:** Users looking to create a command line interface (CLI) for their application. + +---- + +************** +What is a CLI? +************** + +A Command-line Interface (CLI) is an user interface (UI) in a terminal to interact with a specific program. + +.. note:: + + The Lightning guideline to build CLI is `lightning ...` or ` ...`. + +As an example, Lightning provides a CLI to interact with your Lightning Apps and the `lightning.ai `_ platform as follows: + +.. code-block:: bash + + main + ├── create - Creates Lightning AI self-managed resources (clusters, etc…) + │ └── cluster - Creates a Lightning AI BYOC compute cluster with your cloud provider credentials. + ├── delete - Deletes Lightning AI self-managed resources (clusters, etc…) + │ └── cluster - Deletes a Lightning AI BYOC compute cluster and all associated cloud provider resources. + ├── fork - Forks an App. + ├── init - Initializes a Lightning App and/or Component. + │ ├── app + │ ├── component + │ ├── pl-app - Creates an App from your PyTorch Lightning source files. + │ └── react-ui - Creates a React UI to give a Lightning Component a React.js web UI + ├── install - Installs a Lightning App and/or Component. + │ ├── app + │ └── component + ├── list - Lists Lightning AI self-managed resources (clusters, etc…) + │ ├── apps - Lists your Lightning AI Apps. + │ └── clusters - Lists your Lightning AI BYOC compute clusters. + ├── login - Logs in to your lightning.ai account. + ├── logout - Logs out of your lightning.ai account. + ├── run - Runs a Lightning App locally or on the cloud. + │ └── app - Runs an App from a file. + ├── show - Shows given resource. + │ ├── cluster - Groups cluster commands inside show. + │ │ └── logs - Shows cluster logs. + │ └── logs - Shows cloud application logs. By default prints logs for all currently available Components. + ├── stop - Stops your App. + └── tree - Shows the command tree of your CLI. + +Learn more about `Command-line interfaces here `_. + +---- + +********** +Learn more +********** + +.. raw:: html + +
+
+ +.. displayitem:: + :header: Develop a Command Line Interface + :description: Learn how to develop a CLI for your App. + :col_css: col-md-6 + :button_link: ../../workflows/build_command_line_interface/index_content.html + :height: 150 + +.. raw:: html + +
+
diff --git a/docs/source-app/glossary/restful_api/restful_api.rst b/docs/source-app/glossary/restful_api/restful_api.rst new file mode 100644 index 0000000000000..a1128f2234558 --- /dev/null +++ b/docs/source-app/glossary/restful_api/restful_api.rst @@ -0,0 +1,53 @@ +:orphan: + +########### +RESTful API +########### + +**Audience:** Users looking to create an API in their App to allow users to activate functionalities from external sources. + +---- + +********************** +What is a RESTful API? +********************** + +A RESTful API is a set of external URL routes exposed by a server that enables clients to trigger some functionalities, such as getting or putting some data, uploading files, etc.. + +This provides great flexibility for users as they can easily discover functionalities made available by the App Builders. + +The Lightning App framework supports the four primary HTTP methods: `GET`, `POST`, `PUT`, `DELETE`. + +These methods are guidelines to organize your RESTful Services and help users understand your functionalities. + +* **`GET`:** Reads data from the server. +* **`POST`:** Creates new resources. +* **`PUT`:** Updates/replaces existing resources. +* **`DELETE`:** Deletes resources. + +Learn more about `HTTP Methods for RESTful Services here `_. + +The Lightning App framework uses the popular `FastAPI `_ and `Pydantic `_ frameworks under the hood. This means you can use all their features while building your App. + +---- + +********** +Learn more +********** + +.. raw:: html + +
+
+ +.. displayitem:: + :header: Develop a RESTful API + :description: Learn how to develop an API for your App. + :col_css: col-md-6 + :button_link: ../../workflows/build_rest_api/index_content.html + :height: 150 + +.. raw:: html + +
+
diff --git a/docs/source-app/index.rst b/docs/source-app/index.rst index 6c701ffd574d9..af0e7eb350827 100644 --- a/docs/source-app/index.rst +++ b/docs/source-app/index.rst @@ -220,8 +220,10 @@ Keep Learning Add a web user interface (UI) Add a web link Arrange app tabs + Develop a Command Line Interface (CLI) Develop a Lightning App Develop a Lightning Component + Develop a REST API Cache Work run calls Customize your cloud compute Extend an existing app @@ -264,10 +266,12 @@ Keep Learning App Components Tree Build Configuration + Command Line Interface (CLI) DAG Event Loop Environment Variables Frontend + REST API Sharing Components Scheduling Storage diff --git a/docs/source-app/workflows/build_command_line_interface/app.py b/docs/source-app/workflows/build_command_line_interface/app.py new file mode 100644 index 0000000000000..f6a398096b96c --- /dev/null +++ b/docs/source-app/workflows/build_command_line_interface/app.py @@ -0,0 +1,36 @@ +from commands.notebook.run import RunNotebook, RunNotebookConfig +from lit_jupyter import JupyterLab + +import lightning as L +from lightning.app.structures import Dict + + +class Flow(L.LightningFlow): + + def __init__(self): + super().__init__() + self.notebooks = Dict() + + # 1. Annotates the handler input with the Notebook config. + def run_notebook(self, config: RunNotebookConfig): + if config.name in self.notebooks: + return f"The Notebook {config.name} already exists." + else: + # 2. Dynamically creates the Notebook if it doesn't exist and runs it. + self.notebooks[config.name] = JupyterLab( + cloud_compute=L.CloudCompute(config.cloud_compute) + ) + self.notebooks[config.name].run() + return f"The Notebook {config.name} was created." + + def configure_commands(self): + # 3. Returns a list of dictionaries with the format: + # {"command_name": CustomClientCommand(method=self.custom_server_handler)} + return [{"run notebook": RunNotebook(method=self.run_notebook)}] + + def configure_layout(self): + # 4. Dynamically displays the Notebooks in the Lightning App View. + return [{"name": n, "content": w} for n, w in self.notebooks.items()] + + +app = L.LightningApp(Flow()) diff --git a/docs/source-app/workflows/build_command_line_interface/cli.rst b/docs/source-app/workflows/build_command_line_interface/cli.rst new file mode 100644 index 0000000000000..4608e5675ba92 --- /dev/null +++ b/docs/source-app/workflows/build_command_line_interface/cli.rst @@ -0,0 +1,144 @@ +:orphan: + +########################################### +1. Develop a CLI with server side code only +########################################### + +We are going to learn how to create a simple command-line interface. + +Lightning provides a flexible way to create complex CLI without much effort. + +---- + +************************* +1. Implement a simple CLI +************************* + +To create your first CLI, you need to override the :class:`~lightning_app.core.flow.LightningFlow.configure_commands` hook and return a list of dictionaries where the keys are the commands and the values are the server side handlers. + +First, create a file ``app.py`` and copy-paste the following code in to the file: + +.. literalinclude:: example_command.py + +---- + +************** +2. Run the App +************** + +Execute the following command in a terminal: + +.. code-block:: + + lightning run app app.py + +The following appears the terminal: + +.. code-block:: + + Your Lightning App is starting. This won't take long. + INFO: Your app has started. View it in your browser: http://127.0.0.1:7501/view + [] + +---- + +*************************** +3. Connect to a running App +*************************** + +In another terminal, connect to the running App. +When you connect to an App, the Lightning CLI is replaced by the App CLI. To exit the App CLI, you need to run ``lightning disconnect``. + +.. code-block:: + + lightning connect localhost + +To see a list of available commands: + +.. code-block:: + + lightning --help + You are connected to the cloud Lightning App: localhost. + Usage: lightning [OPTIONS] COMMAND [ARGS]... + + --help Show this message and exit. + + Lightning App Commands + add Description + +To find the arguments of the commands: + +.. code-block:: + + lightning add --help + You are connected to the cloud Lightning App: localhost. + Usage: lightning add [ARGS]... + + Options + name: Add description + +---- + +******************** +4. Execute a command +******************** + +Trigger the command line exposed by your App: + +.. code-block:: + + lightning add --name=my_name + WARNING: Lightning Command Line Interface is an experimental feature and unannounced changes are likely. + +In your first terminal, **Received name: my_name** and **["my_name"]** are printed. + +.. code-block:: + + Your Lightning App is starting. This won't take long. + INFO: Your app has started. View it in your browser: http://127.0.0.1:7501/view + [] + Received name: my_name + ["my_name] + +---- + +************************** +5. Disconnect from the App +************************** + +To exit the App CLI, you need to run ``lightning disconnect``. + +.. code-block:: + + lightning disconnect + You are disconnected from the local Lightning App. + +---- + +********** +Learn more +********** + +.. raw:: html + +
+
+ +.. displayitem:: + :header: 2. Implement a CLI with client side code execution + :description: Learn how to develop a complex API for your application + :col_css: col-md-6 + :button_link: cli_client.html + :height: 150 + +.. displayitem:: + :header: Develop a RESTful API + :description: Learn how to develop an API for your application. + :col_css: col-md-6 + :button_link: ../build_rest_api/index.html + :height: 150 + +.. raw:: html + +
+
diff --git a/docs/source-app/workflows/build_command_line_interface/cli_client.rst b/docs/source-app/workflows/build_command_line_interface/cli_client.rst new file mode 100644 index 0000000000000..96a2b41195492 --- /dev/null +++ b/docs/source-app/workflows/build_command_line_interface/cli_client.rst @@ -0,0 +1,176 @@ +:orphan: + +###################################################### +2. Develop a CLI with server and client code execution +###################################################### + +We've learned how to create a simple command-line interface. But in real-world use-cases, an App Builder wants to provide more complex functionalities where trusted code is executed on the client side. + +Lightning provides a flexible way to create complex CLI without much effort. + +In this example, we’ll create a CLI to dynamically run Notebooks: + + +---- + +************************** +1. Implement a complex CLI +************************** + +First of all, lets' create the following file structure: + +.. code-block:: python + + app_folder/ + commands/ + notebook/ + run.py + app.py + +We'll use the `Jupyter-Component `_. Follow the installation steps on the repo to install the Component. + +Add the following code to ``commands/notebook/run.py``: + +.. literalinclude:: commands/notebook/run.py + +Add the following code to ``app.py``: + +.. literalinclude:: app.py + +---- + +********************************************** +2. Run the App and check the API documentation +********************************************** + +In a terminal, run the following command and open ``http://127.0.0.1:7501/docs`` in a browser. + +.. code-block:: python + + lightning run app app.py + Your Lightning App is starting. This won't take long. + INFO: Your app has started. View it in your browser: http://127.0.0.1:7501/view + +---- + +*************************** +3. Connect to a running App +*************************** + +In another terminal, connect to the running App. +When you connect to an App, the Lightning CLI is replaced by the App CLI. To exit the App CLI, you need to run ``lightning disconnect``. + +.. code-block:: + + lightning connect localhost + + Storing `run_notebook` under /Users/thomas/.lightning/lightning_connection/commands/run_notebook.py + You can review all the downloaded commands under /Users/thomas/.lightning/lightning_connection/commands folder. + You are connected to the local Lightning App. + +To see a list of available commands: + +.. code-block:: + + lightning --help + + You are connected to the cloud Lightning App: localhost. + Usage: lightning [OPTIONS] COMMAND [ARGS]... + + --help Show this message and exit. + + Lightning App Commands + run notebook Description + + +To find the arguments of the commands: + +.. code-block:: + + lightning run notebook --help + + You are connected to the cloud Lightning App: localhost. + usage: notebook [-h] [--name NAME] [--cloud_compute CLOUD_COMPUTE] + + Run Notebook Parser + + optional arguments: + -h, --help show this help message and exit + --name NAME + --cloud_compute CLOUD_COMPUTE + +---- + +******************** +4. Execute a command +******************** + +And then you can trigger the command-line exposed by your App. + +Run the first Notebook with the following command: + +.. code-block:: python + + lightning run notebook --name="my_notebook" + WARNING: Lightning Command Line Interface is an experimental feature and unannounced changes are likely. + The notebook my_notebook was created. + +And run a second notebook. + +.. code-block:: python + + lightning run notebook --name="my_notebook_2" + WARNING: Lightning Command Line Interface is an experimental feature and unannounced changes are likely. + The notebook my_notebook_2 was created. + +Here is a recording of the Lightning App: + +.. raw:: html + +
+ +
+
+ +************************** +5. Disconnect from the App +************************** + +To exit the App CLI, you need to run **lightning disconnect**. + +.. code-block:: + + lightning disconnect + You are disconnected from the local Lightning App. + +---- + +********** +Learn more +********** + +.. raw:: html + +
+
+ +.. displayitem:: + :header: 1. Develop a CLI with server side code only + :description: Learn how to develop a simple CLI for your App. + :col_css: col-md-6 + :button_link: cli.html + :height: 150 + +.. displayitem:: + :header: Develop a RESTful API + :description: Learn how to develop an API for your App. + :col_css: col-md-6 + :button_link: ../build_rest_api/index.html + :height: 150 + +.. raw:: html + +
+
diff --git a/docs/source-app/workflows/build_command_line_interface/commands/__init__.py b/docs/source-app/workflows/build_command_line_interface/commands/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/docs/source-app/workflows/build_command_line_interface/commands/notebook/__init__.py b/docs/source-app/workflows/build_command_line_interface/commands/notebook/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/docs/source-app/workflows/build_command_line_interface/commands/notebook/run.py b/docs/source-app/workflows/build_command_line_interface/commands/notebook/run.py new file mode 100644 index 0000000000000..4e3bc67d9e538 --- /dev/null +++ b/docs/source-app/workflows/build_command_line_interface/commands/notebook/run.py @@ -0,0 +1,32 @@ +from argparse import ArgumentParser +from uuid import uuid4 + +from pydantic import BaseModel + +from lightning.app.utilities.commands import ClientCommand + + +class RunNotebookConfig(BaseModel): + name: str + cloud_compute: str + + +class RunNotebook(ClientCommand): + + def run(self): + # 1. Define your own argument parser. You can use argparse, click, etc... + parser = ArgumentParser(description='Run Notebook Parser') + parser.add_argument("--name", type=str, default=None) + parser.add_argument("--cloud_compute", type=str, default="cpu") + hparams = parser.parse_args() + + # 2. Invoke the server side handler by sending a payload. + response = self.invoke_handler( + config=RunNotebookConfig( + name=hparams.name or str(uuid4()), + cloud_compute=hparams.cloud_compute, + ), + ) + + # 3. Print the server response. + print(response) diff --git a/docs/source-app/workflows/build_command_line_interface/example_command.py b/docs/source-app/workflows/build_command_line_interface/example_command.py new file mode 100644 index 0000000000000..3c013548af0a3 --- /dev/null +++ b/docs/source-app/workflows/build_command_line_interface/example_command.py @@ -0,0 +1,24 @@ +from lightning import LightningApp, LightningFlow + + +class Flow(LightningFlow): + def __init__(self): + super().__init__() + self.names = [] + + def run(self): + print(self.names) + + def add_name(self, name: str): + print(f"Received name: {name}") + self.names.append(name) + + def configure_commands(self): + # This can be invoked with `lightning add --name=my_name` + commands = [ + {"add": self.add_name}, + ] + return commands + + +app = LightningApp(Flow()) diff --git a/docs/source-app/workflows/build_command_line_interface/index.rst b/docs/source-app/workflows/build_command_line_interface/index.rst new file mode 100644 index 0000000000000..1f1b1b16163fb --- /dev/null +++ b/docs/source-app/workflows/build_command_line_interface/index.rst @@ -0,0 +1,55 @@ +############################ +Command-line Interface (CLI) +############################ + +**Audience:** Users looking to create a command line interface (CLI) for their application. + +---- + +************** +What is a CLI? +************** + +A Command-line Interface (CLI) is an user interface (UI) in a terminal to interact with a specific program. + +.. note:: + + The Lightning guideline to build CLI is `lightning ...` or ` ...`. + +As an example, Lightning provides a CLI to interact with your Lightning Apps and the `lightning.ai `_ platform as follows: + +.. code-block:: bash + + main + ├── create - Creates Lightning AI self-managed resources (clusters, etc…) + │ └── cluster - Creates a Lightning AI BYOC compute cluster with your cloud provider credentials. + ├── delete - Deletes Lightning AI self-managed resources (clusters, etc…) + │ └── cluster - Deletes a Lightning AI BYOC compute cluster and all associated cloud provider resources. + ├── fork - Forks an App. + ├── init - Initializes a Lightning App and/or Component. + │ ├── app + │ ├── component + │ ├── pl-app - Creates an App from your PyTorch Lightning source files. + │ └── react-ui - Creates a React UI to give a Lightning Component a React.js web UI + ├── install - Installs a Lightning App and/or Component. + │ ├── app + │ └── component + ├── list - Lists Lightning AI self-managed resources (clusters, etc…) + │ ├── apps - Lists your Lightning AI Apps. + │ └── clusters - Lists your Lightning AI BYOC compute clusters. + ├── login - Logs in to your lightning.ai account. + ├── logout - Logs out of your lightning.ai account. + ├── run - Runs a Lightning App locally or on the cloud. + │ └── app - Runs an App from a file. + ├── show - Shows given resource. + │ ├── cluster - Groups cluster commands inside show. + │ │ └── logs - Shows cluster logs. + │ └── logs - Shows cloud application logs. By default prints logs for all currently available Components. + ├── stop - Stops your App. + └── tree - Shows the command tree of your CLI. + +Learn more about `Command-line interfaces here `_. + +---- + +.. include:: index_content.rst diff --git a/docs/source-app/workflows/build_command_line_interface/index_content.rst b/docs/source-app/workflows/build_command_line_interface/index_content.rst new file mode 100644 index 0000000000000..ced369dbfd815 --- /dev/null +++ b/docs/source-app/workflows/build_command_line_interface/index_content.rst @@ -0,0 +1,51 @@ +************************************** +Develop a command line interface (CLI) +************************************** + +.. raw:: html + +
+
+ +.. displayitem:: + :header: 1. Develop a CLI with server side code only + :description: Learn how to develop a simple CLI for your application + :col_css: col-md-6 + :button_link: cli.html + :height: 150 + +.. displayitem:: + :header: 2. Develop a CLI with server and client code execution + :description: Learn how to develop a complex CLI for your application + :col_css: col-md-6 + :button_link: cli_client.html + :height: 150 + +.. raw:: html + +
+
+ + +---- + +********** +Learn more +********** + +.. raw:: html + +
+
+ +.. displayitem:: + :header: Develop a RESTful API + :description: Learn how to develop an API for your application. + :col_css: col-md-6 + :button_link: ../build_rest_api/index.html + :height: 150 + +.. raw:: html + +
+
diff --git a/docs/source-app/workflows/build_command_line_interface/post_example.py b/docs/source-app/workflows/build_command_line_interface/post_example.py new file mode 100644 index 0000000000000..c7f87f1cffdf7 --- /dev/null +++ b/docs/source-app/workflows/build_command_line_interface/post_example.py @@ -0,0 +1,27 @@ +import lightning as L +from lightning.app.api import Post + + +class Flow(L.LightningFlow): + + # 1. Define the state + def __init__(self): + super().__init__() + self.names = [] + + # 2. Optional, but used to validate names + def run(self): + print(self.names) + + # 3. Method executed when a request is received. + def handle_post(self, name: str): + self.names.append(name) + return f'The name {name} was registered' + + # 4. Defines this Component's Restful API. You can have several routes. + def configure_api(self): + # Your own defined route and handler + return [Post(route="/name", method=self.handle_post)] + + +app = L.LightningApp(Flow()) diff --git a/docs/source-app/workflows/build_rest_api/add_api.rst b/docs/source-app/workflows/build_rest_api/add_api.rst new file mode 100644 index 0000000000000..9538c7792f8d0 --- /dev/null +++ b/docs/source-app/workflows/build_rest_api/add_api.rst @@ -0,0 +1,102 @@ +:orphan: + +############################ +Add an API Route to your App +############################ + +In order to add a new route, you need to override the :class:`~lightning_app.core.flow.LightningFlow.configure_api` hook and return a list of :class:`~lightning_app.api.:class:`~lightning_app.api.http_methods.HttpMethod` such as :class:`~lightning_app.api.:class:`~lightning_app.api.http_methods.Get`, :class:`~lightning_app.api.:class:`~lightning_app.api.http_methods.Post`, :class:`~lightning_app.api.:class:`~lightning_app.api.http_methods.Put`, :class:`~lightning_app.api.:class:`~lightning_app.api.http_methods.Delete`. + +---- + +********************** +1. Create a simple App +********************** + +We're going to create a single route ``/name`` that takes a string input ``name`` and stores the value within the ``names`` attribute of the flow state. + +Create a file called ``app.py`` and copy-paste the following code in to the file: + +.. literalinclude:: post_example.py + +---- + +************** +2. Run the App +************** + +Execute the following command in a terminal: + +.. code-block:: + +lightning run app app.py + +The following appears: + +.. code-block:: + + Your Lightning App is starting. This won't take long. + INFO: Your app has started. View it in your browser: http://127.0.0.1:7501/view + +---- + +**************** +3. Check the API +**************** + +The Lightning App framework automatically generates API documentation from your App using `Swagger UI `_. + +You can access it by accessing the following URL: ``http://127.0.0.1:7501/docs`` in your browser and validate your API with the route ``/name`` directly from the documentation page as shown below. + +.. raw:: html + + + +Alternatively, you can invoke the route directly from a second terminal using `curl `_. + +.. code-block:: + + curl -X 'POST' \ + 'http://127.0.0.1:7501/name?name=my_name' \ + -H 'accept: application/json' \ + -d '' + + "The name my_name was registered" + +And you can see the following in your first terminal running your App. + +.. code-block:: + + Your Lightning App is starting. This won't take long. + INFO: Your app has started. View it in your browser: http://127.0.0.1:7501/view + [] + ["my_name"] + +************************************** +Develop a command line interface (CLI) +************************************** + +.. raw:: html + +
+
+ +.. displayitem:: + :header: Add Requests Validation + :description: Learn how to use pydantic with your API. + :col_css: col-md-6 + :button_link: request_validation.html + :height: 150 + +.. displayitem:: + :header: Develop a Command Line Interface (CLI) + :description: Learn how to develop an CLI for your App. + :col_css: col-md-6 + :button_link: ../build_command_line_interface/index.html + :height: 150 + +.. raw:: html + +
+
diff --git a/docs/source-app/workflows/build_rest_api/index.rst b/docs/source-app/workflows/build_rest_api/index.rst new file mode 100644 index 0000000000000..269b810ec6442 --- /dev/null +++ b/docs/source-app/workflows/build_rest_api/index.rst @@ -0,0 +1,32 @@ +########### +RESTful API +########### + +**Audience:** Users looking to create an API in their App to allow users to activate functionalities from external sources. + +---- + +********************** +What is a RESTful API? +********************** + +A RESTful API is a set of external URL routes exposed by a server that enables clients to trigger some functionalities, such as getting or putting some data, uploading files, etc.. + +This provides great flexibility for users as they can easily discover functionalities made available by the App Builders. + +The Lightning App framework supports the four primary HTTP methods: `GET`, `POST`, `PUT`, `DELETE`. + +These methods are guidelines to organize your RESTful Services and help users understand your functionalities. + +* **`GET`:** Reads data from the server. +* **`POST`:** Creates new resources. +* **`PUT`:** Updates/replaces existing resources. +* **`DELETE`:** Deletes resources. + +Learn more about `HTTP Methods for RESTful Services here `_. + +The Lightning App framework uses the popular `FastAPI `_ and `Pydantic `_ frameworks under the hood. This means you can use all their features while building your App. + +---- + +.. include:: index_content.rst diff --git a/docs/source-app/workflows/build_rest_api/index_content.rst b/docs/source-app/workflows/build_rest_api/index_content.rst new file mode 100644 index 0000000000000..9f77225f24f59 --- /dev/null +++ b/docs/source-app/workflows/build_rest_api/index_content.rst @@ -0,0 +1,50 @@ +************** +Develop an API +************** + +.. raw:: html + +
+
+ +.. displayitem:: + :header: Add an API Route to your App + :description: Learn how to develop a simple API for your App. + :col_css: col-md-6 + :button_link: add_api.html + :height: 150 + +.. displayitem:: + :header: Add Requests Validation + :description: Learn how to use pydantic with your API. + :col_css: col-md-6 + :button_link: cli_client.html + :height: 150 + +.. raw:: html + +
+
+ +---- + +********** +Learn more +********** + +.. raw:: html + +
+
+ +.. displayitem:: + :header: Develop a Command-line Interface + :description: Learn how to develop an CLI for your App. + :col_css: col-md-6 + :button_link: ../build_command_line_interface/index.html + :height: 150 + +.. raw:: html + +
+
diff --git a/docs/source-app/workflows/build_rest_api/models.py b/docs/source-app/workflows/build_rest_api/models.py new file mode 100644 index 0000000000000..7ebb3ac8c8c17 --- /dev/null +++ b/docs/source-app/workflows/build_rest_api/models.py @@ -0,0 +1,6 @@ +from pydantic import BaseModel + + +# 1. Subclass the BaseModel and defines your payload format. +class NamePostConfig(BaseModel): + name: str diff --git a/docs/source-app/workflows/build_rest_api/post_example.py b/docs/source-app/workflows/build_rest_api/post_example.py new file mode 100644 index 0000000000000..4a306f176e4b0 --- /dev/null +++ b/docs/source-app/workflows/build_rest_api/post_example.py @@ -0,0 +1,26 @@ +import lightning as L +from lightning.app.api import Post + + +class Flow(L.LightningFlow): + + # 1. Define the state + def __init__(self): + super().__init__() + self.names = [] + + # 2. Optional, but used to validate names + def run(self): + print(self.names) + + # 3. Method executed when a request is received. + def handle_post(self, name: str): + self.names.append(name) + return f'The name {name} was registered' + + # 4. Defines this Component's Restful API. You can have several routes. + def configure_api(self): + return [Post(route="/name", method=self.handle_post)] + + +app = L.LightningApp(Flow()) diff --git a/docs/source-app/workflows/build_rest_api/post_example_pydantic.py b/docs/source-app/workflows/build_rest_api/post_example_pydantic.py new file mode 100644 index 0000000000000..e3c16ca35de48 --- /dev/null +++ b/docs/source-app/workflows/build_rest_api/post_example_pydantic.py @@ -0,0 +1,33 @@ +from models import NamePostConfig # 2. Import your custom model. + +import lightning as L +from lightning.app.api import Post + + +class Flow(L.LightningFlow): + + # 1. Define the state + def __init__(self): + super().__init__() + self.names = [] + + # 2. Optional, but used to validate names + def run(self): + print(self.names) + + # 3. Annotate your input with your custom pydantic model. + def handle_post(self, config: NamePostConfig): + self.names.append(config.name) + return f'The name {config} was registered' + + # 4. Defines this Component's Restful API. You can have several routes. + def configure_api(self): + return [ + Post( + route="/name", + method=self.handle_post, + ) + ] + + +app = L.LightningApp(Flow()) diff --git a/docs/source-app/workflows/build_rest_api/request_validation.rst b/docs/source-app/workflows/build_rest_api/request_validation.rst new file mode 100644 index 0000000000000..a34b2dd04910d --- /dev/null +++ b/docs/source-app/workflows/build_rest_api/request_validation.rst @@ -0,0 +1,69 @@ +:orphan: + +*********************** +Add Requests Validation +*********************** + +The Lightning App framework uses the popular `FastAPI `_ and `Pydantic `_ frameworks under the hood. This means you can use all their features while building your App. + +pydantic enables fast data validation and settings management using Python type annotations and FastAPI is a modern, fast (high-performance), web framework for building APIs. + +You can easily use pydantic by defining your own payload format. + +.. literalinclude:: models.py + +Then, type your handler input with your custom model. + +.. literalinclude:: post_example_pydantic.py + +After running the updated App, the App documentation ``/name`` has changed and takes JSON with ``{"name": ...}`` as input. + +.. figure:: https://pl-flash-data.s3.amazonaws.com/assets_lightning/rest_post_pydantic.png + :alt: Rest API with pydantic + :width: 100 % + +You can invoke the RESTful API route ``/name`` with the following command: + +.. code-block:: bash + + curl -X 'POST' \ + 'http://127.0.0.1:7501/name' \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "name": "my_name" + }' + +.. note:: + + Using curl, you can pass a JSON payload using the ``-d`` argument. + +---- + +********** +Learn more +********** + +.. raw:: html + +
+
+ +.. displayitem:: + :header: Add an API Route to your App + :description: Learn how to develop a simple API for your App. + :col_css: col-md-6 + :button_link: add_api.html + :height: 150 + +.. displayitem:: + :header: Develop a Command Line Interface (CLI) + :description: Learn how to develop an CLI for your App. + :col_css: col-md-6 + :button_link: ../build_command_line_interface/index.html + :height: 150 + +.. raw:: html + +
+
diff --git a/src/lightning_app/cli/commands/__init__.py b/src/lightning_app/cli/commands/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/src/lightning_app/cli/commands/app_commands.py b/src/lightning_app/cli/commands/app_commands.py new file mode 100644 index 0000000000000..0b08538e76ba6 --- /dev/null +++ b/src/lightning_app/cli/commands/app_commands.py @@ -0,0 +1,93 @@ +import os +import sys +from typing import Dict, Optional + +import requests + +from lightning_app.cli.commands.connection import _resolve_command_path +from lightning_app.utilities.cli_helpers import _retrieve_application_url_and_available_commands +from lightning_app.utilities.commands.base import _download_command +from lightning_app.utilities.enum import OpenAPITags + + +def _run_app_command(app_name: str, app_id: Optional[str]): + """Execute a function in a running App from its name.""" + # 1: Collect the url and comments from the running application + url, api_commands, _ = _retrieve_application_url_and_available_commands(app_id) + if url is None or api_commands is None: + raise Exception("We couldn't find any matching running App.") + + if not api_commands: + raise Exception("This application doesn't expose any commands yet.") + + full_command = "_".join(sys.argv) + + has_found = False + for command in list(api_commands): + if command in full_command: + has_found = True + break + + if not has_found: + raise Exception(f"The provided command isn't available in {list(api_commands)}") + + # 2: Send the command from the user + metadata = api_commands[command] + + # 3: Execute the command + if metadata["tag"] == OpenAPITags.APP_COMMAND: + _handle_command_without_client(command, metadata, url) + else: + _handle_command_with_client(command, metadata, app_name, app_id, url) + + if sys.argv[-1] != "--help": + print("Your command execution was successful.") + + +def _handle_command_without_client(command: str, metadata: Dict, url: str) -> None: + supported_params = list(metadata["parameters"]) + if "--help" == sys.argv[-1]: + print(f"Usage: lightning {command} [ARGS]...") + print(" ") + print("Options") + for param in supported_params: + print(f" {param}: Add description") + return + + provided_params = [param.replace("--", "") for param in sys.argv[1 + len(command.split("_")) :]] + + # TODO: Add support for more argument types. + if any("=" not in param for param in provided_params): + raise Exception("Please, use --x=y syntax when providing the command arguments.") + + if any(param.split("=")[0] not in supported_params for param in provided_params): + raise Exception(f"Some arguments need to be provided. The keys are {supported_params}.") + + # TODO: Encode the parameters and validate their type. + query_parameters = "&".join(provided_params) + resp = requests.post(url + f"/command/{command}?{query_parameters}") + assert resp.status_code == 200, resp.json() + + +def _handle_command_with_client(command: str, metadata: Dict, app_name: str, app_id: Optional[str], url: str): + debug_mode = bool(int(os.getenv("DEBUG", "0"))) + + if app_name == "localhost": + target_file = metadata["cls_path"] + else: + target_file = _resolve_command_path(command) if debug_mode else _resolve_command_path(command) + + if debug_mode: + print(target_file) + + client_command = _download_command( + command, + metadata["cls_path"], + metadata["cls_name"], + app_id, + debug_mode=debug_mode, + target_file=target_file if debug_mode else _resolve_command_path(command), + ) + client_command._setup(command_name=command, app_url=url) + sys.argv = sys.argv[len(command.split("_")) :] + client_command.run() diff --git a/src/lightning_app/cli/commands/connection.py b/src/lightning_app/cli/commands/connection.py new file mode 100644 index 0000000000000..e4288219d3095 --- /dev/null +++ b/src/lightning_app/cli/commands/connection.py @@ -0,0 +1,197 @@ +import os +import shutil +from typing import List, Optional, Tuple + +import click + +from lightning_app.utilities.cli_helpers import _retrieve_application_url_and_available_commands +from lightning_app.utilities.cloud import _get_project +from lightning_app.utilities.network import LightningClient + + +@click.argument("app_name_or_id", required=True) +@click.option("-y", "--yes", required=False, is_flag=True, help="Whether to download the commands automatically.") +def connect(app_name_or_id: str, yes: bool = False): + """Connect to a Lightning App.""" + from lightning_app.utilities.commands.base import _download_command + + home = os.path.expanduser("~") + lightning_folder = os.path.join(home, ".lightning", "lightning_connection") + + if not os.path.exists(lightning_folder): + os.makedirs(lightning_folder) + + connected_file = os.path.join(lightning_folder, "connect.txt") + + if os.path.exists(connected_file): + with open(connected_file) as f: + result = f.readlines()[0].replace("\n", "") + + if result == app_name_or_id: + if app_name_or_id == "localhost": + click.echo("You are connected to the local Lightning App.") + else: + click.echo(f"You are already connected to the cloud Lightning App: {app_name_or_id}.") + else: + click.echo("You are already connected to a Lightning App. Please, use `lightning disconnect`.") + + elif app_name_or_id.startswith("localhost"): + + if app_name_or_id != "localhost": + raise Exception("You need to pass localhost to connect to the local Lightning App.") + + _, api_commands, __cached__ = _retrieve_application_url_and_available_commands(None) + + if api_commands is None: + raise Exception(f"The commands weren't found. Is your app {app_name_or_id} running ?") + + commands_folder = os.path.join(lightning_folder, "commands") + if not os.path.exists(commands_folder): + os.makedirs(commands_folder) + + for command_name, metadata in api_commands.items(): + if "cls_path" in metadata: + target_file = os.path.join(commands_folder, f"{command_name.replace(' ','_')}.py") + _download_command( + command_name, + metadata["cls_path"], + metadata["cls_name"], + None, + target_file=target_file, + ) + click.echo(f"Storing `{command_name}` under {target_file}") + click.echo(f"You can review all the downloaded commands under {commands_folder} folder.") + else: + with open(os.path.join(commands_folder, f"{command_name}.txt"), "w") as f: + f.write(command_name) + + with open(connected_file, "w") as f: + f.write(app_name_or_id + "\n") + + click.echo("You are connected to the local Lightning App.") + else: + _, api_commands, lightningapp_id = _retrieve_application_url_and_available_commands(app_name_or_id) + + if not api_commands: + client = LightningClient() + project = _get_project(client) + lightningapps = client.lightningapp_instance_service_list_lightningapp_instances(project.project_id) + click.echo( + "We didn't find a matching App. Here are the available Apps that could be " + f"connected to {[app.name for app in lightningapps.lightningapps]}." + ) + return + + assert lightningapp_id + + if not yes: + yes = click.confirm( + f"The Lightning App `{app_name_or_id}` provides a command-line (CLI). " + "Do you want to proceed and install its CLI ?" + ) + click.echo(" ") + + if yes: + commands_folder = os.path.join(lightning_folder, "commands") + if not os.path.exists(commands_folder): + os.makedirs(commands_folder) + + for command_name, metadata in api_commands.items(): + if "cls_path" in metadata: + target_file = os.path.join(commands_folder, f"{command_name}.py") + _download_command( + command_name, + metadata["cls_path"], + metadata["cls_name"], + lightningapp_id, + target_file=target_file, + ) + click.echo(f"Storing `{command_name}` under {target_file}") + click.echo(f"You can review all the downloaded commands under {commands_folder} folder.") + else: + with open(os.path.join(commands_folder, f"{command_name}.txt"), "w") as f: + f.write(command_name) + + click.echo(" ") + click.echo("The client interface has been successfully installed. ") + click.echo("You can now run the following commands:") + for command in api_commands: + click.echo(f" lightning {command}") + + with open(connected_file, "w") as f: + f.write(app_name_or_id + "\n") + f.write(lightningapp_id + "\n") + click.echo(" ") + click.echo(f"You are connected to the cloud Lightning App: {app_name_or_id}.") + + +def disconnect(logout: bool = False): + """Disconnect from an App.""" + home = os.path.expanduser("~") + lightning_folder = os.path.join(home, ".lightning", "lightning_connection") + connected_file = os.path.join(lightning_folder, "connect.txt") + if os.path.exists(connected_file): + with open(connected_file) as f: + result = f.readlines()[0].replace("\n", "") + + os.remove(connected_file) + commands_folder = os.path.join(lightning_folder, "commands") + if os.path.exists(commands_folder): + shutil.rmtree(commands_folder) + + if result == "localhost": + click.echo("You are disconnected from the local Lightning App.") + else: + click.echo(f"You are disconnected from the cloud Lightning App: {result}.") + else: + if not logout: + click.echo( + "You aren't connected to any Lightning App. " + "Please use `lightning connect app_name_or_id` to connect to one." + ) + + +def _retrieve_connection_to_an_app() -> Tuple[Optional[str], Optional[str]]: + home = os.path.expanduser("~") + lightning_folder = os.path.join(home, ".lightning", "lightning_connection") + connected_file = os.path.join(lightning_folder, "connect.txt") + + if os.path.exists(connected_file): + with open(connected_file) as f: + lines = [line.replace("\n", "") for line in f.readlines()] + if len(lines) == 2: + return lines[0], lines[1] + return lines[0], None + return None, None + + +def _get_commands_folder() -> str: + home = os.path.expanduser("~") + lightning_folder = os.path.join(home, ".lightning", "lightning_connection") + return os.path.join(lightning_folder, "commands") + + +def _resolve_command_path(command: str) -> str: + return os.path.join(_get_commands_folder(), f"{command}.py") + + +def _list_app_commands() -> List[str]: + command_names = sorted( + n.replace(".py", "").replace(".txt", "").replace("_", " ") + for n in os.listdir(_get_commands_folder()) + if n != "__pycache__" + ) + if not command_names: + click.echo("The current Lightning App doesn't have commands.") + return [] + + click.echo("Usage: lightning [OPTIONS] COMMAND [ARGS]...") + click.echo("") + click.echo(" --help Show this message and exit.") + click.echo("") + click.echo("Lightning App Commands") + max_length = max(len(n) for n in command_names) + for command_name in command_names: + padding = (max_length + 1 - len(command_name)) * " " + click.echo(f" {command_name}{padding}Description") + return command_names diff --git a/src/lightning_app/cli/lightning_cli.py b/src/lightning_app/cli/lightning_cli.py index d590cbc667f8a..83a8efcb5334a 100644 --- a/src/lightning_app/cli/lightning_cli.py +++ b/src/lightning_app/cli/lightning_cli.py @@ -1,13 +1,11 @@ import logging import os import sys -from argparse import ArgumentParser from pathlib import Path from typing import List, Tuple, Union import arrow import click -import requests import rich from requests.exceptions import ConnectionError from rich.color import ANSI_COLOR_NAMES @@ -15,6 +13,13 @@ from lightning_app import __version__ as ver from lightning_app.cli import cmd_init, cmd_install, cmd_pl_init, cmd_react_ui_init from lightning_app.cli.cmd_clusters import AWSClusterManager +from lightning_app.cli.commands.app_commands import _run_app_command +from lightning_app.cli.commands.connection import ( + _list_app_commands, + _retrieve_connection_to_an_app, + connect, + disconnect, +) from lightning_app.cli.lightning_cli_create import create from lightning_app.cli.lightning_cli_delete import delete from lightning_app.cli.lightning_cli_list import get_list @@ -22,14 +27,9 @@ from lightning_app.runners.runtime import dispatch from lightning_app.runners.runtime_type import RuntimeType from lightning_app.utilities.app_logs import _app_logs_reader -from lightning_app.utilities.cli_helpers import ( - _arrow_time_callback, - _format_input_env_variables, - _retrieve_application_url_and_available_commands, -) +from lightning_app.utilities.cli_helpers import _arrow_time_callback, _format_input_env_variables from lightning_app.utilities.cloud import _get_project from lightning_app.utilities.cluster_logs import _cluster_logs_reader -from lightning_app.utilities.enum import OpenAPITags from lightning_app.utilities.install_components import register_all_external_components from lightning_app.utilities.login import Auth from lightning_app.utilities.network import LightningClient @@ -46,12 +46,29 @@ def get_app_url(runtime_type: RuntimeType, *args) -> str: def main(): - if len(sys.argv) == 1: - _main() - elif sys.argv[1] in _main.commands.keys() or sys.argv[1] == "--help": + # 1: Handle connection to a Lightning App. + if sys.argv[1] in ("connect", "disconnect"): _main() else: - app_command() + # 2: Collect the connection a Lightning App. + app_name, app_id = _retrieve_connection_to_an_app() + if app_name: + # 3: Handle development use case. + is_local_app = app_name == "localhost" + if is_local_app and sys.argv[1:3] == ["run", "app"]: + _main() + else: + if is_local_app: + click.echo("You are connected to the local Lightning App.") + else: + click.echo(f"You are connected to the cloud Lightning App: {app_name}.") + + if "help" in sys.argv[1]: + _list_app_commands() + else: + _run_app_command(app_name, app_id) + else: + _main() @click.group() @@ -66,6 +83,10 @@ def show(): pass +_main.command(connect) +_main.command(disconnect) + + @show.command() @click.argument("app_name", required=False) @click.argument("components", nargs=-1, required=False) @@ -250,6 +271,7 @@ def login(): def logout(): """Log out of your lightning.ai account.""" Auth().clear() + disconnect(logout=True) def _run_app( @@ -341,59 +363,6 @@ def run_app( _run_app(file, cloud, cluster_id, without_server, no_cache, name, blocking, open_ui, env) -def app_command(): - """Execute a function in a running application from its name.""" - from lightning_app.utilities.commands.base import _download_command - - logger.warn("Lightning Commands are a beta feature and APIs aren't stable yet.") - - debug_mode = bool(int(os.getenv("DEBUG", "0"))) - - parser = ArgumentParser() - parser.add_argument("--app_id", default=None, type=str, help="Optional argument to identify an application.") - hparams, argv = parser.parse_known_args() - - # 1: Collect the url and comments from the running application - url, api_commands = _retrieve_application_url_and_available_commands(hparams.app_id) - if url is None or api_commands is None: - raise Exception("We couldn't find any matching running app.") - - if not api_commands: - raise Exception("This application doesn't expose any commands yet.") - - command = argv[0] - - if command not in api_commands: - raise Exception(f"The provided command {command} isn't available in {list(api_commands)}") - - # 2: Send the command from the user - metadata = api_commands[command] - - # 3: Execute the command - if metadata["tag"] == OpenAPITags.APP_COMMAND: - # TODO: Improve what is current supported - kwargs = [v.replace("--", "") for v in argv[1:]] - - for p in kwargs: - if p.split("=")[0] not in metadata["parameters"]: - raise Exception(f"Some arguments need to be provided. The keys are {list(metadata['parameters'])}.") - # TODO: Encode the parameters and validate their type. - query_parameters = "&".join(kwargs) - resp = requests.post(url + f"/command/{command}?{query_parameters}") - assert resp.status_code == 200, resp.json() - else: - client_command = _download_command( - command, - metadata["cls_path"], - metadata["cls_name"], - hparams.app_id, - debug_mode=debug_mode, - ) - client_command._setup(command_name=command, app_url=url) - sys.argv = argv - client_command.run() - - @_main.group(hidden=True) def fork(): """Fork an application.""" diff --git a/src/lightning_app/testing/testing.py b/src/lightning_app/testing/testing.py index 1ccc8ba1ff63c..387592a4c178e 100644 --- a/src/lightning_app/testing/testing.py +++ b/src/lightning_app/testing/testing.py @@ -184,7 +184,10 @@ def run_app_in_cloud(app_folder: str, app_name: str = "app.py", extra_args: [str else: name = f"test-{TEST_APP_NAME}-" + str(int(time.time())) - # 3. Launch the application in the cloud from the Lightning CLI. + # 3. Disconnect from the App if any. + Popen("lightning disconnect", shell=True).wait() + + # 4. Launch the application in the cloud from the Lightning CLI. with tempfile.TemporaryDirectory() as tmpdir: env_copy = os.environ.copy() env_copy["PACKAGE_LIGHTNING"] = "1" @@ -214,10 +217,10 @@ def run_app_in_cloud(app_folder: str, app_name: str = "app.py", extra_args: [str ) process.wait() - # 4. Print your application name + # 5. Print your application name print(f"The Lightning App Name is: [bold magenta]{name}[/bold magenta]") - # 5. Create chromium browser, auth to lightning_app.ai and yield the admin and view pages. + # 6. Create chromium browser, auth to lightning_app.ai and yield the admin and view pages. with sync_playwright() as p: browser = p.chromium.launch(headless=bool(int(os.getenv("HEADLESS", "0")))) payload = {"apiKey": Config.api_key, "username": Config.username, "duration": "120000"} @@ -343,7 +346,7 @@ def on_error_callback(ws_app, *_): print(f"[{color}]{log_event.component_name}{padding}[/{color}] {date} {message}") yield message - # 5. Print your application ID + # 7. Print your application ID print( f"The Lightning Id Name : [bold magenta]{str(view_page.url).split('.')[0].split('//')[-1]}[/bold magenta]" ) @@ -377,6 +380,8 @@ def on_error_callback(ws_app, *_): except ApiException as e: print(f"Failed to delete {lightningapp.name}. Exception {e}") + Popen("lightning disconnect", shell=True).wait() + def wait_for(page, callback: Callable, *args, **kwargs) -> Any: import playwright diff --git a/src/lightning_app/utilities/cli_helpers.py b/src/lightning_app/utilities/cli_helpers.py index 068024b783bd5..5c885360ce11f 100644 --- a/src/lightning_app/utilities/cli_helpers.py +++ b/src/lightning_app/utilities/cli_helpers.py @@ -86,23 +86,21 @@ def _retrieve_application_url_and_available_commands(app_id_or_name_or_url: Opti resp = requests.get(url + "/openapi.json") if resp.status_code != 200: raise Exception(f"The server didn't process the request properly. Found {resp.json()}") - return url, _extract_command_from_openapi(resp.json()) + return url, _extract_command_from_openapi(resp.json()), None # 2: If no identifier has been provided, evaluate the local application - failed_locally = False - if app_id_or_name_or_url is None: try: url = f"http://localhost:{APP_SERVER_PORT}" resp = requests.get(f"{url}/openapi.json") if resp.status_code != 200: raise Exception(f"The server didn't process the request properly. Found {resp.json()}") - return url, _extract_command_from_openapi(resp.json()) + return url, _extract_command_from_openapi(resp.json()), None except requests.exceptions.ConnectionError: - failed_locally = True + pass # 3: If an identified was provided or the local evaluation has failed, evaluate the cloud. - if app_id_or_name_or_url or failed_locally: + else: client = LightningClient() project = _get_project(client) list_lightningapps = client.lightningapp_instance_service_list_lightningapp_instances(project.project_id) @@ -119,8 +117,8 @@ def _retrieve_application_url_and_available_commands(app_id_or_name_or_url: Opti resp = requests.get(lightningapp.status.url + "/openapi.json") if resp.status_code != 200: raise Exception(f"The server didn't process the request properly. Found {resp.json()}") - return lightningapp.status.url, _extract_command_from_openapi(resp.json()) - return None, None + return lightningapp.status.url, _extract_command_from_openapi(resp.json()), lightningapp.id + return None, None, None def _arrow_time_callback( diff --git a/src/lightning_app/utilities/commands/base.py b/src/lightning_app/utilities/commands/base.py index c74926f542744..512858f8d0ab9 100644 --- a/src/lightning_app/utilities/commands/base.py +++ b/src/lightning_app/utilities/commands/base.py @@ -30,6 +30,9 @@ def makedirs(path: str): class ClientCommand: + + DESCRIPTION = "" + def __init__(self, method: Callable, requirements: Optional[List[str]] = None) -> None: self.method = method flow = getattr(method, "__self__", None) @@ -58,7 +61,8 @@ def run(self, **cli_kwargs) -> None: """Overrides with the logic to execute on the client side.""" def invoke_handler(self, config: BaseModel) -> Dict[str, Any]: - resp = requests.post(self.app_url + f"/command/{self.command_name}", data=config.json()) + command = self.command_name.replace(" ", "_") + resp = requests.post(self.app_url + f"/command/{command}", data=config.json()) assert resp.status_code == 200, resp.json() return resp.json() @@ -75,31 +79,39 @@ def _download_command( cls_name: str, app_id: Optional[str] = None, debug_mode: bool = False, + target_file: Optional[str] = None, ) -> ClientCommand: # TODO: This is a skateboard implementation and the final version will rely on versioned # immutable commands for security concerns - tmpdir = osp.join(gettempdir(), f"{getuser()}_commands") - makedirs(tmpdir) - target_file = osp.join(tmpdir, f"{command_name}.py") - if app_id: - client = LightningClient() - project_id = _get_project(client).project_id - response = client.lightningapp_instance_service_list_lightningapp_instance_artifacts(project_id, app_id) - for artifact in response.artifacts: - if f"commands/{command_name}.py" == artifact.filename: - r = requests.get(artifact.url, allow_redirects=True) - with open(target_file, "wb") as f: - f.write(r.content) - else: - if not debug_mode: + command_name = command_name.replace(" ", "_") + tmpdir = None + if not target_file: + tmpdir = osp.join(gettempdir(), f"{getuser()}_commands") + makedirs(tmpdir) + target_file = osp.join(tmpdir, f"{command_name}.py") + + if not debug_mode: + if app_id: + if not os.path.exists(target_file): + client = LightningClient() + project_id = _get_project(client).project_id + response = client.lightningapp_instance_service_list_lightningapp_instance_artifacts(project_id, app_id) + for artifact in response.artifacts: + if f"commands/{command_name}.py" == artifact.filename: + resp = requests.get(artifact.url, allow_redirects=True) + + with open(target_file, "wb") as f: + f.write(resp.content) + else: shutil.copy(cls_path, target_file) - spec = spec_from_file_location(cls_name, cls_path if debug_mode else target_file) + spec = spec_from_file_location(cls_name, target_file) mod = module_from_spec(spec) sys.modules[cls_name] = mod spec.loader.exec_module(mod) command = getattr(mod, cls_name)(method=None, requirements=[]) - shutil.rmtree(tmpdir) + if tmpdir and os.path.exists(tmpdir): + shutil.rmtree(tmpdir) return command @@ -184,6 +196,7 @@ def _process_api_request(app, request: APIRequest) -> None: def _process_command_requests(app, request: CommandRequest) -> None: for command in app.commands: for command_name, method in command.items(): + command_name = command_name.replace(" ", "_") if request.method_name == command_name: # 2.1: Evaluate the method associated to a specific command. # Validation is done on the CLI side. @@ -213,6 +226,7 @@ def _commands_to_api(commands: List[Dict[str, Union[Callable, ClientCommand]]]) api = [] for command in commands: for k, v in command.items(): + k = k.replace(" ", "_") api.append( Post( f"/command/{k}", diff --git a/tests/tests_app/cli/jsons/connect_1.json b/tests/tests_app/cli/jsons/connect_1.json new file mode 100644 index 0000000000000..dc605a6354c8c --- /dev/null +++ b/tests/tests_app/cli/jsons/connect_1.json @@ -0,0 +1 @@ +{"openapi":"3.0.2","info":{"title":"FastAPI","version":"0.1.0"},"paths":{"/api/v1/state":{"get":{"summary":"Get State","operationId":"get_state_api_v1_state_get","parameters":[{"required":false,"schema":{"title":"X-Lightning-Type","type":"string"},"name":"x-lightning-type","in":"header"},{"required":false,"schema":{"title":"X-Lightning-Session-Uuid","type":"string"},"name":"x-lightning-session-uuid","in":"header"},{"required":false,"schema":{"title":"X-Lightning-Session-Id","type":"string"},"name":"x-lightning-session-id","in":"header"}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}},"post":{"summary":"Post State","operationId":"post_state_api_v1_state_post","parameters":[{"required":false,"schema":{"title":"X-Lightning-Type","type":"string"},"name":"x-lightning-type","in":"header"},{"required":false,"schema":{"title":"X-Lightning-Session-Uuid","type":"string"},"name":"x-lightning-session-uuid","in":"header"},{"required":false,"schema":{"title":"X-Lightning-Session-Id","type":"string"},"name":"x-lightning-session-id","in":"header"}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v1/spec":{"get":{"summary":"Get Spec","operationId":"get_spec_api_v1_spec_get","parameters":[{"required":false,"schema":{"title":"X-Lightning-Session-Uuid","type":"string"},"name":"x-lightning-session-uuid","in":"header"},{"required":false,"schema":{"title":"X-Lightning-Session-Id","type":"string"},"name":"x-lightning-session-id","in":"header"}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v1/delta":{"post":{"summary":"Post Delta","description":"This endpoint is used to make an update to the app state using delta diff, mainly used by streamlit to\nupdate the state.","operationId":"post_delta_api_v1_delta_post","parameters":[{"required":false,"schema":{"title":"X-Lightning-Type","type":"string"},"name":"x-lightning-type","in":"header"},{"required":false,"schema":{"title":"X-Lightning-Session-Uuid","type":"string"},"name":"x-lightning-session-uuid","in":"header"},{"required":false,"schema":{"title":"X-Lightning-Session-Id","type":"string"},"name":"x-lightning-session-id","in":"header"}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/healthz":{"get":{"summary":"Healthz","description":"Health check endpoint used in the cloud FastAPI servers to check the status periodically. This requires\nRedis to be installed for it to work.\n\n# TODO - Once the state store abstraction is in, check that too","operationId":"healthz_healthz_get","responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}}}}},"/user/command_without_client":{"post":{"tags":["app_api"],"summary":"Command Without Client","operationId":"command_without_client_user_command_without_client_post","parameters":[{"required":true,"schema":{"title":"Name","type":"string"},"name":"name","in":"query"}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/command/command_without_client":{"post":{"tags":["app_command"],"summary":"Command Without Client","operationId":"command_without_client_command_command_without_client_post","parameters":[{"required":true,"schema":{"title":"Name","type":"string"},"name":"name","in":"query"}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/command/command_with_client":{"post":{"tags":["app_client_command"],"summary":"Command With Client","operationId":"command_with_client_command_command_with_client_post","requestBody":{"content":{"application/json":{"schema":{"$ref":"#/components/schemas/CustomConfig"}}},"required":true},"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}},"cls_name":"CustomCommand","cls_path":"examples/app_commands_and_api/command.py"}},"/command/nested_command":{"post":{"tags":["app_command"],"summary":"Nested Command","operationId":"nested_command_command_nested_command_post","parameters":[{"required":true,"schema":{"title":"Name","type":"string"},"name":"name","in":"query"}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api{full_path}":{"get":{"summary":"Api Catch All","operationId":"api_catch_all_api_full_path__get","parameters":[{"required":true,"schema":{"title":"Full Path","type":"string"},"name":"full_path","in":"path"}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/{full_path}":{"get":{"summary":"Frontend Route","operationId":"frontend_route__full_path__get","parameters":[{"required":true,"schema":{"title":"Full Path","type":"string"},"name":"full_path","in":"path"}],"responses":{"200":{"description":"Successful Response","content":{"text/html":{"schema":{"type":"string"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}}},"components":{"schemas":{"CustomConfig":{"title":"CustomConfig","required":["name"],"type":"object","properties":{"name":{"title":"Name","type":"string"}}},"HTTPValidationError":{"title":"HTTPValidationError","type":"object","properties":{"detail":{"title":"Detail","type":"array","items":{"$ref":"#/components/schemas/ValidationError"}}}},"ValidationError":{"title":"ValidationError","required":["loc","msg","type"],"type":"object","properties":{"loc":{"title":"Location","type":"array","items":{"anyOf":[{"type":"string"},{"type":"integer"}]}},"msg":{"title":"Message","type":"string"},"type":{"title":"Error Type","type":"string"}}}}}} diff --git a/tests/tests_app/cli/test_connect.py b/tests/tests_app/cli/test_connect.py new file mode 100644 index 0000000000000..cfe95740a2020 --- /dev/null +++ b/tests/tests_app/cli/test_connect.py @@ -0,0 +1,190 @@ +import json +import os +import sys +from unittest.mock import MagicMock + +import click +import pytest +import requests + +from lightning_app import _PACKAGE_ROOT +from lightning_app.cli.commands.connection import ( + _list_app_commands, + _resolve_command_path, + _retrieve_connection_to_an_app, + connect, + disconnect, +) +from lightning_app.utilities import cli_helpers +from lightning_app.utilities.commands import base + + +def test_connect_disconnect_local(monkeypatch): + + disconnect() + + with pytest.raises(Exception, match="The commands weren't found. Is your app localhost running ?"): + connect("localhost", True) + + with open(os.path.join(os.path.dirname(__file__), "jsons/connect_1.json")) as f: + data = json.load(f) + + data["paths"]["/command/command_with_client"]["post"]["cls_path"] = os.path.join( + os.path.dirname(os.path.dirname(_PACKAGE_ROOT)), + data["paths"]["/command/command_with_client"]["post"]["cls_path"], + ) + + messages = [] + + def fn(msg): + messages.append(msg) + + monkeypatch.setattr(click, "echo", fn) + + response = MagicMock() + response.status_code = 200 + response.json.return_value = data + monkeypatch.setattr(requests, "get", MagicMock(return_value=response)) + connect("localhost", True) + assert _retrieve_connection_to_an_app() == ("localhost", None) + commands = _list_app_commands() + assert commands == ["command with client", "command without client", "nested command"] + command_path = _resolve_command_path("nested_command") + assert not os.path.exists(command_path) + command_path = _resolve_command_path("command_with_client") + assert os.path.exists(command_path) + home = os.path.expanduser("~") + s = "/" if sys.platform != "win32" else "\\" + command_folder_path = f"{home}{s}.lightning{s}lightning_connection{s}commands" + expected = [ + f"Storing `command_with_client` under {command_folder_path}{s}command_with_client.py", + f"You can review all the downloaded commands under {command_folder_path} folder.", + "You are connected to the local Lightning App.", + "Usage: lightning [OPTIONS] COMMAND [ARGS]...", + "", + " --help Show this message and exit.", + "", + "Lightning App Commands", + " command with client Description", + " command without client Description", + " nested command Description", + ] + assert messages == expected + + messages = [] + connect("localhost", True) + assert messages == ["You are connected to the local Lightning App."] + + messages = [] + disconnect() + assert messages == ["You are disconnected from the local Lightning App."] + messages = [] + disconnect() + assert messages == [ + "You aren't connected to any Lightning App. Please use `lightning connect app_name_or_id` to connect to one." + ] + + assert _retrieve_connection_to_an_app() == (None, None) + + +def test_connect_disconnect_cloud(monkeypatch): + + disconnect() + + target_file = _resolve_command_path("command_with_client") + + if os.path.exists(target_file): + os.remove(target_file) + + with open(os.path.join(os.path.dirname(__file__), "jsons/connect_1.json")) as f: + data = json.load(f) + + data["paths"]["/command/command_with_client"]["post"]["cls_path"] = os.path.join( + os.path.dirname(os.path.dirname(_PACKAGE_ROOT)), + data["paths"]["/command/command_with_client"]["post"]["cls_path"], + ) + + messages = [] + + def fn(msg): + messages.append(msg) + + monkeypatch.setattr(click, "echo", fn) + + response = MagicMock() + response.status_code = 200 + response.json.return_value = data + monkeypatch.setattr(requests, "get", MagicMock(return_value=response)) + project = MagicMock() + project.project_id = "custom_project_name" + monkeypatch.setattr(cli_helpers, "_get_project", MagicMock(return_value=project)) + client = MagicMock() + lightningapps = MagicMock() + + app = MagicMock() + app.name = "example" + app.id = "1234" + + lightningapps.lightningapps = [app] + client.lightningapp_instance_service_list_lightningapp_instances.return_value = lightningapps + monkeypatch.setattr(cli_helpers, "LightningClient", MagicMock(return_value=client)) + + monkeypatch.setattr(base, "_get_project", MagicMock(return_value=project)) + + artifact = MagicMock() + artifact.filename = "commands/command_with_client.py" + artifacts = MagicMock() + artifacts.artifacts = [artifact] + client.lightningapp_instance_service_list_lightningapp_instance_artifacts.return_value = artifacts + monkeypatch.setattr(base, "LightningClient", MagicMock(return_value=client)) + + with open(data["paths"]["/command/command_with_client"]["post"]["cls_path"], "rb") as f: + response.content = f.read() + + connect("example", True) + assert _retrieve_connection_to_an_app() == ("example", "1234") + commands = _list_app_commands() + assert commands == ["command with client", "command without client", "nested command"] + command_path = _resolve_command_path("nested_command") + assert not os.path.exists(command_path) + command_path = _resolve_command_path("command_with_client") + assert os.path.exists(command_path) + home = os.path.expanduser("~") + s = "/" if sys.platform != "win32" else "\\" + command_folder_path = f"{home}{s}.lightning{s}lightning_connection{s}commands" + expected = [ + f"Storing `command_with_client` under {command_folder_path}{s}command_with_client.py", + f"You can review all the downloaded commands under {command_folder_path} folder.", + " ", + "The client interface has been successfully installed. ", + "You can now run the following commands:", + " lightning command_without_client", + " lightning command_with_client", + " lightning nested_command", + " ", + "You are connected to the cloud Lightning App: example.", + "Usage: lightning [OPTIONS] COMMAND [ARGS]...", + "", + " --help Show this message and exit.", + "", + "Lightning App Commands", + " command with client Description", + " command without client Description", + " nested command Description", + ] + assert messages == expected + + messages = [] + connect("example", True) + assert messages == ["You are already connected to the cloud Lightning App: example."] + + messages = [] + disconnect() + assert messages == ["You are disconnected from the cloud Lightning App: example."] + messages = [] + disconnect() + assert messages == [ + "You aren't connected to any Lightning App. Please use `lightning connect app_name_or_id` to connect to one." + ] + + assert _retrieve_connection_to_an_app() == (None, None) diff --git a/tests/tests_app/utilities/test_commands.py b/tests/tests_app/utilities/test_commands.py index 1be35a3a2e290..2e14f580c1122 100644 --- a/tests/tests_app/utilities/test_commands.py +++ b/tests/tests_app/utilities/test_commands.py @@ -10,7 +10,8 @@ from lightning import LightningFlow from lightning_app import LightningApp -from lightning_app.cli.lightning_cli import app_command +from lightning_app.cli.commands.connection import connect, disconnect +from lightning_app.cli.lightning_cli import _run_app_command from lightning_app.core.constants import APP_SERVER_PORT from lightning_app.runners import MultiProcessRuntime from lightning_app.testing.helpers import RunIf @@ -54,7 +55,7 @@ def sweep(self, config: SweepConfig): return True def configure_commands(self): - return [{"user_command": self.trigger_method}, {"sweep": SweepCommand(self.sweep)}] + return [{"user command": self.trigger_method}, {"sweep": SweepCommand(self.sweep)}] class DummyConfig(BaseModel): @@ -130,6 +131,7 @@ def target(): def test_configure_commands(monkeypatch): + """This test validates command can be used locally with connect and disconnect.""" process = Process(target=target) process.start() time_left = 15 @@ -142,14 +144,15 @@ def test_configure_commands(monkeypatch): time_left -= 0.1 sleep(0.5) - monkeypatch.setattr(sys, "argv", ["lightning", "user_command", "--name=something"]) - app_command() + monkeypatch.setattr(sys, "argv", ["lightning", "user", "command", "--name=something"]) + connect("localhost") + _run_app_command("localhost", None) sleep(0.5) state = AppState() state._request_state() assert state.names == ["something"] monkeypatch.setattr(sys, "argv", ["lightning", "sweep", "--sweep_name=my_name", "--num_trials=1"]) - app_command() + _run_app_command("localhost", None) time_left = 15 while time_left > 0: if process.exitcode == 0: @@ -157,3 +160,4 @@ def test_configure_commands(monkeypatch): sleep(0.1) time_left -= 0.1 assert process.exitcode == 0 + disconnect() diff --git a/tests/tests_app_examples/idle_timeout/app.py b/tests/tests_app_examples/idle_timeout/app.py index ff45f5332bcaa..218c9e0174d08 100644 --- a/tests/tests_app_examples/idle_timeout/app.py +++ b/tests/tests_app_examples/idle_timeout/app.py @@ -49,7 +49,7 @@ def run(self): assert stopped_status_sigterm.stage == WorkStageStatus.STOPPED assert stopped_status_sigterm.reason == WorkStopReasons.SIGTERM_SIGNAL_HANDLER # Note: Account for the controlplane, k8s, SIGTERM handler delays. - assert (stopped_status_pending.timestamp - succeeded_status.timestamp) < 10 + assert (stopped_status_pending.timestamp - succeeded_status.timestamp) < 20 assert (stopped_status_sigterm.timestamp - stopped_status_pending.timestamp) < 120 fs = filesystem() destination_path = artifacts_path(self.work) / pathlib.Path(*self.work.path.resolve().parts[1:]) diff --git a/tests/tests_app_examples/test_commands_and_api.py b/tests/tests_app_examples/test_commands_and_api.py index 8d84cf4847ebd..8fe3d024c8343 100644 --- a/tests/tests_app_examples/test_commands_and_api.py +++ b/tests/tests_app_examples/test_commands_and_api.py @@ -20,15 +20,18 @@ def test_commands_and_api_example_cloud() -> None: # 1: Collect the app_id app_id = admin_page.url.split("/")[-1] - # 2: Send the first command with the client - cmd = f"lightning command_with_client --name=this --app_id {app_id}" + # 2: Connect to the App + Popen(f"lightning connect {app_id} -y", shell=True).wait() + + # 3: Send the first command with the client + cmd = "lightning command with client --name=this" Popen(cmd, shell=True).wait() - # 3: Send the second command without a client - cmd = f"lightning command_without_client --name=is --app_id {app_id}" + # 4: Send the second command without a client + cmd = "lightning command without client --name=is" Popen(cmd, shell=True).wait() - # 4: Send a request to the Rest API directly. + # 5: Send a request to the Rest API directly. base_url = view_page.url.replace("/view", "").replace("/child_flow", "") resp = requests.post(base_url + "/user/command_without_client?name=awesome") assert resp.status_code == 200, resp.json() @@ -40,3 +43,6 @@ def test_commands_and_api_example_cloud() -> None: if "['this', 'is', 'awesome']" in log: has_logs = True sleep(1) + + # 5: Disconnect from the App + Popen("lightning disconnect", shell=True).wait() From e90ac769d6451b3f363f38d4934277613eb3d539 Mon Sep 17 00:00:00 2001 From: Rohit Gupta Date: Thu, 1 Sep 2022 02:30:18 +0530 Subject: [PATCH 032/193] Reset dataloaders on failure in tuner (#14372) --- src/pytorch_lightning/CHANGELOG.md | 6 +++ .../tuner/batch_size_scaling.py | 38 +++++++++++++------ .../tuner/test_scale_batch_size.py | 23 +++++++++++ 3 files changed, 55 insertions(+), 12 deletions(-) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 384e193f43b1f..f205cf4ac9052 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -141,6 +141,12 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed incorrect values after transferring data to a MPS device ([#13285](https://github.com/Lightning-AI/lightning/issues/13285)) +- Reset the dataloaders on OOM failure in batch size finder to use the last successful batch size ([#14372](https://github.com/Lightning-AI/lightning/pull/14372)) + + +- Fixed an issue to keep downscaling the batch size in case there hasn't been even a single successful optimal batch size with `mode="power"` ([#14372](https://github.com/Lightning-AI/lightning/pull/14372)) + + - Fixed an issue to avoid the impact of sanity check on `reload_dataloaders_every_n_epochs` for validation ([#13964](https://github.com/Lightning-AI/lightning/pull/13964)) diff --git a/src/pytorch_lightning/tuner/batch_size_scaling.py b/src/pytorch_lightning/tuner/batch_size_scaling.py index a1f8a2de4b9d8..e857d321d7669 100644 --- a/src/pytorch_lightning/tuner/batch_size_scaling.py +++ b/src/pytorch_lightning/tuner/batch_size_scaling.py @@ -126,6 +126,9 @@ def _run_power_scaling( trainer: "pl.Trainer", model: "pl.LightningModule", new_size: int, batch_arg_name: str, max_trials: int ) -> int: """Batch scaling mode where the size is doubled at each iteration until an OOM error is encountered.""" + # this flag is used to determine whether the previously scaled batch size, right before OOM, was a success or not + # if it was we exit, else we continue downscaling in case we haven't encountered a single optimal batch size + any_success = False for _ in range(max_trials): garbage_collection_cuda() @@ -137,22 +140,28 @@ def _run_power_scaling( trainer.tuner._run(model) # Double in size new_size, changed = _adjust_batch_size(trainer, batch_arg_name, factor=2.0, desc="succeeded") + + if not changed: + break + + # Force the train dataloader to reset as the batch size has changed + trainer.reset_train_dataloader(model) + trainer.reset_val_dataloader(model) + any_success = True except RuntimeError as exception: # Only these errors should trigger an adjustment if is_oom_error(exception): # If we fail in power mode, half the size and return garbage_collection_cuda() new_size, _ = _adjust_batch_size(trainer, batch_arg_name, factor=0.5, desc="failed") - break + # Force the train dataloader to reset as the batch size has changed + trainer.reset_train_dataloader(model) + trainer.reset_val_dataloader(model) + if any_success: + break else: raise # some other error not memory related - if changed: - # Force the train dataloader to reset as the batch size has changed - trainer.reset_train_dataloader(model) - trainer.reset_val_dataloader(model) - else: - break return new_size @@ -189,13 +198,13 @@ def _run_binsearch_scaling( else: new_size, changed = _adjust_batch_size(trainer, batch_arg_name, factor=2.0, desc="succeeded") - if changed: - # Force the train dataloader to reset as the batch size has changed - trainer.reset_train_dataloader(model) - trainer.reset_val_dataloader(model) - else: + if not changed: break + # Force the train dataloader to reset as the batch size has changed + trainer.reset_train_dataloader(model) + trainer.reset_val_dataloader(model) + except RuntimeError as exception: # Only these errors should trigger an adjustment if is_oom_error(exception): @@ -204,6 +213,11 @@ def _run_binsearch_scaling( high = new_size midval = (high + low) // 2 new_size, _ = _adjust_batch_size(trainer, batch_arg_name, value=midval, desc="failed") + + # Force the train dataloader to reset as the batch size has changed + trainer.reset_train_dataloader(model) + trainer.reset_val_dataloader(model) + if high - low <= 1: break else: diff --git a/tests/tests_pytorch/tuner/test_scale_batch_size.py b/tests/tests_pytorch/tuner/test_scale_batch_size.py index e703b37491d26..a346eaa124dbb 100644 --- a/tests/tests_pytorch/tuner/test_scale_batch_size.py +++ b/tests/tests_pytorch/tuner/test_scale_batch_size.py @@ -319,3 +319,26 @@ def test_dataloader_reset_with_scale_batch_size(tmpdir, scale_method): assert trainer.train_dataloader.loaders.batch_size == new_batch_size assert trainer.val_dataloaders[0].batch_size == new_batch_size + + +@pytest.mark.parametrize("scale_method, expected_batch_size", [("power", 62), ("binsearch", 100)]) +@patch("pytorch_lightning.tuner.batch_size_scaling.is_oom_error", return_value=True) +def test_dataloader_batch_size_updated_on_failure(_, tmpdir, scale_method, expected_batch_size): + class CustomBatchSizeModel(BatchSizeModel): + def training_step(self, *_, **__): + if self.batch_size > 100: + raise RuntimeError + + def train_dataloader(self): + return DataLoader(RandomDataset(32, 1000), batch_size=self.batch_size) + + model = CustomBatchSizeModel(batch_size=16) + model.validation_step = None + model.training_epoch_end = None + scale_batch_size_kwargs = {"max_trials": 10, "steps_per_trial": 1, "init_val": 500, "mode": scale_method} + + trainer = Trainer(default_root_dir=tmpdir, max_epochs=2, auto_scale_batch_size=True) + new_batch_size = trainer.tune(model, scale_batch_size_kwargs=scale_batch_size_kwargs)["scale_batch_size"] + assert new_batch_size == model.batch_size + assert new_batch_size == expected_batch_size + assert trainer.train_dataloader.loaders.batch_size == expected_batch_size From c1a7254b7a628694d93a0a07902f418192e8366f Mon Sep 17 00:00:00 2001 From: Sherin Thomas Date: Thu, 1 Sep 2022 02:54:28 +0530 Subject: [PATCH 033/193] Dependency pinning (#14463) * deps pinned * Apply suggestions from code review Co-authored-by: Jirka Borovec --- requirements/app/base.txt | 2 +- src/lightning_app/CHANGELOG.md | 3 --- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/requirements/app/base.txt b/requirements/app/base.txt index d40fe4ef8a85e..8e5d829cfa071 100644 --- a/requirements/app/base.txt +++ b/requirements/app/base.txt @@ -4,6 +4,6 @@ deepdiff>=5.7.0, <=5.8.1 starsessions>=1.2.1, <2.0 # strict fsspec>=2022.01.0, <=2022.7.1 s3fs>=2022.1.0, <=2022.7.1 -croniter # for now until we found something more robust. +croniter>=1.3.0, <1.4.0 # strict; TODO: for now until we find something more robust. traitlets<5.2.0 # Traitlets 5.2.X fails: https://github.com/ipython/traitlets/issues/741 arrow>=1.2.0, <=1.2.2 diff --git a/src/lightning_app/CHANGELOG.md b/src/lightning_app/CHANGELOG.md index 18a3e4ac8223d..4a9bc13a093ba 100644 --- a/src/lightning_app/CHANGELOG.md +++ b/src/lightning_app/CHANGELOG.md @@ -31,9 +31,6 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Changed - Default values and parameter names for Lightning AI BYOC cluster management ([#14132](https://github.com/Lightning-AI/lightning/pull/14132)) - -### Changed - - Run the flow only if the state has changed from the previous execution ([#14076](https://github.com/Lightning-AI/lightning/pull/14076)) ### Fixed From 1bcb5c301df67c7db60ddd33b040c06d58bedfd5 Mon Sep 17 00:00:00 2001 From: Mansy Date: Thu, 1 Sep 2022 09:58:09 +0200 Subject: [PATCH 034/193] [App][CLI] Fix lightning cli --version (#14433) * [App][CLI] Fix lightning cli --version --- src/lightning_app/CHANGELOG.md | 2 +- src/lightning_app/cli/lightning_cli.py | 3 +- .../utilities/install_components.py | 261 ------------------ tests/tests_app/cli/test_cli.py | 5 +- .../test_install_external_component.py | 50 ---- 5 files changed, 6 insertions(+), 315 deletions(-) delete mode 100644 src/lightning_app/utilities/install_components.py delete mode 100644 tests/tests_app/components/test_install_external_component.py diff --git a/src/lightning_app/CHANGELOG.md b/src/lightning_app/CHANGELOG.md index 4a9bc13a093ba..ac8c3b2c1d9c9 100644 --- a/src/lightning_app/CHANGELOG.md +++ b/src/lightning_app/CHANGELOG.md @@ -36,7 +36,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Fixed - Unification of app template: moved `app.py` to root dir for `lightning init app ` template ([#13853](https://github.com/Lightning-AI/lightning/pull/13853)) - +- Fixing an issue with `lightning --version` command ([#14433](https://github.com/Lightning-AI/lightning/pull/14433)) ## [0.5.7] - 2022-08-22 diff --git a/src/lightning_app/cli/lightning_cli.py b/src/lightning_app/cli/lightning_cli.py index 83a8efcb5334a..0ed23f6577097 100644 --- a/src/lightning_app/cli/lightning_cli.py +++ b/src/lightning_app/cli/lightning_cli.py @@ -30,7 +30,6 @@ from lightning_app.utilities.cli_helpers import _arrow_time_callback, _format_input_env_variables from lightning_app.utilities.cloud import _get_project from lightning_app.utilities.cluster_logs import _cluster_logs_reader -from lightning_app.utilities.install_components import register_all_external_components from lightning_app.utilities.login import Auth from lightning_app.utilities.network import LightningClient @@ -74,7 +73,7 @@ def main(): @click.group() @click.version_option(ver) def _main(): - register_all_external_components() + pass @_main.group() diff --git a/src/lightning_app/utilities/install_components.py b/src/lightning_app/utilities/install_components.py deleted file mode 100644 index a948f9e23f7ed..0000000000000 --- a/src/lightning_app/utilities/install_components.py +++ /dev/null @@ -1,261 +0,0 @@ -import logging -import os -import platform -from typing import List, Union - -from packaging.version import Version - -import lightning_app - -_PACKAGE_REGISTRY_COMMANDS = { - "quick-start": [ - "curl https://gist.githubusercontent.com/tchaton/b81c8d8ba0f4dd39a47bfa607d81d6d5/raw/a5f84a40c03e349f659e219cc328ffec1b22b2c9/train_script.py > train_script.py", # noqa E501 - "curl https://gist.githubusercontent.com/tchaton/2df61d77a0adbd0f105b1c2dc01ae83a/raw/f5a86d9e0d05d391dec58545c0c31b43271a3541/requirements.txt > requirements.txt", # noqa E501 - ] -} - -logger = logging.getLogger(__name__) - -_PYTHON_GREATER_EQUAL_3_8_0 = Version(platform.python_version()) >= Version("3.8.0") -_LIGHTNING_ENTRYPOINT = "lightning_app.external_components" - - -def _ensure_package_exists(package_path): - package_init_file = os.path.join(package_path, "__init__.py") - if not os.path.exists(package_path): - os.mkdir(package_path) - if not os.path.isfile(package_init_file): - open(package_init_file, mode="a").close() - - -def _import_external_component_classes( - external_package_name: str, - external_classes: List[Union[lightning_app.LightningFlow, lightning_app.LightningWork]], - validate_external_classes: bool = True, -): - """Imports a list of external components either a LightningFlow or LightningWork. - - - How it works ? - - For each component that's not already installed, write an import line in the __init__.py of the package - """ - from lightning_app import _PROJECT_ROOT, LightningFlow, LightningWork - - external_package_path_parts = [_PROJECT_ROOT, "lightning", "components"] - - for sub_module in external_package_name.split("."): - external_package_path_parts.append(sub_module) - _ensure_package_exists(os.path.join(*external_package_path_parts)) - - external_package_path_parts.append("__init__.py") - external_components_import_file = os.path.join(*external_package_path_parts) - - new_imports = [] - - with open(external_components_import_file) as f: - existing_imports = set(f.readlines()) - - for external_cls in external_classes: - if issubclass(external_cls, (LightningWork, LightningFlow)): - import_line_str = f"from {external_cls.__module__} import {external_cls.__name__} # noqa E511 \n" - if import_line_str not in existing_imports: - new_imports.append(import_line_str) - elif validate_external_classes: - raise Exception( - f"Cannot import external component {external_cls.__name__} from {external_cls.__module__}. " - f"The provided external class isn't a LightningWork or LightningFlow." - ) - - with open(external_components_import_file, "a") as fw: - fw.writelines(new_imports) - - # TODO: import the classes to make sure it works - - -def register_all_external_components(): - - if _PYTHON_GREATER_EQUAL_3_8_0: - from importlib.metadata import entry_points - - lightning_entry_points = entry_points().get(_LIGHTNING_ENTRYPOINT, ()) - else: - from pkg_resources import iter_entry_points - - lightning_entry_points = iter_entry_points(_LIGHTNING_ENTRYPOINT) - - for entrypoint in lightning_entry_points: - try: - external_classes = entrypoint.load()() - _import_external_component_classes(entrypoint.name, external_classes, validate_external_classes=False) - except Exception as e: - logger.debug(f"Cannot register entrypoint: {entrypoint.name}, group: {entrypoint.group} error: {str(e)}") - - -def _pip_uninstall_component_package(component_package: str): - import subprocess - import sys - - p = subprocess.Popen( - [sys.executable, "-m", "pip", "uninstall", component_package, "-y"], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - ) - try: - _, stderr = p.communicate(timeout=60) # 30 seconds timeout - except subprocess.TimeoutExpired: - p.kill() - logger.debug(f"Timeout, did not uninstall {component_package}") - if p.returncode != 0: - logger.debug(f"Did not not uninstall {component_package}, got this error: {stderr}") - - -def _pip_install_component_package(component_package: str, force_reinstall=False): - """pip install a `component_package` and extract the package info from the installation logs.""" - import subprocess - import sys - - args = [ - sys.executable, - "-m", - "pip", - "install", - component_package, - "-v", - ] - - if force_reinstall: - args.append("--force-reinstall") - - p = subprocess.Popen( - args, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - ) - try: - stdout, stderr = p.communicate(timeout=60 * 5) # 5 minutes timeout - except subprocess.TimeoutExpired: - p.kill() - raise Exception(f"Could not install {component_package}, installation timeout.") - - return - - # stdout = stdout.decode("utf-8") - # stderr = stderr.decode("utf-8") - - # if p.returncode != 0: - # raise Exception(f"Could not pip install {component_package}, got this error: {stderr}.") - # if not stdout: - # raise Exception("Could extract package name from installation logs. No logs found.") - - # component_package_info_str = "" - # for line in stdout.splitlines() + stderr.splitlines(): - # if "Installed lightning component package:" in line: - # component_package_info_str = line.replace("Installed lightning component package:", "").strip() - # break - - # if not component_package_info_str: - # # fixme (manskx): need to find a way to get package name from the installation - # logger.info("Could not extract lightning component info from installation logs. ") - # return None - - # try: - # component_package_info_dict = json.loads(component_package_info_str) - # assert component_package_info_dict["package"] - # assert component_package_info_dict["version"] - # assert component_package_info_dict["entry_point"] - # except Exception: - # _pip_uninstall_component_package(component_package) - # raise Exception( - # "Could not extract lightning component info from installation logs. Cannot parse component package info." - # ) - - # last_logline = stdout.splitlines()[-1] - # if "Requirement already satisfied" in last_logline: - # warnings.warn( - # f"The package {component_package} seems to be already installed but we extracted this " - # f"information {component_package_info_str}. " - # f"If this is not correct, please uninstall the package and try again. " - # ) - # elif "Successfully installed" not in last_logline: - # # If the installation is successful, the last log line is something like: - # # Successfully installed - - # warnings.warn( - # f"Lightning is not sure that the package {component_package} is correctly installed, " - # f"but we extracted this information {component_package_info_str}. " - # f"If this is not correct, please uninstall the package and try again. " - # ) - - # return component_package_info_dict - - -def _extract_public_package_name_from_entrypoint(entrypoint): - """The syntax for entry points is specified as follows: - - "entrypoint.name = entrypoint.value" - " = [.[.]][:.]" - This fun return the part - """ - return entrypoint.value.split(":")[0].strip().split(".")[0] - - -def install_external_component(component_package: str): - """Installs an external lightning component and make it avaiable for usage. `component` param can be a name of - python package to be installed from pypi, a zip file of a package or a githib url of the package. - - How it works? - - Run "pip install " - - Get entry points for `lightning_app.external_components` - - Register the components from the installed package - """ - - _pip_install_component_package(component_package) - - if component_package not in _PACKAGE_REGISTRY_COMMANDS: - return - - for command in _PACKAGE_REGISTRY_COMMANDS[component_package]: - os.system(command) - - # if not installed_component_package_info_dict: - # # fixme (manskx): install command should not register all entrypoints - # register_all_external_components() - # return - - # if _PYTHON_GREATER_EQUAL_3_8_0: - # from importlib.metadata import entry_points - - # lightning_entry_points = entry_points().get(_LIGHTNING_ENTRYPOINT, ()) - # else: - # from pkg_resources import iter_entry_points - - # lightning_entry_points = iter_entry_points(_LIGHTNING_ENTRYPOINT) - - # component_entry_points = [ - # e - # for e in lightning_entry_points - # if _extract_public_package_name_from_entrypoint(e) == installed_component_package_info_dict["package"] - # and e.name == installed_component_package_info_dict["entry_point"] - # ] - - # if not component_entry_points: - # _pip_uninstall_component_package(installed_component_package_info_dict["package"]) - # other_entrypoints = "- ".join([f"name: {e.name}, value: {e.value}" for e in lightning_entry_points]) - - # raise Exception( - # f"Could not find and entry point for package {installed_component_package_info_dict['package']}, " - # f"Make sure that the package is registered to 'lightning_app.external_components' " - # f"with the same package name. We found another ({len(lightning_entry_points)}) entrypoints " - # f"{other_entrypoints}" - # ) - - # for entrypoint in component_entry_points: - - # try: - # external_classes = entrypoint.load()() - # except Exception as e: - # _pip_uninstall_component_package(installed_component_package_info_dict["package"]) - # raise Exception( - # f"Cannot register entrypoint: {entrypoint.name}, value: {entrypoint.value} " - # f"group: {entrypoint.group} error: {str(e)}" - # ) - - # _import_external_component_classes(entrypoint.name, external_classes) diff --git a/tests/tests_app/cli/test_cli.py b/tests/tests_app/cli/test_cli.py index 6daa7be5b8e07..4e42fb2ab5b96 100644 --- a/tests/tests_app/cli/test_cli.py +++ b/tests/tests_app/cli/test_cli.py @@ -6,6 +6,7 @@ from click.testing import CliRunner from lightning_cloud.openapi import Externalv1LightningappInstance +from lightning_app import __version__ from lightning_app.cli.lightning_cli import _main, get_app_url, login, logout, run from lightning_app.cli.lightning_cli_create import create, create_cluster from lightning_app.cli.lightning_cli_delete import delete, delete_cluster @@ -171,4 +172,6 @@ def test_cli_logout(exists: mock.MagicMock, unlink: mock.MagicMock, creds: bool) unlink.assert_not_called() -# TODO: test for the other commands +def test_lightning_cli_version(): + res = os.popen("python -m lightning --version").read() + assert __version__ in res diff --git a/tests/tests_app/components/test_install_external_component.py b/tests/tests_app/components/test_install_external_component.py deleted file mode 100644 index 300ced3c30918..0000000000000 --- a/tests/tests_app/components/test_install_external_component.py +++ /dev/null @@ -1,50 +0,0 @@ -import os -import shutil -import subprocess - -import pytest -from tests_app import _PROJECT_ROOT - -from lightning_app.utilities.install_components import _pip_uninstall_component_package, install_external_component - -_PACKAGE_PATH = os.path.join(_PROJECT_ROOT, "tests", "tests_app", "components", "sample_package_repo") -_EXTERNAL_COMPONENT_PACKAGE = "external_lightning_component_package" -_COMPONENT_PACKAGE_TAR_PATH = os.path.join(_PACKAGE_PATH, "dist", f"{_EXTERNAL_COMPONENT_PACKAGE}-0.0.1.tar.gz") - - -@pytest.fixture(scope="function", autouse=True) -def cleanup_installation(): - _pip_uninstall_component_package(_EXTERNAL_COMPONENT_PACKAGE.replace("_", "-")) - shutil.rmtree(os.path.join(_PROJECT_ROOT, "lightning", "components", "myorg"), ignore_errors=True) - yield - _pip_uninstall_component_package(_EXTERNAL_COMPONENT_PACKAGE.replace("_", "-")) - shutil.rmtree(os.path.join(_PACKAGE_PATH, "dist"), ignore_errors=True) - shutil.rmtree(os.path.join(_PACKAGE_PATH, f"{_EXTERNAL_COMPONENT_PACKAGE}.egg-info"), ignore_errors=True) - shutil.rmtree(os.path.join(_PROJECT_ROOT, "lightning", "components", "myorg"), ignore_errors=True) - - -@pytest.mark.usefixtures("cleanup_installation") -def test_install_external_component(): - with subprocess.Popen( - ["python", "setup.py", "sdist"], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - cwd=_PACKAGE_PATH, - ) as proc: - proc.wait() - - assert os.path.exists(_COMPONENT_PACKAGE_TAR_PATH) - - install_external_component(_COMPONENT_PACKAGE_TAR_PATH) - - # TODO (tchaton) Enable once stable. - # from lightning_app.components.myorg.lightning_modules import MyCustomLightningFlow, MyCustomLightningWork - - # assert ( - # MyCustomLightningWork.special_method() - # == "Hi, I'm an external lightning work component and can be added to any lightning project." - # ) - # assert ( - # MyCustomLightningFlow.special_method() - # == "Hi, I'm an external lightning flow component and can be added to any lightning project." - # ) From 4a81b9b99e57d63ca8611287abed3fcf8b5ab10a Mon Sep 17 00:00:00 2001 From: Rohit Gupta Date: Thu, 1 Sep 2022 16:38:50 +0530 Subject: [PATCH 035/193] Update changelog after v1.7.4 release (#14479) --- src/pytorch_lightning/CHANGELOG.md | 22 +++++++--------------- 1 file changed, 7 insertions(+), 15 deletions(-) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index f205cf4ac9052..9d4323548cb7e 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -21,9 +21,6 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added support for saving sharded optimizer state dict outside of `DDPShardedStrategy` ([#14208](https://github.com/PyTorchLightning/pytorch-lightning/pull/14208)) -- Added an environment variable `PL_DISABLE_FORK` that can be used to disable all forking in the Trainer ([#14319](https://github.com/Lightning-AI/lightning/issues/14319)) - - ### Changed @@ -133,30 +130,25 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Fixed -- Fixed `LightningDataModule` hparams parsing ([#12806](https://github.com/PyTorchLightning/pytorch-lightning/pull/12806)) - - -- Reset epoch progress with batch size scaler ([#13846](https://github.com/Lightning-AI/lightning/pull/13846)) - -- Fixed incorrect values after transferring data to a MPS device ([#13285](https://github.com/Lightning-AI/lightning/issues/13285)) - - - Reset the dataloaders on OOM failure in batch size finder to use the last successful batch size ([#14372](https://github.com/Lightning-AI/lightning/pull/14372)) - Fixed an issue to keep downscaling the batch size in case there hasn't been even a single successful optimal batch size with `mode="power"` ([#14372](https://github.com/Lightning-AI/lightning/pull/14372)) -- Fixed an issue to avoid the impact of sanity check on `reload_dataloaders_every_n_epochs` for validation ([#13964](https://github.com/Lightning-AI/lightning/pull/13964)) +## [1.7.4] - 2022-08-31 -- Fixed restoring the trainer after using `lr_find()` so that the correct LR schedule is used for the actual training ([#14113](https://github.com/Lightning-AI/lightning/pull/14113)) - +### Added -- Reset epoch progress with batch size scaler ([#13846](https://github.com/Lightning-AI/lightning/pull/13846) +- Added an environment variable `PL_DISABLE_FORK` that can be used to disable all forking in the Trainer ([#14319](https://github.com/Lightning-AI/lightning/issues/14319)) +### Fixed - Fixed `LightningDataModule` hparams parsing ([#12806](https://github.com/PyTorchLightning/pytorch-lightning/pull/12806)) +- Reset epoch progress with batch size scaler ([#13846](https://github.com/Lightning-AI/lightning/pull/13846)) +- Fixed restoring the trainer after using `lr_find()` so that the correct LR schedule is used for the actual training ([#14113](https://github.com/Lightning-AI/lightning/pull/14113)) +- Fixed incorrect values after transferring data to an MPS device ([#14368](https://github.com/Lightning-AI/lightning/pull/14368)) ## [1.7.3] - 2022-08-25 From 764b3482492891efed53b1b08b35ac3ce3da06af Mon Sep 17 00:00:00 2001 From: Adam Bobowski <100693297+adam-lightning@users.noreply.github.com> Date: Thu, 1 Sep 2022 13:12:04 +0200 Subject: [PATCH 036/193] [App] Handling s3 rate limiting in framework (#14411) bump of fsspec and s3fs to version supporting retry on "SlowDown" response --- requirements/app/base.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements/app/base.txt b/requirements/app/base.txt index 8e5d829cfa071..50a6d6c1d6e24 100644 --- a/requirements/app/base.txt +++ b/requirements/app/base.txt @@ -2,8 +2,8 @@ lightning-cloud==0.5.3 packaging deepdiff>=5.7.0, <=5.8.1 starsessions>=1.2.1, <2.0 # strict -fsspec>=2022.01.0, <=2022.7.1 -s3fs>=2022.1.0, <=2022.7.1 +fsspec>=2022.5.0, <=2022.7.1 +s3fs>=2022.5.0, <=2022.7.1 croniter>=1.3.0, <1.4.0 # strict; TODO: for now until we find something more robust. traitlets<5.2.0 # Traitlets 5.2.X fails: https://github.com/ipython/traitlets/issues/741 arrow>=1.2.0, <=1.2.2 From 28e18881a9ad2298169c78ad9ae109191e201c2f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Thu, 1 Sep 2022 15:47:40 +0200 Subject: [PATCH 037/193] Mark stage argument in hooks as required (#14064) Co-authored-by: rohitgr7 --- .../advanced/training_tricks.rst | 2 +- docs/source-pytorch/data/datamodule.rst | 21 +++++------ examples/pl_domain_templates/imagenet.py | 4 +- examples/pl_loops/kfold.py | 2 +- .../cli/pl-app-template/core/callbacks.py | 8 ++-- src/pytorch_lightning/callbacks/callback.py | 4 +- .../callbacks/device_stats_monitor.py | 2 +- .../callbacks/early_stopping.py | 2 +- src/pytorch_lightning/callbacks/finetuning.py | 2 +- .../callbacks/model_checkpoint.py | 2 +- .../callbacks/progress/base.py | 2 +- .../callbacks/progress/rich_progress.py | 2 +- src/pytorch_lightning/callbacks/pruning.py | 2 +- .../callbacks/stochastic_weight_avg.py | 2 +- src/pytorch_lightning/cli.py | 2 +- src/pytorch_lightning/core/hooks.py | 4 +- src/pytorch_lightning/demos/boring_classes.py | 10 ++--- .../demos/mnist_datamodule.py | 2 +- src/pytorch_lightning/profilers/advanced.py | 2 +- src/pytorch_lightning/profilers/profiler.py | 6 +-- src/pytorch_lightning/profilers/pytorch.py | 2 +- .../trainer/configuration_validator.py | 7 ---- tests/tests_pytorch/accelerators/test_ipu.py | 3 +- .../callbacks/progress/test_base_progress.py | 2 +- .../checkpointing/test_model_checkpoint.py | 2 +- tests/tests_pytorch/core/test_datamodules.py | 4 +- tests/tests_pytorch/helpers/datamodules.py | 7 ++-- tests/tests_pytorch/models/test_hparams.py | 2 +- .../plugins/precision/hpu/test_hpu.py | 3 +- tests/tests_pytorch/strategies/test_ddp.py | 3 +- .../strategies/test_deepspeed_strategy.py | 6 +-- .../trainer/test_config_validator.py | 37 +------------------ 32 files changed, 56 insertions(+), 105 deletions(-) diff --git a/docs/source-pytorch/advanced/training_tricks.rst b/docs/source-pytorch/advanced/training_tricks.rst index 76d2f43176433..71a778fa09557 100644 --- a/docs/source-pytorch/advanced/training_tricks.rst +++ b/docs/source-pytorch/advanced/training_tricks.rst @@ -326,7 +326,7 @@ The :class:`~pytorch_lightning.core.datamodule.LightningDataModule` class provid def prepare_data(self): MNIST(self.data_dir, download=True) - def setup(self, stage: Optional[str] = None): + def setup(self, stage: str): self.mnist = MNIST(self.data_dir) def train_loader(self): diff --git a/docs/source-pytorch/data/datamodule.rst b/docs/source-pytorch/data/datamodule.rst index 62a0f9d0d54b6..fbee2e80e4ea2 100644 --- a/docs/source-pytorch/data/datamodule.rst +++ b/docs/source-pytorch/data/datamodule.rst @@ -84,7 +84,7 @@ The equivalent DataModule just organizes the same exact code, but makes it reusa self.data_dir = data_dir self.batch_size = batch_size - def setup(self, stage: Optional[str] = None): + def setup(self, stage: str): self.mnist_test = MNIST(self.data_dir, train=False) self.mnist_predict = MNIST(self.data_dir, train=False) mnist_full = MNIST(self.data_dir, train=True) @@ -102,7 +102,7 @@ The equivalent DataModule just organizes the same exact code, but makes it reusa def predict_dataloader(self): return DataLoader(self.mnist_predict, batch_size=self.batch_size) - def teardown(self, stage: Optional[str] = None): + def teardown(self, stage: str): # Used to clean-up when the run is finished ... @@ -141,18 +141,18 @@ Here's a more realistic, complex DataModule that shows how much more reusable th MNIST(self.data_dir, train=True, download=True) MNIST(self.data_dir, train=False, download=True) - def setup(self, stage: Optional[str] = None): + def setup(self, stage: str): # Assign train/val datasets for use in dataloaders - if stage == "fit" or stage is None: + if stage == "fit": mnist_full = MNIST(self.data_dir, train=True, transform=self.transform) self.mnist_train, self.mnist_val = random_split(mnist_full, [55000, 5000]) # Assign test dataset for use in dataloader(s) - if stage == "test" or stage is None: + if stage == "test": self.mnist_test = MNIST(self.data_dir, train=False, transform=self.transform) - if stage == "predict" or stage is None: + if stage == "predict": self.mnist_predict = MNIST(self.data_dir, train=False, transform=self.transform) def train_dataloader(self): @@ -226,15 +226,15 @@ There are also data operations you might want to perform on every GPU. Use :meth class MNISTDataModule(pl.LightningDataModule): - def setup(self, stage: Optional[str] = None): + def setup(self, stage: str): # Assign Train/val split(s) for use in Dataloaders - if stage in (None, "fit"): + if stage == "fit": mnist_full = MNIST(self.data_dir, train=True, download=True, transform=self.transform) self.mnist_train, self.mnist_val = random_split(mnist_full, [55000, 5000]) # Assign Test split(s) for use in Dataloaders - if stage in (None, "test"): + if stage == "test": self.mnist_test = MNIST(self.data_dir, train=False, download=True, transform=self.transform) @@ -256,8 +256,7 @@ For eg., if you are working with NLP task where you need to tokenize the text an This method expects a ``stage`` argument. -It is used to separate setup logic for ``trainer.{fit,validate,test,predict}``. If ``setup`` is called with ``stage=None``, -we assume all stages have been set-up. +It is used to separate setup logic for ``trainer.{fit,validate,test,predict}``. .. note:: :ref:`setup` is called from every process across all the nodes. Setting state here is recommended. .. note:: :ref:`teardown` can be used to clean up the state. It is also called from every process across all the nodes. diff --git a/examples/pl_domain_templates/imagenet.py b/examples/pl_domain_templates/imagenet.py index 93284963db4b4..efb9c40eea061 100644 --- a/examples/pl_domain_templates/imagenet.py +++ b/examples/pl_domain_templates/imagenet.py @@ -125,7 +125,7 @@ def configure_optimizers(self): scheduler = lr_scheduler.LambdaLR(optimizer, lambda epoch: 0.1 ** (epoch // 30)) return [optimizer], [scheduler] - def setup(self, stage: Optional[str] = None): + def setup(self, stage: str): if isinstance(self.trainer.strategy, ParallelStrategy): # When using a single GPU per process and per `DistributedDataParallel`, we need to divide the batch size # ourselves based on the total number of GPUs we have @@ -133,7 +133,7 @@ def setup(self, stage: Optional[str] = None): self.batch_size = int(self.batch_size / num_processes) self.workers = int(self.workers / num_processes) - if stage in (None, "fit"): + if stage == "fit": normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_dir = os.path.join(self.data_path, "train") self.train_dataset = datasets.ImageFolder( diff --git a/examples/pl_loops/kfold.py b/examples/pl_loops/kfold.py index 028e0be698972..529f0c6e1b162 100644 --- a/examples/pl_loops/kfold.py +++ b/examples/pl_loops/kfold.py @@ -83,7 +83,7 @@ def prepare_data(self) -> None: # download the data. MNIST(DATASETS_PATH, transform=T.Compose([T.ToTensor(), T.Normalize(mean=(0.5,), std=(0.5,))])) - def setup(self, stage: Optional[str] = None) -> None: + def setup(self, stage: str) -> None: # load the data dataset = MNIST(DATASETS_PATH, transform=T.Compose([T.ToTensor(), T.Normalize(mean=(0.5,), std=(0.5,))])) self.train_dataset, self.test_dataset = random_split(dataset, [50000, 10000]) diff --git a/src/lightning_app/cli/pl-app-template/core/callbacks.py b/src/lightning_app/cli/pl-app-template/core/callbacks.py index de1bb4003f71f..f324d10f1faa4 100644 --- a/src/lightning_app/cli/pl-app-template/core/callbacks.py +++ b/src/lightning_app/cli/pl-app-template/core/callbacks.py @@ -1,6 +1,6 @@ import inspect import logging -from typing import Any, Dict, Optional, TYPE_CHECKING, Union +from typing import Any, Dict, TYPE_CHECKING, Union from core.state import ProgressBarState, TrainerState @@ -31,7 +31,7 @@ def setup( self, trainer: "pl.Trainer", pl_module: "pl.LightningModule", - stage: Optional[str] = None, + stage: str, ) -> None: self.is_enabled = trainer.is_global_zero @@ -261,7 +261,7 @@ def setup( self, trainer: "pl.Trainer", pl_module: "pl.LightningModule", - stage: Optional[str] = None, + stage: str, ) -> None: self.work.model_hparams = self._sanitize_model_init_args(dict(**pl_module.hparams)) @@ -284,7 +284,7 @@ def setup( self, trainer: "pl.Trainer", pl_module: "pl.LightningModule", - stage: Optional[str] = None, + stage: str, ) -> None: log_dir = self._get_logdir(trainer) self.work.log_dir = Path(log_dir) if log_dir is not None else None diff --git a/src/pytorch_lightning/callbacks/callback.py b/src/pytorch_lightning/callbacks/callback.py index 892bd0fdfbf8b..cf57c5c2f7847 100644 --- a/src/pytorch_lightning/callbacks/callback.py +++ b/src/pytorch_lightning/callbacks/callback.py @@ -72,10 +72,10 @@ def on_before_accelerator_backend_setup(self, trainer: "pl.Trainer", pl_module: Called before accelerator is being setup. """ - def setup(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule", stage: Optional[str] = None) -> None: + def setup(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule", stage: str) -> None: """Called when fit, validate, test, predict, or tune begins.""" - def teardown(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule", stage: Optional[str] = None) -> None: + def teardown(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule", stage: str) -> None: """Called when fit, validate, test, predict, or tune ends.""" def on_init_start(self, trainer: "pl.Trainer") -> None: diff --git a/src/pytorch_lightning/callbacks/device_stats_monitor.py b/src/pytorch_lightning/callbacks/device_stats_monitor.py index ed6750496a735..c062fea8db144 100644 --- a/src/pytorch_lightning/callbacks/device_stats_monitor.py +++ b/src/pytorch_lightning/callbacks/device_stats_monitor.py @@ -58,7 +58,7 @@ def setup( self, trainer: "pl.Trainer", pl_module: "pl.LightningModule", - stage: Optional[str] = None, + stage: str, ) -> None: if stage != "fit": return diff --git a/src/pytorch_lightning/callbacks/early_stopping.py b/src/pytorch_lightning/callbacks/early_stopping.py index 87585bb8120d0..79ba68e194586 100644 --- a/src/pytorch_lightning/callbacks/early_stopping.py +++ b/src/pytorch_lightning/callbacks/early_stopping.py @@ -129,7 +129,7 @@ def __init__( def state_key(self) -> str: return self._generate_state_key(monitor=self.monitor, mode=self.mode) - def setup(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule", stage: Optional[str] = None) -> None: + def setup(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule", stage: str) -> None: if self._check_on_train_epoch_end is None: # if the user runs validation multiple times per training epoch or multiple training epochs without # validation, then we run after validation instead of on train epoch end diff --git a/src/pytorch_lightning/callbacks/finetuning.py b/src/pytorch_lightning/callbacks/finetuning.py index d2afdd20bd9e9..11cd81f7a2609 100644 --- a/src/pytorch_lightning/callbacks/finetuning.py +++ b/src/pytorch_lightning/callbacks/finetuning.py @@ -244,7 +244,7 @@ def unfreeze_and_add_param_group( if params: optimizer.add_param_group({"params": params, "lr": params_lr / denom_lr}) - def setup(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule", stage: Optional[str] = None) -> None: + def setup(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule", stage: str) -> None: self.freeze_before_training(pl_module) @staticmethod diff --git a/src/pytorch_lightning/callbacks/model_checkpoint.py b/src/pytorch_lightning/callbacks/model_checkpoint.py index 1ad86a0917dac..3362d07902184 100644 --- a/src/pytorch_lightning/callbacks/model_checkpoint.py +++ b/src/pytorch_lightning/callbacks/model_checkpoint.py @@ -254,7 +254,7 @@ def state_key(self) -> str: save_on_train_epoch_end=self._save_on_train_epoch_end, ) - def setup(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule", stage: Optional[str] = None) -> None: + def setup(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule", stage: str) -> None: self.__resolve_ckpt_dir(trainer) assert self.dirpath is not None if trainer.is_global_zero and stage == "fit": diff --git a/src/pytorch_lightning/callbacks/progress/base.py b/src/pytorch_lightning/callbacks/progress/base.py index 003cc7bc6fa39..4fd4597c996f5 100644 --- a/src/pytorch_lightning/callbacks/progress/base.py +++ b/src/pytorch_lightning/callbacks/progress/base.py @@ -217,7 +217,7 @@ def print(self, *args: Any, **kwargs: Any) -> None: """You should provide a way to print without breaking the progress bar.""" print(*args, **kwargs) - def setup(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule", stage: Optional[str] = None) -> None: + def setup(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule", stage: str) -> None: self._trainer = trainer if not trainer.is_global_zero: self.disable() diff --git a/src/pytorch_lightning/callbacks/progress/rich_progress.py b/src/pytorch_lightning/callbacks/progress/rich_progress.py index 8ca2cb6671cab..e0d8fca2e753e 100644 --- a/src/pytorch_lightning/callbacks/progress/rich_progress.py +++ b/src/pytorch_lightning/callbacks/progress/rich_progress.py @@ -476,7 +476,7 @@ def _update_metrics(self, trainer, pl_module) -> None: if self._metric_component: self._metric_component.update(metrics) - def teardown(self, trainer, pl_module, stage: Optional[str] = None) -> None: + def teardown(self, trainer, pl_module, stage: str) -> None: self._stop_progress() def on_exception(self, trainer, pl_module, exception: BaseException) -> None: diff --git a/src/pytorch_lightning/callbacks/pruning.py b/src/pytorch_lightning/callbacks/pruning.py index 63516028b129f..878fe674b85a1 100644 --- a/src/pytorch_lightning/callbacks/pruning.py +++ b/src/pytorch_lightning/callbacks/pruning.py @@ -361,7 +361,7 @@ def _log_sparsity_stats( f" {curr_mask_zeros} ({curr_mask_zeros / curr_mask_size:.2%})" ) - def setup(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule", stage: Optional[str] = None) -> None: + def setup(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule", stage: str) -> None: parameters_to_prune = self.sanitize_parameters_to_prune( pl_module, self._parameters_to_prune, parameter_names=self._parameter_names ) diff --git a/src/pytorch_lightning/callbacks/stochastic_weight_avg.py b/src/pytorch_lightning/callbacks/stochastic_weight_avg.py index 6650bb3f0c479..90e2c62a7962d 100644 --- a/src/pytorch_lightning/callbacks/stochastic_weight_avg.py +++ b/src/pytorch_lightning/callbacks/stochastic_weight_avg.py @@ -143,7 +143,7 @@ def swa_end(self) -> int: def pl_module_contains_batch_norm(pl_module: "pl.LightningModule") -> bool: return any(isinstance(module, nn.modules.batchnorm._BatchNorm) for module in pl_module.modules()) - def setup(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule", stage: Optional[str] = None) -> None: + def setup(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule", stage: str) -> None: # copy the model before moving it to accelerator device. with pl_module._prevent_trainer_and_dataloaders_deepcopy(): self._average_model = deepcopy(pl_module) diff --git a/src/pytorch_lightning/cli.py b/src/pytorch_lightning/cli.py index d3990d79c5c88..700307b6ef1bc 100644 --- a/src/pytorch_lightning/cli.py +++ b/src/pytorch_lightning/cli.py @@ -205,7 +205,7 @@ def __init__( self.overwrite = overwrite self.multifile = multifile - def setup(self, trainer: Trainer, pl_module: LightningModule, stage: Optional[str] = None) -> None: + def setup(self, trainer: Trainer, pl_module: LightningModule, stage: str) -> None: log_dir = trainer.log_dir # this broadcasts the directory assert log_dir is not None config_path = os.path.join(log_dir, self.config_filename) diff --git a/src/pytorch_lightning/core/hooks.py b/src/pytorch_lightning/core/hooks.py index 4da53903eccca..86b3d3f92e9c8 100644 --- a/src/pytorch_lightning/core/hooks.py +++ b/src/pytorch_lightning/core/hooks.py @@ -380,7 +380,7 @@ def __init__(self): model.predict_dataloader() """ - def setup(self, stage: Optional[str] = None) -> None: + def setup(self, stage: str) -> None: """Called at the beginning of fit (train + validate), validate, test, or predict. This is a good hook when you need to build models dynamically or adjust something about them. This hook is called on every process when using DDP. @@ -406,7 +406,7 @@ def setup(self, stage): self.l1 = nn.Linear(28, data.num_classes) """ - def teardown(self, stage: Optional[str] = None) -> None: + def teardown(self, stage: str) -> None: """Called at the end of fit (train + validate), validate, test, or predict. Args: diff --git a/src/pytorch_lightning/demos/boring_classes.py b/src/pytorch_lightning/demos/boring_classes.py index f7be5390466d8..7d79f8916353a 100644 --- a/src/pytorch_lightning/demos/boring_classes.py +++ b/src/pytorch_lightning/demos/boring_classes.py @@ -163,17 +163,17 @@ def __init__(self, data_dir: str = "./"): self.checkpoint_state: Optional[str] = None self.random_full = RandomDataset(32, 64 * 4) - def setup(self, stage: Optional[str] = None) -> None: - if stage == "fit" or stage is None: + def setup(self, stage: str) -> None: + if stage == "fit": self.random_train = Subset(self.random_full, indices=range(64)) - if stage in ("fit", "validate") or stage is None: + if stage in ("fit", "validate"): self.random_val = Subset(self.random_full, indices=range(64, 64 * 2)) - if stage == "test" or stage is None: + if stage == "test": self.random_test = Subset(self.random_full, indices=range(64 * 2, 64 * 3)) - if stage == "predict" or stage is None: + if stage == "predict": self.random_predict = Subset(self.random_full, indices=range(64 * 3, 64 * 4)) def train_dataloader(self) -> DataLoader: diff --git a/src/pytorch_lightning/demos/mnist_datamodule.py b/src/pytorch_lightning/demos/mnist_datamodule.py index 6466b78250b3b..e1818e83f44db 100644 --- a/src/pytorch_lightning/demos/mnist_datamodule.py +++ b/src/pytorch_lightning/demos/mnist_datamodule.py @@ -195,7 +195,7 @@ def prepare_data(self) -> None: MNIST(self.data_dir, train=True, download=True) MNIST(self.data_dir, train=False, download=True) - def setup(self, stage: Optional[str] = None) -> None: + def setup(self, stage: str) -> None: """Split the train and valid dataset.""" extra = dict(transform=self.default_transforms) if self.default_transforms else {} dataset: Dataset = MNIST(self.data_dir, train=True, download=False, **extra) diff --git a/src/pytorch_lightning/profilers/advanced.py b/src/pytorch_lightning/profilers/advanced.py index 90fddc8074168..73be0de3f8817 100644 --- a/src/pytorch_lightning/profilers/advanced.py +++ b/src/pytorch_lightning/profilers/advanced.py @@ -78,7 +78,7 @@ def summary(self) -> str: recorded_stats[action_name] = s.getvalue() return self._stats_to_str(recorded_stats) - def teardown(self, stage: Optional[str] = None) -> None: + def teardown(self, stage: Optional[str]) -> None: super().teardown(stage=stage) self.profiled_actions = {} diff --git a/src/pytorch_lightning/profilers/profiler.py b/src/pytorch_lightning/profilers/profiler.py index 1b36159837523..755007ba743f1 100644 --- a/src/pytorch_lightning/profilers/profiler.py +++ b/src/pytorch_lightning/profilers/profiler.py @@ -148,15 +148,13 @@ def _stats_to_str(self, stats: Dict[str, str]) -> str: output.append(value) return os.linesep.join(output) - def setup( - self, stage: Optional[str] = None, local_rank: Optional[int] = None, log_dir: Optional[str] = None - ) -> None: + def setup(self, stage: str, local_rank: Optional[int] = None, log_dir: Optional[str] = None) -> None: """Execute arbitrary pre-profiling set-up steps.""" self._stage = stage self._local_rank = local_rank self.dirpath = self.dirpath or log_dir - def teardown(self, stage: Optional[str] = None) -> None: + def teardown(self, stage: Optional[str]) -> None: """Execute arbitrary post-profiling tear-down steps. Closes the currently open file and stream. diff --git a/src/pytorch_lightning/profilers/pytorch.py b/src/pytorch_lightning/profilers/pytorch.py index 079aafe37ec8b..9b843dccbf2a8 100644 --- a/src/pytorch_lightning/profilers/pytorch.py +++ b/src/pytorch_lightning/profilers/pytorch.py @@ -505,7 +505,7 @@ def _delete_profilers(self) -> None: self._register.__exit__(None, None, None) self._register = None - def teardown(self, stage: Optional[str] = None) -> None: + def teardown(self, stage: str) -> None: self._delete_profilers() for k in list(self._recording_map): diff --git a/src/pytorch_lightning/trainer/configuration_validator.py b/src/pytorch_lightning/trainer/configuration_validator.py index 8bf68e1bedd62..6ec2b15a11c6d 100644 --- a/src/pytorch_lightning/trainer/configuration_validator.py +++ b/src/pytorch_lightning/trainer/configuration_validator.py @@ -56,7 +56,6 @@ def verify_loop_configurations(trainer: "pl.Trainer") -> None: _check_on_pretrain_routine(model) # TODO: Delete CheckpointHooks off LightningDataModule in v1.8 _check_datamodule_checkpoint_hooks(trainer) - _check_setup_method(trainer) def __verify_train_val_loop_configuration(trainer: "pl.Trainer", model: "pl.LightningModule") -> None: @@ -309,9 +308,3 @@ def _check_datamodule_checkpoint_hooks(trainer: "pl.Trainer") -> None: "`LightningDataModule.on_load_checkpoint` was deprecated in" " v1.6 and will be removed in v1.8. Use `load_state_dict` instead." ) - - -def _check_setup_method(trainer: "pl.Trainer") -> None: - for obj in [trainer.lightning_module, trainer.datamodule] + trainer.callbacks: - if is_overridden("setup", obj) and not is_param_in_hook_signature(obj.setup, "stage"): - raise MisconfigurationException(f"`{obj.__class__.__name__}.setup` does not have a `stage` argument.") diff --git a/tests/tests_pytorch/accelerators/test_ipu.py b/tests/tests_pytorch/accelerators/test_ipu.py index 470cb4a028bed..d5958eae0ec4f 100644 --- a/tests/tests_pytorch/accelerators/test_ipu.py +++ b/tests/tests_pytorch/accelerators/test_ipu.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. import os -from typing import Optional from unittest import mock import pytest @@ -187,7 +186,7 @@ def test_optimization(tmpdir): @RunIf(ipu=True) def test_half_precision(tmpdir): class TestCallback(Callback): - def setup(self, trainer: Trainer, pl_module: LightningModule, stage: Optional[str] = None) -> None: + def setup(self, trainer: Trainer, pl_module: LightningModule, stage: str) -> None: assert trainer.strategy.model.precision == 16 raise SystemExit diff --git a/tests/tests_pytorch/callbacks/progress/test_base_progress.py b/tests/tests_pytorch/callbacks/progress/test_base_progress.py index 75f276a6b913d..588ec782d72c5 100644 --- a/tests/tests_pytorch/callbacks/progress/test_base_progress.py +++ b/tests/tests_pytorch/callbacks/progress/test_base_progress.py @@ -22,7 +22,7 @@ def test_main_progress_bar_with_val_check_interval_int(): limit_train_batches=train_batches, limit_val_batches=10, val_check_interval=3, check_val_every_n_epoch=None ) model = BoringModel() - trainer.progress_bar_callback.setup(trainer, model) + trainer.progress_bar_callback.setup(trainer, model, stage="fit") trainer.strategy.connect(model) trainer._data_connector.attach_data(model) trainer.reset_train_dataloader() diff --git a/tests/tests_pytorch/checkpointing/test_model_checkpoint.py b/tests/tests_pytorch/checkpointing/test_model_checkpoint.py index 60ec4ec0f23de..ebe0769d8df07 100644 --- a/tests/tests_pytorch/checkpointing/test_model_checkpoint.py +++ b/tests/tests_pytorch/checkpointing/test_model_checkpoint.py @@ -1245,7 +1245,7 @@ def on_load_checkpoint(self, *args, **kwargs): # Case - 2 # Make sure that everything runs when dirpath is not initialized explicitly cb_restore = CustomModelCheckpoint() - cb_restore.setup(Trainer(), BoringModel()) + cb_restore.setup(Trainer(), BoringModel(), stage="fit") with pytest.warns(UserWarning, match="The dirpath has changed from*"): cb_restore.load_state_dict(written_ckpt) make_assertions(cb_restore, written_ckpt) diff --git a/tests/tests_pytorch/core/test_datamodules.py b/tests/tests_pytorch/core/test_datamodules.py index 23419c102eb2c..19fb5181b2476 100644 --- a/tests/tests_pytorch/core/test_datamodules.py +++ b/tests/tests_pytorch/core/test_datamodules.py @@ -117,7 +117,7 @@ def prepare_data(self, *args, **kwargs): def test_helper_boringdatamodule(): dm = BoringDataModule() dm.prepare_data() - dm.setup() + dm.setup("fit") def test_helper_boringdatamodule_with_verbose_setup(): @@ -140,7 +140,7 @@ def test_dm_init_from_argparse_args(tmpdir): args = parser.parse_args(["--data_dir", str(tmpdir)]) dm = BoringDataModule.from_argparse_args(args) dm.prepare_data() - dm.setup() + dm.setup("fit") assert dm.data_dir == args.data_dir == str(tmpdir) diff --git a/tests/tests_pytorch/helpers/datamodules.py b/tests/tests_pytorch/helpers/datamodules.py index 6ad3151f3aadb..4278422593c00 100644 --- a/tests/tests_pytorch/helpers/datamodules.py +++ b/tests/tests_pytorch/helpers/datamodules.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import Optional import pytest import torch @@ -42,10 +41,10 @@ def prepare_data(self): self.dataset_cls(self.data_dir, train=True, download=True) self.dataset_cls(self.data_dir, train=False, download=True) - def setup(self, stage: Optional[str] = None): - if stage == "fit" or stage is None: + def setup(self, stage: str): + if stage == "fit": self.mnist_train = self.dataset_cls(self.data_dir, train=True) - if stage == "test" or stage is None: + if stage == "test": self.mnist_test = self.dataset_cls(self.data_dir, train=False) def train_dataloader(self): diff --git a/tests/tests_pytorch/models/test_hparams.py b/tests/tests_pytorch/models/test_hparams.py index 84311d6f780fb..628eb28403486 100644 --- a/tests/tests_pytorch/models/test_hparams.py +++ b/tests/tests_pytorch/models/test_hparams.py @@ -860,7 +860,7 @@ def test_no_datamodule_for_hparams(tmpdir): model = SaveHparamsModel({"arg1": 5, "arg2": "abc"}) org_model_hparams = copy.deepcopy(model.hparams_initial) data = DataModuleWithoutHparams() - data.setup() + data.setup("fit") mock_logger = _get_mock_logger(tmpdir) trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, logger=mock_logger) diff --git a/tests/tests_pytorch/plugins/precision/hpu/test_hpu.py b/tests/tests_pytorch/plugins/precision/hpu/test_hpu.py index 5ca366f5162e8..e8fc226435ebd 100644 --- a/tests/tests_pytorch/plugins/precision/hpu/test_hpu.py +++ b/tests/tests_pytorch/plugins/precision/hpu/test_hpu.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import Optional import pytest import torch @@ -42,7 +41,7 @@ def test_precision_plugin(hmp_params): @RunIf(hpu=True) def test_mixed_precision(tmpdir, hmp_params: dict): class TestCallback(Callback): - def setup(self, trainer: Trainer, pl_module: LightningModule, stage: Optional[str] = None) -> None: + def setup(self, trainer: Trainer, pl_module: LightningModule, stage: str) -> None: assert trainer.strategy.model.precision == "bf16" raise SystemExit diff --git a/tests/tests_pytorch/strategies/test_ddp.py b/tests/tests_pytorch/strategies/test_ddp.py index 9b196f3e2a97f..dbde198b6eb6e 100644 --- a/tests/tests_pytorch/strategies/test_ddp.py +++ b/tests/tests_pytorch/strategies/test_ddp.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. import os -from typing import Optional from unittest import mock from unittest.mock import patch @@ -94,7 +93,7 @@ def test_ddp_torch_dist_is_available_in_setup( """Test to ensure torch distributed is available within the setup hook using ddp.""" class TestModel(BoringModel): - def setup(self, stage: Optional[str] = None) -> None: + def setup(self, stage: str) -> None: assert torch.distributed.is_initialized() raise SystemExit() diff --git a/tests/tests_pytorch/strategies/test_deepspeed_strategy.py b/tests/tests_pytorch/strategies/test_deepspeed_strategy.py index e3c6f95f3ff47..857abaa8dfbb4 100644 --- a/tests/tests_pytorch/strategies/test_deepspeed_strategy.py +++ b/tests/tests_pytorch/strategies/test_deepspeed_strategy.py @@ -15,7 +15,7 @@ import json import logging import os -from typing import Any, Dict, Optional +from typing import Any, Dict from unittest import mock import pytest @@ -263,7 +263,7 @@ def train_dataloader(self): return DataLoader(dataset_cls(32, 64)) class AssertCallback(Callback): - def setup(self, trainer, pl_module, stage: Optional[str] = None) -> None: + def setup(self, trainer, pl_module, stage: str) -> None: assert isinstance(trainer.strategy, DeepSpeedStrategy) config = trainer.strategy.config @@ -1059,7 +1059,7 @@ def __init__(self): super().__init__() self._setup = False - def setup(self, stage: Optional[str] = None) -> None: + def setup(self, stage: str) -> None: self._setup = True def train_dataloader(self): diff --git a/tests/tests_pytorch/trainer/test_config_validator.py b/tests/tests_pytorch/trainer/test_config_validator.py index 7fba63ba7ae24..f6508c181ebbb 100644 --- a/tests/tests_pytorch/trainer/test_config_validator.py +++ b/tests/tests_pytorch/trainer/test_config_validator.py @@ -16,8 +16,7 @@ import pytorch_lightning as pl from pytorch_lightning import LightningDataModule, LightningModule, Trainer -from pytorch_lightning.callbacks.callback import Callback -from pytorch_lightning.demos.boring_classes import BoringDataModule, BoringModel, RandomDataset +from pytorch_lightning.demos.boring_classes import BoringModel, RandomDataset from pytorch_lightning.utilities import device_parser from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.warnings import PossibleUserWarning @@ -162,40 +161,6 @@ def test_trainer_manual_optimization_config(tmpdir): trainer.fit(model) -def test_invalid_setup_method(): - """Test error message when `setup` method of `LightningModule` or `LightningDataModule` is not defined - correctly.""" - - class CustomModel(BoringModel): - def setup(self): - pass - - class CustomDataModule(BoringDataModule): - def setup(self): - pass - - class CustomBoringCallback(Callback): - def setup(self, pl_module, trainer): - pass - - fit_kwargs = [ - {"model": CustomModel(), "datamodule": BoringDataModule()}, - {"model": BoringModel(), "datamodule": CustomDataModule()}, - ] - - for kwargs in fit_kwargs: - trainer = Trainer(fast_dev_run=True) - - with pytest.raises(MisconfigurationException, match="does not have a `stage` argument"): - trainer.fit(**kwargs) - - trainer = Trainer(fast_dev_run=True, callbacks=[CustomBoringCallback()]) - model = BoringModel() - - with pytest.raises(MisconfigurationException, match="does not have a `stage` argument"): - trainer.fit(model) - - @pytest.mark.parametrize("trainer_kwargs", [{"accelerator": "ipu"}, {"accelerator": "gpu", "strategy": "dp"}]) @pytest.mark.parametrize("hook", ["transfer_batch_to_device", "on_after_batch_transfer"]) def test_raise_exception_with_batch_transfer_hooks(monkeypatch, hook, trainer_kwargs, tmpdir): From cce55b6cd3fb53dcded30c493cff2cc023a0cdbc Mon Sep 17 00:00:00 2001 From: Benjamin Krala <56834877+KralaBenjamin@users.noreply.github.com> Date: Thu, 1 Sep 2022 16:06:28 +0200 Subject: [PATCH 038/193] Precise description of reload_dataloaders_every_n_epochs (#14245) --- docs/source-pytorch/common/trainer.rst | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/docs/source-pytorch/common/trainer.rst b/docs/source-pytorch/common/trainer.rst index 53148c9fab583..049bbf94181af 100644 --- a/docs/source-pytorch/common/trainer.rst +++ b/docs/source-pytorch/common/trainer.rst @@ -1232,12 +1232,15 @@ reload_dataloaders_every_n_epochs | -Set to a positive integer to reload dataloaders every n epochs. +Set to a positive integer to reload dataloaders every n epochs from your currently used data source. +DataSource can be a ``LightningModule`` or a ``LightningDataModule``. + .. code-block:: python # if 0 (default) train_loader = model.train_dataloader() + # or if using data module: datamodule.train_dataloader() for epoch in epochs: for batch in train_loader: ... @@ -1246,9 +1249,12 @@ Set to a positive integer to reload dataloaders every n epochs. for epoch in epochs: if not epoch % reload_dataloaders_every_n_epochs: train_loader = model.train_dataloader() + # or if using data module: datamodule.train_dataloader() for batch in train_loader: ... +The pseudocode applies also to the ``val_dataloader``. + .. _replace-sampler-ddp: replace_sampler_ddp From e0c2c3e677d141594cdd799050942b10908c9a97 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Thu, 1 Sep 2022 18:08:40 +0200 Subject: [PATCH 039/193] Clean up fairscale imports (#14476) --- .../plugins/precision/sharded_native_amp.py | 2 +- src/pytorch_lightning/strategies/ddp.py | 8 +--- .../strategies/fully_sharded.py | 4 +- src/pytorch_lightning/strategies/sharded.py | 13 +++-- .../strategies/sharded_spawn.py | 2 +- src/pytorch_lightning/utilities/__init__.py | 3 -- src/pytorch_lightning/utilities/imports.py | 4 -- .../benchmarks/test_sharded_parity.py | 2 +- .../callbacks/test_stochastic_weight_avg.py | 2 +- tests/tests_pytorch/helpers/runif.py | 13 ++--- .../precision/test_sharded_precision.py | 2 +- ..._ddp_fully_sharded_with_full_state_dict.py | 12 ++--- .../strategies/test_ddp_strategy.py | 5 +- .../strategies/test_sharded_strategy.py | 48 ++++++------------- tests/tests_pytorch/utilities/test_imports.py | 3 +- 15 files changed, 45 insertions(+), 78 deletions(-) diff --git a/src/pytorch_lightning/plugins/precision/sharded_native_amp.py b/src/pytorch_lightning/plugins/precision/sharded_native_amp.py index 15c23e18ed6bc..d76db26a76358 100644 --- a/src/pytorch_lightning/plugins/precision/sharded_native_amp.py +++ b/src/pytorch_lightning/plugins/precision/sharded_native_amp.py @@ -13,9 +13,9 @@ # limitations under the License. from typing import Optional, Union +from pytorch_lightning.overrides.fairscale import _FAIRSCALE_AVAILABLE from pytorch_lightning.plugins.precision.native_amp import NativeMixedPrecisionPlugin from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning.utilities.imports import _FAIRSCALE_AVAILABLE if _FAIRSCALE_AVAILABLE: from fairscale.optim import OSS diff --git a/src/pytorch_lightning/strategies/ddp.py b/src/pytorch_lightning/strategies/ddp.py index f4f5397a78bca..57ab3a151b011 100644 --- a/src/pytorch_lightning/strategies/ddp.py +++ b/src/pytorch_lightning/strategies/ddp.py @@ -34,6 +34,7 @@ from pytorch_lightning.overrides import LightningDistributedModule from pytorch_lightning.overrides.base import _LightningPrecisionModuleWrapperBase from pytorch_lightning.overrides.distributed import prepare_for_backward +from pytorch_lightning.overrides.fairscale import _FAIRSCALE_AVAILABLE from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO from pytorch_lightning.plugins.precision import PrecisionPlugin @@ -54,12 +55,7 @@ sync_ddp_if_available, ) from pytorch_lightning.utilities.exceptions import DeadlockDetectedException -from pytorch_lightning.utilities.imports import ( - _FAIRSCALE_AVAILABLE, - _IS_WINDOWS, - _TORCH_GREATER_EQUAL_1_10, - _TORCH_GREATER_EQUAL_1_11, -) +from pytorch_lightning.utilities.imports import _IS_WINDOWS, _TORCH_GREATER_EQUAL_1_10, _TORCH_GREATER_EQUAL_1_11 from pytorch_lightning.utilities.optimizer import optimizers_to_device from pytorch_lightning.utilities.rank_zero import rank_zero_info, rank_zero_only, rank_zero_warn from pytorch_lightning.utilities.seed import reset_seed diff --git a/src/pytorch_lightning/strategies/fully_sharded.py b/src/pytorch_lightning/strategies/fully_sharded.py index 239e4844b146e..6f7ca3b34b03d 100644 --- a/src/pytorch_lightning/strategies/fully_sharded.py +++ b/src/pytorch_lightning/strategies/fully_sharded.py @@ -18,18 +18,18 @@ import torch import pytorch_lightning as pl +from pytorch_lightning.overrides.fairscale import _FAIRSCALE_AVAILABLE from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO from pytorch_lightning.plugins.precision import PrecisionPlugin from pytorch_lightning.strategies.ddp import DDPStrategy from pytorch_lightning.trainer.states import TrainerFn -from pytorch_lightning.utilities import _FAIRSCALE_FULLY_SHARDED_AVAILABLE from pytorch_lightning.utilities.enums import PrecisionType from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.optimizer import optimizers_to_device from pytorch_lightning.utilities.types import PredictStep, STEP_OUTPUT, TestStep, TrainingStep, ValidationStep -if _FAIRSCALE_FULLY_SHARDED_AVAILABLE: +if _FAIRSCALE_AVAILABLE: from fairscale.nn import default_auto_wrap_policy, enable_wrap from fairscale.nn.data_parallel import FullyShardedDataParallel diff --git a/src/pytorch_lightning/strategies/sharded.py b/src/pytorch_lightning/strategies/sharded.py index 6bf8e47022c45..22a1c22e96398 100644 --- a/src/pytorch_lightning/strategies/sharded.py +++ b/src/pytorch_lightning/strategies/sharded.py @@ -21,11 +21,11 @@ import pytorch_lightning as pl from pytorch_lightning.core.optimizer import LightningOptimizer from pytorch_lightning.overrides.base import _LightningModuleWrapperBase, _LightningPrecisionModuleWrapperBase +from pytorch_lightning.overrides.fairscale import _FAIRSCALE_AVAILABLE from pytorch_lightning.strategies.ddp import DDPStrategy from pytorch_lightning.trainer.states import TrainerFn from pytorch_lightning.utilities.enums import PrecisionType from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning.utilities.imports import _FAIRSCALE_AVAILABLE, _FAIRSCALE_OSS_FP16_BROADCAST_AVAILABLE from pytorch_lightning.utilities.optimizer import optimizers_to_device if _FAIRSCALE_AVAILABLE: @@ -114,12 +114,11 @@ def _reinit_optimizers_with_oss(self, optimizers: List[Optimizer]) -> List["OSS" if not isinstance(optimizer, OSS): optim_class = type(optimizer) zero_optimizer = OSS(params=optimizer.param_groups, optim=optim_class, **optimizer.defaults) - if _FAIRSCALE_OSS_FP16_BROADCAST_AVAILABLE: - is_fp16 = self.precision_plugin.precision in (PrecisionType.MIXED, PrecisionType.HALF) - # For multi-node training, compressing the model shards in fp16 before broadcasting - # improves performance. When using PyTorch AMP, it will not degrade - # the model performance. - zero_optimizer.broadcast_fp16 = is_fp16 and self.num_nodes > 1 + is_fp16 = self.precision_plugin.precision in (PrecisionType.MIXED, PrecisionType.HALF) + # For multi-node training, compressing the model shards in fp16 before broadcasting + # improves performance. When using PyTorch AMP, it will not degrade + # the model performance. + zero_optimizer.broadcast_fp16 = is_fp16 and self.num_nodes > 1 optimizers[x] = zero_optimizer del optimizer return optimizers diff --git a/src/pytorch_lightning/strategies/sharded_spawn.py b/src/pytorch_lightning/strategies/sharded_spawn.py index 01ccb75677544..b5cd9497a3640 100644 --- a/src/pytorch_lightning/strategies/sharded_spawn.py +++ b/src/pytorch_lightning/strategies/sharded_spawn.py @@ -20,10 +20,10 @@ import pytorch_lightning as pl from pytorch_lightning.overrides.base import _LightningModuleWrapperBase, _LightningPrecisionModuleWrapperBase +from pytorch_lightning.overrides.fairscale import _FAIRSCALE_AVAILABLE from pytorch_lightning.strategies.ddp_spawn import DDPSpawnStrategy from pytorch_lightning.trainer.states import TrainerFn from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning.utilities.imports import _FAIRSCALE_AVAILABLE from pytorch_lightning.utilities.optimizer import optimizers_to_device if _FAIRSCALE_AVAILABLE: diff --git a/src/pytorch_lightning/utilities/__init__.py b/src/pytorch_lightning/utilities/__init__.py index b8d5801734a25..0b4b074a43768 100644 --- a/src/pytorch_lightning/utilities/__init__.py +++ b/src/pytorch_lightning/utilities/__init__.py @@ -27,9 +27,6 @@ from pytorch_lightning.utilities.grads import grad_norm # noqa: F401 from pytorch_lightning.utilities.imports import ( # noqa: F401 _APEX_AVAILABLE, - _FAIRSCALE_FULLY_SHARDED_AVAILABLE, - _FAIRSCALE_OSS_FP16_BROADCAST_AVAILABLE, - _GROUP_AVAILABLE, _HIVEMIND_AVAILABLE, _HOROVOD_AVAILABLE, _HPU_AVAILABLE, diff --git a/src/pytorch_lightning/utilities/imports.py b/src/pytorch_lightning/utilities/imports.py index dfc88104ac25c..b04aec50bc1ee 100644 --- a/src/pytorch_lightning/utilities/imports.py +++ b/src/pytorch_lightning/utilities/imports.py @@ -134,10 +134,6 @@ def __repr__(self) -> str: _APEX_AVAILABLE = _module_available("apex.amp") _DALI_AVAILABLE = _module_available("nvidia.dali") -_FAIRSCALE_AVAILABLE = not _IS_WINDOWS and _module_available("fairscale.nn") -_FAIRSCALE_OSS_FP16_BROADCAST_AVAILABLE = _FAIRSCALE_AVAILABLE and _compare_version("fairscale", operator.ge, "0.3.3") -_FAIRSCALE_FULLY_SHARDED_AVAILABLE = _FAIRSCALE_AVAILABLE and _compare_version("fairscale", operator.ge, "0.3.4") -_GROUP_AVAILABLE = not _IS_WINDOWS and _module_available("torch.distributed.group") _HABANA_FRAMEWORK_AVAILABLE = _package_available("habana_frameworks") _HIVEMIND_AVAILABLE = _package_available("hivemind") _HOROVOD_AVAILABLE = _module_available("horovod.torch") diff --git a/tests/tests_pytorch/benchmarks/test_sharded_parity.py b/tests/tests_pytorch/benchmarks/test_sharded_parity.py index b02428758f169..782df5ce924d5 100644 --- a/tests/tests_pytorch/benchmarks/test_sharded_parity.py +++ b/tests/tests_pytorch/benchmarks/test_sharded_parity.py @@ -187,7 +187,7 @@ def plugin_parity_test( ) -@RunIf(skip_windows=True, fairscale=True) +@RunIf(fairscale=True) @pytest.mark.parametrize( "kwargs", [ diff --git a/tests/tests_pytorch/callbacks/test_stochastic_weight_avg.py b/tests/tests_pytorch/callbacks/test_stochastic_weight_avg.py index 7f1692e30a3f2..a39a7a2145225 100644 --- a/tests/tests_pytorch/callbacks/test_stochastic_weight_avg.py +++ b/tests/tests_pytorch/callbacks/test_stochastic_weight_avg.py @@ -346,7 +346,7 @@ def test_swa_resume_training_from_checkpoint_ddp(tmpdir): @pytest.mark.parametrize( "strategy", [ - pytest.param("fsdp", marks=RunIf(fairscale_fully_sharded=True, min_cuda_gpus=1)), + pytest.param("fsdp", marks=RunIf(fairscale=True, min_cuda_gpus=1)), pytest.param("deepspeed", marks=RunIf(deepspeed=True, min_cuda_gpus=1)), ], ) diff --git a/tests/tests_pytorch/helpers/runif.py b/tests/tests_pytorch/helpers/runif.py index 4074eaf725e1f..afd61976550eb 100644 --- a/tests/tests_pytorch/helpers/runif.py +++ b/tests/tests_pytorch/helpers/runif.py @@ -22,12 +22,11 @@ from pytorch_lightning.accelerators.mps import _MPS_AVAILABLE from pytorch_lightning.callbacks.progress.rich_progress import _RICH_AVAILABLE +from pytorch_lightning.overrides.fairscale import _FAIRSCALE_AVAILABLE from pytorch_lightning.strategies.bagua import _BAGUA_AVAILABLE from pytorch_lightning.strategies.deepspeed import _DEEPSPEED_AVAILABLE from pytorch_lightning.utilities.imports import ( _APEX_AVAILABLE, - _FAIRSCALE_AVAILABLE, - _FAIRSCALE_FULLY_SHARDED_AVAILABLE, _HIVEMIND_AVAILABLE, _HOROVOD_AVAILABLE, _HPU_AVAILABLE, @@ -81,7 +80,6 @@ def __new__( skip_windows: bool = False, standalone: bool = False, fairscale: bool = False, - fairscale_fully_sharded: bool = False, deepspeed: bool = False, rich: bool = False, omegaconf: bool = False, @@ -112,7 +110,6 @@ def __new__( standalone: Mark the test as standalone, our CI will run it in a separate process. This requires that the ``PL_RUN_STANDALONE_TESTS=1`` environment variable is set. fairscale: Require that facebookresearch/fairscale is installed. - fairscale_fully_sharded: Require that `fairscale` fully sharded support is available. deepspeed: Require that microsoft/DeepSpeed is installed. rich: Require that willmcgugan/rich is installed. omegaconf: Require that omry/omegaconf is installed. @@ -214,13 +211,13 @@ def __new__( kwargs["standalone"] = True if fairscale: + if skip_windows: + raise ValueError( + "`skip_windows` is not necessary when `fairscale` is set as it does not support Windows." + ) conditions.append(not _FAIRSCALE_AVAILABLE) reasons.append("Fairscale") - if fairscale_fully_sharded: - conditions.append(not _FAIRSCALE_FULLY_SHARDED_AVAILABLE) - reasons.append("Fairscale Fully Sharded") - if deepspeed: conditions.append(not _DEEPSPEED_AVAILABLE) reasons.append("Deepspeed") diff --git a/tests/tests_pytorch/plugins/precision/test_sharded_precision.py b/tests/tests_pytorch/plugins/precision/test_sharded_precision.py index ab7a4a432a2c6..0c08c8e9540eb 100644 --- a/tests/tests_pytorch/plugins/precision/test_sharded_precision.py +++ b/tests/tests_pytorch/plugins/precision/test_sharded_precision.py @@ -15,8 +15,8 @@ import pytest import torch +from pytorch_lightning.overrides.fairscale import _FAIRSCALE_AVAILABLE from pytorch_lightning.plugins import ShardedNativeMixedPrecisionPlugin -from pytorch_lightning.utilities.imports import _FAIRSCALE_AVAILABLE from tests_pytorch.helpers.runif import RunIf ShardedGradScaler = None diff --git a/tests/tests_pytorch/strategies/test_ddp_fully_sharded_with_full_state_dict.py b/tests/tests_pytorch/strategies/test_ddp_fully_sharded_with_full_state_dict.py index 2790f014c7212..fe587877e84fb 100644 --- a/tests/tests_pytorch/strategies/test_ddp_fully_sharded_with_full_state_dict.py +++ b/tests/tests_pytorch/strategies/test_ddp_fully_sharded_with_full_state_dict.py @@ -8,13 +8,13 @@ from pytorch_lightning import Trainer from pytorch_lightning.callbacks import ModelCheckpoint from pytorch_lightning.demos.boring_classes import BoringModel +from pytorch_lightning.overrides.fairscale import _FAIRSCALE_AVAILABLE from pytorch_lightning.plugins import FullyShardedNativeMixedPrecisionPlugin from pytorch_lightning.strategies import DDPFullyShardedStrategy -from pytorch_lightning.utilities import _FAIRSCALE_FULLY_SHARDED_AVAILABLE from pytorch_lightning.utilities.exceptions import MisconfigurationException from tests_pytorch.helpers.runif import RunIf -if _FAIRSCALE_FULLY_SHARDED_AVAILABLE: +if _FAIRSCALE_AVAILABLE: from fairscale.nn import FullyShardedDataParallel, wrap @@ -31,7 +31,7 @@ def test_invalid_on_cpu(tmpdir): @mock.patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0"}) @mock.patch("pytorch_lightning.utilities.device_parser.num_cuda_devices", return_value=1) @mock.patch("pytorch_lightning.utilities.device_parser.is_cuda_available", return_value=True) -@RunIf(fairscale_fully_sharded=True) +@RunIf(fairscale=True) def test_fsdp_with_sharded_amp(device_count_mock, mock_cuda_available, tmpdir): """Test to ensure that plugin native amp plugin is correctly chosen when using sharded.""" trainer = Trainer( @@ -96,7 +96,7 @@ def _assert_layer_fsdp_instance(self) -> None: assert self.layer.module[2].mixed_precision -@RunIf(min_cuda_gpus=1, skip_windows=True, standalone=True, fairscale_fully_sharded=True) +@RunIf(min_cuda_gpus=1, standalone=True, fairscale=True) def test_fully_sharded_strategy_checkpoint(tmpdir): """Test to ensure that checkpoint is saved correctly when using a single GPU, and all stages can be run.""" @@ -114,7 +114,7 @@ def test_fully_sharded_strategy_checkpoint(tmpdir): _run_multiple_stages(trainer, model, os.path.join(tmpdir, "last.ckpt")) -@RunIf(min_cuda_gpus=2, skip_windows=True, standalone=True, fairscale_fully_sharded=True) +@RunIf(min_cuda_gpus=2, standalone=True, fairscale=True) def test_fully_sharded_strategy_checkpoint_multi_gpus(tmpdir): """Test to ensure that checkpoint is saved correctly when using multiple GPUs, and all stages can be run.""" @@ -162,7 +162,7 @@ def _run_multiple_stages(trainer, model, model_path: Optional[str] = None): trainer.test(ckpt_path=model_path) -@RunIf(min_cuda_gpus=1, skip_windows=True, standalone=True, fairscale_fully_sharded=True) +@RunIf(min_cuda_gpus=1, standalone=True, fairscale=True) def test_fsdp_gradient_clipping_raises(tmpdir): """Test to ensure that an exception is raised when clipping gradients by value with FSDP.""" model = BoringModel() diff --git a/tests/tests_pytorch/strategies/test_ddp_strategy.py b/tests/tests_pytorch/strategies/test_ddp_strategy.py index 318505a984216..d867339ef63ea 100644 --- a/tests/tests_pytorch/strategies/test_ddp_strategy.py +++ b/tests/tests_pytorch/strategies/test_ddp_strategy.py @@ -21,10 +21,11 @@ from pytorch_lightning import LightningModule, Trainer from pytorch_lightning.demos.boring_classes import BoringModel +from pytorch_lightning.overrides.fairscale import _FAIRSCALE_AVAILABLE from pytorch_lightning.plugins.environments import ClusterEnvironment, LightningEnvironment from pytorch_lightning.strategies import DDPStrategy from pytorch_lightning.trainer.states import TrainerFn -from pytorch_lightning.utilities.imports import _FAIRSCALE_AVAILABLE, _TORCH_GREATER_EQUAL_1_10 +from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_10 from tests_pytorch.helpers.runif import RunIf if _FAIRSCALE_AVAILABLE: @@ -266,7 +267,7 @@ def configure_optimizers(self): return OSS(params=base_optimizer.param_groups, optim=type(base_optimizer), **base_optimizer.defaults) -@RunIf(min_cuda_gpus=2, skip_windows=True, fairscale=True) +@RunIf(min_cuda_gpus=2, fairscale=True) @pytest.mark.parametrize("strategy", (pytest.param("ddp", marks=RunIf(standalone=True)), "ddp_spawn")) def test_ddp_strategy_checkpoint_multi_gpu_fairscale_optimizer(tmpdir, strategy): """Test to ensure that checkpoint is saved correctly when using faircale optimizer.""" diff --git a/tests/tests_pytorch/strategies/test_sharded_strategy.py b/tests/tests_pytorch/strategies/test_sharded_strategy.py index acefecbf4d2a7..2c0a5579c9933 100644 --- a/tests/tests_pytorch/strategies/test_sharded_strategy.py +++ b/tests/tests_pytorch/strategies/test_sharded_strategy.py @@ -7,9 +7,9 @@ from pytorch_lightning import LightningModule, Trainer from pytorch_lightning.demos.boring_classes import BoringModel +from pytorch_lightning.overrides.fairscale import _FAIRSCALE_AVAILABLE from pytorch_lightning.strategies import DDPShardedStrategy, DDPSpawnShardedStrategy from pytorch_lightning.trainer.states import TrainerFn -from pytorch_lightning.utilities.imports import _FAIRSCALE_AVAILABLE from tests_pytorch.helpers.runif import RunIf if _FAIRSCALE_AVAILABLE: @@ -18,7 +18,7 @@ @pytest.mark.parametrize("clip_val", [0, 10]) -@RunIf(min_cuda_gpus=1, skip_windows=True, fairscale=True) +@RunIf(min_cuda_gpus=1, fairscale=True) @mock.patch("fairscale.optim.oss.OSS.clip_grad_norm") def test_ddp_sharded_precision_16_clip_gradients(mock_oss_clip_grad_norm, clip_val, tmpdir): """Ensure that clip gradients is only called if the value is greater than 0.""" @@ -58,7 +58,7 @@ def test_ddp_choice_sharded_amp(strategy, expected): assert isinstance(trainer.strategy, expected) -@RunIf(skip_windows=True, fairscale=True) +@RunIf(fairscale=True) def test_ddp_sharded_strategy_checkpoint_cpu(tmpdir): """Test to ensure that checkpoint is saved correctly.""" model = BoringModel() @@ -75,7 +75,7 @@ def test_ddp_sharded_strategy_checkpoint_cpu(tmpdir): assert torch.equal(trained_param.to("cpu"), loaded_param) -@RunIf(min_cuda_gpus=2, skip_windows=True, fairscale=True) +@RunIf(min_cuda_gpus=2, fairscale=True) def test_ddp_sharded_strategy_checkpoint_multi_gpu(tmpdir): """Test to ensure that checkpoint is saved correctly when using multiple GPUs.""" model = BoringModel() @@ -92,7 +92,7 @@ def test_ddp_sharded_strategy_checkpoint_multi_gpu(tmpdir): assert torch.equal(trained_param.to("cpu"), loaded_param) -@RunIf(min_cuda_gpus=2, skip_windows=True, fairscale=True) +@RunIf(min_cuda_gpus=2, fairscale=True) def test_ddp_sharded_strategy_finetune(tmpdir): """Test to ensure that we can save and restart training (simulate fine-tuning)""" model = BoringModel() @@ -107,7 +107,7 @@ def test_ddp_sharded_strategy_finetune(tmpdir): trainer.fit(saved_model) -@RunIf(skip_windows=True, fairscale=True) +@RunIf(fairscale=True) def test_ddp_sharded_strategy_fit_ckpt_path(tmpdir): """Test to ensure that resuming from checkpoint works.""" model = BoringModel() @@ -125,27 +125,7 @@ def test_ddp_sharded_strategy_fit_ckpt_path(tmpdir): trainer.fit(model, ckpt_path=checkpoint_path) -@pytest.mark.skip(reason="Not a critical test, skip till drone CI performance improves.") # todo -@pytest.mark.skip(reason="Currently unsupported restarting training on different number of devices.") -@RunIf(min_cuda_gpus=2, skip_windows=True, fairscale=True) -def test_ddp_sharded_strategy_fit_ckpt_path_downsize_gpus(tmpdir): - """Test to ensure that resuming from checkpoint works when downsizing number of GPUS.""" - model = BoringModel() - trainer = Trainer(strategy="ddp_sharded_spawn", fast_dev_run=True, gpus=2) - - trainer.fit(model) - - checkpoint_path = os.path.join(tmpdir, "model.pt") - trainer.save_checkpoint(checkpoint_path) - - model = BoringModel() - - trainer = Trainer(strategy="ddp_sharded_spawn", fast_dev_run=True, gpus=1) - - trainer.fit(model, ckpt_path=checkpoint_path) - - -@RunIf(min_cuda_gpus=1, skip_windows=True, fairscale=True) +@RunIf(min_cuda_gpus=1, fairscale=True) def test_ddp_sharded_strategy_fit_ckpt_path_gpu_to_cpu(tmpdir): """Test to ensure that resuming from checkpoint works when going from GPUs- > CPU.""" model = BoringModel() @@ -163,7 +143,7 @@ def test_ddp_sharded_strategy_fit_ckpt_path_gpu_to_cpu(tmpdir): trainer.fit(model, ckpt_path=checkpoint_path) -@RunIf(skip_windows=True, standalone=True, fairscale=True) +@RunIf(standalone=True, fairscale=True) @pytest.mark.parametrize( "trainer_kwargs", ( @@ -201,7 +181,7 @@ def training_step(self, batch, batch_idx): return {"loss": loss} -@RunIf(min_cuda_gpus=2, skip_windows=True, standalone=True, fairscale=True) +@RunIf(min_cuda_gpus=2, standalone=True, fairscale=True) @pytest.mark.parametrize("strategy", ("ddp_sharded", "ddp_sharded_spawn")) def test_ddp_sharded_strategy_manual_optimization(tmpdir, strategy): model = ManualBoringModel() @@ -238,7 +218,7 @@ def on_predict_start(self) -> None: assert isinstance(self.trainer.model, LightningModule) -@RunIf(skip_windows=True, fairscale=True) +@RunIf(fairscale=True) def test_configure_ddp(tmpdir): """Tests with ddp sharded strategy.""" trainer = Trainer(default_root_dir=tmpdir, strategy="ddp_sharded", fast_dev_run=True) @@ -251,7 +231,7 @@ def test_configure_ddp(tmpdir): trainer.predict(model, dataloaders=model.predict_dataloader()) -@RunIf(skip_windows=True, fairscale=True) +@RunIf(fairscale=True) @mock.patch("pytorch_lightning.strategies.DDPShardedStrategy._wrap_optimizers", autospec=True) @pytest.mark.parametrize("cls", [DDPShardedStrategy, DDPSpawnShardedStrategy]) def test_custom_kwargs_sharded(_, cls): @@ -269,7 +249,7 @@ def test_custom_kwargs_sharded(_, cls): assert kwargs["reduce_fp16"] -@RunIf(skip_windows=True, fairscale=True) +@RunIf(fairscale=True) @mock.patch("pytorch_lightning.strategies.DDPShardedStrategy._wrap_optimizers", autospec=True) @pytest.mark.parametrize(["params", "expected_buffer_size"], [(dict(), 0), (dict(reduce_buffer_size=128), 128)]) @pytest.mark.parametrize("num_nodes", [1, 2]) @@ -293,7 +273,7 @@ def test_custom_kwargs_sharded_reduce_buffer_size(_, params, expected_buffer_siz assert kwargs["reduce_buffer_size"] == expected_buffer_size -@RunIf(skip_windows=True, fairscale=True) +@RunIf(fairscale=True) def test_block_backward_sync(): strategy = DDPShardedStrategy() model = mock.MagicMock(spec=ShardedDataParallel) @@ -323,7 +303,7 @@ def configure_optimizers(self): return OSS(params=base_optimizer.param_groups, optim=type(base_optimizer), **base_optimizer.defaults) -@RunIf(min_cuda_gpus=2, skip_windows=True, fairscale=True) +@RunIf(min_cuda_gpus=2, fairscale=True) @pytest.mark.parametrize("strategy", (pytest.param("ddp_sharded", marks=RunIf(standalone=True)), "ddp_sharded_spawn")) def test_ddp_sharded_strategy_checkpoint_multi_gpu_fairscale_optimizer(tmpdir, strategy): """Test to ensure that checkpoint is saved correctly when using fairscale optimizers.""" diff --git a/tests/tests_pytorch/utilities/test_imports.py b/tests/tests_pytorch/utilities/test_imports.py index c673716c457f2..25995bb029f3a 100644 --- a/tests/tests_pytorch/utilities/test_imports.py +++ b/tests/tests_pytorch/utilities/test_imports.py @@ -13,6 +13,7 @@ # limitations under the License. import operator +from pytorch_lightning.overrides.fairscale import _FAIRSCALE_AVAILABLE from pytorch_lightning.strategies.bagua import _BAGUA_AVAILABLE from pytorch_lightning.strategies.deepspeed import _DEEPSPEED_AVAILABLE from pytorch_lightning.utilities import ( @@ -22,7 +23,7 @@ _OMEGACONF_AVAILABLE, _POPTORCH_AVAILABLE, ) -from pytorch_lightning.utilities.imports import _compare_version, _FAIRSCALE_AVAILABLE, _RequirementAvailable, torch +from pytorch_lightning.utilities.imports import _compare_version, _RequirementAvailable, torch def test_module_exists(): From 291dc1b61553813bc8057995d91c963fc77bc1f2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Fri, 2 Sep 2022 00:13:12 +0200 Subject: [PATCH 040/193] Standalone Lite CI setup (#14451) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Jirka Co-authored-by: Carlos Mocholí Co-authored-by: Jirka Borovec --- .azure/gpu-tests-lite.yml | 101 ++++++++++++++++ .azure/gpu-tests.yml | 1 + .github/checkgroup.yml | 16 +++ .github/workflows/ci-lite-test-full.yml | 120 ++++++++++++++++++++ .github/workflows/ci-pkg-install.yml | 3 +- .pre-commit-config.yaml | 6 +- dockers/tpu-tests/tpu_test_cases.jsonnet | 1 + pyproject.toml | 1 + requirements/lite/base.txt | 7 ++ requirements/lite/devel.txt | 2 + requirements/lite/strategies.txt | 5 + setup.py | 2 +- src/lightning_lite/CHANGELOG.md | 28 +++++ src/lightning_lite/README.md | 1 + src/lightning_lite/__about__.py | 23 ++++ src/lightning_lite/__init__.py | 4 + src/lightning_lite/__setup__.py | 98 ++++++++++++++++ src/lightning_lite/__version__.py | 1 + src/lightning_lite/lite.py | 3 + tests/tests_lite/__init__.py | 0 tests/tests_lite/helpers/__init__.py | 0 tests/tests_lite/helpers/runif.py | 75 ++++++++++++ tests/tests_lite/run_standalone_tests.sh | 92 +++++++++++++++ tests/tests_lite/test_lite.py | 12 ++ tests/tests_pytorch/run_standalone_tests.sh | 5 +- 25 files changed, 601 insertions(+), 6 deletions(-) create mode 100644 .azure/gpu-tests-lite.yml create mode 100644 .github/workflows/ci-lite-test-full.yml create mode 100644 requirements/lite/base.txt create mode 100644 requirements/lite/devel.txt create mode 100644 requirements/lite/strategies.txt create mode 100644 src/lightning_lite/CHANGELOG.md create mode 100644 src/lightning_lite/README.md create mode 100644 src/lightning_lite/__about__.py create mode 100644 src/lightning_lite/__init__.py create mode 100644 src/lightning_lite/__setup__.py create mode 100644 src/lightning_lite/__version__.py create mode 100644 src/lightning_lite/lite.py create mode 100644 tests/tests_lite/__init__.py create mode 100644 tests/tests_lite/helpers/__init__.py create mode 100644 tests/tests_lite/helpers/runif.py create mode 100644 tests/tests_lite/run_standalone_tests.sh create mode 100644 tests/tests_lite/test_lite.py diff --git a/.azure/gpu-tests-lite.yml b/.azure/gpu-tests-lite.yml new file mode 100644 index 0000000000000..3260fdd702811 --- /dev/null +++ b/.azure/gpu-tests-lite.yml @@ -0,0 +1,101 @@ +# Python package +# Create and test a Python package on multiple Python versions. +# Add steps that analyze code, save the dist with the build record, publish to a PyPI-compatible index, and more: +# https://docs.microsoft.com/azure/devops/pipelines/languages/python + +trigger: + tags: + include: + - '*' + branches: + include: + - "master" + - "release/*" + - "refs/tags/*" + paths: + include: + - ".azure/gpu-tests-lite.yml" + - "requirements/lite/**" + - "src/lightning_lite/**" + - "tests/tests_lite/**" + +pr: + - "master" + - "release/*" + +jobs: + - job: testing + # how long to run the job before automatically cancelling + timeoutInMinutes: "20" + # how much time to give 'run always even if cancelled tasks' before stopping them + cancelTimeoutInMinutes: "2" + pool: azure-jirka-spot + container: + image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.12-cuda11.6.1" + # default shm size is 64m. Increase it to avoid: + # 'Error while creating shared memory: unhandled system error, NCCL version 2.7.8' + options: "--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all --shm-size=512m" + workspace: + clean: all + + steps: + - bash: | + lspci | egrep 'VGA|3D' + whereis nvidia + nvidia-smi + which python && which pip + python --version + pip --version + pip list + displayName: 'Image info & NVIDIA' + + - bash: | + set -e + TORCH_VERSION=$(python -c "import torch; print(torch.__version__.split('+')[0])") + CUDA_VERSION_MM=$(python -c "import torch ; print(''.join(map(str, torch.version.cuda.split('.')[:2])))") + python ./requirements/pytorch/adjust-versions.py requirements/lite/base.txt ${PYTORCH_VERSION} + pip install -e .[strategies] --find-links https://download.pytorch.org/whl/cu${CUDA_VERSION_MM}/torch_stable.html + pip install --requirement requirements/pytorch/devel.txt --find-links https://download.pytorch.org/whl/cu${CUDA_VERSION_MM}/torch_stable.html + pip list + env: + PACKAGE_NAME: pytorch + FREEZE_REQUIREMENTS: 1 + displayName: 'Install dependencies' + + - bash: | + set -e + python requirements/collect_env_details.py + python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu >= 2, f'GPU: {mgpu}'" + displayName: 'Env details' + + - bash: python -m coverage run --source lightning_lite -m pytest --ignore benchmarks -v --junitxml=$(Build.StagingDirectory)/test-results.xml --durations=50 + env: + PL_RUN_CUDA_TESTS: "1" + workingDirectory: tests/tests_lite + displayName: 'Testing: Lite standard' + timeoutInMinutes: "10" + + - bash: bash run_standalone_tests.sh + workingDirectory: tests/tests_lite + env: + PL_USE_MOCKED_MNIST: "1" + PL_RUN_CUDA_TESTS: "1" + PL_STANDALONE_TESTS_SOURCE: "lightning_lite" + displayName: 'Testing: Lite standalone tests' + timeoutInMinutes: "10" + + - bash: | + python -m coverage report + python -m coverage xml + python -m coverage html + python -m codecov --token=$(CODECOV_TOKEN) --commit=$(Build.SourceVersion) --flags=gpu,pytest --name="GPU-coverage" --env=linux,azure + ls -l + workingDirectory: tests/tests_lite + displayName: 'Statistics' + + - task: PublishTestResults@2 + displayName: 'Publish test results' + inputs: + testResultsFiles: '$(Build.StagingDirectory)/test-results.xml' + testRunTitle: '$(Agent.OS) - $(Build.DefinitionName) - Python $(python.version)' + condition: succeededOrFailed() diff --git a/.azure/gpu-tests.yml b/.azure/gpu-tests.yml index 3ed4601d1b8e5..e53d8f07567ff 100644 --- a/.azure/gpu-tests.yml +++ b/.azure/gpu-tests.yml @@ -119,6 +119,7 @@ jobs: env: PL_USE_MOCKED_MNIST: "1" PL_RUN_CUDA_TESTS: "1" + PL_STANDALONE_TESTS_SOURCE: "pytorch_lightning" displayName: 'Testing: PyTorch standalone tests' timeoutInMinutes: "35" condition: eq(variables['continue'], '1') diff --git a/.github/checkgroup.yml b/.github/checkgroup.yml index 531df1ebeaea8..e8892926f6e55 100644 --- a/.github/checkgroup.yml +++ b/.github/checkgroup.yml @@ -41,6 +41,15 @@ subprojects: - "pl-cpu (windows-2022, 3.10, latest, stable)" - "pl-cpu (windows-2022, 3.7, latest, stable)" - "pl-cpu (windows-2022, 3.7, oldest, stable)" + - "lite-cpu (macOS-11, 3.10, latest, stable)" + - "lite-cpu (macOS-11, 3.7, latest, stable)" + - "lite-cpu (macOS-11, 3.7, oldest, stable)" + - "lite-cpu (ubuntu-20.04, 3.10, latest, stable)" + - "lite-cpu (ubuntu-20.04, 3.7, latest, stable)" + - "lite-cpu (ubuntu-20.04, 3.7, oldest, stable)" + - "lite-cpu (windows-2022, 3.10, latest, stable)" + - "lite-cpu (windows-2022, 3.7, latest, stable)" + - "lite-cpu (windows-2022, 3.7, oldest, stable)" - "make-doctest (pytorch)" - "make-html (pytorch)" - "mypy" @@ -60,6 +69,13 @@ subprojects: checks: - "pytorch-lightning (GPUs)" + - id: "lightning_lite: Azure GPU" + paths: + - ".azure/gpu-tests-lite.yml" + - "tests/tests_lite/run_standalone_*.sh" + checks: + - "lightning-lite (GPUs)" + - id: "pytorch_lightning: Azure HPU" paths: - ".azure/hpu-tests.yml" diff --git a/.github/workflows/ci-lite-test-full.yml b/.github/workflows/ci-lite-test-full.yml new file mode 100644 index 0000000000000..69c8a12069f26 --- /dev/null +++ b/.github/workflows/ci-lite-test-full.yml @@ -0,0 +1,120 @@ +name: Test Lite full + +# see: https://help.github.com/en/actions/reference/events-that-trigger-workflows +on: # Trigger the workflow on push or pull request, but only for the master branch + push: + branches: [master, "release/*"] + pull_request: + branches: [master, "release/*"] + types: [opened, reopened, ready_for_review, synchronize] + paths: + - "requirements/lite/**" + - "src/lightning_lite/**" + - "tests/tests_lite/**" + - "setup.cfg" # includes pytest config + - ".github/workflows/ci-lite-test-full.yml" + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }} + cancel-in-progress: ${{ ! (github.ref == 'refs/heads/master' || startsWith(github.ref, 'refs/heads/release/')) }} + +jobs: + + lite-cpu: + runs-on: ${{ matrix.os }} + if: github.event.pull_request.draft == false + strategy: + fail-fast: false + matrix: + os: [ubuntu-20.04, windows-2022, macOS-11] + python-version: ["3.7", "3.10"] # minimum, maximum + requires: ["oldest", "latest"] + release: ["stable"] + exclude: + # There's no distribution of the oldest PyTorch 1.9 for Python 3.10. + # TODO: Remove the exclusion when dropping PyTorch 1.9 support. + - {python-version: "3.10", requires: "oldest"} + + timeout-minutes: 40 + + steps: + - uses: actions/checkout@v2 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + + - name: Reset caching + run: python -c "import time; days = time.time() / 60 / 60 / 24; print(f'TIME_PERIOD=d{int(days / 2) * 2}')" >> $GITHUB_ENV + + - name: basic setup + run: | + pip --version + pip install -q fire + + - name: Setup Windows + if: runner.os == 'windows' + run: | + python .actions/assistant.py requirements_prune_pkgs horovod + + - name: Set min. dependencies + if: matrix.requires == 'oldest' + run: | + python .actions/assistant.py replace_oldest_ver + + # Note: This uses an internal pip API and may not always work + # https://github.com/actions/cache/blob/master/examples.md#multiple-oss-in-a-workflow + - name: Get pip cache dir + id: pip-cache + run: echo "::set-output name=dir::$(pip cache dir)" + + - name: pip cache + uses: actions/cache@v3 + with: + path: ${{ steps.pip-cache.outputs.dir }} + key: ${{ runner.os }}-pip-td${{ env.TIME_PERIOD }}-py${{ matrix.python-version }}-${{ matrix.release }}-${{ matrix.requires }}-${{ hashFiles('requirements/lite/*.txt') }} + restore-keys: | + ${{ runner.os }}-pip-td${{ env.TIME_PERIOD }}-py${{ matrix.python-version }}-${{ matrix.release }}-${{ matrix.requires }}- + + - name: Install dependencies + env: + PACKAGE_NAME: pytorch # TODO(lite) does this need to say lite? + FREEZE_REQUIREMENTS: 1 + run: | + flag=$(python -c "print('--pre' if '${{matrix.release}}' == 'pre' else '')" 2>&1) + url=$(python -c "print('test/cpu/torch_test.html' if '${{matrix.release}}' == 'pre' else 'cpu/torch_stable.html')" 2>&1) + pip install -e .[test] --upgrade $flag --find-links "https://download.pytorch.org/whl/${url}" + pip list + shell: bash + + - name: Testing Lite + working-directory: tests/tests_lite + # NOTE: do not include coverage report here, see: https://github.com/nedbat/coveragepy/issues/1003 + run: coverage run --source lightning_lite -m pytest -v --durations=50 --junitxml=results-${{ runner.os }}-py${{ matrix.python-version }}-${{ matrix.requires }}-${{ matrix.release }}.xml + + - name: Upload pytest results + if: failure() + uses: actions/upload-artifact@v3 + with: + name: unittest-results-${{ runner.os }}-py${{ matrix.python-version }}-${{ matrix.requires }}-${{ matrix.release }} + path: tests/tests_lite/results-${{ runner.os }}-py${{ matrix.python-version }}-${{ matrix.requires }}-${{ matrix.release }}.xml + + - name: Statistics + if: success() + working-directory: tests/tests_lite + run: | + coverage report + coverage xml + + - name: Upload coverage to Codecov + uses: codecov/codecov-action@v3 + if: always() + # see: https://github.com/actions/toolkit/issues/399 + continue-on-error: true + with: + token: ${{ secrets.CODECOV_TOKEN }} + file: tests/tests_lite/coverage.xml + flags: cpu,pytest,python${{ matrix.python-version }} + name: CPU-coverage + fail_ci_if_error: false diff --git a/.github/workflows/ci-pkg-install.yml b/.github/workflows/ci-pkg-install.yml index a9fdd36693a67..6eedad253941a 100644 --- a/.github/workflows/ci-pkg-install.yml +++ b/.github/workflows/ci-pkg-install.yml @@ -34,7 +34,7 @@ jobs: max-parallel: 1 matrix: os: [ubuntu-20.04, macOS-11, windows-2022] - pkg: ["app", "pytorch"] + pkg: ["app", "lite", "pytorch"] python-version: [3.8] # , 3.9 steps: @@ -110,6 +110,7 @@ jobs: python-version: ${{ matrix.python-version }} - name: Dowload package + # todo: download also lite after it is fist published run: | pip install -q fire requests for pkg in 'app' 'pytorch' ; do diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 4b8ec5239615c..1930f99cd1622 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -43,7 +43,8 @@ repos: docs/source-pytorch/_static/images/general/pl_overview_flat.jpg| docs/source-pytorch/_static/images/general/pl_overview.gif| src/lightning_app/cli/pl-app-template/ui/yarn.lock| - src/pytorch_lightning/CHANGELOG.md + src/pytorch_lightning/CHANGELOG.md| + src/lightning_lite/CHANGELOG.md )$ - id: detect-private-key @@ -98,7 +99,8 @@ repos: exclude: | (?x)^( src/pytorch_lightning/CHANGELOG.md| - src/lightning_app/CHANGELOG.md + src/lightning_app/CHANGELOG.md| + src/lightning_lite/CHANGELOG.md )$ - repo: https://github.com/PyCQA/flake8 diff --git a/dockers/tpu-tests/tpu_test_cases.jsonnet b/dockers/tpu-tests/tpu_test_cases.jsonnet index f2c106f220b60..1f6bf4c41b324 100644 --- a/dockers/tpu-tests/tpu_test_cases.jsonnet +++ b/dockers/tpu-tests/tpu_test_cases.jsonnet @@ -39,6 +39,7 @@ local tputests = base.BaseTest { cd tests/tests_pytorch coverage run --source=pytorch_lightning -m pytest -vv --durations=0 ./ echo "\n||| Running standalone tests |||\n" + export PL_STANDALONE_TESTS_SOURCE=pytorch_lightning export PL_STANDALONE_TESTS_BATCH_SIZE=1 bash run_standalone_tests.sh echo "\n||| END PYTEST LOGS |||\n" diff --git a/pyproject.toml b/pyproject.toml index 19702524bc62e..df89de3d092fc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,6 +9,7 @@ requires = [ known_first_party = [ "pl_examples", "pytorch_lightning", + "lightning_lite", "tests_pytorch", ] profile = "black" diff --git a/requirements/lite/base.txt b/requirements/lite/base.txt new file mode 100644 index 0000000000000..89061745d428d --- /dev/null +++ b/requirements/lite/base.txt @@ -0,0 +1,7 @@ +# NOTE: the upper bound for the package version is only set for CI stability, and it is dropped while installing this package +# in case you want to preserve/enforce restrictions on the latest compatible version, add "strict" as an in-line comment + +torch>=1.9.*, <1.13.0 +fsspec[http]>=2021.05.0, !=2021.06.0, <2022.6.0 +packaging>=17.0, <=21.3 +typing-extensions>=4.0.0, <4.3.1 diff --git a/requirements/lite/devel.txt b/requirements/lite/devel.txt new file mode 100644 index 0000000000000..a7d1aa9843b11 --- /dev/null +++ b/requirements/lite/devel.txt @@ -0,0 +1,2 @@ +# install all mandatory dependencies +-r ./base.txt diff --git a/requirements/lite/strategies.txt b/requirements/lite/strategies.txt new file mode 100644 index 0000000000000..c06fac969bb65 --- /dev/null +++ b/requirements/lite/strategies.txt @@ -0,0 +1,5 @@ +# NOTE: the upper bound for the package version is only set for CI stability, and it is dropped while installing this package +# in case you want to preserve/enforce restrictions on the latest compatible version, add "strict" as an in-line comment + +fairscale>=0.4.5, <=0.4.6 +deepspeed>=0.6.0, <=0.7.0 diff --git a/setup.py b/setup.py index 82a4a969ec80b..3048f8a1aed4e 100755 --- a/setup.py +++ b/setup.py @@ -54,7 +54,7 @@ from setuptools import setup _PACKAGE_NAME = os.environ.get("PACKAGE_NAME", "") -_PACKAGE_MAPPING = {"pytorch": "pytorch_lightning", "app": "lightning_app"} +_PACKAGE_MAPPING = {"pytorch": "pytorch_lightning", "app": "lightning_app", "lite": "lightning_lite"} _REAL_PKG_NAME = _PACKAGE_MAPPING.get(_PACKAGE_NAME, _PACKAGE_NAME) # https://packaging.python.org/guides/single-sourcing-package-version/ # http://blog.ionelmc.ro/2014/05/25/python-packaging/ diff --git a/src/lightning_lite/CHANGELOG.md b/src/lightning_lite/CHANGELOG.md new file mode 100644 index 0000000000000..280a7e2ac00f1 --- /dev/null +++ b/src/lightning_lite/CHANGELOG.md @@ -0,0 +1,28 @@ +# Changelog + +All notable changes to this project will be documented in this file. + +The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). + +## [0.0.x] - 2022-MM-DD + + +### Added + +- + +### Changed + +- + +### Deprecated + +- + +### Removed + +- + +### Fixed + +- diff --git a/src/lightning_lite/README.md b/src/lightning_lite/README.md new file mode 100644 index 0000000000000..464090415c471 --- /dev/null +++ b/src/lightning_lite/README.md @@ -0,0 +1 @@ +# TODO diff --git a/src/lightning_lite/__about__.py b/src/lightning_lite/__about__.py new file mode 100644 index 0000000000000..deb737cdf46ed --- /dev/null +++ b/src/lightning_lite/__about__.py @@ -0,0 +1,23 @@ +import time + +__author__ = "Lightning AI et al." +__author_email__ = "pytorch@lightning.ai" +__license__ = "Apache-2.0" +__copyright__ = f"Copyright (c) 2022-{time.strftime('%Y')}, {__author__}." +__homepage__ = "https://github.com/Lightning-AI/lightning" +__docs_url__ = "https://pytorch-lightning.readthedocs.io/en/stable/" +# TODO +__docs__ = "" +__long_docs__ = """ + +""" + +__all__ = [ + "__author__", + "__author_email__", + "__copyright__", + "__docs__", + "__docs_url__", + "__homepage__", + "__license__", +] diff --git a/src/lightning_lite/__init__.py b/src/lightning_lite/__init__.py new file mode 100644 index 0000000000000..5e0d0ad5cb20a --- /dev/null +++ b/src/lightning_lite/__init__.py @@ -0,0 +1,4 @@ +"""Root package info.""" + +from lightning_lite.__about__ import * # noqa: F401, F403 +from lightning_lite.__version__ import version as __version__ # noqa: F401 diff --git a/src/lightning_lite/__setup__.py b/src/lightning_lite/__setup__.py new file mode 100644 index 0000000000000..5c0c8ae660de3 --- /dev/null +++ b/src/lightning_lite/__setup__.py @@ -0,0 +1,98 @@ +import os +from importlib.util import module_from_spec, spec_from_file_location +from types import ModuleType +from typing import Any, Dict + +from setuptools import find_packages + +_PROJECT_ROOT = "." +_SOURCE_ROOT = os.path.join(_PROJECT_ROOT, "src") +_PACKAGE_ROOT = os.path.join(_SOURCE_ROOT, "lightning_lite") +_PATH_REQUIREMENTS = os.path.join("requirements", "lite") +_FREEZE_REQUIREMENTS = bool(int(os.environ.get("FREEZE_REQUIREMENTS", 0))) + + +def _load_py_module(name: str, location: str) -> ModuleType: + spec = spec_from_file_location(name, location) + assert spec, f"Failed to load module {name} from {location}" + py = module_from_spec(spec) + assert spec.loader, f"ModuleSpec.loader is None for {name} from {location}" + spec.loader.exec_module(py) + return py + + +def _adjust_manifest(**__: Any) -> None: + manifest_path = os.path.join(_PROJECT_ROOT, "MANIFEST.in") + assert os.path.isfile(manifest_path) + with open(manifest_path) as fp: + lines = fp.readlines() + lines += [ + "recursive-exclude src *.md" + os.linesep, + "recursive-exclude requirements *.txt" + os.linesep, + "recursive-include requirements/lite *.txt" + os.linesep, + "recursive-include src/lightning_lite *.md" + os.linesep, + ] + + # TODO: remove this once lightning-ui package is ready as a dependency + lines += ["recursive-include src/lightning_app/ui *" + os.linesep] + + with open(manifest_path, "w") as fp: + fp.writelines(lines) + + +def _setup_args(**__: Any) -> Dict[str, Any]: + _path_setup_tools = os.path.join(_PROJECT_ROOT, ".actions", "setup_tools.py") + _setup_tools = _load_py_module("setup_tools", _path_setup_tools) + _about = _load_py_module("about", os.path.join(_PACKAGE_ROOT, "__about__.py")) + _version = _load_py_module("version", os.path.join(_PACKAGE_ROOT, "__version__.py")) + _long_description = _setup_tools.load_readme_description( + _PACKAGE_ROOT, homepage=_about.__homepage__, version=_version.version + ) + + return dict( + name="lightning-lite", + version=_version.version, # todo: consider using date version + branch for installation from source + description=_about.__docs__, + author=_about.__author__, + author_email=_about.__author_email__, + url=_about.__homepage__, + download_url="https://github.com/Lightning-AI/lightning", + license=_about.__license__, + packages=find_packages(where="src", include=["lightning_lite", "lightning_lite.*"]), + package_dir={"": "src"}, + long_description=_long_description, + long_description_content_type="text/markdown", + include_package_data=True, + zip_safe=False, + keywords=["deep learning", "pytorch", "AI"], + python_requires=">=3.7", + setup_requires=["wheel"], + install_requires=_setup_tools.load_requirements(_PATH_REQUIREMENTS, unfreeze=not _FREEZE_REQUIREMENTS), + # extras_require=_prepare_extras(), # todo + project_urls={ + "Bug Tracker": "https://github.com/Lightning-AI/lightning/issues", + "Documentation": "https://pytorch-lightning.rtfd.io/en/latest/", + "Source Code": "https://github.com/Lightning-AI/lightning", + }, + classifiers=[ + "Environment :: Console", + "Natural Language :: English", + # How mature is this project? Common values are + # 3 - Alpha, 4 - Beta, 5 - Production/Stable + "Development Status :: 4 - Beta", + # Indicate who your project is intended for + "Intended Audience :: Developers", + "Topic :: Scientific/Engineering :: Artificial Intelligence", + "Topic :: Scientific/Engineering :: Information Analysis", + # Pick your license as you wish + # 'License :: OSI Approved :: BSD License', + "Operating System :: OS Independent", + # Specify the Python versions you support here. In particular, ensure + # that you indicate whether you support Python 2, Python 3 or both. + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + ], + ) diff --git a/src/lightning_lite/__version__.py b/src/lightning_lite/__version__.py new file mode 100644 index 0000000000000..61767daea3270 --- /dev/null +++ b/src/lightning_lite/__version__.py @@ -0,0 +1 @@ +version = "0.0.0dev" diff --git a/src/lightning_lite/lite.py b/src/lightning_lite/lite.py new file mode 100644 index 0000000000000..65fee1bf09834 --- /dev/null +++ b/src/lightning_lite/lite.py @@ -0,0 +1,3 @@ +class LightningLite: + # Placeholder for real implementation + pass diff --git a/tests/tests_lite/__init__.py b/tests/tests_lite/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/tests/tests_lite/helpers/__init__.py b/tests/tests_lite/helpers/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/tests/tests_lite/helpers/runif.py b/tests/tests_lite/helpers/runif.py new file mode 100644 index 0000000000000..280af6a96f9c4 --- /dev/null +++ b/tests/tests_lite/helpers/runif.py @@ -0,0 +1,75 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os + +import pytest +import torch + + +# TODO(lite): Add all RunIf conditions once the relevant utilities have moved to lite source dir +class RunIf: + """RunIf wrapper for simple marking specific cases, fully compatible with pytest.mark:: + + @RunIf(min_torch="0.0") + @pytest.mark.parametrize("arg1", [1, 2.0]) + def test_wrapper(arg1): + assert arg1 > 0.0 + """ + + def __new__( + self, + *args, + min_cuda_gpus: int = 0, + standalone: bool = False, + **kwargs, + ): + """ + Args: + *args: Any :class:`pytest.mark.skipif` arguments. + min_cuda_gpus: Require this number of gpus and that the ``PL_RUN_CUDA_TESTS=1`` environment variable is set. + standalone: Mark the test as standalone, our CI will run it in a separate process. + This requires that the ``PL_RUN_STANDALONE_TESTS=1`` environment variable is set. + **kwargs: Any :class:`pytest.mark.skipif` keyword arguments. + """ + conditions = [] + reasons = [] + + if min_cuda_gpus: + conditions.append(torch.cuda.device_count() < min_cuda_gpus) + reasons.append(f"GPUs>={min_cuda_gpus}") + # used in conftest.py::pytest_collection_modifyitems + kwargs["min_cuda_gpus"] = True + + if standalone: + env_flag = os.getenv("PL_RUN_STANDALONE_TESTS", "0") + conditions.append(env_flag != "1") + reasons.append("Standalone execution") + # used in conftest.py::pytest_collection_modifyitems + kwargs["standalone"] = True + + reasons = [rs for cond, rs in zip(conditions, reasons) if cond] + return pytest.mark.skipif( + *args, condition=any(conditions), reason=f"Requires: [{' + '.join(reasons)}]", **kwargs + ) + + +@RunIf(min_torch="99") +def test_always_skip(): + exit(1) + + +@pytest.mark.parametrize("arg1", [0.5, 1.0, 2.0]) +@RunIf(min_torch="0.0") +def test_wrapper(arg1: float): + assert arg1 > 0.0 diff --git a/tests/tests_lite/run_standalone_tests.sh b/tests/tests_lite/run_standalone_tests.sh new file mode 100644 index 0000000000000..9f91f4bc3f80c --- /dev/null +++ b/tests/tests_lite/run_standalone_tests.sh @@ -0,0 +1,92 @@ +#!/bin/bash +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +set -e +# THIS FILE ASSUMES IT IS RUN INSIDE THE tests/tests_ DIRECTORY + +# Batch size for testing: Determines how many standalone test invocations run in parallel +# It can be set through the env variable PL_STANDALONE_TESTS_BATCH_SIZE and defaults to 6 if not set +test_batch_size="${PL_STANDALONE_TESTS_BATCH_SIZE:-6}" +source="${PL_STANDALONE_TESTS_SOURCE}" + +# this environment variable allows special tests to run +export PL_RUN_STANDALONE_TESTS=1 +# python arguments +defaults="-m coverage run --source $source --append -m pytest --no-header" + +# find tests marked as `@RunIf(standalone=True)`. done manually instead of with pytest because it is faster +grep_output=$(grep --recursive --word-regexp . --regexp 'standalone=True' --include '*.py') + +# file paths, remove duplicates +files=$(echo "$grep_output" | cut -f1 -d: | sort | uniq) + +# get the list of parametrizations. we need to call them separately. the last two lines are removed. +# note: if there's a syntax error, this will fail with some garbled output +if [[ "$OSTYPE" == "darwin"* ]]; then + parametrizations=$(python -m pytest $files --collect-only --quiet "$@" | tail -r | sed -e '1,3d' | tail -r) +else + parametrizations=$(python -m pytest $files --collect-only --quiet "$@" | head -n -2) +fi +# remove the "tests/tests_lite" path suffixes +parametrizations=${parametrizations//"tests/tests_lite/"/} +parametrizations_arr=($parametrizations) + +# tests to skip - space separated +blocklist='utilities/test_warnings.py' +report='' + +rm -f standalone_test_output.txt # in case it exists, remove it +function show_batched_output { + if [ -f standalone_test_output.txt ]; then # if exists + cat standalone_test_output.txt + rm standalone_test_output.txt + fi +} +trap show_batched_output EXIT # show the output on exit + +for i in "${!parametrizations_arr[@]}"; do + parametrization=${parametrizations_arr[$i]} + + # check blocklist + if echo $blocklist | grep -F "${parametrization}"; then + report+="Skipped\t$parametrization\n" + # do not continue the loop because we might need to wait for batched jobs + else + echo "Running $parametrization" + # execute the test in the background + # redirect to a log file that buffers test output. since the tests will run in the background, we cannot let them + # output to std{out,err} because the outputs would be garbled together + python ${defaults} "$parametrization" &>> standalone_test_output.txt & + # save the PID in an array + pids[${i}]=$! + # add row to the final report + report+="Ran\t$parametrization\n" + fi + + if ((($i + 1) % $test_batch_size == 0)); then + # wait for running tests + for pid in ${pids[*]}; do wait $pid; done + unset pids # empty the array + show_batched_output + fi +done +# wait for leftover tests +for pid in ${pids[*]}; do wait $pid; done +show_batched_output + +# echo test report +printf '=%.s' {1..80} +printf "\n$report" +printf '=%.s' {1..80} +printf '\n' diff --git a/tests/tests_lite/test_lite.py b/tests/tests_lite/test_lite.py new file mode 100644 index 0000000000000..a7df3089cb5ac --- /dev/null +++ b/tests/tests_lite/test_lite.py @@ -0,0 +1,12 @@ +from tests_lite.helpers.runif import RunIf + +from lightning_lite.lite import LightningLite # noqa: F401 + + +def test_placeholder(tmpdir): + assert True + + +@RunIf(min_cuda_gpus=2, standalone=True) +def test_placeholder_standalone(tmpdir): + assert True diff --git a/tests/tests_pytorch/run_standalone_tests.sh b/tests/tests_pytorch/run_standalone_tests.sh index 7e9292f4458db..1443c6885c69d 100644 --- a/tests/tests_pytorch/run_standalone_tests.sh +++ b/tests/tests_pytorch/run_standalone_tests.sh @@ -13,16 +13,17 @@ # See the License for the specific language governing permissions and # limitations under the License. set -e -# THIS FILE ASSUMES IT IS RUN INSIDE THE tests/tests_pytorch DIRECTORY +# THIS FILE ASSUMES IT IS RUN INSIDE THE tests/tests_ DIRECTORY # Batch size for testing: Determines how many standalone test invocations run in parallel # It can be set through the env variable PL_STANDALONE_TESTS_BATCH_SIZE and defaults to 6 if not set test_batch_size="${PL_STANDALONE_TESTS_BATCH_SIZE:-6}" +source="${PL_STANDALONE_TESTS_SOURCE}" # this environment variable allows special tests to run export PL_RUN_STANDALONE_TESTS=1 # python arguments -defaults='-m coverage run --source pytorch_lightning --append -m pytest --no-header' +defaults="-m coverage run --source $source --append -m pytest --no-header" # find tests marked as `@RunIf(standalone=True)`. done manually instead of with pytest because it is faster grep_output=$(grep --recursive --word-regexp . --regexp 'standalone=True' --include '*.py') From b19b15546b9e11026a11be7743f6d42c702f63d7 Mon Sep 17 00:00:00 2001 From: Laverne Henderson Date: Fri, 2 Sep 2022 01:53:29 -0700 Subject: [PATCH 041/193] Updated basic debugging (#14488) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Rohit Gupta Co-authored-by: Adrian Wälchli --- docs/source-pytorch/debug/debugging_basic.rst | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/docs/source-pytorch/debug/debugging_basic.rst b/docs/source-pytorch/debug/debugging_basic.rst index 147285f9fe798..14d059af1067c 100644 --- a/docs/source-pytorch/debug/debugging_basic.rst +++ b/docs/source-pytorch/debug/debugging_basic.rst @@ -128,6 +128,16 @@ To add the child modules to the summary add a :class:`~pytorch_lightning.callbac trainer = Trainer(callbacks=[ModelSummary(max_depth=-1)]) +To print the model summary if ``.fit()`` is not called: + +.. code-block:: python + + from pytorch_lightning.utilities.model_summary import ModelSummary + + model = LitModel() + summary = ModelSummary(model, max_depth=-1) + print(summary) + To turn off the autosummary use: .. code:: python From fed1d8dc6a091bd712d2709df43807ee3adb359e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Sat, 3 Sep 2022 16:38:30 +0200 Subject: [PATCH 042/193] Pin protobuf (#14512) --- requirements/pytorch/extra.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements/pytorch/extra.txt b/requirements/pytorch/extra.txt index f211f5654adec..b547e7a62f2ed 100644 --- a/requirements/pytorch/extra.txt +++ b/requirements/pytorch/extra.txt @@ -8,3 +8,4 @@ hydra-core>=1.0.5, <1.3.0 jsonargparse[signatures]>=4.12.0, <=4.12.0 gcsfs>=2021.5.0, <2022.8.0 rich>=10.14.0, !=10.15.0.a, <13.0.0 +protobuf<=3.20.1 # strict # an extra is updating protobuf, this pin prevents TensorBoard failure From ce0bde702a82db218c419cd582e688046b4364d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Sun, 4 Sep 2022 13:29:56 +0200 Subject: [PATCH 043/193] Remove deprecated `test_tube` dependency (#14513) --- requirements/pytorch/loggers.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/requirements/pytorch/loggers.txt b/requirements/pytorch/loggers.txt index 905823451973b..573daaa541ced 100644 --- a/requirements/pytorch/loggers.txt +++ b/requirements/pytorch/loggers.txt @@ -6,5 +6,4 @@ neptune-client>=0.10.0, <0.16.4 comet-ml>=3.1.12, <3.31.8 mlflow>=1.0.0, <1.29.0 -test_tube>=0.7.5, <=0.7.5 wandb>=0.10.22, <0.13.2 From 693116f3b5f4ddf99f14c988135c6cb76dd82baa Mon Sep 17 00:00:00 2001 From: Dmitry Frolov Date: Sun, 4 Sep 2022 14:03:38 -0400 Subject: [PATCH 044/193] [CLI] Fix status message on cluster creation (#14477) * Fix message on BYOC cluster creation Co-authored-by: thomas chaton --- src/lightning_app/cli/cmd_clusters.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lightning_app/cli/cmd_clusters.py b/src/lightning_app/cli/cmd_clusters.py index 371aaa0f76415..a41d44ceba383 100644 --- a/src/lightning_app/cli/cmd_clusters.py +++ b/src/lightning_app/cli/cmd_clusters.py @@ -91,7 +91,7 @@ def create( if wait: _wait_for_cluster_state(self.api_client, resp.id, V1ClusterState.RUNNING) - click.echo(f"${resp.id} cluster is ${resp.status.phase}") + click.echo(f"{resp.id} cluster is in {resp.status.phase} state") def get_clusters(self): resp = self.api_client.cluster_service_list_clusters(phase_not_in=[V1ClusterState.DELETED]) From 4235eff71242e3d6e240915c3b913d88ec7f8cde Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Sun, 4 Sep 2022 20:57:28 +0200 Subject: [PATCH 045/193] Use a standalone test symlink for Lite (#14502) --- .azure/gpu-tests-lite.yml | 3 +- tests/tests_lite/conftest.py | 52 ++++++++++++ tests/tests_lite/run_standalone_tests.sh | 93 +-------------------- tests/tests_pytorch/conftest.py | 2 +- tests/tests_pytorch/run_standalone_tests.sh | 5 +- 5 files changed, 59 insertions(+), 96 deletions(-) create mode 100644 tests/tests_lite/conftest.py mode change 100644 => 120000 tests/tests_lite/run_standalone_tests.sh diff --git a/.azure/gpu-tests-lite.yml b/.azure/gpu-tests-lite.yml index 3260fdd702811..5ceccade964da 100644 --- a/.azure/gpu-tests-lite.yml +++ b/.azure/gpu-tests-lite.yml @@ -18,6 +18,8 @@ trigger: - "requirements/lite/**" - "src/lightning_lite/**" - "tests/tests_lite/**" + - "tests/tests_pytorch/run_standalone_tests.sh" + - "tests/tests_lite/run_standalone_tests.sh" # a symlink to the one above pr: - "master" @@ -78,7 +80,6 @@ jobs: - bash: bash run_standalone_tests.sh workingDirectory: tests/tests_lite env: - PL_USE_MOCKED_MNIST: "1" PL_RUN_CUDA_TESTS: "1" PL_STANDALONE_TESTS_SOURCE: "lightning_lite" displayName: 'Testing: Lite standalone tests' diff --git a/tests/tests_lite/conftest.py b/tests/tests_lite/conftest.py new file mode 100644 index 0000000000000..fab4ff7e17901 --- /dev/null +++ b/tests/tests_lite/conftest.py @@ -0,0 +1,52 @@ +import os +from typing import List + +import pytest + + +def pytest_collection_modifyitems(items: List[pytest.Function], config: pytest.Config) -> None: + """An adaptation of `tests/tests_pytorch/conftest.py::pytest_collection_modifyitems`""" + initial_size = len(items) + conditions = [] + filtered, skipped = 0, 0 + + options = dict( + standalone="PL_RUN_STANDALONE_TESTS", + min_cuda_gpus="PL_RUN_CUDA_TESTS", + ipu="PL_RUN_IPU_TESTS", + tpu="PL_RUN_TPU_TESTS", + ) + if os.getenv(options["standalone"], "0") == "1" and os.getenv(options["min_cuda_gpus"], "0") == "1": + # special case: we don't have a CPU job for standalone tests, so we shouldn't run only cuda tests. + # by deleting the key, we avoid filtering out the CPU tests + del options["min_cuda_gpus"] + + for kwarg, env_var in options.items(): + # this will compute the intersection of all tests selected per environment variable + if os.getenv(env_var, "0") == "1": + conditions.append(env_var) + for i, test in reversed(list(enumerate(items))): # loop in reverse, since we are going to pop items + already_skipped = any(marker.name == "skip" for marker in test.own_markers) + if already_skipped: + # the test was going to be skipped anyway, filter it out + items.pop(i) + skipped += 1 + continue + has_runif_with_kwarg = any( + marker.name == "skipif" and marker.kwargs.get(kwarg) for marker in test.own_markers + ) + if not has_runif_with_kwarg: + # the test has `@RunIf(kwarg=True)`, filter it out + items.pop(i) + filtered += 1 + + if config.option.verbose >= 0 and (filtered or skipped): + writer = config.get_terminal_writer() + writer.write( + f"\nThe number of tests has been filtered from {initial_size} to {initial_size - filtered} after the" + f" filters {conditions}.\n{skipped} tests are marked as unconditional skips.\nIn total, {len(items)} tests" + " will run.\n", + flush=True, + bold=True, + purple=True, # oh yeah, branded pytest messages + ) diff --git a/tests/tests_lite/run_standalone_tests.sh b/tests/tests_lite/run_standalone_tests.sh deleted file mode 100644 index 9f91f4bc3f80c..0000000000000 --- a/tests/tests_lite/run_standalone_tests.sh +++ /dev/null @@ -1,92 +0,0 @@ -#!/bin/bash -# Copyright The PyTorch Lightning team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -set -e -# THIS FILE ASSUMES IT IS RUN INSIDE THE tests/tests_ DIRECTORY - -# Batch size for testing: Determines how many standalone test invocations run in parallel -# It can be set through the env variable PL_STANDALONE_TESTS_BATCH_SIZE and defaults to 6 if not set -test_batch_size="${PL_STANDALONE_TESTS_BATCH_SIZE:-6}" -source="${PL_STANDALONE_TESTS_SOURCE}" - -# this environment variable allows special tests to run -export PL_RUN_STANDALONE_TESTS=1 -# python arguments -defaults="-m coverage run --source $source --append -m pytest --no-header" - -# find tests marked as `@RunIf(standalone=True)`. done manually instead of with pytest because it is faster -grep_output=$(grep --recursive --word-regexp . --regexp 'standalone=True' --include '*.py') - -# file paths, remove duplicates -files=$(echo "$grep_output" | cut -f1 -d: | sort | uniq) - -# get the list of parametrizations. we need to call them separately. the last two lines are removed. -# note: if there's a syntax error, this will fail with some garbled output -if [[ "$OSTYPE" == "darwin"* ]]; then - parametrizations=$(python -m pytest $files --collect-only --quiet "$@" | tail -r | sed -e '1,3d' | tail -r) -else - parametrizations=$(python -m pytest $files --collect-only --quiet "$@" | head -n -2) -fi -# remove the "tests/tests_lite" path suffixes -parametrizations=${parametrizations//"tests/tests_lite/"/} -parametrizations_arr=($parametrizations) - -# tests to skip - space separated -blocklist='utilities/test_warnings.py' -report='' - -rm -f standalone_test_output.txt # in case it exists, remove it -function show_batched_output { - if [ -f standalone_test_output.txt ]; then # if exists - cat standalone_test_output.txt - rm standalone_test_output.txt - fi -} -trap show_batched_output EXIT # show the output on exit - -for i in "${!parametrizations_arr[@]}"; do - parametrization=${parametrizations_arr[$i]} - - # check blocklist - if echo $blocklist | grep -F "${parametrization}"; then - report+="Skipped\t$parametrization\n" - # do not continue the loop because we might need to wait for batched jobs - else - echo "Running $parametrization" - # execute the test in the background - # redirect to a log file that buffers test output. since the tests will run in the background, we cannot let them - # output to std{out,err} because the outputs would be garbled together - python ${defaults} "$parametrization" &>> standalone_test_output.txt & - # save the PID in an array - pids[${i}]=$! - # add row to the final report - report+="Ran\t$parametrization\n" - fi - - if ((($i + 1) % $test_batch_size == 0)); then - # wait for running tests - for pid in ${pids[*]}; do wait $pid; done - unset pids # empty the array - show_batched_output - fi -done -# wait for leftover tests -for pid in ${pids[*]}; do wait $pid; done -show_batched_output - -# echo test report -printf '=%.s' {1..80} -printf "\n$report" -printf '=%.s' {1..80} -printf '\n' diff --git a/tests/tests_lite/run_standalone_tests.sh b/tests/tests_lite/run_standalone_tests.sh new file mode 120000 index 0000000000000..23049489b7160 --- /dev/null +++ b/tests/tests_lite/run_standalone_tests.sh @@ -0,0 +1 @@ +../tests_pytorch/run_standalone_tests.sh \ No newline at end of file diff --git a/tests/tests_pytorch/conftest.py b/tests/tests_pytorch/conftest.py index 745067cc2f9f1..95586af186da3 100644 --- a/tests/tests_pytorch/conftest.py +++ b/tests/tests_pytorch/conftest.py @@ -170,7 +170,7 @@ def single_process_pg(): os.environ.update(orig_environ) -def pytest_collection_modifyitems(items: List[pytest.Function], config: pytest.Config): +def pytest_collection_modifyitems(items: List[pytest.Function], config: pytest.Config) -> None: initial_size = len(items) conditions = [] filtered, skipped = 0, 0 diff --git a/tests/tests_pytorch/run_standalone_tests.sh b/tests/tests_pytorch/run_standalone_tests.sh index 1443c6885c69d..fa6bda6706bc8 100644 --- a/tests/tests_pytorch/run_standalone_tests.sh +++ b/tests/tests_pytorch/run_standalone_tests.sh @@ -38,8 +38,9 @@ if [[ "$OSTYPE" == "darwin"* ]]; then else parametrizations=$(python -m pytest $files --collect-only --quiet "$@" | head -n -2) fi -# remove the "tests/tests_pytorch" path suffixes -parametrizations=${parametrizations//"tests/tests_pytorch/"/} +# remove the "tests/tests_pytorch/" path suffixes +path_suffix=$(basename "$(dirname "$(pwd)")")/$(basename "$(pwd)")"/" # https://stackoverflow.com/a/8223345 +parametrizations=${parametrizations//$path_suffix/} parametrizations_arr=($parametrizations) # tests to skip - space separated From ed0164a3d235526abe7d50f233b8bab2a07501fa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Roberto=20de=20Moura=20Estev=C3=A3o=20Filho?= Date: Mon, 5 Sep 2022 06:05:21 -0300 Subject: [PATCH 046/193] Estimate stepping batches with max_steps if max_epochs is not set (#14317) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Roberto Estevão Co-authored-by: Rohit Gupta Co-authored-by: Jirka Borovec Co-authored-by: Adrian Wälchli --- src/pytorch_lightning/CHANGELOG.md | 3 +++ src/pytorch_lightning/trainer/trainer.py | 4 ++-- .../trainer/properties/test_estimated_stepping_batches.py | 4 ++-- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 9d4323548cb7e..ea0e8f76461da 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -162,6 +162,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed an issue to avoid the impact of sanity check on `reload_dataloaders_every_n_epochs` for validation ([#13964](https://github.com/Lightning-AI/lightning/pull/13964)) +- Fixed `Trainer.estimated_stepping_batches` when maximum number of epochs is not set ([#14317](https://github.com/Lightning-AI/lightning/pull/14317)) + + ## [1.7.2] - 2022-08-17 ### Added diff --git a/src/pytorch_lightning/trainer/trainer.py b/src/pytorch_lightning/trainer/trainer.py index 08fade4021a8b..963c44dde21b9 100644 --- a/src/pytorch_lightning/trainer/trainer.py +++ b/src/pytorch_lightning/trainer/trainer.py @@ -2729,8 +2729,8 @@ def configure_optimizers(self): ) # infinite training - if self.max_epochs == -1 and self.max_steps == -1: - return float("inf") + if self.max_epochs == -1: + return float("inf") if self.max_steps == -1 else self.max_steps if self.train_dataloader is None: rank_zero_info("Loading `train_dataloader` to estimate number of stepping batches.") diff --git a/tests/tests_pytorch/trainer/properties/test_estimated_stepping_batches.py b/tests/tests_pytorch/trainer/properties/test_estimated_stepping_batches.py index 177d2034a0273..72c07ec0790c2 100644 --- a/tests/tests_pytorch/trainer/properties/test_estimated_stepping_batches.py +++ b/tests/tests_pytorch/trainer/properties/test_estimated_stepping_batches.py @@ -94,9 +94,9 @@ def test_num_stepping_batches_infinite_training(): assert trainer.estimated_stepping_batches == float("inf") -def test_num_stepping_batches_with_max_steps(): +@pytest.mark.parametrize("max_steps", [2, 100]) +def test_num_stepping_batches_with_max_steps(max_steps): """Test stepping batches with `max_steps`.""" - max_steps = 2 trainer = Trainer(max_steps=max_steps) model = BoringModel() trainer.fit(model) From 23f0e20209fb01c075b71983c9b9d465465f2fbf Mon Sep 17 00:00:00 2001 From: Tianshu Wang Date: Mon, 5 Sep 2022 18:12:43 +0800 Subject: [PATCH 047/193] Fixed `WandbLogger` `save_dir` is not set after creation (#12748) (#14326) Co-authored-by: Jirka Borovec --- src/pytorch_lightning/CHANGELOG.md | 6 +++--- src/pytorch_lightning/loggers/wandb.py | 6 +++--- tests/tests_pytorch/loggers/test_wandb.py | 8 +++++++- 3 files changed, 13 insertions(+), 7 deletions(-) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index ea0e8f76461da..ba8bf05f49bb3 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -136,6 +136,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed an issue to keep downscaling the batch size in case there hasn't been even a single successful optimal batch size with `mode="power"` ([#14372](https://github.com/Lightning-AI/lightning/pull/14372)) +- Fixed `WandbLogger` `save_dir` is not set after creation ([#14326](https://github.com/Lightning-AI/lightning/pull/14326)) + + ## [1.7.4] - 2022-08-31 @@ -162,9 +165,6 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed an issue to avoid the impact of sanity check on `reload_dataloaders_every_n_epochs` for validation ([#13964](https://github.com/Lightning-AI/lightning/pull/13964)) -- Fixed `Trainer.estimated_stepping_batches` when maximum number of epochs is not set ([#14317](https://github.com/Lightning-AI/lightning/pull/14317)) - - ## [1.7.2] - 2022-08-17 ### Added diff --git a/src/pytorch_lightning/loggers/wandb.py b/src/pytorch_lightning/loggers/wandb.py index baf4bc9092774..3198e46b1a586 100644 --- a/src/pytorch_lightning/loggers/wandb.py +++ b/src/pytorch_lightning/loggers/wandb.py @@ -223,7 +223,7 @@ def __init__(self, *args, **kwarg): Args: name: Display name for the run. - save_dir: Path where data is saved (wandb dir by default). + save_dir: Path where data is saved. offline: Run offline (data can be streamed later to wandb servers). id: Sets the version, mainly used to resume a previous run. version: Same as id. @@ -255,7 +255,7 @@ def __init__(self, *args, **kwarg): def __init__( self, name: Optional[str] = None, - save_dir: Optional[str] = None, + save_dir: str = ".", offline: bool = False, id: Optional[str] = None, anonymous: Optional[bool] = None, @@ -300,7 +300,7 @@ def __init__( name=name, project=project, id=version or id, - dir=save_dir, + dir=save_dir or kwargs.pop("dir"), resume="allow", anonymous=("allow" if anonymous else None), ) diff --git a/tests/tests_pytorch/loggers/test_wandb.py b/tests/tests_pytorch/loggers/test_wandb.py index 648e1a8f38ec8..b408046c9e5d2 100644 --- a/tests/tests_pytorch/loggers/test_wandb.py +++ b/tests/tests_pytorch/loggers/test_wandb.py @@ -58,9 +58,15 @@ def test_wandb_logger_init(wandb, monkeypatch): wandb.init.reset_mock() WandbLogger(project="test_project").experiment wandb.init.assert_called_once_with( - name=None, dir=None, id=None, project="test_project", resume="allow", anonymous=None + name=None, dir=".", id=None, project="test_project", resume="allow", anonymous=None ) + # test wandb.init set save_dir correctly after created + wandb.run = None + wandb.init.reset_mock() + logger = WandbLogger(project="test_project") + assert logger.save_dir is not None + # test wandb.init and setting logger experiment externally wandb.run = None run = wandb.init() From f3d4d83462fccf83d953f3a8f6bdf32ec8b7bcd9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Mon, 5 Sep 2022 14:16:26 +0200 Subject: [PATCH 048/193] Add path filters for some non-required jobs (#14539) --- .azure/app-cloud-e2e.yml | 6 ++++ .azure/hpu-tests.yml | 7 ++++ .github/file-filters.yml | 9 ----- .github/workflows/ci-pytorch-test-slow.yml | 38 +++++----------------- 4 files changed, 22 insertions(+), 38 deletions(-) delete mode 100644 .github/file-filters.yml diff --git a/.azure/app-cloud-e2e.yml b/.azure/app-cloud-e2e.yml index 3fd92512b7d40..1511284cad94b 100644 --- a/.azure/app-cloud-e2e.yml +++ b/.azure/app-cloud-e2e.yml @@ -12,6 +12,12 @@ trigger: - "master" - "release/*" - "refs/tags/*" + paths: + include: + - ".azure/app-cloud-e2e.yml" + - "requirements/app/**" + - "src/lightning_app/**" + - "examples/app_*" pr: - "master" diff --git a/.azure/hpu-tests.yml b/.azure/hpu-tests.yml index b128dbc5433b1..785a9b0f0115c 100644 --- a/.azure/hpu-tests.yml +++ b/.azure/hpu-tests.yml @@ -9,6 +9,13 @@ trigger: - "master" - "release/*" - "refs/tags/*" + paths: + include: + - ".azure/hpu-tests.yml" + - "examples/pl_hpu/mnist_sample.py" + - "requirements/pytorch/**" + - "src/pytorch_lightning/**" + - "tests/tests_pytorch/**" pr: - "master" diff --git a/.github/file-filters.yml b/.github/file-filters.yml deleted file mode 100644 index e621cd83881e4..0000000000000 --- a/.github/file-filters.yml +++ /dev/null @@ -1,9 +0,0 @@ -# This file contains filters to be used in the CI to detect file changes and run the required CI jobs. - -app_examples: - - "src/lightning_app/**" - - "tests/tests_app_examples/**" - - "requirements/app/**" - - "examples/app_*" - - "setup.py" - - "src/pytorch_lightning/__version__.py" diff --git a/.github/workflows/ci-pytorch-test-slow.yml b/.github/workflows/ci-pytorch-test-slow.yml index 126eaaf17da1a..f808ce561a5b1 100644 --- a/.github/workflows/ci-pytorch-test-slow.yml +++ b/.github/workflows/ci-pytorch-test-slow.yml @@ -7,6 +7,12 @@ on: # Trigger the workflow on push or pull request, but only for the master bra pull_request: branches: [master, "release/*"] types: [opened, reopened, ready_for_review, synchronize] + paths: + - "requirements/pytorch/**" + - "src/pytorch_lightning/**" + - "tests/tests_pytorch/**" + - "setup.cfg" # includes pytest config + - ".github/workflows/ci-pytorch-test-slow.yml" concurrency: group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }} @@ -28,43 +34,19 @@ jobs: steps: - uses: actions/checkout@v2 - - name: Get changed files - id: changed-files - uses: tj-actions/changed-files@v29.0.1 - - - name: Decide if the test should be skipped - id: skip - shell: bash -l {0} - run: | - FILTER='src/pytorch_lightning|requirements/pytorch|tests/tests_pytorch|examples/pl_*' - echo "${{ steps.changed-files.outputs.all_changed_files }}" | tr " " "\n" > changed_files.txt - MATCHES=$(cat changed_files.txt | grep -E $FILTER) - echo $MATCHES - if [ -z "$MATCHES" ]; then - echo "Skip" - echo "::set-output name=continue::0" - else - echo "Continue" - echo "::set-output name=continue::1" - fi - - uses: actions/setup-python@v4 - if: ${{ (steps.skip.outputs.continue == '1') }} with: python-version: ${{ matrix.python-version }} - name: Reset caching - if: ${{ (steps.skip.outputs.continue == '1') }} run: python -c "import time; days = time.time() / 60 / 60 / 24; print(f'TIME_PERIOD=d{int(days / 2) * 2}')" >> $GITHUB_ENV - name: Get pip cache - if: ${{ (steps.skip.outputs.continue == '1') }} id: pip-cache run: | python -c "from pip._internal.locations import USER_CACHE_DIR; print('::set-output name=dir::' + USER_CACHE_DIR)" - name: Cache pip - if: ${{ (steps.skip.outputs.continue == '1') }} uses: actions/cache@v3 with: path: ${{ steps.pip-cache.outputs.dir }} @@ -73,7 +55,6 @@ jobs: ${{ runner.os }}-pip-td${{ env.TIME_PERIOD }}-py${{ matrix.python-version }}- - name: Install dependencies - if: ${{ (steps.skip.outputs.continue == '1') }} env: PACKAGE_NAME: pytorch FREEZE_REQUIREMENTS: 1 @@ -85,21 +66,20 @@ jobs: shell: bash - name: Testing PyTorch - if: ${{ (steps.skip.outputs.continue == '1') }} working-directory: tests/tests_pytorch run: coverage run --source pytorch_lightning -m pytest -v --junitxml=results-${{ runner.os }}-py${{ matrix.python-version }}.xml env: PL_RUN_SLOW_TESTS: 1 - name: Upload pytest test results - if: ${{ (failure()) && (steps.skip.outputs.continue == '1') }} + if: failure() uses: actions/upload-artifact@v3 with: name: unittest-results-${{ runner.os }}-py${{ matrix.python-version }} path: tests/tests_pytorch/results-${{ runner.os }}-py${{ matrix.python-version }}.xml - name: Statistics - if: ${{ (success()) && (steps.skip.outputs.continue == '1') }} + if: success() working-directory: tests/tests_pytorch run: | coverage report @@ -107,7 +87,7 @@ jobs: - name: Upload coverage to Codecov uses: codecov/codecov-action@v3 - if: ${{ (success()) && (steps.skip.outputs.continue == '1') }} + if: success() # see: https://github.com/actions/toolkit/issues/399 continue-on-error: true with: From 6773df938713997a0c6d16a9dbf6fb94970000f5 Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Mon, 5 Sep 2022 15:18:38 +0200 Subject: [PATCH 049/193] pkg: include lite in PL (#14536) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * pkg: include lite in PL * Apply suggestions from code review * ci: nb dirs Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Carlos Mocholí --- .github/actions/pkg-check/action.yml | 6 +++++- .github/workflows/ci-pkg-install.yml | 2 ++ src/pytorch_lightning/__setup__.py | 14 +++++++++++++- 3 files changed, 20 insertions(+), 2 deletions(-) diff --git a/.github/actions/pkg-check/action.yml b/.github/actions/pkg-check/action.yml index 26ae8ddc88a7c..bfd1602e69af3 100644 --- a/.github/actions/pkg-check/action.yml +++ b/.github/actions/pkg-check/action.yml @@ -5,6 +5,10 @@ inputs: pkg-name: description: package name inside lightning.* required: true + nb-dirs: + description: nb of packages in the wrap/distribution + required: false + default: "1" runs: using: "composite" @@ -54,7 +58,7 @@ runs: # list folders without ending .egg-info dirs = [d for d in glob.glob(os.path.join("*", "src", "*")) if not d.endswith(".egg-info")] print(dirs) - assert len(dirs) == 1 + assert len(dirs) == ${{ inputs.nb-dirs }} # cleaning shutil.rmtree(pathlib.Path(dirs[0]).parent.parent) shell: python diff --git a/.github/workflows/ci-pkg-install.yml b/.github/workflows/ci-pkg-install.yml index 6eedad253941a..7a83cd46ece61 100644 --- a/.github/workflows/ci-pkg-install.yml +++ b/.github/workflows/ci-pkg-install.yml @@ -49,9 +49,11 @@ jobs: path: pypi - run: ls -lh pypi/ + - run: python -c "print('NB_DIRS=' + str(2 if '${{ matrix.pkg }}' == 'pytorch' else 1))" >> $GITHUB_ENV - uses: ./.github/actions/pkg-check with: pkg-name: ${{ matrix.pkg }} + nb-dirs: ${{ env.NB_DIRS }} - uses: actions/upload-artifact@v3 with: diff --git a/src/pytorch_lightning/__setup__.py b/src/pytorch_lightning/__setup__.py index 8fd93c77eeab1..8085574a5d2ff 100644 --- a/src/pytorch_lightning/__setup__.py +++ b/src/pytorch_lightning/__setup__.py @@ -53,6 +53,10 @@ def _adjust_manifest(**__: Any) -> None: lines += [ "recursive-exclude src *.md" + os.linesep, "recursive-exclude requirements *.txt" + os.linesep, + # TODO: remove after the first standalone Lite release + "recursive-include requirements/lite *.txt" + os.linesep, + # TODO: remove after the first standalone Lite release + "recursive-include src/lightning_lite *.md" + os.linesep, "recursive-include src/pytorch_lightning *.md" + os.linesep, "recursive-include requirements/pytorch *.txt" + os.linesep, "include src/pytorch_lightning/py.typed" + os.linesep, # marker file for PEP 561 @@ -78,7 +82,15 @@ def _setup_args(**__: Any) -> Dict[str, Any]: url=_about.__homepage__, download_url="https://github.com/Lightning-AI/lightning", license=_about.__license__, - packages=find_packages(where="src", include=["pytorch_lightning", "pytorch_lightning.*"]), + packages=find_packages( + where="src", + include=[ + "pytorch_lightning", + "pytorch_lightning.*", + "lightning_lite", # TODO: remove after the first standalone Lite release + "lightning_lite.*", # TODO: remove after the first standalone Lite release + ], + ), package_dir={"": "src"}, include_package_data=True, long_description=_long_description, From ce702fd40e5301cdb27a17bc9c22aac1b8299a41 Mon Sep 17 00:00:00 2001 From: Rohit Gupta Date: Mon, 5 Sep 2022 19:31:51 +0530 Subject: [PATCH 050/193] Squeeze tensor while logging (#14489) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Adrian Wälchli --- src/pytorch_lightning/CHANGELOG.md | 3 +++ src/pytorch_lightning/core/module.py | 12 +++++------- .../trainer/logging_/test_train_loop_logging.py | 11 +++++++++++ 3 files changed, 19 insertions(+), 7 deletions(-) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index ba8bf05f49bb3..cae69427acdf0 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -136,6 +136,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed an issue to keep downscaling the batch size in case there hasn't been even a single successful optimal batch size with `mode="power"` ([#14372](https://github.com/Lightning-AI/lightning/pull/14372)) +- Squeezed tensor values when logging with `LightningModule.log` ([#14489](https://github.com/Lightning-AI/lightning/pull/14489)) + + - Fixed `WandbLogger` `save_dir` is not set after creation ([#14326](https://github.com/Lightning-AI/lightning/pull/14326)) diff --git a/src/pytorch_lightning/core/module.py b/src/pytorch_lightning/core/module.py index a479beadc7931..a8fea8c210959 100644 --- a/src/pytorch_lightning/core/module.py +++ b/src/pytorch_lightning/core/module.py @@ -423,8 +423,7 @@ def log( " but it should not contain information about `dataloader_idx`" ) - value = apply_to_collection(value, numbers.Number, self.__to_tensor) - apply_to_collection(value, torch.Tensor, self.__check_numel_1, name) + value = apply_to_collection(value, (torch.Tensor, numbers.Number), self.__to_tensor, name) if self.trainer._logger_connector.should_reset_tensors(self._current_fx_name): # if we started a new epoch (running its first batch) the hook name has changed @@ -556,16 +555,15 @@ def __check_not_nested(value: dict, name: str) -> None: def __check_allowed(v: Any, name: str, value: Any) -> None: raise ValueError(f"`self.log({name}, {value})` was called, but `{type(v).__name__}` values cannot be logged") - def __to_tensor(self, value: numbers.Number) -> Tensor: - return torch.tensor(value, device=self.device) - - @staticmethod - def __check_numel_1(value: Tensor, name: str) -> None: + def __to_tensor(self, value: Union[torch.Tensor, numbers.Number], name: str) -> Tensor: + value = torch.tensor(value, device=self.device) if not torch.numel(value) == 1: raise ValueError( f"`self.log({name}, {value})` was called, but the tensor must have a single element." f" You can try doing `self.log({name}, {value}.mean())`" ) + value = value.squeeze() + return value def log_grad_norm(self, grad_norm_dict: Dict[str, float]) -> None: """Override this method to change the default behaviour of ``log_grad_norm``. diff --git a/tests/tests_pytorch/trainer/logging_/test_train_loop_logging.py b/tests/tests_pytorch/trainer/logging_/test_train_loop_logging.py index 85ed3d8e3471d..cd7f83ddc7bfe 100644 --- a/tests/tests_pytorch/trainer/logging_/test_train_loop_logging.py +++ b/tests/tests_pytorch/trainer/logging_/test_train_loop_logging.py @@ -29,6 +29,7 @@ from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint, TQDMProgressBar from pytorch_lightning.core.module import LightningModule from pytorch_lightning.demos.boring_classes import BoringModel, RandomDataset, RandomDictDataset +from pytorch_lightning.trainer.states import RunningStage from pytorch_lightning.utilities.exceptions import MisconfigurationException from tests_pytorch.helpers.runif import RunIf @@ -836,3 +837,13 @@ def on_train_start(self): assert mock_log_metrics.mock_calls == [call(metrics={"foo": 123.0, "epoch": 0}, step=0)] assert trainer.max_epochs > 1 + + +def test_unsqueezed_tensor_logging(): + model = BoringModel() + trainer = Trainer() + trainer.state.stage = RunningStage.TRAINING + model._current_fx_name = "training_step" + model.trainer = trainer + model.log("foo", torch.Tensor([1.2])) + assert trainer.callback_metrics["foo"].ndim == 0 From 3cf57ff11cf650dac03d330baeb05612982ce911 Mon Sep 17 00:00:00 2001 From: awaelchli Date: Sat, 3 Sep 2022 12:06:51 +0200 Subject: [PATCH 051/193] Move pl/core/mixins/device_dtype_mixin.py to lite/utilities/device_dtype_mixin.py (#14511) --- src/lightning_lite/utilities/__init__.py | 0 .../mixins => lightning_lite/utilities}/device_dtype_mixin.py | 0 src/pytorch_lightning/core/mixins/__init__.py | 2 +- 3 files changed, 1 insertion(+), 1 deletion(-) create mode 100644 src/lightning_lite/utilities/__init__.py rename src/{pytorch_lightning/core/mixins => lightning_lite/utilities}/device_dtype_mixin.py (100%) diff --git a/src/lightning_lite/utilities/__init__.py b/src/lightning_lite/utilities/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/src/pytorch_lightning/core/mixins/device_dtype_mixin.py b/src/lightning_lite/utilities/device_dtype_mixin.py similarity index 100% rename from src/pytorch_lightning/core/mixins/device_dtype_mixin.py rename to src/lightning_lite/utilities/device_dtype_mixin.py diff --git a/src/pytorch_lightning/core/mixins/__init__.py b/src/pytorch_lightning/core/mixins/__init__.py index a48dd82429a45..3671ced07aa93 100644 --- a/src/pytorch_lightning/core/mixins/__init__.py +++ b/src/pytorch_lightning/core/mixins/__init__.py @@ -12,5 +12,5 @@ # See the License for the specific language governing permissions and # limitations under the License. -from pytorch_lightning.core.mixins.device_dtype_mixin import DeviceDtypeModuleMixin # noqa: F401 +from lightning_lite.utilities.device_dtype_mixin import DeviceDtypeModuleMixin # noqa: F401 from pytorch_lightning.core.mixins.hparams_mixin import HyperparametersMixin # noqa: F401 From cefe2fa1232b17f9d9bb9ee80af9d92cd7e1f3db Mon Sep 17 00:00:00 2001 From: awaelchli Date: Sat, 3 Sep 2022 12:19:57 +0200 Subject: [PATCH 052/193] Move test_dtype_device_mixin to lite (#14511) --- tests/tests_lite/helpers/runif.py | 31 ++++++ tests/tests_lite/utilities/__init__.py | 0 .../utilities/test_device_dtype_mixin.py | 94 +++++++++++++++++++ .../utilities/test_dtype_device_mixin.py | 76 +-------------- 4 files changed, 126 insertions(+), 75 deletions(-) create mode 100644 tests/tests_lite/utilities/__init__.py create mode 100644 tests/tests_lite/utilities/test_device_dtype_mixin.py diff --git a/tests/tests_lite/helpers/runif.py b/tests/tests_lite/helpers/runif.py index 280af6a96f9c4..fcdca0f9a6d22 100644 --- a/tests/tests_lite/helpers/runif.py +++ b/tests/tests_lite/helpers/runif.py @@ -12,9 +12,13 @@ # See the License for the specific language governing permissions and # limitations under the License. import os +import sys +from typing import Optional import pytest import torch +from packaging.version import Version +from pkg_resources import get_distribution # TODO(lite): Add all RunIf conditions once the relevant utilities have moved to lite source dir @@ -31,6 +35,10 @@ def __new__( self, *args, min_cuda_gpus: int = 0, + min_torch: Optional[str] = None, + max_torch: Optional[str] = None, + min_python: Optional[str] = None, + skip_windows: bool = False, standalone: bool = False, **kwargs, ): @@ -38,6 +46,10 @@ def __new__( Args: *args: Any :class:`pytest.mark.skipif` arguments. min_cuda_gpus: Require this number of gpus and that the ``PL_RUN_CUDA_TESTS=1`` environment variable is set. + min_torch: Require that PyTorch is greater or equal than this version. + max_torch: Require that PyTorch is less than this version. + min_python: Require that Python is greater or equal than this version. + skip_windows: Skip for Windows platform. standalone: Mark the test as standalone, our CI will run it in a separate process. This requires that the ``PL_RUN_STANDALONE_TESTS=1`` environment variable is set. **kwargs: Any :class:`pytest.mark.skipif` keyword arguments. @@ -51,6 +63,25 @@ def __new__( # used in conftest.py::pytest_collection_modifyitems kwargs["min_cuda_gpus"] = True + if min_torch: + torch_version = get_distribution("torch").version + conditions.append(Version(torch_version) < Version(min_torch)) + reasons.append(f"torch>={min_torch}") + + if max_torch: + torch_version = get_distribution("torch").version + conditions.append(Version(torch_version) >= Version(max_torch)) + reasons.append(f"torch<{max_torch}") + + if min_python: + py_version = f"{sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}" + conditions.append(Version(py_version) < Version(min_python)) + reasons.append(f"python>={min_python}") + + if skip_windows: + conditions.append(sys.platform == "win32") + reasons.append("unimplemented on Windows") + if standalone: env_flag = os.getenv("PL_RUN_STANDALONE_TESTS", "0") conditions.append(env_flag != "1") diff --git a/tests/tests_lite/utilities/__init__.py b/tests/tests_lite/utilities/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/tests/tests_lite/utilities/test_device_dtype_mixin.py b/tests/tests_lite/utilities/test_device_dtype_mixin.py new file mode 100644 index 0000000000000..28e7021a9eac4 --- /dev/null +++ b/tests/tests_lite/utilities/test_device_dtype_mixin.py @@ -0,0 +1,94 @@ +import pytest +import torch +from tests_lite.helpers.runif import RunIf +from torch import nn as nn + +from lightning_lite.utilities.device_dtype_mixin import DeviceDtypeModuleMixin + + +class SubSubModule(DeviceDtypeModuleMixin): + pass + + +class SubModule(nn.Module): + def __init__(self): + super().__init__() + self.module = SubSubModule() + + +class TopModule(DeviceDtypeModuleMixin): + def __init__(self) -> None: + super().__init__() + self.module = SubModule() + + +@pytest.mark.parametrize( + "dst_device_str,dst_dtype", + [ + ("cpu", torch.half), + ("cpu", torch.float), + ("cpu", torch.double), + pytest.param("cuda:0", torch.half, marks=RunIf(min_cuda_gpus=1)), + pytest.param("cuda:0", torch.float, marks=RunIf(min_cuda_gpus=1)), + pytest.param("cuda:0", torch.double, marks=RunIf(min_cuda_gpus=1)), + pytest.param("mps:0", torch.float, marks=RunIf(mps=True)), # double and half are not yet supported. + ], +) +@RunIf(min_cuda_gpus=1) +def test_submodules_device_and_dtype(dst_device_str, dst_dtype): + """Test that the device and dtype property updates propagate through mixed nesting of regular nn.Modules and + the special modules of type DeviceDtypeModuleMixin (e.g. Metric or LightningModule).""" + + dst_device = torch.device(dst_device_str) + + model = TopModule() + assert model.device == torch.device("cpu") + model = model.to(device=dst_device, dtype=dst_dtype) + # nn.Module does not have these attributes + assert not hasattr(model.module, "_device") + assert not hasattr(model.module, "_dtype") + # device and dtype change should propagate down into all children + assert model.device == model.module.module.device == dst_device + assert model.dtype == model.module.module.dtype == dst_dtype + + +@pytest.mark.parametrize( + "device", + [ + None, # explicitly call without an index to see if the returning device contains an index + 0, + torch.device("cuda", 0), + ], +) +@RunIf(min_cuda_gpus=1) +def test_cuda_device(device): + model = TopModule() + + model.cuda(device) + + device = model.device + assert device.type == "cuda" + assert device.index is not None + assert device.index == torch.cuda.current_device() + + +@RunIf(min_cuda_gpus=2) +def test_cuda_current_device(): + """Test that calling .cuda() moves the model to the correct device and respects current cuda device setting.""" + + class CudaModule(DeviceDtypeModuleMixin): + def __init__(self): + super().__init__() + self.layer = nn.Linear(1, 1) + + model = CudaModule() + + torch.cuda.set_device(0) + model.cuda(1) + assert model.device == torch.device("cuda", 1) + assert model.layer.weight.device == torch.device("cuda", 1) + + torch.cuda.set_device(1) + model.cuda() # model is already on device 1, and calling .cuda() without device index should not move model + assert model.device == torch.device("cuda", 1) + assert model.layer.weight.device == torch.device("cuda", 1) diff --git a/tests/tests_pytorch/utilities/test_dtype_device_mixin.py b/tests/tests_pytorch/utilities/test_dtype_device_mixin.py index 7c17b3d9f7642..68aad3257beb5 100644 --- a/tests/tests_pytorch/utilities/test_dtype_device_mixin.py +++ b/tests/tests_pytorch/utilities/test_dtype_device_mixin.py @@ -11,12 +11,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import pytest -import torch import torch.nn as nn +from lightning_lite.utilities.device_dtype_mixin import DeviceDtypeModuleMixin from pytorch_lightning import Callback, Trainer -from pytorch_lightning.core.mixins import DeviceDtypeModuleMixin from pytorch_lightning.demos.boring_classes import BoringModel from tests_pytorch.helpers.runif import RunIf @@ -46,36 +44,6 @@ def on_train_batch_start(self, trainer, model, batch, batch_idx): assert model.device == model.module.module.device -@pytest.mark.parametrize( - "dst_device_str,dst_dtype", - [ - ("cpu", torch.half), - ("cpu", torch.float), - ("cpu", torch.double), - pytest.param("cuda:0", torch.half, marks=RunIf(min_cuda_gpus=1)), - pytest.param("cuda:0", torch.float, marks=RunIf(min_cuda_gpus=1)), - pytest.param("cuda:0", torch.double, marks=RunIf(min_cuda_gpus=1)), - pytest.param("mps:0", torch.float, marks=RunIf(mps=True)), # double and half are not yet supported. - ], -) -@RunIf(min_cuda_gpus=1) -def test_submodules_device_and_dtype(dst_device_str, dst_dtype): - """Test that the device and dtype property updates propagate through mixed nesting of regular nn.Modules and - the special modules of type DeviceDtypeModuleMixin (e.g. Metric or LightningModule).""" - - dst_device = torch.device(dst_device_str) - - model = TopModule() - assert model.device == torch.device("cpu") - model = model.to(device=dst_device, dtype=dst_dtype) - # nn.Module does not have these attributes - assert not hasattr(model.module, "_device") - assert not hasattr(model.module, "_dtype") - # device and dtype change should propagate down into all children - assert model.device == model.module.module.device == dst_device - assert model.dtype == model.module.module.dtype == dst_dtype - - @RunIf(min_cuda_gpus=2) def test_submodules_multi_gpu_dp(tmpdir): model = TopModule() @@ -102,45 +70,3 @@ def test_submodules_multi_gpu_ddp_spawn(tmpdir): max_steps=1, ) trainer.fit(model) - - -@pytest.mark.parametrize( - "device", - [ - None, # explicitly call without an index to see if the returning device contains an index - 0, - torch.device("cuda", 0), - ], -) -@RunIf(min_cuda_gpus=1) -def test_cuda_device(device): - model = TopModule() - - model.cuda(device) - - device = model.device - assert device.type == "cuda" - assert device.index is not None - assert device.index == torch.cuda.current_device() - - -@RunIf(min_cuda_gpus=2) -def test_cuda_current_device(): - """Test that calling .cuda() moves the model to the correct device and respects current cuda device setting.""" - - class CudaModule(DeviceDtypeModuleMixin): - def __init__(self): - super().__init__() - self.layer = nn.Linear(1, 1) - - model = CudaModule() - - torch.cuda.set_device(0) - model.cuda(1) - assert model.device == torch.device("cuda", 1) - assert model.layer.weight.device == torch.device("cuda", 1) - - torch.cuda.set_device(1) - model.cuda() # model is already on device 1, and calling .cuda() without device index should not move model - assert model.device == torch.device("cuda", 1) - assert model.layer.weight.device == torch.device("cuda", 1) From c2879c20daa9f87b3d08ffba9bac2b32522dbb99 Mon Sep 17 00:00:00 2001 From: awaelchli Date: Sat, 3 Sep 2022 14:16:28 +0200 Subject: [PATCH 053/193] Deprecate pl/core/mixins/device_dtype_mixin and update imports (#14511) --- src/pytorch_lightning/CHANGELOG.md | 3 +++ .../core/mixins/device_dtype_mixin.py | 26 +++++++++++++++++++ src/pytorch_lightning/core/module.py | 3 ++- src/pytorch_lightning/lite/wrappers.py | 2 +- src/pytorch_lightning/overrides/base.py | 2 +- .../connectors/logger_connector/result.py | 2 +- .../deprecated_api/test_remove_1-10.py | 9 +++++++ tests/tests_pytorch/lite/test_wrappers.py | 2 +- 8 files changed, 44 insertions(+), 5 deletions(-) create mode 100644 src/pytorch_lightning/core/mixins/device_dtype_mixin.py diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index cae69427acdf0..28063a041ec5c 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -72,6 +72,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Deprecated the `on_colab_kaggle` function ([#14247](https://github.com/Lightning-AI/lightning/pull/14247)) +- Deprecated the `pl.core.mixins.DeviceDtypeModuleMixin` in favor of `lightning_lite.utilities.DeviceDtypeModuleMixin` ([#14511](https://github.com/Lightning-AI/lightning/pull/14511)) + + ### Removed - Removed the deprecated `Trainer.training_type_plugin` property in favor of `Trainer.strategy` ([#14011](https://github.com/Lightning-AI/lightning/pull/14011)) diff --git a/src/pytorch_lightning/core/mixins/device_dtype_mixin.py b/src/pytorch_lightning/core/mixins/device_dtype_mixin.py new file mode 100644 index 0000000000000..9e85fb7742b19 --- /dev/null +++ b/src/pytorch_lightning/core/mixins/device_dtype_mixin.py @@ -0,0 +1,26 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from lightning_lite.utilities.device_dtype_mixin import DeviceDtypeModuleMixin as NewDeviceDtypeModuleMixin +from pytorch_lightning.utilities import rank_zero_deprecation + + +class DeviceDtypeModuleMixin(NewDeviceDtypeModuleMixin): + def __init__(self) -> None: + rank_zero_deprecation( + "`pytorch_lightning.core.mixins.DeviceDtypeModuleMixin` has been deprecated in v1.8.0 and will be" + " removed in v1.10.0. Please use `lightning_lite.utilities.device_dtype_mixin.DeviceDtypeModuleMixin`" + " instead." + ) + super().__init__() diff --git a/src/pytorch_lightning/core/module.py b/src/pytorch_lightning/core/module.py index a8fea8c210959..8cbb67317c96c 100644 --- a/src/pytorch_lightning/core/module.py +++ b/src/pytorch_lightning/core/module.py @@ -32,9 +32,10 @@ from typing_extensions import Literal import pytorch_lightning as pl +from lightning_lite.utilities.device_dtype_mixin import DeviceDtypeModuleMixin from pytorch_lightning.callbacks.callback import Callback from pytorch_lightning.core.hooks import CheckpointHooks, DataHooks, ModelHooks -from pytorch_lightning.core.mixins import DeviceDtypeModuleMixin, HyperparametersMixin +from pytorch_lightning.core.mixins import HyperparametersMixin from pytorch_lightning.core.optimizer import LightningOptimizer from pytorch_lightning.core.saving import ModelIO from pytorch_lightning.loggers import Logger, LoggerCollection diff --git a/src/pytorch_lightning/lite/wrappers.py b/src/pytorch_lightning/lite/wrappers.py index 96d6e6233115c..2675f87b9442a 100644 --- a/src/pytorch_lightning/lite/wrappers.py +++ b/src/pytorch_lightning/lite/wrappers.py @@ -19,7 +19,7 @@ from torch.optim import Optimizer from torch.utils.data import DataLoader -from pytorch_lightning.core.mixins import DeviceDtypeModuleMixin +from lightning_lite.utilities.device_dtype_mixin import DeviceDtypeModuleMixin from pytorch_lightning.plugins import PrecisionPlugin from pytorch_lightning.strategies import Strategy from pytorch_lightning.utilities.apply_func import apply_to_collection, move_data_to_device diff --git a/src/pytorch_lightning/overrides/base.py b/src/pytorch_lightning/overrides/base.py index 07f30c271b207..c169431a2d2fd 100644 --- a/src/pytorch_lightning/overrides/base.py +++ b/src/pytorch_lightning/overrides/base.py @@ -19,7 +19,7 @@ from torch.nn.parallel import DistributedDataParallel import pytorch_lightning as pl -from pytorch_lightning.core.mixins import DeviceDtypeModuleMixin +from lightning_lite.utilities.device_dtype_mixin import DeviceDtypeModuleMixin from pytorch_lightning.utilities import rank_zero_deprecation diff --git a/src/pytorch_lightning/trainer/connectors/logger_connector/result.py b/src/pytorch_lightning/trainer/connectors/logger_connector/result.py index 7fb56c21c2edf..6b9d9ce8268f7 100644 --- a/src/pytorch_lightning/trainer/connectors/logger_connector/result.py +++ b/src/pytorch_lightning/trainer/connectors/logger_connector/result.py @@ -20,7 +20,7 @@ from torchmetrics import Metric from typing_extensions import TypedDict -from pytorch_lightning.core.mixins import DeviceDtypeModuleMixin +from lightning_lite.utilities.device_dtype_mixin import DeviceDtypeModuleMixin from pytorch_lightning.utilities.apply_func import apply_to_collection, apply_to_collections, move_data_to_device from pytorch_lightning.utilities.data import extract_batch_size from pytorch_lightning.utilities.distributed import distributed_available diff --git a/tests/tests_pytorch/deprecated_api/test_remove_1-10.py b/tests/tests_pytorch/deprecated_api/test_remove_1-10.py index 40a4069001505..c60c86c72a33c 100644 --- a/tests/tests_pytorch/deprecated_api/test_remove_1-10.py +++ b/tests/tests_pytorch/deprecated_api/test_remove_1-10.py @@ -15,6 +15,7 @@ import pytest from pytorch_lightning import Trainer +from pytorch_lightning.core.mixins.device_dtype_mixin import DeviceDtypeModuleMixin from pytorch_lightning.demos.boring_classes import BoringModel from pytorch_lightning.overrides import LightningDistributedModule, LightningParallelModule from pytorch_lightning.overrides.base import unwrap_lightning_module @@ -69,3 +70,11 @@ def test_v1_10_deprecated_unwrap_lightning_module_sharded(): def test_v1_10_deprecated_on_colab_kaggle_func(): with pytest.deprecated_call(match="The function `on_colab_kaggle` has been deprecated in v1.8.0"): on_colab_kaggle() + + +def test_v1_10_deprecated_device_dtype_module_mixin(): + class MyModule(DeviceDtypeModuleMixin): + pass + + with pytest.deprecated_call(match="mixins.DeviceDtypeModuleMixin` has been deprecated in v1.8.0"): + MyModule() diff --git a/tests/tests_pytorch/lite/test_wrappers.py b/tests/tests_pytorch/lite/test_wrappers.py index 1098d2f0f8459..03799babf94c6 100644 --- a/tests/tests_pytorch/lite/test_wrappers.py +++ b/tests/tests_pytorch/lite/test_wrappers.py @@ -17,7 +17,7 @@ import torch from torch.utils.data.dataloader import DataLoader -from pytorch_lightning.core.mixins import DeviceDtypeModuleMixin +from lightning_lite.utilities.device_dtype_mixin import DeviceDtypeModuleMixin from pytorch_lightning.lite import LightningLite from pytorch_lightning.lite.wrappers import _LiteDataLoader, _LiteModule, _LiteOptimizer from tests_pytorch.helpers.runif import RunIf From 75d5a2d046974f57748019d42c78862f980110dc Mon Sep 17 00:00:00 2001 From: awaelchli Date: Sat, 3 Sep 2022 14:04:27 +0200 Subject: [PATCH 054/193] move pl/utilities/xla_device.py to lite/utilities/xla_device.py (#14514) --- .../utilities/xla_device.py | 0 src/pytorch_lightning/utilities/imports.py | 2 +- .../utilities/test_xla_device_utils.py | 2 +- 3 files changed, 2 insertions(+), 2 deletions(-) rename src/{pytorch_lightning => lightning_lite}/utilities/xla_device.py (100%) rename tests/{tests_pytorch => tests_lite}/utilities/test_xla_device_utils.py (96%) diff --git a/src/pytorch_lightning/utilities/xla_device.py b/src/lightning_lite/utilities/xla_device.py similarity index 100% rename from src/pytorch_lightning/utilities/xla_device.py rename to src/lightning_lite/utilities/xla_device.py diff --git a/src/pytorch_lightning/utilities/imports.py b/src/pytorch_lightning/utilities/imports.py index b04aec50bc1ee..fafc693228b0b 100644 --- a/src/pytorch_lightning/utilities/imports.py +++ b/src/pytorch_lightning/utilities/imports.py @@ -147,7 +147,7 @@ def __repr__(self) -> str: _XLA_AVAILABLE: bool = _package_available("torch_xla") -from pytorch_lightning.utilities.xla_device import XLADeviceUtils # noqa: E402 +from lightning_lite.utilities.xla_device import XLADeviceUtils # noqa: E402 _TPU_AVAILABLE = XLADeviceUtils.tpu_device_exists() diff --git a/tests/tests_pytorch/utilities/test_xla_device_utils.py b/tests/tests_lite/utilities/test_xla_device_utils.py similarity index 96% rename from tests/tests_pytorch/utilities/test_xla_device_utils.py rename to tests/tests_lite/utilities/test_xla_device_utils.py index 3485a217dfb05..d8f6003c6a55a 100644 --- a/tests/tests_pytorch/utilities/test_xla_device_utils.py +++ b/tests/tests_lite/utilities/test_xla_device_utils.py @@ -16,7 +16,7 @@ import pytest -import pytorch_lightning.utilities.xla_device as xla_utils +import lightning_lite.utilities.xla_device as xla_utils from pytorch_lightning.utilities import _XLA_AVAILABLE from tests_pytorch.helpers.runif import RunIf From 165427a506c3ee25ae3d159c1ca5e10f905a737e Mon Sep 17 00:00:00 2001 From: awaelchli Date: Sun, 4 Sep 2022 21:43:08 +0200 Subject: [PATCH 055/193] Deprecate pl/utilities/xla_device (#14514) --- src/pytorch_lightning/CHANGELOG.md | 4 ++ src/pytorch_lightning/utilities/xla_device.py | 62 +++++++++++++++++++ .../deprecated_api/test_remove_1-10.py | 20 ++++++ 3 files changed, 86 insertions(+) create mode 100644 src/pytorch_lightning/utilities/xla_device.py diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 28063a041ec5c..61a6b806c35ae 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -75,6 +75,10 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Deprecated the `pl.core.mixins.DeviceDtypeModuleMixin` in favor of `lightning_lite.utilities.DeviceDtypeModuleMixin` ([#14511](https://github.com/Lightning-AI/lightning/pull/14511)) +- Deprecated all functions in `pytorch_lightning.utilities.xla_device` in favor of `lightning_lite.utilities.xla_device` ([#14514](https://github.com/Lightning-AI/lightning/pull/14514)) + + + ### Removed - Removed the deprecated `Trainer.training_type_plugin` property in favor of `Trainer.strategy` ([#14011](https://github.com/Lightning-AI/lightning/pull/14011)) diff --git a/src/pytorch_lightning/utilities/xla_device.py b/src/pytorch_lightning/utilities/xla_device.py new file mode 100644 index 0000000000000..1d6347c6e6a25 --- /dev/null +++ b/src/pytorch_lightning/utilities/xla_device.py @@ -0,0 +1,62 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from multiprocessing import Queue +from typing import Any, Callable + +from lightning_lite.utilities.xla_device import inner_f as new_inner_f +from lightning_lite.utilities.xla_device import pl_multi_process as new_pl_multi_process +from lightning_lite.utilities.xla_device import XLADeviceUtils as NewXLADeviceUtils +from pytorch_lightning.utilities import rank_zero_deprecation # TODO(lite): update to lightning_lite.utilities + + +def inner_f(queue: Queue, func: Callable, *args: Any, **kwargs: Any) -> None: # pragma: no cover + rank_zero_deprecation( + "`pytorch_lightning.utilities.xla_device.inner_f` has been deprecated in v1.8.0 and will be" + " removed in v1.10.0. Please use `lightning_lite.utilities.xla_device.inner_f` instead." + ) + return new_inner_f(queue, func, *args, **kwargs) + + +def pl_multi_process(func: Callable) -> Callable: + rank_zero_deprecation( + "`pytorch_lightning.utilities.xla_device.pl_multi_process` has been deprecated in v1.8.0 and will be" + " removed in v1.10.0. Please use `lightning_lite.utilities.xla_device.pl_multi_process` instead." + ) + return new_pl_multi_process(func) + + +class XLADeviceUtils(NewXLADeviceUtils): + def __init__(self) -> None: + rank_zero_deprecation( + "`pytorch_lightning.utilities.xla_device.XLADeviceUtils` has been deprecated in v1.8.0 and will be" + " removed in v1.10.0. Please use `lightning_lite.utilities.xla_device.XLADeviceUtils` instead." + ) + super().__init__() + + @staticmethod + def xla_available() -> bool: + rank_zero_deprecation( + "`pytorch_lightning.utilities.xla_device.XLADeviceUtils` has been deprecated in v1.8.0 and will be" + " removed in v1.10.0. Please use `lightning_lite.utilities.xla_device.XLADeviceUtils` instead." + ) + return NewXLADeviceUtils.xla_available() + + @staticmethod + def tpu_device_exists() -> bool: + rank_zero_deprecation( + "`pytorch_lightning.utilities.xla_device.XLADeviceUtils` has been deprecated in v1.8.0 and will be" + " removed in v1.10.0. Please use `lightning_lite.utilities.xla_device.XLADeviceUtils` instead." + ) + return NewXLADeviceUtils.tpu_device_exists() diff --git a/tests/tests_pytorch/deprecated_api/test_remove_1-10.py b/tests/tests_pytorch/deprecated_api/test_remove_1-10.py index c60c86c72a33c..d9bf33a28dcd4 100644 --- a/tests/tests_pytorch/deprecated_api/test_remove_1-10.py +++ b/tests/tests_pytorch/deprecated_api/test_remove_1-10.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Test deprecated functionality which will be removed in v1.10.0.""" +from unittest import mock + import pytest from pytorch_lightning import Trainer @@ -24,6 +26,7 @@ from pytorch_lightning.strategies.deepspeed import LightningDeepSpeedModule from pytorch_lightning.strategies.ipu import LightningIPUModule from pytorch_lightning.strategies.utils import on_colab_kaggle +from pytorch_lightning.utilities.xla_device import inner_f, pl_multi_process, XLADeviceUtils from tests_pytorch.helpers.runif import RunIf from tests_pytorch.helpers.utils import no_warning_call @@ -78,3 +81,20 @@ class MyModule(DeviceDtypeModuleMixin): with pytest.deprecated_call(match="mixins.DeviceDtypeModuleMixin` has been deprecated in v1.8.0"): MyModule() + + +def test_v1_10_deprecated_xla_device_utilities(): + with pytest.deprecated_call(match="xla_device.inner_f` has been deprecated in v1.8.0"): + inner_f(mock.Mock(), mock.Mock()) + + with pytest.deprecated_call(match="xla_device.pl_multi_process` has been deprecated in v1.8.0"): + pl_multi_process(mock.Mock) + + with pytest.deprecated_call(match="xla_device.XLADeviceUtils` has been deprecated in v1.8.0"): + XLADeviceUtils() + + with pytest.deprecated_call(match="xla_device.XLADeviceUtils` has been deprecated in v1.8.0"): + XLADeviceUtils.xla_available() + + with pytest.deprecated_call(match="xla_device.XLADeviceUtils` has been deprecated in v1.8.0"): + XLADeviceUtils.tpu_device_exists() From def654859642cf2df99e8d5c98d4a3014c69398a Mon Sep 17 00:00:00 2001 From: awaelchli Date: Sat, 3 Sep 2022 14:27:34 +0200 Subject: [PATCH 056/193] move pl/utilities/cloud_io.py to lite/utilities/cloud_io.py (#14515) --- .../utilities/cloud_io.py | 0 src/pytorch_lightning/callbacks/model_checkpoint.py | 2 +- src/pytorch_lightning/cli.py | 2 +- src/pytorch_lightning/core/module.py | 2 +- src/pytorch_lightning/core/saving.py | 4 ++-- src/pytorch_lightning/loggers/tensorboard.py | 2 +- src/pytorch_lightning/plugins/environments/lsf_environment.py | 2 +- src/pytorch_lightning/plugins/io/hpu_plugin.py | 2 +- src/pytorch_lightning/plugins/io/torch_plugin.py | 4 ++-- src/pytorch_lightning/plugins/io/xla_plugin.py | 2 +- src/pytorch_lightning/profilers/profiler.py | 2 +- src/pytorch_lightning/strategies/ipu.py | 2 +- .../trainer/connectors/checkpoint_connector.py | 2 +- src/pytorch_lightning/trainer/trainer.py | 2 +- .../{tests_pytorch => tests_lite}/utilities/test_cloud_io.py | 2 +- tests/tests_pytorch/checkpointing/test_model_checkpoint.py | 2 +- tests/tests_pytorch/lite/test_parity.py | 2 +- tests/tests_pytorch/models/test_torchscript.py | 2 +- tests/tests_pytorch/trainer/test_trainer.py | 2 +- 19 files changed, 20 insertions(+), 20 deletions(-) rename src/{pytorch_lightning => lightning_lite}/utilities/cloud_io.py (100%) rename tests/{tests_pytorch => tests_lite}/utilities/test_cloud_io.py (94%) diff --git a/src/pytorch_lightning/utilities/cloud_io.py b/src/lightning_lite/utilities/cloud_io.py similarity index 100% rename from src/pytorch_lightning/utilities/cloud_io.py rename to src/lightning_lite/utilities/cloud_io.py diff --git a/src/pytorch_lightning/callbacks/model_checkpoint.py b/src/pytorch_lightning/callbacks/model_checkpoint.py index 3362d07902184..a442459769ab7 100644 --- a/src/pytorch_lightning/callbacks/model_checkpoint.py +++ b/src/pytorch_lightning/callbacks/model_checkpoint.py @@ -34,8 +34,8 @@ from torch import Tensor import pytorch_lightning as pl +from lightning_lite.utilities.cloud_io import get_filesystem from pytorch_lightning.callbacks import Checkpoint -from pytorch_lightning.utilities.cloud_io import get_filesystem from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation, rank_zero_info, rank_zero_warn from pytorch_lightning.utilities.types import _PATH, STEP_OUTPUT diff --git a/src/pytorch_lightning/cli.py b/src/pytorch_lightning/cli.py index 700307b6ef1bc..27b9c0487ce6b 100644 --- a/src/pytorch_lightning/cli.py +++ b/src/pytorch_lightning/cli.py @@ -20,8 +20,8 @@ from torch.optim import Optimizer import pytorch_lightning as pl +from lightning_lite.utilities.cloud_io import get_filesystem from pytorch_lightning import Callback, LightningDataModule, LightningModule, seed_everything, Trainer -from pytorch_lightning.utilities.cloud_io import get_filesystem from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.imports import _RequirementAvailable from pytorch_lightning.utilities.model_helpers import is_overridden diff --git a/src/pytorch_lightning/core/module.py b/src/pytorch_lightning/core/module.py index 8cbb67317c96c..87eddc0eaef62 100644 --- a/src/pytorch_lightning/core/module.py +++ b/src/pytorch_lightning/core/module.py @@ -32,6 +32,7 @@ from typing_extensions import Literal import pytorch_lightning as pl +from lightning_lite.utilities.cloud_io import get_filesystem from lightning_lite.utilities.device_dtype_mixin import DeviceDtypeModuleMixin from pytorch_lightning.callbacks.callback import Callback from pytorch_lightning.core.hooks import CheckpointHooks, DataHooks, ModelHooks @@ -42,7 +43,6 @@ from pytorch_lightning.trainer.connectors.logger_connector.fx_validator import _FxValidator from pytorch_lightning.utilities import _IS_WINDOWS, _TORCH_GREATER_EQUAL_1_10, GradClipAlgorithmType from pytorch_lightning.utilities.apply_func import apply_to_collection, convert_to_tensors -from pytorch_lightning.utilities.cloud_io import get_filesystem from pytorch_lightning.utilities.distributed import distributed_available, sync_ddp from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_11, _TORCH_GREATER_EQUAL_1_13 diff --git a/src/pytorch_lightning/core/saving.py b/src/pytorch_lightning/core/saving.py index 380338f5c0312..9ebd5efc7b891 100644 --- a/src/pytorch_lightning/core/saving.py +++ b/src/pytorch_lightning/core/saving.py @@ -26,10 +26,10 @@ import yaml import pytorch_lightning as pl +from lightning_lite.utilities.cloud_io import get_filesystem +from lightning_lite.utilities.cloud_io import load as pl_load from pytorch_lightning.utilities import _OMEGACONF_AVAILABLE, AttributeDict from pytorch_lightning.utilities.apply_func import apply_to_collection -from pytorch_lightning.utilities.cloud_io import get_filesystem -from pytorch_lightning.utilities.cloud_io import load as pl_load from pytorch_lightning.utilities.migration import pl_legacy_patch from pytorch_lightning.utilities.parsing import parse_class_init_keys from pytorch_lightning.utilities.rank_zero import rank_zero_warn diff --git a/src/pytorch_lightning/loggers/tensorboard.py b/src/pytorch_lightning/loggers/tensorboard.py index 25e1fa8be5193..7ff19b8c38c89 100644 --- a/src/pytorch_lightning/loggers/tensorboard.py +++ b/src/pytorch_lightning/loggers/tensorboard.py @@ -27,9 +27,9 @@ from torch.utils.tensorboard.summary import hparams import pytorch_lightning as pl +from lightning_lite.utilities.cloud_io import get_filesystem from pytorch_lightning.core.saving import save_hparams_to_yaml from pytorch_lightning.loggers.logger import Logger, rank_zero_experiment -from pytorch_lightning.utilities.cloud_io import get_filesystem from pytorch_lightning.utilities.imports import _OMEGACONF_AVAILABLE from pytorch_lightning.utilities.logger import _add_prefix, _convert_params, _flatten_dict from pytorch_lightning.utilities.logger import _sanitize_params as _utils_sanitize_params diff --git a/src/pytorch_lightning/plugins/environments/lsf_environment.py b/src/pytorch_lightning/plugins/environments/lsf_environment.py index 359add5137bad..b1f592bd9b04d 100644 --- a/src/pytorch_lightning/plugins/environments/lsf_environment.py +++ b/src/pytorch_lightning/plugins/environments/lsf_environment.py @@ -16,9 +16,9 @@ import socket from typing import Dict, List +from lightning_lite.utilities.cloud_io import get_filesystem from pytorch_lightning import _logger as log from pytorch_lightning.plugins.environments import ClusterEnvironment -from pytorch_lightning.utilities.cloud_io import get_filesystem class LSFEnvironment(ClusterEnvironment): diff --git a/src/pytorch_lightning/plugins/io/hpu_plugin.py b/src/pytorch_lightning/plugins/io/hpu_plugin.py index c72d1d9fcd112..b2a6893848309 100644 --- a/src/pytorch_lightning/plugins/io/hpu_plugin.py +++ b/src/pytorch_lightning/plugins/io/hpu_plugin.py @@ -17,9 +17,9 @@ import torch +from lightning_lite.utilities.cloud_io import atomic_save, get_filesystem from pytorch_lightning.plugins.io.torch_plugin import TorchCheckpointIO from pytorch_lightning.utilities.apply_func import move_data_to_device -from pytorch_lightning.utilities.cloud_io import atomic_save, get_filesystem from pytorch_lightning.utilities.types import _PATH diff --git a/src/pytorch_lightning/plugins/io/torch_plugin.py b/src/pytorch_lightning/plugins/io/torch_plugin.py index 0e5cba3837de3..ccdc4874a197d 100644 --- a/src/pytorch_lightning/plugins/io/torch_plugin.py +++ b/src/pytorch_lightning/plugins/io/torch_plugin.py @@ -16,9 +16,9 @@ from typing import Any, Callable, Dict, Optional import pytorch_lightning as pl +from lightning_lite.utilities.cloud_io import atomic_save, get_filesystem +from lightning_lite.utilities.cloud_io import load as pl_load from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO -from pytorch_lightning.utilities.cloud_io import atomic_save, get_filesystem -from pytorch_lightning.utilities.cloud_io import load as pl_load from pytorch_lightning.utilities.rank_zero import rank_zero_warn from pytorch_lightning.utilities.types import _PATH diff --git a/src/pytorch_lightning/plugins/io/xla_plugin.py b/src/pytorch_lightning/plugins/io/xla_plugin.py index 3868995eea2c7..6593843b37ef1 100644 --- a/src/pytorch_lightning/plugins/io/xla_plugin.py +++ b/src/pytorch_lightning/plugins/io/xla_plugin.py @@ -14,10 +14,10 @@ import os from typing import Any, Dict, Optional +from lightning_lite.utilities.cloud_io import get_filesystem from pytorch_lightning.plugins.io.torch_plugin import TorchCheckpointIO from pytorch_lightning.utilities import _OMEGACONF_AVAILABLE, _TPU_AVAILABLE from pytorch_lightning.utilities.apply_func import apply_to_collection -from pytorch_lightning.utilities.cloud_io import get_filesystem from pytorch_lightning.utilities.types import _PATH if _TPU_AVAILABLE: diff --git a/src/pytorch_lightning/profilers/profiler.py b/src/pytorch_lightning/profilers/profiler.py index 755007ba743f1..1dc91efc83bb1 100644 --- a/src/pytorch_lightning/profilers/profiler.py +++ b/src/pytorch_lightning/profilers/profiler.py @@ -19,7 +19,7 @@ from pathlib import Path from typing import Any, Callable, Dict, Generator, Iterable, Optional, TextIO, Union -from pytorch_lightning.utilities.cloud_io import get_filesystem +from lightning_lite.utilities.cloud_io import get_filesystem from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation log = logging.getLogger(__name__) diff --git a/src/pytorch_lightning/strategies/ipu.py b/src/pytorch_lightning/strategies/ipu.py index b254c5df16ca5..05078ef8c57a1 100644 --- a/src/pytorch_lightning/strategies/ipu.py +++ b/src/pytorch_lightning/strategies/ipu.py @@ -20,6 +20,7 @@ from torch.utils.data import DataLoader, Sampler import pytorch_lightning as pl +from lightning_lite.utilities.cloud_io import get_filesystem from pytorch_lightning.overrides.base import _LightningModuleWrapperBase, _LightningPrecisionModuleWrapperBase from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO @@ -30,7 +31,6 @@ from pytorch_lightning.trainer.states import RunningStage, TrainerFn from pytorch_lightning.utilities import _IPU_AVAILABLE, _POPTORCH_AVAILABLE, rank_zero_warn from pytorch_lightning.utilities.apply_func import apply_to_collection -from pytorch_lightning.utilities.cloud_io import get_filesystem from pytorch_lightning.utilities.data import _get_dataloader_init_args_and_kwargs, _reinstantiate_wrapped_cls from pytorch_lightning.utilities.enums import PrecisionType from pytorch_lightning.utilities.exceptions import MisconfigurationException diff --git a/src/pytorch_lightning/trainer/connectors/checkpoint_connector.py b/src/pytorch_lightning/trainer/connectors/checkpoint_connector.py index f0b35e054d6d5..647d505eb3568 100644 --- a/src/pytorch_lightning/trainer/connectors/checkpoint_connector.py +++ b/src/pytorch_lightning/trainer/connectors/checkpoint_connector.py @@ -23,10 +23,10 @@ from torchmetrics import Metric import pytorch_lightning as pl +from lightning_lite.utilities.cloud_io import get_filesystem from pytorch_lightning.plugins.precision import ApexMixedPrecisionPlugin, NativeMixedPrecisionPlugin from pytorch_lightning.trainer.states import TrainerFn from pytorch_lightning.utilities import _OMEGACONF_AVAILABLE -from pytorch_lightning.utilities.cloud_io import get_filesystem from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.imports import _fault_tolerant_training from pytorch_lightning.utilities.migration import pl_legacy_patch diff --git a/src/pytorch_lightning/trainer/trainer.py b/src/pytorch_lightning/trainer/trainer.py index 963c44dde21b9..19b3b1192248a 100644 --- a/src/pytorch_lightning/trainer/trainer.py +++ b/src/pytorch_lightning/trainer/trainer.py @@ -36,6 +36,7 @@ from torch.utils.data import DataLoader import pytorch_lightning as pl +from lightning_lite.utilities.cloud_io import get_filesystem from pytorch_lightning.accelerators import ( Accelerator, CUDAAccelerator, @@ -100,7 +101,6 @@ parse_env_variables, ) from pytorch_lightning.utilities.auto_restart import _add_capture_metadata_collate -from pytorch_lightning.utilities.cloud_io import get_filesystem from pytorch_lightning.utilities.data import _auto_add_worker_init_fn, has_len_all_ranks from pytorch_lightning.utilities.distributed import distributed_available from pytorch_lightning.utilities.exceptions import ExitGracefullyException, MisconfigurationException diff --git a/tests/tests_pytorch/utilities/test_cloud_io.py b/tests/tests_lite/utilities/test_cloud_io.py similarity index 94% rename from tests/tests_pytorch/utilities/test_cloud_io.py rename to tests/tests_lite/utilities/test_cloud_io.py index b2cbd5bead649..8148ce3501b92 100644 --- a/tests/tests_pytorch/utilities/test_cloud_io.py +++ b/tests/tests_lite/utilities/test_cloud_io.py @@ -16,7 +16,7 @@ import fsspec from fsspec.implementations.local import LocalFileSystem -from pytorch_lightning.utilities.cloud_io import get_filesystem +from lightning_lite.utilities.cloud_io import get_filesystem def test_get_filesystem_custom_filesystem(): diff --git a/tests/tests_pytorch/checkpointing/test_model_checkpoint.py b/tests/tests_pytorch/checkpointing/test_model_checkpoint.py index ebe0769d8df07..ccda0d7bcc408 100644 --- a/tests/tests_pytorch/checkpointing/test_model_checkpoint.py +++ b/tests/tests_pytorch/checkpointing/test_model_checkpoint.py @@ -32,11 +32,11 @@ import pytorch_lightning as pl import tests_pytorch.helpers.utils as tutils +from lightning_lite.utilities.cloud_io import load as pl_load from pytorch_lightning import seed_everything, Trainer from pytorch_lightning.callbacks import ModelCheckpoint from pytorch_lightning.demos.boring_classes import BoringModel from pytorch_lightning.loggers import TensorBoardLogger -from pytorch_lightning.utilities.cloud_io import load as pl_load from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.imports import _OMEGACONF_AVAILABLE from tests_pytorch.helpers.runif import RunIf diff --git a/tests/tests_pytorch/lite/test_parity.py b/tests/tests_pytorch/lite/test_parity.py index e294094799196..d8f5df5f6e1e9 100644 --- a/tests/tests_pytorch/lite/test_parity.py +++ b/tests/tests_pytorch/lite/test_parity.py @@ -27,12 +27,12 @@ from torch.utils.data import DataLoader from torch.utils.data.distributed import DistributedSampler +from lightning_lite.utilities.cloud_io import atomic_save from pytorch_lightning.demos.boring_classes import RandomDataset from pytorch_lightning.lite import LightningLite from pytorch_lightning.plugins.environments.lightning_environment import find_free_network_port from pytorch_lightning.strategies.ddp_spawn import DDPSpawnStrategy from pytorch_lightning.utilities.apply_func import apply_to_collection, move_data_to_device -from pytorch_lightning.utilities.cloud_io import atomic_save from tests_pytorch.helpers.runif import RunIf diff --git a/tests/tests_pytorch/models/test_torchscript.py b/tests/tests_pytorch/models/test_torchscript.py index 150ea86044be6..fc63d661ab5e7 100644 --- a/tests/tests_pytorch/models/test_torchscript.py +++ b/tests/tests_pytorch/models/test_torchscript.py @@ -19,8 +19,8 @@ import torch from fsspec.implementations.local import LocalFileSystem +from lightning_lite.utilities.cloud_io import get_filesystem from pytorch_lightning.demos.boring_classes import BoringModel -from pytorch_lightning.utilities.cloud_io import get_filesystem from tests_pytorch.helpers.advanced_models import BasicGAN, ParityModuleRNN from tests_pytorch.helpers.runif import RunIf diff --git a/tests/tests_pytorch/trainer/test_trainer.py b/tests/tests_pytorch/trainer/test_trainer.py index 6febe1c7787cd..d1b1ef6cf9e68 100644 --- a/tests/tests_pytorch/trainer/test_trainer.py +++ b/tests/tests_pytorch/trainer/test_trainer.py @@ -35,6 +35,7 @@ import pytorch_lightning import tests_pytorch.helpers.utils as tutils +from lightning_lite.utilities.cloud_io import load as pl_load from pytorch_lightning import Callback, LightningDataModule, LightningModule, Trainer from pytorch_lightning.accelerators import CPUAccelerator, CUDAAccelerator from pytorch_lightning.callbacks import EarlyStopping, GradientAccumulationScheduler, ModelCheckpoint, Timer @@ -60,7 +61,6 @@ ) from pytorch_lightning.trainer.states import RunningStage, TrainerFn from pytorch_lightning.utilities import device_parser -from pytorch_lightning.utilities.cloud_io import load as pl_load from pytorch_lightning.utilities.exceptions import DeadlockDetectedException, MisconfigurationException from pytorch_lightning.utilities.imports import _OMEGACONF_AVAILABLE, _TORCH_GREATER_EQUAL_1_12 from pytorch_lightning.utilities.seed import seed_everything From cfea2be13752415d25f769f3f38c01e6e520ab97 Mon Sep 17 00:00:00 2001 From: awaelchli Date: Sat, 3 Sep 2022 14:33:22 +0200 Subject: [PATCH 057/193] Deprecate pl/utilities/cloud_io.py (#14515) --- src/pytorch_lightning/CHANGELOG.md | 4 ++ src/pytorch_lightning/utilities/cloud_io.py | 45 +++++++++++++++++++ .../deprecated_api/test_remove_1-10.py | 12 +++++ 3 files changed, 61 insertions(+) create mode 100644 src/pytorch_lightning/utilities/cloud_io.py diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 61a6b806c35ae..08197e442f367 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -79,6 +79,10 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). +- Deprecated all functions in `pytorch_lightning.utilities.cloud_io` in favor of `lightning_lite.utilities.cloud_io` ([#14515](https://github.com/Lightning-AI/lightning/pull/14515)) + + + ### Removed - Removed the deprecated `Trainer.training_type_plugin` property in favor of `Trainer.strategy` ([#14011](https://github.com/Lightning-AI/lightning/pull/14011)) diff --git a/src/pytorch_lightning/utilities/cloud_io.py b/src/pytorch_lightning/utilities/cloud_io.py new file mode 100644 index 0000000000000..735b2e95ed1dc --- /dev/null +++ b/src/pytorch_lightning/utilities/cloud_io.py @@ -0,0 +1,45 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Utilities related to data saving/loading.""" + +from typing import Any + +from lightning_lite.utilities.cloud_io import atomic_save as new_atomic_save +from lightning_lite.utilities.cloud_io import get_filesystem as new_get_filesystem +from lightning_lite.utilities.cloud_io import load as new_load +from pytorch_lightning.utilities import rank_zero_deprecation # TODO(lite): change to lightning_lite.utilities + + +def atomic_save(*args: Any, **kwargs: Any) -> Any: + rank_zero_deprecation( + "`pytorch_lightning.utilities.cloud_io.atomic_save` has been deprecated in v1.8.0 and will be" + " removed in v1.10.0. Please use `lightning_lite.utilities.cloud_io.atomic_save` instead." + ) + return new_atomic_save(*args, **kwargs) + + +def get_filesystem(*args: Any, **kwargs: Any) -> Any: + rank_zero_deprecation( + "`pytorch_lightning.utilities.cloud_io.get_filesystem` has been deprecated in v1.8.0 and will be" + " removed in v1.10.0. Please use `lightning_lite.utilities.cloud_io.get_filesystem` instead." + ) + return new_get_filesystem(*args, **kwargs) + + +def load(*args: Any, **kwargs: Any) -> Any: + rank_zero_deprecation( + "`pytorch_lightning.utilities.cloud_io.load` has been deprecated in v1.8.0 and will be" + " removed in v1.10.0. Please use `lightning_lite.utilities.cloud_io.load` instead." + ) + return new_load(*args, **kwargs) diff --git a/tests/tests_pytorch/deprecated_api/test_remove_1-10.py b/tests/tests_pytorch/deprecated_api/test_remove_1-10.py index d9bf33a28dcd4..ffb6abfcc9e12 100644 --- a/tests/tests_pytorch/deprecated_api/test_remove_1-10.py +++ b/tests/tests_pytorch/deprecated_api/test_remove_1-10.py @@ -26,6 +26,7 @@ from pytorch_lightning.strategies.deepspeed import LightningDeepSpeedModule from pytorch_lightning.strategies.ipu import LightningIPUModule from pytorch_lightning.strategies.utils import on_colab_kaggle +from pytorch_lightning.utilities.cloud_io import atomic_save, get_filesystem, load from pytorch_lightning.utilities.xla_device import inner_f, pl_multi_process, XLADeviceUtils from tests_pytorch.helpers.runif import RunIf from tests_pytorch.helpers.utils import no_warning_call @@ -98,3 +99,14 @@ def test_v1_10_deprecated_xla_device_utilities(): with pytest.deprecated_call(match="xla_device.XLADeviceUtils` has been deprecated in v1.8.0"): XLADeviceUtils.tpu_device_exists() + + +def test_v1_10_deprecated_cloud_io_utilities(tmpdir): + with pytest.deprecated_call(match="cloud_io.atomic_save` has been deprecated in v1.8.0"): + atomic_save({}, tmpdir / "atomic_save.ckpt") + + with pytest.deprecated_call(match="cloud_io.get_filesystem` has been deprecated in v1.8.0"): + get_filesystem(tmpdir) + + with pytest.deprecated_call(match="cloud_io.load` has been deprecated in v1.8.0"): + load(str(tmpdir / "atomic_save.ckpt")) From 875db199710f77fdff006a022cbd4d769d62b289 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Mon, 5 Sep 2022 19:11:12 +0200 Subject: [PATCH 058/193] Add path filters for azure PR jobs (#14544) --- .azure/app-cloud-e2e.yml | 12 ++++++++++-- .azure/gpu-tests-lite.yml | 14 ++++++++++++-- .azure/hpu-tests.yml | 13 +++++++++++-- 3 files changed, 33 insertions(+), 6 deletions(-) diff --git a/.azure/app-cloud-e2e.yml b/.azure/app-cloud-e2e.yml index 1511284cad94b..eef8a8b8bfff8 100644 --- a/.azure/app-cloud-e2e.yml +++ b/.azure/app-cloud-e2e.yml @@ -20,8 +20,16 @@ trigger: - "examples/app_*" pr: - - "master" - - "release/*" + branches: + include: + - "master" + - "release/*" + paths: + include: + - ".azure/app-cloud-e2e.yml" + - "requirements/app/**" + - "src/lightning_app/**" + - "examples/app_*" # variables are automatically exported as environment variables so this will override pip's default cache dir variables: diff --git a/.azure/gpu-tests-lite.yml b/.azure/gpu-tests-lite.yml index 5ceccade964da..66fc3951b9ce1 100644 --- a/.azure/gpu-tests-lite.yml +++ b/.azure/gpu-tests-lite.yml @@ -22,8 +22,18 @@ trigger: - "tests/tests_lite/run_standalone_tests.sh" # a symlink to the one above pr: - - "master" - - "release/*" + branches: + include: + - "master" + - "release/*" + paths: + include: + - ".azure/gpu-tests-lite.yml" + - "requirements/lite/**" + - "src/lightning_lite/**" + - "tests/tests_lite/**" + - "tests/tests_pytorch/run_standalone_tests.sh" + - "tests/tests_lite/run_standalone_tests.sh" # a symlink to the one above jobs: - job: testing diff --git a/.azure/hpu-tests.yml b/.azure/hpu-tests.yml index 785a9b0f0115c..33206ef5c3e37 100644 --- a/.azure/hpu-tests.yml +++ b/.azure/hpu-tests.yml @@ -18,8 +18,17 @@ trigger: - "tests/tests_pytorch/**" pr: - - "master" - - "release/*" + branches: + include: + - "master" + - "release/*" + paths: + include: + - ".azure/hpu-tests.yml" + - "examples/pl_hpu/mnist_sample.py" + - "requirements/pytorch/**" + - "src/pytorch_lightning/**" + - "tests/tests_pytorch/**" jobs: - job: testing From dafa3e8d2eda210a21622083945c6685ba64dba0 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 5 Sep 2022 17:11:54 +0000 Subject: [PATCH 059/193] Bump tj-actions/changed-files from 29.0.1 to 29.0.3 (#14541) Bumps [tj-actions/changed-files](https://github.com/tj-actions/changed-files) from 29.0.1 to 29.0.3. - [Release notes](https://github.com/tj-actions/changed-files/releases) - [Changelog](https://github.com/tj-actions/changed-files/blob/main/HISTORY.md) - [Commits](https://github.com/tj-actions/changed-files/compare/v29.0.1...v29.0.3) --- updated-dependencies: - dependency-name: tj-actions/changed-files dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/ci-pr-gatekeeper.yml | 2 +- .github/workflows/ci-pytorch-test-conda.yml | 2 +- .github/workflows/ci-pytorch-test-full.yml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ci-pr-gatekeeper.yml b/.github/workflows/ci-pr-gatekeeper.yml index 8714bec926c23..5c235f151b59b 100644 --- a/.github/workflows/ci-pr-gatekeeper.yml +++ b/.github/workflows/ci-pr-gatekeeper.yml @@ -20,7 +20,7 @@ jobs: fetch-depth: "2" # To retrieve the preceding commit. - name: Get changed files using defaults id: changed-files - uses: tj-actions/changed-files@v29.0.1 + uses: tj-actions/changed-files@v29.0.3 - name: Determine changes id: touched run: | diff --git a/.github/workflows/ci-pytorch-test-conda.yml b/.github/workflows/ci-pytorch-test-conda.yml index 82c463a54169f..8f74a9ed96249 100644 --- a/.github/workflows/ci-pytorch-test-conda.yml +++ b/.github/workflows/ci-pytorch-test-conda.yml @@ -37,7 +37,7 @@ jobs: - name: Get changed files id: changed-files - uses: tj-actions/changed-files@v29.0.1 + uses: tj-actions/changed-files@v29.0.3 - name: Decide if the test should be skipped id: skip diff --git a/.github/workflows/ci-pytorch-test-full.yml b/.github/workflows/ci-pytorch-test-full.yml index 987373b6ea2bf..3a532bec5b648 100644 --- a/.github/workflows/ci-pytorch-test-full.yml +++ b/.github/workflows/ci-pytorch-test-full.yml @@ -39,7 +39,7 @@ jobs: - name: Get changed files id: changed-files - uses: tj-actions/changed-files@v29.0.1 + uses: tj-actions/changed-files@v29.0.3 - name: Decide if the test should be skipped id: skip From e5395de9d3ae76e528f0a84856bb113d43094828 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 5 Sep 2022 19:13:08 +0200 Subject: [PATCH 060/193] Bump actions/checkout from 2 to 3 (#14540) Bumps [actions/checkout](https://github.com/actions/checkout) from 2 to 3. - [Release notes](https://github.com/actions/checkout/releases) - [Changelog](https://github.com/actions/checkout/blob/main/CHANGELOG.md) - [Commits](https://github.com/actions/checkout/compare/v2...v3) --- updated-dependencies: - dependency-name: actions/checkout dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/ci-app-cloud-e2e-test.yml | 2 +- .github/workflows/ci-app-examples.yml | 2 +- .github/workflows/ci-app-tests.yml | 2 +- .github/workflows/ci-lite-test-full.yml | 2 +- .github/workflows/ci-pkg-install.yml | 6 +++--- .github/workflows/ci-pytorch-test-conda.yml | 2 +- .github/workflows/ci-pytorch-test-full.yml | 2 +- .github/workflows/ci-pytorch-test-slow.yml | 2 +- .github/workflows/docs-checks.yml | 4 ++-- .github/workflows/docs-deploy.yml | 4 ++-- .github/workflows/events-nightly.yml | 2 +- .github/workflows/legacy-checkpoints.yml | 2 +- .github/workflows/release-docker.yml | 2 +- .github/workflows/release-pypi.yml | 14 +++++++------- 14 files changed, 24 insertions(+), 24 deletions(-) diff --git a/.github/workflows/ci-app-cloud-e2e-test.yml b/.github/workflows/ci-app-cloud-e2e-test.yml index c85e245696c23..7fe0d6b5c61e6 100644 --- a/.github/workflows/ci-app-cloud-e2e-test.yml +++ b/.github/workflows/ci-app-cloud-e2e-test.yml @@ -35,7 +35,7 @@ jobs: - commands_and_api timeout-minutes: 35 steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - uses: actions/setup-python@v4 with: python-version: "3.8" diff --git a/.github/workflows/ci-app-examples.yml b/.github/workflows/ci-app-examples.yml index 22e8fc4454a37..0480322d2b39d 100644 --- a/.github/workflows/ci-app-examples.yml +++ b/.github/workflows/ci-app-examples.yml @@ -33,7 +33,7 @@ jobs: timeout-minutes: 10 steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v4 with: diff --git a/.github/workflows/ci-app-tests.yml b/.github/workflows/ci-app-tests.yml index 91299fdc4e16b..4565cee2d36d2 100644 --- a/.github/workflows/ci-app-tests.yml +++ b/.github/workflows/ci-app-tests.yml @@ -30,7 +30,7 @@ jobs: timeout-minutes: 20 steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v4 with: diff --git a/.github/workflows/ci-lite-test-full.yml b/.github/workflows/ci-lite-test-full.yml index 69c8a12069f26..896086b697d66 100644 --- a/.github/workflows/ci-lite-test-full.yml +++ b/.github/workflows/ci-lite-test-full.yml @@ -38,7 +38,7 @@ jobs: timeout-minutes: 40 steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v4 diff --git a/.github/workflows/ci-pkg-install.yml b/.github/workflows/ci-pkg-install.yml index 7a83cd46ece61..7751dce429b3d 100644 --- a/.github/workflows/ci-pkg-install.yml +++ b/.github/workflows/ci-pkg-install.yml @@ -38,7 +38,7 @@ jobs: python-version: [3.8] # , 3.9 steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - uses: actions/setup-python@v4 with: python-version: ${{ matrix.python-version }} @@ -74,7 +74,7 @@ jobs: python-version: [3.8] # , 3.9 steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - uses: actions/setup-python@v4 with: python-version: ${{ matrix.python-version }} @@ -106,7 +106,7 @@ jobs: python-version: [3.8] # , 3.9 steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - uses: actions/setup-python@v4 with: python-version: ${{ matrix.python-version }} diff --git a/.github/workflows/ci-pytorch-test-conda.yml b/.github/workflows/ci-pytorch-test-conda.yml index 8f74a9ed96249..64d06a22949d8 100644 --- a/.github/workflows/ci-pytorch-test-conda.yml +++ b/.github/workflows/ci-pytorch-test-conda.yml @@ -33,7 +33,7 @@ jobs: - name: Workaround for https://github.com/actions/checkout/issues/760 run: git config --global --add safe.directory /__w/lightning/lightning - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - name: Get changed files id: changed-files diff --git a/.github/workflows/ci-pytorch-test-full.yml b/.github/workflows/ci-pytorch-test-full.yml index 3a532bec5b648..fbdc81b91c0ed 100644 --- a/.github/workflows/ci-pytorch-test-full.yml +++ b/.github/workflows/ci-pytorch-test-full.yml @@ -35,7 +35,7 @@ jobs: timeout-minutes: 40 steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - name: Get changed files id: changed-files diff --git a/.github/workflows/ci-pytorch-test-slow.yml b/.github/workflows/ci-pytorch-test-slow.yml index f808ce561a5b1..091c3f606c3ca 100644 --- a/.github/workflows/ci-pytorch-test-slow.yml +++ b/.github/workflows/ci-pytorch-test-slow.yml @@ -32,7 +32,7 @@ jobs: timeout-minutes: 20 steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - uses: actions/setup-python@v4 with: diff --git a/.github/workflows/docs-checks.yml b/.github/workflows/docs-checks.yml index 1de92849ed0c0..bea17f946a8ab 100644 --- a/.github/workflows/docs-checks.yml +++ b/.github/workflows/docs-checks.yml @@ -20,7 +20,7 @@ jobs: matrix: pkg: ["app", "pytorch"] # TODO: , "lit" steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 with: submodules: true - uses: actions/setup-python@v4 @@ -71,7 +71,7 @@ jobs: matrix: pkg: ["app", "pytorch", "lit"] steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 with: submodules: true # lfs: true diff --git a/.github/workflows/docs-deploy.yml b/.github/workflows/docs-deploy.yml index f1df928fef569..d39f799859c71 100644 --- a/.github/workflows/docs-deploy.yml +++ b/.github/workflows/docs-deploy.yml @@ -9,8 +9,8 @@ jobs: runs-on: ubuntu-20.04 steps: - name: Checkout 🛎️ - uses: actions/checkout@v2 - # If you're using actions/checkout@v2 you must set persist-credentials to false in most cases for the deployment to work correctly. + uses: actions/checkout@v3 + # If you're using actions/checkout@v3 you must set persist-credentials to false in most cases for the deployment to work correctly. with: persist-credentials: false - uses: actions/setup-python@v4 diff --git a/.github/workflows/events-nightly.yml b/.github/workflows/events-nightly.yml index 13d3895bf365d..2576b05e33566 100644 --- a/.github/workflows/events-nightly.yml +++ b/.github/workflows/events-nightly.yml @@ -15,7 +15,7 @@ jobs: steps: # does nightly releases from feature branch - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - uses: actions/setup-python@v4 with: python-version: 3.9 diff --git a/.github/workflows/legacy-checkpoints.yml b/.github/workflows/legacy-checkpoints.yml index 0856cfd3229a2..7a59b9446aab0 100644 --- a/.github/workflows/legacy-checkpoints.yml +++ b/.github/workflows/legacy-checkpoints.yml @@ -8,7 +8,7 @@ jobs: create-legacy-ckpts: runs-on: ubuntu-20.04 steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - uses: actions/setup-python@v4 with: diff --git a/.github/workflows/release-docker.yml b/.github/workflows/release-docker.yml index 2de330ea5ca75..67503ba2b2c0d 100644 --- a/.github/workflows/release-docker.yml +++ b/.github/workflows/release-docker.yml @@ -22,7 +22,7 @@ jobs: - {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.6.1"} steps: - name: Checkout - uses: actions/checkout@v2 + uses: actions/checkout@v3 - name: Get release version id: get_version diff --git a/.github/workflows/release-pypi.yml b/.github/workflows/release-pypi.yml index 97c3b8eca77d1..2c6f5da240f63 100644 --- a/.github/workflows/release-pypi.yml +++ b/.github/workflows/release-pypi.yml @@ -21,7 +21,7 @@ jobs: build-pkgs: ${{ steps.candidate.outputs.pkgs }} pull-pkgs: ${{ steps.download.outputs.pkgs }} steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - uses: actions/setup-python@v4 with: python-version: 3.9 @@ -60,7 +60,7 @@ jobs: max-parallel: 1 matrix: ${{ fromJSON(needs.releasing.outputs.build-pkgs) }} steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - uses: actions/download-artifact@v3 with: name: dist-packages-${{ github.sha }} @@ -94,7 +94,7 @@ jobs: max-parallel: 1 matrix: ${{ fromJSON(needs.releasing.outputs.pull-pkgs) }} steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - uses: actions/download-artifact@v3 with: name: pypi-packages-${{ github.sha }} @@ -118,7 +118,7 @@ jobs: needs: [build-package, download-package] runs-on: ubuntu-20.04 steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - uses: actions/download-artifact@v3 with: name: dist-packages-${{ github.sha }} @@ -169,7 +169,7 @@ jobs: needs: build-meta-pkg if: startsWith(github.event.ref, 'refs/tags') || github.event_name == 'release' steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - uses: actions/download-artifact@v3 with: name: dist-packages-${{ github.sha }} @@ -188,7 +188,7 @@ jobs: needs: build-meta-pkg if: startsWith(github.event.ref, 'refs/tags') || github.event_name == 'release' steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - uses: actions/download-artifact@v3 with: name: dist-packages-${{ github.sha }} @@ -220,7 +220,7 @@ jobs: runs-on: ubuntu-20.04 needs: [build-package, publish-package] steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - uses: actions/setup-python@v4 with: From 381600dcc328f639f307bbaa4a3efed5e49b6883 Mon Sep 17 00:00:00 2001 From: donlapark <10988155+donlapark@users.noreply.github.com> Date: Tue, 6 Sep 2022 01:09:20 +0700 Subject: [PATCH 061/193] fixes typing errors in auto_restart.py (#13904) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: otaj <6065855+otaj@users.noreply.github.com> Co-authored-by: rohitgr7 Co-authored-by: Carlos Mocholí --- pyproject.toml | 1 - .../utilities/auto_restart.py | 83 ++++++++++++------- src/pytorch_lightning/utilities/types.py | 15 ++-- 3 files changed, 63 insertions(+), 36 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index df89de3d092fc..5b62baf9ce6f3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -56,7 +56,6 @@ module = [ "pytorch_lightning.trainer.supporters", "pytorch_lightning.trainer.trainer", "pytorch_lightning.tuner.batch_size_scaling", - "pytorch_lightning.utilities.auto_restart", "pytorch_lightning.utilities.data", ] ignore_errors = "True" diff --git a/src/pytorch_lightning/utilities/auto_restart.py b/src/pytorch_lightning/utilities/auto_restart.py index 3877a1ab3944c..e90dcc7172690 100644 --- a/src/pytorch_lightning/utilities/auto_restart.py +++ b/src/pytorch_lightning/utilities/auto_restart.py @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from collections.abc import Sized from copy import deepcopy from dataclasses import dataclass, field from functools import partial, wraps @@ -24,6 +25,7 @@ DataLoader, IterableDataset, ) +from typing_extensions import TypedDict import pytorch_lightning as pl from pytorch_lightning.utilities.apply_func import apply_to_collection @@ -34,6 +36,21 @@ from pytorch_lightning.utilities.types import _Stateful +class _IteratorStateDict(TypedDict): + dataset_state: Dict[int, Any] + sampler_state: Dict[int, Any] + worker_id: int + num_workers: int + num_batches_fetched: int + name: Optional[str] + + +class _MergedIteratorStateDict(TypedDict): + state: Dict[str, Any] + latest_worker_id: int + represent_map_dataset: Optional[bool] + + class FastForwardSampler(Sampler): """This FastForwardSampler wraps a :class:`torch.utils.data.Sampler` and records the number of iterations performed during an epoch. @@ -45,7 +62,7 @@ class FastForwardSampler(Sampler): samples seen in the last iterations (for the current worker). """ - def __init__(self, sampler: Union[Sampler, Generator], attr_name: Optional[str] = None) -> None: + def __init__(self, sampler: Iterator, attr_name: Optional[str] = None) -> None: super().__init__(data_source=None) self._sampler = sampler self.restarting: bool = False @@ -79,7 +96,7 @@ def __iter__(self) -> Iterator[Any]: self._counter = 0 return self - def __next__(self): + def __next__(self) -> Any: # the `state dict` was cached as workers were unavailable before. if self._cached_state_dict is not None: self._load_non_random_state(self._cached_state_dict) @@ -109,6 +126,7 @@ def __next__(self): raise StopIteration def __len__(self) -> int: + assert isinstance(self._sampler, Sized) return len(self._sampler) def state_dict(self, num_batches_processed: Optional[int] = None) -> Dict[int, Dict[str, int]]: @@ -161,7 +179,7 @@ class IteratorState: name: Optional[str] = None @classmethod - def from_state_dict(cls, state_dict) -> "IteratorState": + def from_state_dict(cls, state_dict: _IteratorStateDict) -> "IteratorState": return cls(**state_dict) @@ -173,22 +191,22 @@ class MergedIteratorState: worker states in this merged iterator state. """ - state: Union[Dict[Union[int, str], Union[Dict[str, IteratorState], IteratorState]]] = field(default_factory=dict) + state: Dict = field(default_factory=dict) latest_worker_id: int = 0 represent_map_dataset: Optional[bool] = None def update(self, generator_name: Optional[str], new_state: IteratorState) -> None: # a map based dataset doesn't own a generator and therefore `generator_name` should be None. self.represent_map_dataset = generator_name is None - if self.represent_map_dataset: - state = self.state + latest_worker_id = new_state.worker_id + if generator_name is None: + self.state[latest_worker_id] = new_state else: if generator_name not in self.state: self.state[generator_name] = {} state = self.state[generator_name] + state[latest_worker_id] = new_state - latest_worker_id = new_state.worker_id - state[latest_worker_id] = new_state self.latest_worker_id = latest_worker_id @property @@ -202,7 +220,7 @@ def dataset_states(self) -> Dict[int, Any]: return {k: self.state[k].dataset_state[k] for k in self.state.keys()} @classmethod - def from_state_dict(cls, state_dict) -> "MergedIteratorState": + def from_state_dict(cls, state_dict: _MergedIteratorStateDict) -> "MergedIteratorState": if state_dict["represent_map_dataset"]: state_dict["state"] = { worker_id: IteratorState.from_state_dict(state) for worker_id, state in state_dict["state"].items() @@ -229,15 +247,15 @@ class CaptureMapDataset(Dataset): """ def __init__(self, dataset: Dataset) -> None: - self.dataset = dataset - self._cached_state_dict = None + self.dataset: Dataset = dataset + self._cached_state_dict: Optional[Dict[int, Any]] = None @property def worker_id(self) -> int: worker_info = get_worker_info() return worker_info.id if worker_info else 0 - def __getitem__(self, item) -> Tuple[Any, Dict[int, Dict]]: + def __getitem__(self, item: int) -> Tuple[Any, Dict[int, Dict]]: if self._cached_state_dict is not None: if self.worker_id in self._cached_state_dict: _set_rng_states(self._cached_state_dict[self.worker_id]["rng_states"]) @@ -246,6 +264,7 @@ def __getitem__(self, item) -> Tuple[Any, Dict[int, Dict]]: return self.dataset[item] def __len__(self) -> int: + assert isinstance(self.dataset, Sized) return len(self.dataset) def load_state_dict(self, state_dict: Dict[int, Any], latest_worker_id: int, num_workers: int) -> None: @@ -268,7 +287,7 @@ def __init__(self, dataset: IterableDataset) -> None: super().__init__() self.dataset = deepcopy(dataset) self.samplers: Optional[Dict[str, FastForwardSampler]] = None - self._state_dict: Optional[Dict[int, Any]] = None + self._state_dict: Optional[Dict[str, Any]] = None self._has_wrapped: bool = False @property @@ -276,9 +295,10 @@ def sampler(self) -> Sampler: return self.dataset.sampler def state_dict(self) -> Dict[str, Any]: + assert self.samplers is not None return {k: v.state_dict() for k, v in self.samplers.items()} - def load_state_dict(self, state_dict: Dict[int, Any]) -> None: + def load_state_dict(self, state_dict: Dict[str, Any]) -> None: self._state_dict = deepcopy(state_dict) def _wrap_generator_samplers(self) -> None: @@ -311,7 +331,7 @@ def _wrap_generator_samplers(self) -> None: self.reset_on_epoch() - def reset_on_epoch(self): + def reset_on_epoch(self) -> None: self._state_dict = None def __iter__(self) -> Iterator: @@ -371,8 +391,8 @@ def _cycle_to_next_worker_and_reset(dataloader: DataLoader, state_dict: Dict[str for _ in range(state_dict["previous_worker"] - 1): next(iter_dataloader._worker_queue_idx_cycle) - # we can finally call reset and apply prefecthing. - iter_dataloader._reset = iter_dataloader._original_reset + # we can finally call reset and apply prefetching. + iter_dataloader._reset = iter_dataloader._original_reset # type: ignore[assignment] iter_dataloader._reset(dataloader, first_iter=True) # return the iterator return iter_dataloader @@ -445,6 +465,7 @@ def wrapper() -> Any: ] elif isinstance(dataset, CaptureMapDataset): ff_sampler = _find_fast_forward_samplers(dl) + assert ff_sampler is not None state = [ IteratorState( num_workers=dl.num_workers, @@ -519,6 +540,7 @@ def _reload_dataloader_state_dict_automatic_map_dataset(dataloader: DataLoader, # reload sampler state ff_sampler = _find_fast_forward_samplers(dataloader) + assert ff_sampler is not None ff_sampler.load_state_dict(iterator_state.sampler_state) # reload dataset state @@ -610,18 +632,20 @@ def _rotate_worker_indices(state: Dict[int, Any], latest_worker_id: int, num_wor return {new_id: state[old_id] for old_id, new_id in old_to_new_worker_id_map if old_id in state} -class _StatefulDataLoaderIter: +class _StatefulDataLoaderIter(_BaseDataLoaderIter): """This mixin is used to make PyTorch DataLoaderIter stateful.""" - def __accumulate_state(self, sampler_state: Dict[str, Any]) -> None: + def __accumulate_state(self, sampler_state: Dict[int, Any]) -> None: # store sampler state within a queue alongside its idx. - self._sampler_state_idx = getattr(self, "_sampler_state_idx", 0) + 1 + self._sampler_state_idx: int = getattr(self, "_sampler_state_idx", 0) + 1 self._sampler_state.append((sampler_state, self._sampler_state_idx)) def _store_sampler_state(self) -> None: """This function is used to extract the sampler states if any.""" - sampler_state = { - k: v.state_dict() for k, v in self._loader.__dict__.items() if isinstance(v, _Stateful) and k != "dataset" + sampler_state: Dict[int, Any] = { + k: v.state_dict() # type: ignore[misc] + for k, v in self._loader.__dict__.items() + if isinstance(v, _Stateful) and k != "dataset" } self.__accumulate_state(sampler_state) @@ -630,12 +654,12 @@ def _next_index(self) -> Any: self._store_sampler_state() return indexes - def _prepare_loader(self, loader): + def _prepare_loader(self, loader: DataLoader) -> None: _add_capture_metadata_collate(loader) self._loader = loader self._data_fetcher: "pl.utilities.fetching.AbstractDataFetcher" = loader._lightning_fetcher self.num_batches_fetched = 0 - self._sampler_state = [] + self._sampler_state: List[Tuple[Dict[int, Any], int]] = [] self._sampler_state_idx = 0 def __del__(self) -> None: @@ -680,7 +704,7 @@ def __init__(self, loader: DataLoader): super().__init__(loader) -def _get_iterator(self) -> "_BaseDataLoaderIter": +def _get_iterator(self: DataLoader) -> "_BaseDataLoaderIter": if not hasattr(self, "_lightning_fetcher"): raise MisconfigurationException( "A stateful iterator should be used only when a DataFetcher has been attached to the DataLoader." @@ -699,7 +723,7 @@ def _patch_dataloader_get_iterators() -> None: return if not hasattr(DataLoader, "_ori_get_iterator"): DataLoader._ori_get_iterator = DataLoader._get_iterator - DataLoader._get_iterator = _get_iterator + DataLoader._get_iterator = _get_iterator # type: ignore[assignment] def _teardown_dataloader_get_iterators() -> None: @@ -707,7 +731,7 @@ def _teardown_dataloader_get_iterators() -> None: # cleanup the get_iterator replacement in case of Fault-tolerance. get_iterator = getattr(DataLoader, "_ori_get_iterator", None) if get_iterator: - DataLoader._get_iterator = get_iterator + DataLoader._get_iterator = get_iterator # type: ignore[assignment] del DataLoader._ori_get_iterator @@ -781,16 +805,17 @@ def flatten_dataloader(dataloader: Union[DataLoader, CycleIterator, Iterable]) - raise ValueError("Fault-tolerance supports only a single dataloader.") for dataloader in dl_loaders: + assert isinstance(dataloader, DataLoader) validator_fn = ( _validate_iterable_dataset if isinstance(dataloader.dataset, IterableDataset) else _validate_map_dataset ) validator_fn(dataloader) -def _collect_states_on_rank_zero_over_collection(state_dict: Any, key: str = "state") -> Any: +def _collect_states_on_rank_zero_over_collection(state_dict: Dict, key: str = "state") -> Dict: """This utility collects the state across processes for a collection of state.""" - def fn(state: Dict): + def fn(state: Dict) -> Dict: if key in state: return _collect_states_on_rank_zero(state) return {k: apply_to_collection(v, Dict, fn) for k, v in state.items()} diff --git a/src/pytorch_lightning/utilities/types.py b/src/pytorch_lightning/utilities/types.py index 7ab3d6948854c..c90657b34e868 100644 --- a/src/pytorch_lightning/utilities/types.py +++ b/src/pytorch_lightning/utilities/types.py @@ -20,7 +20,7 @@ from contextlib import contextmanager from dataclasses import dataclass from pathlib import Path -from typing import Any, Callable, Dict, Generator, Iterator, List, Mapping, Optional, Sequence, Type, Union +from typing import Any, Callable, Dict, Generator, Iterator, List, Mapping, Optional, Sequence, Type, TypeVar, Union import torch from torch import Tensor @@ -90,21 +90,24 @@ def predict_step(self, *args: Any, **kwargs: Any) -> STEP_OUTPUT: ... +_DictKey = TypeVar("_DictKey") + + @runtime_checkable -class _Stateful(Protocol): +class _Stateful(Protocol[_DictKey]): """This class is used to detect if an object is stateful using `isinstance(obj, _Stateful)`.""" - def state_dict(self) -> Dict[str, Any]: + def state_dict(self) -> Dict[_DictKey, Any]: ... - def load_state_dict(self, state_dict: Dict[str, Any]) -> None: + def load_state_dict(self, state_dict: Dict[_DictKey, Any]) -> None: ... # Inferred from `torch.optim.lr_scheduler.pyi` # Missing attributes were added to improve typing @runtime_checkable -class _LRScheduler(_Stateful, Protocol): +class _LRScheduler(_Stateful[str], Protocol): optimizer: Optimizer base_lrs: List[float] @@ -118,7 +121,7 @@ def step(self, epoch: Optional[int] = None) -> None: # Inferred from `torch.optim.lr_scheduler.pyi` # Missing attributes were added to improve typing @runtime_checkable -class ReduceLROnPlateau(_Stateful, Protocol): +class ReduceLROnPlateau(_Stateful[str], Protocol): in_cooldown: bool optimizer: Optimizer From ea3d62d4753fff866513b159a2f0a9cd82820aa9 Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Mon, 5 Sep 2022 20:23:53 +0200 Subject: [PATCH 062/193] Setup: add requirement freeze for next major version (#14480) --- .actions/setup_tools.py | 108 +++++++++++++++++++++-------- setup.py | 2 +- src/lightning/__setup__.py | 2 +- src/lightning_app/__setup__.py | 6 +- src/pytorch_lightning/__setup__.py | 6 +- 5 files changed, 90 insertions(+), 34 deletions(-) diff --git a/.actions/setup_tools.py b/.actions/setup_tools.py index 0b84cec001e6d..3706dda307a84 100644 --- a/.actions/setup_tools.py +++ b/.actions/setup_tools.py @@ -21,11 +21,14 @@ import tempfile import urllib.request from datetime import datetime +from distutils.version import LooseVersion from importlib.util import module_from_spec, spec_from_file_location from itertools import chain, groupby from types import ModuleType from typing import List +from pkg_resources import parse_requirements + _PROJECT_ROOT = os.path.dirname(os.path.dirname(__file__)) _PACKAGE_MAPPING = {"pytorch": "pytorch_lightning", "app": "lightning_app"} @@ -42,45 +45,92 @@ def _load_py_module(name: str, location: str) -> ModuleType: return py +def _augment_requirement(ln: str, comment_char: str = "#", unfreeze: str = "all") -> str: + """Adjust the upper version contrains. + + Args: + ln: raw line from requirement + comment_char: charter marking comment + unfreeze: Enum or "all"|"major"|"" + + Returns: + adjusted requirement + + >>> _augment_requirement("arrow>=1.2.0, <=1.2.2 # anything", unfreeze="") + 'arrow>=1.2.0, <=1.2.2' + >>> _augment_requirement("arrow>=1.2.0, <=1.2.2 # strict", unfreeze="") + 'arrow>=1.2.0, <=1.2.2 # strict' + >>> _augment_requirement("arrow>=1.2.0, <=1.2.2 # my name", unfreeze="all") + 'arrow>=1.2.0' + >>> _augment_requirement("arrow>=1.2.0, <=1.2.2 # strict", unfreeze="all") + 'arrow>=1.2.0, <=1.2.2 # strict' + >>> _augment_requirement("arrow", unfreeze="all") + 'arrow' + >>> _augment_requirement("arrow>=1.2.0, <=1.2.2 # cool", unfreeze="major") + 'arrow>=1.2.0, <2.0 # strict' + >>> _augment_requirement("arrow>=1.2.0, <=1.2.2 # strict", unfreeze="major") + 'arrow>=1.2.0, <=1.2.2 # strict' + >>> _augment_requirement("arrow>=1.2.0", unfreeze="major") + 'arrow>=1.2.0, <2.0 # strict' + >>> _augment_requirement("arrow", unfreeze="major") + 'arrow' + """ + # filer all comments + if comment_char in ln: + comment = ln[ln.index(comment_char) :] + ln = ln[: ln.index(comment_char)] + is_strict = "strict" in comment + else: + is_strict = False + req = ln.strip() + # skip directly installed dependencies + if not req or req.startswith("http") or "@http" in req: + return "" + # extract the major version from all listed versions + if unfreeze == "major": + req_ = list(parse_requirements([req]))[0] + vers = [LooseVersion(v) for s, v in req_.specs if s not in ("==", "~=")] + ver_major = sorted(vers)[-1].version[0] if vers else None + else: + ver_major = None + + # remove version restrictions unless they are strict + if unfreeze and "<" in req and not is_strict: + req = re.sub(r",? *<=? *[\d\.\*]+", "", req).strip() + if ver_major is not None and not is_strict: + # add , only if there are already some versions + req += f"{',' if any(c in req for c in '<=>') else ''} <{int(ver_major) + 1}.0" + + # adding strict back to the comment + if is_strict or ver_major is not None: + req += " # strict" + + return req + + def load_requirements( - path_dir: str, file_name: str = "base.txt", comment_char: str = "#", unfreeze: bool = True + path_dir: str, file_name: str = "base.txt", comment_char: str = "#", unfreeze: str = "all" ) -> List[str]: """Loading requirements from a file. >>> path_req = os.path.join(_PROJECT_ROOT, "requirements") - >>> load_requirements(path_req) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE - ['numpy...', 'torch...', ...] + >>> load_requirements(path_req, unfreeze="major") # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE + ['pytorch_lightning...', 'lightning_app...'] """ with open(os.path.join(path_dir, file_name)) as file: lines = [ln.strip() for ln in file.readlines()] reqs = [] for ln in lines: - # filer all comments - comment = "" - if comment_char in ln: - comment = ln[ln.index(comment_char) :] - ln = ln[: ln.index(comment_char)] - req = ln.strip() - # skip directly installed dependencies - if not req or req.startswith("http") or "@http" in req: - continue - # remove version restrictions unless they are strict - if unfreeze and "<" in req and "strict" not in comment: - req = re.sub(r",? *<=? *[\d\.\*]+", "", req).strip() - - # adding strict back to the comment - if "strict" in comment: - req += " # strict" - - reqs.append(req) - return reqs + reqs.append(_augment_requirement(ln, comment_char=comment_char, unfreeze=unfreeze)) + # filter empty lines + return [str(req) for req in reqs if req] def load_readme_description(path_dir: str, homepage: str, version: str) -> str: """Load readme as decribtion. >>> load_readme_description(_PROJECT_ROOT, "", "") # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE - '
...' + '...' """ path_readme = os.path.join(path_dir, "README.md") text = open(path_readme, encoding="utf-8").read() @@ -439,12 +489,14 @@ def _download_frontend(root: str = _PROJECT_ROOT): print("The Lightning UI downloading has failed!") -def _adjust_require_versions(source_dir: str = "src", req_dir: str = "requirements") -> None: - """Parse the base requirements and append as version adjustments if needed `pkg>=X1.Y1.Z1,==X2.Y2.*`.""" +def _relax_require_versions(source_dir: str = "src", req_dir: str = "requirements") -> None: + """Parse the base requirements and append as version adjustments if needed `pkg>=X1.Y1.Z1,==X2.Y2.*`. + + >>> _relax_require_versions("../src", "../requirements") + """ reqs = load_requirements(req_dir, file_name="base.txt") - for i, req in enumerate(reqs): - pkg_name = req[: min(req.index(c) for c in ">=" if c in req)] - ver_ = parse_version_from_file(os.path.join(source_dir, pkg_name)) + for i, req in enumerate(parse_requirements(reqs)): + ver_ = parse_version_from_file(os.path.join(source_dir, req.name)) if not ver_: continue ver2 = ".".join(ver_.split(".")[:2] + ["*"]) diff --git a/setup.py b/setup.py index 3048f8a1aed4e..d558a24e0bdac 100755 --- a/setup.py +++ b/setup.py @@ -95,7 +95,7 @@ def _load_py_module(name: str, location: str) -> ModuleType: _SETUP_TOOLS = _load_py_module(name="setup_tools", location=os.path.join(".actions", "setup_tools.py")) if _PACKAGE_NAME == "lightning": # install just the meta package - _SETUP_TOOLS._adjust_require_versions(_PATH_SRC, _PATH_REQUIRE) + _SETUP_TOOLS._relax_require_versions(_PATH_SRC, _PATH_REQUIRE) elif _PACKAGE_NAME not in _PACKAGE_MAPPING: # install everything _SETUP_TOOLS._load_aggregate_requirements(_PATH_REQUIRE, _FREEZE_REQUIREMENTS) diff --git a/src/lightning/__setup__.py b/src/lightning/__setup__.py index e3ada2d7e93df..cc3a4f8d11e78 100644 --- a/src/lightning/__setup__.py +++ b/src/lightning/__setup__.py @@ -85,7 +85,7 @@ def _setup_args(**kwargs: Any) -> Dict[str, Any]: ], }, setup_requires=[], - install_requires=_SETUP_TOOLS.load_requirements(_PATH_REQUIREMENTS, unfreeze=True), + install_requires=_SETUP_TOOLS.load_requirements(_PATH_REQUIREMENTS, unfreeze="all"), extras_require={}, # todo: consider porting all other packages extras with prefix project_urls={ "Bug Tracker": "https://github.com/Lightning-AI/lightning/issues", diff --git a/src/lightning_app/__setup__.py b/src/lightning_app/__setup__.py index 9fe01a0ebe0e5..9fd5a5d969bbf 100644 --- a/src/lightning_app/__setup__.py +++ b/src/lightning_app/__setup__.py @@ -28,7 +28,7 @@ def _prepare_extras(**kwargs: Any) -> Dict[str, Any]: # Define package extras. These are only installed if you specify them. # From remote, use like `pip install pytorch-lightning[dev, docs]` # From local copy of repo, use like `pip install ".[dev, docs]"` - common_args = dict(path_dir=_PATH_REQUIREMENTS, unfreeze=not _FREEZE_REQUIREMENTS) + common_args = dict(path_dir=_PATH_REQUIREMENTS, unfreeze="major" if _FREEZE_REQUIREMENTS else "all") extras = { # 'docs': load_requirements(file_name='docs.txt'), "cloud": _setup_tools.load_requirements(file_name="cloud.txt", **common_args), @@ -95,7 +95,9 @@ def _setup_args(**__: Any) -> Dict[str, Any]: ], }, setup_requires=["wheel"], - install_requires=_setup_tools.load_requirements(_PATH_REQUIREMENTS, unfreeze=not _FREEZE_REQUIREMENTS), + install_requires=_setup_tools.load_requirements( + _PATH_REQUIREMENTS, unfreeze="major" if _FREEZE_REQUIREMENTS else "all" + ), extras_require=_prepare_extras(), project_urls={ "Bug Tracker": "https://github.com/Lightning-AI/lightning/issues", diff --git a/src/pytorch_lightning/__setup__.py b/src/pytorch_lightning/__setup__.py index 8085574a5d2ff..1bbf2a2735789 100644 --- a/src/pytorch_lightning/__setup__.py +++ b/src/pytorch_lightning/__setup__.py @@ -29,7 +29,7 @@ def _prepare_extras(**kwargs: Any) -> Dict[str, Any]: # Define package extras. These are only installed if you specify them. # From remote, use like `pip install pytorch-lightning[dev, docs]` # From local copy of repo, use like `pip install ".[dev, docs]"` - common_args = dict(path_dir=_PATH_REQUIREMENTS, unfreeze=not _FREEZE_REQUIREMENTS) + common_args = dict(path_dir=_PATH_REQUIREMENTS, unfreeze="" if _FREEZE_REQUIREMENTS else "all") extras = { # 'docs': load_requirements(file_name='docs.txt'), "examples": _setup_tools.load_requirements(file_name="examples.txt", **common_args), @@ -99,7 +99,9 @@ def _setup_args(**__: Any) -> Dict[str, Any]: keywords=["deep learning", "pytorch", "AI"], python_requires=">=3.7", setup_requires=[], - install_requires=_setup_tools.load_requirements(_PATH_REQUIREMENTS, unfreeze=not _FREEZE_REQUIREMENTS), + install_requires=_setup_tools.load_requirements( + _PATH_REQUIREMENTS, unfreeze="" if _FREEZE_REQUIREMENTS else "all" + ), extras_require=_prepare_extras(), project_urls={ "Bug Tracker": "https://github.com/Lightning-AI/lightning/issues", From 9fea2ed9d5356feb6e7aec853b5337ddb4019972 Mon Sep 17 00:00:00 2001 From: awaelchli Date: Sat, 3 Sep 2022 14:57:48 +0200 Subject: [PATCH 063/193] move pl/utilities/apply_func.py to pl/utilities/apply_func.py (#14516) --- .../utilities/apply_func.py | 0 src/pytorch_lightning/callbacks/pruning.py | 2 +- src/pytorch_lightning/core/module.py | 2 +- src/pytorch_lightning/core/saving.py | 2 +- src/pytorch_lightning/lite/lite.py | 2 +- src/pytorch_lightning/lite/wrappers.py | 2 +- src/pytorch_lightning/loops/dataloader/evaluation_loop.py | 2 +- src/pytorch_lightning/loops/epoch/prediction_epoch_loop.py | 2 +- src/pytorch_lightning/loops/epoch/training_epoch_loop.py | 2 +- src/pytorch_lightning/overrides/data_parallel.py | 2 +- src/pytorch_lightning/plugins/io/hpu_plugin.py | 2 +- src/pytorch_lightning/plugins/io/xla_plugin.py | 2 +- src/pytorch_lightning/plugins/precision/double.py | 2 +- src/pytorch_lightning/strategies/deepspeed.py | 2 +- src/pytorch_lightning/strategies/dp.py | 2 +- src/pytorch_lightning/strategies/ipu.py | 2 +- src/pytorch_lightning/strategies/launchers/multiprocessing.py | 2 +- src/pytorch_lightning/strategies/launchers/xla.py | 2 +- src/pytorch_lightning/strategies/strategy.py | 2 +- src/pytorch_lightning/strategies/tpu_spawn.py | 2 +- src/pytorch_lightning/trainer/connectors/data_connector.py | 2 +- .../trainer/connectors/logger_connector/logger_connector.py | 2 +- .../trainer/connectors/logger_connector/result.py | 2 +- src/pytorch_lightning/trainer/supporters.py | 2 +- src/pytorch_lightning/trainer/trainer.py | 2 +- src/pytorch_lightning/utilities/__init__.py | 2 +- src/pytorch_lightning/utilities/auto_restart.py | 2 +- src/pytorch_lightning/utilities/data.py | 2 +- src/pytorch_lightning/utilities/fetching.py | 2 +- src/pytorch_lightning/utilities/memory.py | 2 +- src/pytorch_lightning/utilities/metrics.py | 2 +- src/pytorch_lightning/utilities/optimizer.py | 2 +- .../{tests_pytorch => tests_lite}/utilities/test_apply_func.py | 2 +- tests/tests_pytorch/lite/test_parity.py | 2 +- tests/tests_pytorch/trainer/test_supporters.py | 2 +- 35 files changed, 34 insertions(+), 34 deletions(-) rename src/{pytorch_lightning => lightning_lite}/utilities/apply_func.py (100%) rename tests/{tests_pytorch => tests_lite}/utilities/test_apply_func.py (99%) diff --git a/src/pytorch_lightning/utilities/apply_func.py b/src/lightning_lite/utilities/apply_func.py similarity index 100% rename from src/pytorch_lightning/utilities/apply_func.py rename to src/lightning_lite/utilities/apply_func.py diff --git a/src/pytorch_lightning/callbacks/pruning.py b/src/pytorch_lightning/callbacks/pruning.py index 878fe674b85a1..14fc1acd424e3 100644 --- a/src/pytorch_lightning/callbacks/pruning.py +++ b/src/pytorch_lightning/callbacks/pruning.py @@ -26,9 +26,9 @@ from typing_extensions import TypedDict import pytorch_lightning as pl +from lightning_lite.utilities.apply_func import apply_to_collection from pytorch_lightning.callbacks.callback import Callback from pytorch_lightning.core.module import LightningModule -from pytorch_lightning.utilities.apply_func import apply_to_collection from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.rank_zero import rank_zero_debug, rank_zero_only diff --git a/src/pytorch_lightning/core/module.py b/src/pytorch_lightning/core/module.py index 87eddc0eaef62..5ee5b36d8b9f6 100644 --- a/src/pytorch_lightning/core/module.py +++ b/src/pytorch_lightning/core/module.py @@ -32,6 +32,7 @@ from typing_extensions import Literal import pytorch_lightning as pl +from lightning_lite.utilities.apply_func import apply_to_collection, convert_to_tensors from lightning_lite.utilities.cloud_io import get_filesystem from lightning_lite.utilities.device_dtype_mixin import DeviceDtypeModuleMixin from pytorch_lightning.callbacks.callback import Callback @@ -42,7 +43,6 @@ from pytorch_lightning.loggers import Logger, LoggerCollection from pytorch_lightning.trainer.connectors.logger_connector.fx_validator import _FxValidator from pytorch_lightning.utilities import _IS_WINDOWS, _TORCH_GREATER_EQUAL_1_10, GradClipAlgorithmType -from pytorch_lightning.utilities.apply_func import apply_to_collection, convert_to_tensors from pytorch_lightning.utilities.distributed import distributed_available, sync_ddp from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_11, _TORCH_GREATER_EQUAL_1_13 diff --git a/src/pytorch_lightning/core/saving.py b/src/pytorch_lightning/core/saving.py index 9ebd5efc7b891..5b2f54114e404 100644 --- a/src/pytorch_lightning/core/saving.py +++ b/src/pytorch_lightning/core/saving.py @@ -26,10 +26,10 @@ import yaml import pytorch_lightning as pl +from lightning_lite.utilities.apply_func import apply_to_collection from lightning_lite.utilities.cloud_io import get_filesystem from lightning_lite.utilities.cloud_io import load as pl_load from pytorch_lightning.utilities import _OMEGACONF_AVAILABLE, AttributeDict -from pytorch_lightning.utilities.apply_func import apply_to_collection from pytorch_lightning.utilities.migration import pl_legacy_patch from pytorch_lightning.utilities.parsing import parse_class_init_keys from pytorch_lightning.utilities.rank_zero import rank_zero_warn diff --git a/src/pytorch_lightning/lite/lite.py b/src/pytorch_lightning/lite/lite.py index ca45a4011fcdd..b87e690ec8862 100644 --- a/src/pytorch_lightning/lite/lite.py +++ b/src/pytorch_lightning/lite/lite.py @@ -24,6 +24,7 @@ from torch.optim import Optimizer from torch.utils.data import BatchSampler, DataLoader, DistributedSampler +from lightning_lite.utilities.apply_func import apply_to_collection, convert_to_tensors from pytorch_lightning.accelerators.accelerator import Accelerator from pytorch_lightning.lite.wrappers import _LiteDataLoader, _LiteModule, _LiteOptimizer from pytorch_lightning.overrides.distributed import DistributedSamplerWrapper @@ -32,7 +33,6 @@ from pytorch_lightning.strategies.strategy import TBroadcast from pytorch_lightning.trainer.connectors.accelerator_connector import AcceleratorConnector from pytorch_lightning.utilities import _AcceleratorType, _StrategyType, move_data_to_device -from pytorch_lightning.utilities.apply_func import apply_to_collection, convert_to_tensors from pytorch_lightning.utilities.data import ( _auto_add_worker_init_fn, _replace_dunder_methods, diff --git a/src/pytorch_lightning/lite/wrappers.py b/src/pytorch_lightning/lite/wrappers.py index 2675f87b9442a..29a2cffe931a0 100644 --- a/src/pytorch_lightning/lite/wrappers.py +++ b/src/pytorch_lightning/lite/wrappers.py @@ -19,10 +19,10 @@ from torch.optim import Optimizer from torch.utils.data import DataLoader +from lightning_lite.utilities.apply_func import apply_to_collection, move_data_to_device from lightning_lite.utilities.device_dtype_mixin import DeviceDtypeModuleMixin from pytorch_lightning.plugins import PrecisionPlugin from pytorch_lightning.strategies import Strategy -from pytorch_lightning.utilities.apply_func import apply_to_collection, move_data_to_device def _do_nothing_closure() -> None: diff --git a/src/pytorch_lightning/loops/dataloader/evaluation_loop.py b/src/pytorch_lightning/loops/dataloader/evaluation_loop.py index 5760dd63a0d57..d041d371ddfaa 100644 --- a/src/pytorch_lightning/loops/dataloader/evaluation_loop.py +++ b/src/pytorch_lightning/loops/dataloader/evaluation_loop.py @@ -21,6 +21,7 @@ from torch.utils.data.dataloader import DataLoader import pytorch_lightning as pl +from lightning_lite.utilities.apply_func import apply_to_collection from pytorch_lightning.accelerators import CUDAAccelerator from pytorch_lightning.callbacks.progress.rich_progress import _RICH_AVAILABLE from pytorch_lightning.loops.dataloader import DataLoaderLoop @@ -28,7 +29,6 @@ from pytorch_lightning.loops.utilities import _set_sampler_epoch from pytorch_lightning.trainer.connectors.logger_connector.result import _OUT_DICT, _ResultCollection from pytorch_lightning.trainer.states import TrainerFn -from pytorch_lightning.utilities.apply_func import apply_to_collection from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.fetching import ( AbstractDataFetcher, diff --git a/src/pytorch_lightning/loops/epoch/prediction_epoch_loop.py b/src/pytorch_lightning/loops/epoch/prediction_epoch_loop.py index 39717929787bb..cd47f31870062 100644 --- a/src/pytorch_lightning/loops/epoch/prediction_epoch_loop.py +++ b/src/pytorch_lightning/loops/epoch/prediction_epoch_loop.py @@ -3,10 +3,10 @@ import torch +from lightning_lite.utilities.apply_func import move_data_to_device from pytorch_lightning.loops.loop import Loop from pytorch_lightning.overrides.distributed import IndexBatchSamplerWrapper from pytorch_lightning.trainer.progress import Progress -from pytorch_lightning.utilities.apply_func import move_data_to_device from pytorch_lightning.utilities.warnings import WarningCache warning_cache = WarningCache() diff --git a/src/pytorch_lightning/loops/epoch/training_epoch_loop.py b/src/pytorch_lightning/loops/epoch/training_epoch_loop.py index 6be4956b9bfd5..edc020cd72c3d 100644 --- a/src/pytorch_lightning/loops/epoch/training_epoch_loop.py +++ b/src/pytorch_lightning/loops/epoch/training_epoch_loop.py @@ -19,6 +19,7 @@ import torch import pytorch_lightning as pl +from lightning_lite.utilities.apply_func import apply_to_collection from pytorch_lightning import loops # import as loops to avoid circular imports from pytorch_lightning.loops.batch import TrainingBatchLoop from pytorch_lightning.loops.batch.training_batch_loop import _OUTPUTS_TYPE as _BATCH_OUTPUTS_TYPE @@ -26,7 +27,6 @@ from pytorch_lightning.trainer.connectors.logger_connector.result import _ResultCollection from pytorch_lightning.trainer.progress import BatchProgress, SchedulerProgress from pytorch_lightning.trainer.supporters import CombinedLoader -from pytorch_lightning.utilities.apply_func import apply_to_collection from pytorch_lightning.utilities.auto_restart import _collect_states_on_rank_zero_over_collection from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.fetching import AbstractDataFetcher, DataLoaderIterDataFetcher diff --git a/src/pytorch_lightning/overrides/data_parallel.py b/src/pytorch_lightning/overrides/data_parallel.py index b296d1d8697f4..f3feb95f5eea8 100644 --- a/src/pytorch_lightning/overrides/data_parallel.py +++ b/src/pytorch_lightning/overrides/data_parallel.py @@ -19,8 +19,8 @@ from torch import Tensor import pytorch_lightning as pl +from lightning_lite.utilities.apply_func import apply_to_collection from pytorch_lightning.overrides.base import _LightningModuleWrapperBase, _LightningPrecisionModuleWrapperBase -from pytorch_lightning.utilities.apply_func import apply_to_collection from pytorch_lightning.utilities.rank_zero import rank_zero_warn diff --git a/src/pytorch_lightning/plugins/io/hpu_plugin.py b/src/pytorch_lightning/plugins/io/hpu_plugin.py index b2a6893848309..59dfa93219413 100644 --- a/src/pytorch_lightning/plugins/io/hpu_plugin.py +++ b/src/pytorch_lightning/plugins/io/hpu_plugin.py @@ -17,9 +17,9 @@ import torch +from lightning_lite.utilities.apply_func import move_data_to_device from lightning_lite.utilities.cloud_io import atomic_save, get_filesystem from pytorch_lightning.plugins.io.torch_plugin import TorchCheckpointIO -from pytorch_lightning.utilities.apply_func import move_data_to_device from pytorch_lightning.utilities.types import _PATH diff --git a/src/pytorch_lightning/plugins/io/xla_plugin.py b/src/pytorch_lightning/plugins/io/xla_plugin.py index 6593843b37ef1..9430ee5a8d176 100644 --- a/src/pytorch_lightning/plugins/io/xla_plugin.py +++ b/src/pytorch_lightning/plugins/io/xla_plugin.py @@ -14,10 +14,10 @@ import os from typing import Any, Dict, Optional +from lightning_lite.utilities.apply_func import apply_to_collection from lightning_lite.utilities.cloud_io import get_filesystem from pytorch_lightning.plugins.io.torch_plugin import TorchCheckpointIO from pytorch_lightning.utilities import _OMEGACONF_AVAILABLE, _TPU_AVAILABLE -from pytorch_lightning.utilities.apply_func import apply_to_collection from pytorch_lightning.utilities.types import _PATH if _TPU_AVAILABLE: diff --git a/src/pytorch_lightning/plugins/precision/double.py b/src/pytorch_lightning/plugins/precision/double.py index 814ff659a9dc9..5f124e8f997fb 100644 --- a/src/pytorch_lightning/plugins/precision/double.py +++ b/src/pytorch_lightning/plugins/precision/double.py @@ -20,9 +20,9 @@ from torch.optim import Optimizer import pytorch_lightning as pl +from lightning_lite.utilities.apply_func import apply_to_collection from pytorch_lightning.overrides.base import _LightningPrecisionModuleWrapperBase from pytorch_lightning.plugins.precision.precision_plugin import PrecisionPlugin -from pytorch_lightning.utilities.apply_func import apply_to_collection class LightningDoublePrecisionModule(_LightningPrecisionModuleWrapperBase): diff --git a/src/pytorch_lightning/strategies/deepspeed.py b/src/pytorch_lightning/strategies/deepspeed.py index 4a70eb983fd86..2fb4deeb76dd9 100644 --- a/src/pytorch_lightning/strategies/deepspeed.py +++ b/src/pytorch_lightning/strategies/deepspeed.py @@ -27,6 +27,7 @@ from torch.optim import Optimizer import pytorch_lightning as pl +from lightning_lite.utilities.apply_func import apply_to_collection from pytorch_lightning.accelerators.cuda import CUDAAccelerator from pytorch_lightning.core.optimizer import _init_optimizers_and_lr_schedulers from pytorch_lightning.overrides.base import _LightningModuleWrapperBase, _LightningPrecisionModuleWrapperBase @@ -36,7 +37,6 @@ from pytorch_lightning.strategies.utils import _fp_to_half from pytorch_lightning.trainer.states import TrainerFn from pytorch_lightning.utilities import GradClipAlgorithmType -from pytorch_lightning.utilities.apply_func import apply_to_collection from pytorch_lightning.utilities.distributed import ( _get_process_group_backend_from_env, get_default_process_group_backend_for_device, diff --git a/src/pytorch_lightning/strategies/dp.py b/src/pytorch_lightning/strategies/dp.py index 5ab5021b8ac50..a144630fb0fd1 100644 --- a/src/pytorch_lightning/strategies/dp.py +++ b/src/pytorch_lightning/strategies/dp.py @@ -18,13 +18,13 @@ from torch.nn import DataParallel, Module import pytorch_lightning as pl +from lightning_lite.utilities.apply_func import apply_to_collection from pytorch_lightning.overrides.base import _LightningPrecisionModuleWrapperBase from pytorch_lightning.overrides.data_parallel import LightningParallelModule from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO from pytorch_lightning.plugins.precision import PrecisionPlugin from pytorch_lightning.strategies.parallel import ParallelStrategy from pytorch_lightning.strategies.strategy import TBroadcast, TReduce -from pytorch_lightning.utilities.apply_func import apply_to_collection from pytorch_lightning.utilities.distributed import ReduceOp from pytorch_lightning.utilities.model_helpers import is_overridden from pytorch_lightning.utilities.types import STEP_OUTPUT diff --git a/src/pytorch_lightning/strategies/ipu.py b/src/pytorch_lightning/strategies/ipu.py index 05078ef8c57a1..de3b2877528df 100644 --- a/src/pytorch_lightning/strategies/ipu.py +++ b/src/pytorch_lightning/strategies/ipu.py @@ -20,6 +20,7 @@ from torch.utils.data import DataLoader, Sampler import pytorch_lightning as pl +from lightning_lite.utilities.apply_func import apply_to_collection from lightning_lite.utilities.cloud_io import get_filesystem from pytorch_lightning.overrides.base import _LightningModuleWrapperBase, _LightningPrecisionModuleWrapperBase from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment @@ -30,7 +31,6 @@ from pytorch_lightning.strategies.utils import _fp_to_half from pytorch_lightning.trainer.states import RunningStage, TrainerFn from pytorch_lightning.utilities import _IPU_AVAILABLE, _POPTORCH_AVAILABLE, rank_zero_warn -from pytorch_lightning.utilities.apply_func import apply_to_collection from pytorch_lightning.utilities.data import _get_dataloader_init_args_and_kwargs, _reinstantiate_wrapped_cls from pytorch_lightning.utilities.enums import PrecisionType from pytorch_lightning.utilities.exceptions import MisconfigurationException diff --git a/src/pytorch_lightning/strategies/launchers/multiprocessing.py b/src/pytorch_lightning/strategies/launchers/multiprocessing.py index cee3ed5ee7b2a..90a32d7a5a5e5 100644 --- a/src/pytorch_lightning/strategies/launchers/multiprocessing.py +++ b/src/pytorch_lightning/strategies/launchers/multiprocessing.py @@ -25,10 +25,10 @@ from typing_extensions import Literal import pytorch_lightning as pl +from lightning_lite.utilities.apply_func import apply_to_collection, move_data_to_device from pytorch_lightning.strategies.launchers.base import _Launcher from pytorch_lightning.strategies.strategy import Strategy from pytorch_lightning.trainer.states import TrainerFn, TrainerState -from pytorch_lightning.utilities.apply_func import apply_to_collection, move_data_to_device from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_11 from pytorch_lightning.utilities.rank_zero import rank_zero_debug from pytorch_lightning.utilities.seed import _collect_rng_states, _set_rng_states diff --git a/src/pytorch_lightning/strategies/launchers/xla.py b/src/pytorch_lightning/strategies/launchers/xla.py index 064d952f71a8f..c5d8ac8a6afc6 100644 --- a/src/pytorch_lightning/strategies/launchers/xla.py +++ b/src/pytorch_lightning/strategies/launchers/xla.py @@ -21,6 +21,7 @@ from torch.multiprocessing import ProcessContext import pytorch_lightning as pl +from lightning_lite.utilities.apply_func import move_data_to_device from pytorch_lightning.strategies.launchers.multiprocessing import ( _FakeQueue, _GlobalStateSnapshot, @@ -29,7 +30,6 @@ ) from pytorch_lightning.trainer.states import TrainerFn from pytorch_lightning.utilities import _TPU_AVAILABLE -from pytorch_lightning.utilities.apply_func import move_data_to_device from pytorch_lightning.utilities.rank_zero import rank_zero_debug if _TPU_AVAILABLE: diff --git a/src/pytorch_lightning/strategies/strategy.py b/src/pytorch_lightning/strategies/strategy.py index 0abc5fe516273..0d89529a8d115 100644 --- a/src/pytorch_lightning/strategies/strategy.py +++ b/src/pytorch_lightning/strategies/strategy.py @@ -23,6 +23,7 @@ from torch.utils.data import DataLoader import pytorch_lightning as pl +from lightning_lite.utilities.apply_func import move_data_to_device from pytorch_lightning.core.optimizer import _init_optimizers_and_lr_schedulers, LightningOptimizer from pytorch_lightning.plugins import TorchCheckpointIO from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO @@ -30,7 +31,6 @@ from pytorch_lightning.plugins.precision import PrecisionPlugin from pytorch_lightning.strategies.launchers.base import _Launcher from pytorch_lightning.trainer.states import TrainerFn -from pytorch_lightning.utilities.apply_func import move_data_to_device from pytorch_lightning.utilities.distributed import ReduceOp from pytorch_lightning.utilities.optimizer import optimizer_to_device, optimizers_to_device from pytorch_lightning.utilities.types import ( diff --git a/src/pytorch_lightning/strategies/tpu_spawn.py b/src/pytorch_lightning/strategies/tpu_spawn.py index 5ca8db74c4620..3c46fe10964ad 100644 --- a/src/pytorch_lightning/strategies/tpu_spawn.py +++ b/src/pytorch_lightning/strategies/tpu_spawn.py @@ -21,6 +21,7 @@ from torch.utils.data import DataLoader import pytorch_lightning as pl +from lightning_lite.utilities.apply_func import apply_to_collection from pytorch_lightning.overrides import LightningDistributedModule from pytorch_lightning.plugins.environments import XLAEnvironment from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO @@ -33,7 +34,6 @@ from pytorch_lightning.trainer.connectors.data_connector import DataConnector from pytorch_lightning.trainer.states import TrainerFn from pytorch_lightning.utilities import _TPU_AVAILABLE, find_shared_parameters, set_shared_parameters -from pytorch_lightning.utilities.apply_func import apply_to_collection from pytorch_lightning.utilities.data import has_len from pytorch_lightning.utilities.distributed import ReduceOp from pytorch_lightning.utilities.exceptions import MisconfigurationException diff --git a/src/pytorch_lightning/trainer/connectors/data_connector.py b/src/pytorch_lightning/trainer/connectors/data_connector.py index d8665843945f7..f308d772033c1 100644 --- a/src/pytorch_lightning/trainer/connectors/data_connector.py +++ b/src/pytorch_lightning/trainer/connectors/data_connector.py @@ -21,12 +21,12 @@ from torch.utils.data.distributed import DistributedSampler import pytorch_lightning as pl +from lightning_lite.utilities.apply_func import apply_to_collection from pytorch_lightning.accelerators.ipu import IPUAccelerator from pytorch_lightning.overrides.distributed import DistributedSamplerWrapper, UnrepeatedDistributedSamplerWrapper from pytorch_lightning.strategies import DDPSpawnStrategy from pytorch_lightning.trainer.states import RunningStage, TrainerFn from pytorch_lightning.trainer.supporters import CombinedLoader, CycleIterator -from pytorch_lightning.utilities.apply_func import apply_to_collection from pytorch_lightning.utilities.auto_restart import _validate_fault_tolerant_automatic from pytorch_lightning.utilities.data import ( _auto_add_worker_init_fn, diff --git a/src/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py b/src/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py index 02e17a8d93494..897fe8a988605 100644 --- a/src/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py +++ b/src/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py @@ -16,10 +16,10 @@ from torch import Tensor import pytorch_lightning as pl +from lightning_lite.utilities.apply_func import apply_to_collection, move_data_to_device from pytorch_lightning.loggers import Logger, TensorBoardLogger from pytorch_lightning.plugins.environments.slurm_environment import SLURMEnvironment from pytorch_lightning.trainer.connectors.logger_connector.result import _METRICS, _OUT_DICT, _PBAR_DICT -from pytorch_lightning.utilities.apply_func import apply_to_collection, move_data_to_device from pytorch_lightning.utilities.metrics import metrics_to_scalars from pytorch_lightning.utilities.model_helpers import is_overridden from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation diff --git a/src/pytorch_lightning/trainer/connectors/logger_connector/result.py b/src/pytorch_lightning/trainer/connectors/logger_connector/result.py index 6b9d9ce8268f7..1909ade8a9cd9 100644 --- a/src/pytorch_lightning/trainer/connectors/logger_connector/result.py +++ b/src/pytorch_lightning/trainer/connectors/logger_connector/result.py @@ -20,8 +20,8 @@ from torchmetrics import Metric from typing_extensions import TypedDict +from lightning_lite.utilities.apply_func import apply_to_collection, apply_to_collections, move_data_to_device from lightning_lite.utilities.device_dtype_mixin import DeviceDtypeModuleMixin -from pytorch_lightning.utilities.apply_func import apply_to_collection, apply_to_collections, move_data_to_device from pytorch_lightning.utilities.data import extract_batch_size from pytorch_lightning.utilities.distributed import distributed_available from pytorch_lightning.utilities.exceptions import MisconfigurationException diff --git a/src/pytorch_lightning/trainer/supporters.py b/src/pytorch_lightning/trainer/supporters.py index 63213dcbed4b7..84b12bd5b9fb4 100644 --- a/src/pytorch_lightning/trainer/supporters.py +++ b/src/pytorch_lightning/trainer/supporters.py @@ -20,7 +20,7 @@ from torch.utils.data.dataloader import _BaseDataLoaderIter, _MultiProcessingDataLoaderIter, DataLoader from torch.utils.data.dataset import IterableDataset -from pytorch_lightning.utilities.apply_func import apply_to_collection, apply_to_collections +from lightning_lite.utilities.apply_func import apply_to_collection, apply_to_collections from pytorch_lightning.utilities.auto_restart import ( _reload_dataloader_state_dict, MergedIteratorState, diff --git a/src/pytorch_lightning/trainer/trainer.py b/src/pytorch_lightning/trainer/trainer.py index 19b3b1192248a..494d56f6c7a1f 100644 --- a/src/pytorch_lightning/trainer/trainer.py +++ b/src/pytorch_lightning/trainer/trainer.py @@ -36,6 +36,7 @@ from torch.utils.data import DataLoader import pytorch_lightning as pl +from lightning_lite.utilities.apply_func import apply_to_collection from lightning_lite.utilities.cloud_io import get_filesystem from pytorch_lightning.accelerators import ( Accelerator, @@ -92,7 +93,6 @@ GradClipAlgorithmType, parsing, ) -from pytorch_lightning.utilities.apply_func import apply_to_collection from pytorch_lightning.utilities.argparse import ( _defaults_from_env_vars, add_argparse_args, diff --git a/src/pytorch_lightning/utilities/__init__.py b/src/pytorch_lightning/utilities/__init__.py index 0b4b074a43768..127794f35fdba 100644 --- a/src/pytorch_lightning/utilities/__init__.py +++ b/src/pytorch_lightning/utilities/__init__.py @@ -15,7 +15,7 @@ import numpy -from pytorch_lightning.utilities.apply_func import move_data_to_device # noqa: F401 +from lightning_lite.utilities.apply_func import move_data_to_device # noqa: F401 from pytorch_lightning.utilities.distributed import AllGatherGrad # noqa: F401 from pytorch_lightning.utilities.enums import ( # noqa: F401 _AcceleratorType, diff --git a/src/pytorch_lightning/utilities/auto_restart.py b/src/pytorch_lightning/utilities/auto_restart.py index e90dcc7172690..df6f0508281ce 100644 --- a/src/pytorch_lightning/utilities/auto_restart.py +++ b/src/pytorch_lightning/utilities/auto_restart.py @@ -28,7 +28,7 @@ from typing_extensions import TypedDict import pytorch_lightning as pl -from pytorch_lightning.utilities.apply_func import apply_to_collection +from lightning_lite.utilities.apply_func import apply_to_collection from pytorch_lightning.utilities.distributed import _collect_states_on_rank_zero from pytorch_lightning.utilities.enums import _FaultTolerantMode, AutoRestartBatchKeys from pytorch_lightning.utilities.exceptions import MisconfigurationException diff --git a/src/pytorch_lightning/utilities/data.py b/src/pytorch_lightning/utilities/data.py index adb425127a81e..6c5c32a6bf811 100644 --- a/src/pytorch_lightning/utilities/data.py +++ b/src/pytorch_lightning/utilities/data.py @@ -33,9 +33,9 @@ ) import pytorch_lightning as pl +from lightning_lite.utilities.apply_func import _is_dataclass_instance from pytorch_lightning.overrides.distributed import IndexBatchSamplerWrapper from pytorch_lightning.trainer.states import RunningStage -from pytorch_lightning.utilities.apply_func import _is_dataclass_instance from pytorch_lightning.utilities.auto_restart import CaptureIterableDataset, CaptureMapDataset, FastForwardSampler from pytorch_lightning.utilities.enums import _FaultTolerantMode, LightningEnum from pytorch_lightning.utilities.exceptions import MisconfigurationException diff --git a/src/pytorch_lightning/utilities/fetching.py b/src/pytorch_lightning/utilities/fetching.py index fa7e395fbb82b..f0b6e98b8872b 100644 --- a/src/pytorch_lightning/utilities/fetching.py +++ b/src/pytorch_lightning/utilities/fetching.py @@ -19,8 +19,8 @@ import torch from torch.utils.data.dataloader import DataLoader +from lightning_lite.utilities.apply_func import apply_to_collection, apply_to_collections from pytorch_lightning.trainer.supporters import CombinedLoader, CycleIterator -from pytorch_lightning.utilities.apply_func import apply_to_collection, apply_to_collections from pytorch_lightning.utilities.auto_restart import ( _add_capture_metadata_collate, _patch_dataloader_get_iterators, diff --git a/src/pytorch_lightning/utilities/memory.py b/src/pytorch_lightning/utilities/memory.py index 573dd6ed0f129..3480f2e2da50b 100644 --- a/src/pytorch_lightning/utilities/memory.py +++ b/src/pytorch_lightning/utilities/memory.py @@ -24,7 +24,7 @@ from torch import Tensor from torch.nn import Module -from pytorch_lightning.utilities.apply_func import apply_to_collection +from lightning_lite.utilities.apply_func import apply_to_collection def recursive_detach(in_dict: Any, to_cpu: bool = False) -> Any: diff --git a/src/pytorch_lightning/utilities/metrics.py b/src/pytorch_lightning/utilities/metrics.py index 0a9f0dd5f027e..d0752029edcc3 100644 --- a/src/pytorch_lightning/utilities/metrics.py +++ b/src/pytorch_lightning/utilities/metrics.py @@ -18,7 +18,7 @@ from torch import Tensor -from pytorch_lightning.utilities.apply_func import apply_to_collection +from lightning_lite.utilities.apply_func import apply_to_collection from pytorch_lightning.utilities.exceptions import MisconfigurationException diff --git a/src/pytorch_lightning/utilities/optimizer.py b/src/pytorch_lightning/utilities/optimizer.py index b4cc49627a52c..f3f4734b1c3e9 100644 --- a/src/pytorch_lightning/utilities/optimizer.py +++ b/src/pytorch_lightning/utilities/optimizer.py @@ -17,7 +17,7 @@ from torch import Tensor from torch.optim import Optimizer -from pytorch_lightning.utilities.apply_func import apply_to_collection, move_data_to_device +from lightning_lite.utilities.apply_func import apply_to_collection, move_data_to_device from pytorch_lightning.utilities.types import _DEVICE diff --git a/tests/tests_pytorch/utilities/test_apply_func.py b/tests/tests_lite/utilities/test_apply_func.py similarity index 99% rename from tests/tests_pytorch/utilities/test_apply_func.py rename to tests/tests_lite/utilities/test_apply_func.py index 88a4a3fdac734..bbc9b57e0b622 100644 --- a/tests/tests_pytorch/utilities/test_apply_func.py +++ b/tests/tests_lite/utilities/test_apply_func.py @@ -21,7 +21,7 @@ import pytest import torch -from pytorch_lightning.utilities.apply_func import apply_to_collection, apply_to_collections, move_data_to_device +from lightning_lite.utilities.apply_func import apply_to_collection, apply_to_collections, move_data_to_device from pytorch_lightning.utilities.exceptions import MisconfigurationException diff --git a/tests/tests_pytorch/lite/test_parity.py b/tests/tests_pytorch/lite/test_parity.py index d8f5df5f6e1e9..2edca121343e0 100644 --- a/tests/tests_pytorch/lite/test_parity.py +++ b/tests/tests_pytorch/lite/test_parity.py @@ -27,12 +27,12 @@ from torch.utils.data import DataLoader from torch.utils.data.distributed import DistributedSampler +from lightning_lite.utilities.apply_func import apply_to_collection, move_data_to_device from lightning_lite.utilities.cloud_io import atomic_save from pytorch_lightning.demos.boring_classes import RandomDataset from pytorch_lightning.lite import LightningLite from pytorch_lightning.plugins.environments.lightning_environment import find_free_network_port from pytorch_lightning.strategies.ddp_spawn import DDPSpawnStrategy -from pytorch_lightning.utilities.apply_func import apply_to_collection, move_data_to_device from tests_pytorch.helpers.runif import RunIf diff --git a/tests/tests_pytorch/trainer/test_supporters.py b/tests/tests_pytorch/trainer/test_supporters.py index 324070fa87602..92be556d198e3 100644 --- a/tests/tests_pytorch/trainer/test_supporters.py +++ b/tests/tests_pytorch/trainer/test_supporters.py @@ -23,6 +23,7 @@ from torch.utils.data.distributed import DistributedSampler from torch.utils.data.sampler import RandomSampler, Sampler, SequentialSampler +from lightning_lite.utilities.apply_func import apply_to_collection from pytorch_lightning import Trainer from pytorch_lightning.demos.boring_classes import BoringModel, RandomDataset from pytorch_lightning.trainer.supporters import ( @@ -33,7 +34,6 @@ CycleIterator, TensorRunningAccum, ) -from pytorch_lightning.utilities.apply_func import apply_to_collection from pytorch_lightning.utilities.auto_restart import CaptureMapDataset, FastForwardSampler from pytorch_lightning.utilities.data import get_len from pytorch_lightning.utilities.exceptions import MisconfigurationException From 7f148b2c47813987f41b5b2b8a934acea3c1547d Mon Sep 17 00:00:00 2001 From: awaelchli Date: Sat, 3 Sep 2022 15:03:39 +0200 Subject: [PATCH 064/193] Deprecate pl/utilities/apply_func (#14516) --- src/pytorch_lightning/CHANGELOG.md | 4 + src/pytorch_lightning/utilities/apply_func.py | 82 +++++++++++++++++++ .../deprecated_api/test_remove_1-10.py | 37 +++++++++ 3 files changed, 123 insertions(+) create mode 100644 src/pytorch_lightning/utilities/apply_func.py diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 08197e442f367..f517d56fb0465 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -83,6 +83,10 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). +- Deprecated the functions in `pytorch_lightning.utilities.apply_func` in favor of `lightning_lite.utilities.apply_func` ([#14516](https://github.com/Lightning-AI/lightning/pull/14516)) + + + ### Removed - Removed the deprecated `Trainer.training_type_plugin` property in favor of `Trainer.strategy` ([#14011](https://github.com/Lightning-AI/lightning/pull/14011)) diff --git a/src/pytorch_lightning/utilities/apply_func.py b/src/pytorch_lightning/utilities/apply_func.py new file mode 100644 index 0000000000000..bc516d3fb3d6f --- /dev/null +++ b/src/pytorch_lightning/utilities/apply_func.py @@ -0,0 +1,82 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Utilities used for collections.""" + +from typing import Any + +from lightning_lite.utilities.apply_func import apply_to_collection as new_apply_to_collection +from lightning_lite.utilities.apply_func import apply_to_collections as new_apply_to_collections +from lightning_lite.utilities.apply_func import convert_to_tensors as new_convert_to_tensors +from lightning_lite.utilities.apply_func import from_numpy as new_from_numpy +from lightning_lite.utilities.apply_func import move_data_to_device as new_move_data_to_device +from lightning_lite.utilities.apply_func import to_dtype_tensor as new_to_dtype_tensor +from lightning_lite.utilities.apply_func import TransferableDataType as NewTransferableDataType +from pytorch_lightning.utilities import rank_zero_deprecation + + +def apply_to_collection(*args: Any, **kwargs: Any) -> Any: + rank_zero_deprecation( + "`pytorch_lightning.utilities.apply_func.apply_to_collection` has been deprecated in v1.8.0 and will be" + " removed in v1.10.0. Please use `lightning_lite.utilities.apply_func.apply_to_collection` instead." + ) + return new_apply_to_collection(*args, **kwargs) + + +def apply_to_collections(*args: Any, **kwargs: Any) -> Any: + rank_zero_deprecation( + "`pytorch_lightning.utilities.apply_func.apply_to_collections` has been deprecated in v1.8.0 and will be" + " removed in v1.10.0. Please use `lightning_lite.utilities.apply_func.apply_to_collections` instead." + ) + return new_apply_to_collections(*args, **kwargs) + + +def convert_to_tensors(*args: Any, **kwargs: Any) -> Any: + rank_zero_deprecation( + "`pytorch_lightning.utilities.apply_func.convert_to_tensors` has been deprecated in v1.8.0 and will be" + " removed in v1.10.0. Please use `lightning_lite.utilities.apply_func.convert_to_tensors` instead." + ) + return new_convert_to_tensors(*args, **kwargs) + + +def from_numpy(*args: Any, **kwargs: Any) -> Any: + rank_zero_deprecation( + "`pytorch_lightning.utilities.apply_func.from_numpy` has been deprecated in v1.8.0 and will be" + " removed in v1.10.0. Please use `lightning_lite.utilities.apply_func.from_numpy` instead." + ) + return new_from_numpy(*args, **kwargs) + + +def move_data_to_device(*args: Any, **kwargs: Any) -> Any: + rank_zero_deprecation( + "`pytorch_lightning.utilities.apply_func.move_data_to_device` has been deprecated in v1.8.0 and will be" + " removed in v1.10.0. Please use `lightning_lite.utilities.apply_func.move_data_to_device` instead." + ) + return new_move_data_to_device(*args, **kwargs) + + +def to_dtype_tensor(*args: Any, **kwargs: Any) -> Any: + rank_zero_deprecation( + "`pytorch_lightning.utilities.apply_func.to_dtype_tensor` has been deprecated in v1.8.0 and will be" + " removed in v1.10.0. Please use `lightning_lite.utilities.apply_func.to_dtype_tensor` instead." + ) + return new_to_dtype_tensor(*args, **kwargs) + + +class TransferableDataType(NewTransferableDataType): + def __init__(self) -> None: + rank_zero_deprecation( + "`pytorch_lightning.utilities.apply_func.TransferableDataType` has been deprecated in v1.8.0 and will be" + " removed in v1.10.0. Please use `lightning_lite.utilities.apply_func.TransferableDataType` instead." + ) + super().__init__() diff --git a/tests/tests_pytorch/deprecated_api/test_remove_1-10.py b/tests/tests_pytorch/deprecated_api/test_remove_1-10.py index ffb6abfcc9e12..2193085255fb9 100644 --- a/tests/tests_pytorch/deprecated_api/test_remove_1-10.py +++ b/tests/tests_pytorch/deprecated_api/test_remove_1-10.py @@ -14,7 +14,9 @@ """Test deprecated functionality which will be removed in v1.10.0.""" from unittest import mock +import numpy import pytest +import torch from pytorch_lightning import Trainer from pytorch_lightning.core.mixins.device_dtype_mixin import DeviceDtypeModuleMixin @@ -26,6 +28,15 @@ from pytorch_lightning.strategies.deepspeed import LightningDeepSpeedModule from pytorch_lightning.strategies.ipu import LightningIPUModule from pytorch_lightning.strategies.utils import on_colab_kaggle +from pytorch_lightning.utilities.apply_func import ( + apply_to_collection, + apply_to_collections, + convert_to_tensors, + from_numpy, + move_data_to_device, + to_dtype_tensor, + TransferableDataType, +) from pytorch_lightning.utilities.cloud_io import atomic_save, get_filesystem, load from pytorch_lightning.utilities.xla_device import inner_f, pl_multi_process, XLADeviceUtils from tests_pytorch.helpers.runif import RunIf @@ -110,3 +121,29 @@ def test_v1_10_deprecated_cloud_io_utilities(tmpdir): with pytest.deprecated_call(match="cloud_io.load` has been deprecated in v1.8.0"): load(str(tmpdir / "atomic_save.ckpt")) + + +def test_v1_10_deprecated_apply_func_utilities(): + with pytest.deprecated_call(match="apply_func.apply_to_collection` has been deprecated in v1.8.0"): + apply_to_collection([], dtype=object, function=(lambda x: x)) + + with pytest.deprecated_call(match="apply_func.apply_to_collections` has been deprecated in v1.8.0"): + apply_to_collections([], [], dtype=object, function=(lambda x, y: x)) + + with pytest.deprecated_call(match="apply_func.convert_to_tensors` has been deprecated in v1.8.0"): + convert_to_tensors([], torch.device("cpu")) + + with pytest.deprecated_call(match="apply_func.from_numpy` has been deprecated in v1.8.0"): + from_numpy(numpy.zeros(2), torch.device("cpu")) + + with pytest.deprecated_call(match="apply_func.move_data_to_device` has been deprecated in v1.8.0"): + move_data_to_device(torch.tensor(2), torch.device("cpu")) + + with pytest.deprecated_call(match="apply_func.to_dtype_tensor` has been deprecated in v1.8.0"): + to_dtype_tensor(torch.tensor(2), dtype=torch.float32, device=torch.device("cpu")) + + class MyModule(TransferableDataType): + pass + + with pytest.deprecated_call(match="apply_func.TransferableDataType` has been deprecated in v1.8.0"): + MyModule() From 8c6119fbcedccbc17300df1680f41ac30b4b1c79 Mon Sep 17 00:00:00 2001 From: Rohit Gupta Date: Tue, 6 Sep 2022 00:37:26 +0530 Subject: [PATCH 065/193] Add auto wrapping support for `DDPFullyShardedStrategy` (#14383) --- .../advanced/model_parallel.rst | 105 +++++++++++------ docs/source-pytorch/extensions/strategy.rst | 4 +- src/pytorch_lightning/CHANGELOG.md | 3 + .../callbacks/stochastic_weight_avg.py | 7 +- .../strategies/fully_sharded.py | 97 ++++++++++++---- src/pytorch_lightning/strategies/strategy.py | 2 +- .../callbacks/test_stochastic_weight_avg.py | 1 + ..._ddp_fully_sharded_with_full_state_dict.py | 106 +++++++++++++++--- 8 files changed, 248 insertions(+), 77 deletions(-) diff --git a/docs/source-pytorch/advanced/model_parallel.rst b/docs/source-pytorch/advanced/model_parallel.rst index 50ae2cd2827d0..757b7dffa4580 100644 --- a/docs/source-pytorch/advanced/model_parallel.rst +++ b/docs/source-pytorch/advanced/model_parallel.rst @@ -1,7 +1,8 @@ .. _model-parallel: +################################## Train 1 trillion+ parameter models -================================== +################################## When training large models, fitting larger batch sizes, or trying to increase throughput using multi-GPU compute, Lightning provides advanced optimized distributed training strategies to support these cases and offer substantial improvements in memory usage. @@ -19,8 +20,9 @@ Check out this amazing video explaining model parallelism and how it works behin allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen> +********************************************* Choosing an Advanced Distributed GPU Strategy -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +********************************************* If you would like to stick with PyTorch DDP, see :ref:`ddp-optimizations`. @@ -29,7 +31,7 @@ Unlike :class:`~torch.nn.parallel.DistributedDataParallel` (DDP) where the maxim There are many considerations when choosing a strategy as described below. In addition, check out the visualization of various strategy benchmarks using `minGPT `__ `here `__. Pre-training vs Fine-tuning -""""""""""""""""""""""""""" +=========================== When fine-tuning, we often use a magnitude less data compared to pre-training a model. This is important when choosing a distributed strategy as usually for pre-training, **we are compute-bound**. This means we cannot sacrifice throughput as much as if we were fine-tuning, because in fine-tuning the data requirement is smaller. @@ -45,7 +47,7 @@ For example when using 128 GPUs, you can **pre-train** large 10 to 20 Billion pa But for **fine-tuning** a model, you can reach 10 to 20 Billion parameter models using :ref:`deepspeed-zero-stage-3-offload` on a **single GPU**. This does come with a significant throughput hit, which needs to be weighed accordingly. When Shouldn't I use an Optimized Distributed Strategy? -""""""""""""""""""""""""""""""""""""""""""""""""""""""" +======================================================= Sharding techniques help when model sizes are fairly large; roughly 500M+ parameters is where we've seen benefits. However, in the following cases, we recommend sticking to ordinary distributed strategies * When your model is small (ResNet50 of around 80M Parameters), unless you are using unusually large batch sizes or inputs. @@ -55,8 +57,10 @@ Sharding techniques help when model sizes are fairly large; roughly 500M+ parame .. _sharded-training: -Sharded Training -^^^^^^^^^^^^^^^^ +************************** +FairScale Sharded Training +************************** + Lightning integration of optimizer sharded training provided by `FairScale `_. The technique can be found within `DeepSpeed ZeRO `_ and `ZeRO-2 `_, @@ -94,7 +98,7 @@ Internally we re-initialize your optimizers and shard them across your machines .. _fully-sharded-training: FairScale Fully Sharded Training -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +================================ .. warning:: FairScale Fully Sharded Training is in BETA and the API is subject to change. Please create an `issue `_ if you run into any problems. @@ -104,7 +108,7 @@ FairScale Fully Sharded Training Fully Sharded Training alleviates the need to worry about balancing layers onto specific devices using some form of pipe parallelism, and optimizes for distributed communication with minimal effort. Shard Parameters to Reach 10+ Billion Parameters -"""""""""""""""""""""""""""""""""""""""""""""""" +------------------------------------------------ To reach larger parameter sizes and to be memory efficient, we have to shard parameters. There are various ways to enable this. @@ -114,9 +118,37 @@ To reach larger parameter sizes and to be memory efficient, we have to shard par This is a limitation of Fully Sharded Training that will be resolved in the future. Enabling Module Sharding for Maximum Memory Efficiency -"""""""""""""""""""""""""""""""""""""""""""""""""""""" +------------------------------------------------------ + +Auto Wrapping +^^^^^^^^^^^^^ + +Model layers should be wrapped in FSDP in a nested way to save peak memory and enable communication and computation overlapping. The +simplest way to do it is auto wrapping, which can serve as a drop-in replacement for DDP without changing the rest of the code. You don't +have to ``wrap`` layers manually as in the case of manual wrapping. + +.. note:: + While initializing the optimizers inside ``configure_optimizers`` hook, make sure to use ``self.trainer.model.parameters()``, else + PyTorch will raise an error. This is required because when you use auto-wrap, the model layers are sharded and your + ``lightning_module.parameters()`` will return a generator with no params. This inconvenience will be addressed in the future. + +.. code-block:: python + + class MyModel(BoringModel): + def configure_optimizers(self): + return torch.optim.AdamW(self.trainer.model.parameters(), lr=1e-2) -To activate parameter sharding, you must wrap your model using the ``wrap`` or ``auto_wrap`` functions. Internally in Lightning, we enable a context manager around the ``configure_sharded_model`` function to make sure the ``wrap`` and ``auto_wrap`` parameters are passed correctly. + + model = MyModel() + trainer = Trainer(accelerator="gpu", devices=4, strategy="fsdp", precision=16) + trainer.fit(model) + + +Manual Wrapping +^^^^^^^^^^^^^^^ + +Manual wrapping can be useful to explore complex sharding strategies by applying ``wrap`` selectively to some parts of the model. To activate +parameter sharding with manual wrapping, you can wrap your model using the ``wrap`` function. Internally in Lightning, we enable a context manager around the ``configure_sharded_model`` function to make sure the ``wrap`` parameters are passed correctly. When not using Fully Sharded Training these wrap functions are a no-op. That means once the changes have been made, there is no need to remove the changes for other strategies. @@ -164,7 +196,7 @@ Here's an example using both ``wrap`` and ``auto_wrap`` to create your model: self.model = nn.Sequential(linear_layer, nn.ReLU(), block, final_block) def configure_optimizers(self): - return torch.optim.AdamW(self.model.parameters()) + return torch.optim.AdamW(self.model.parameters(), lr=1e-2) model = MyModel() @@ -178,8 +210,8 @@ Here's an example using both ``wrap`` and ``auto_wrap`` to create your model: .. _fairscale-activation-checkpointing: -FairScale Activation Checkpointing -"""""""""""""""""""""""""""""""""" +Activation Checkpointing +------------------------ Activation checkpointing frees activations from memory as soon as they are not needed during the forward pass. They are then re-computed for the backwards pass as needed. Activation checkpointing is very useful when you have intermediate layers that produce large activations. @@ -208,8 +240,9 @@ This saves memory when training larger models, however it requires wrapping modu .. _fully-sharded-native-training: +****************************** PyTorch Fully Sharded Training -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +****************************** PyTorch has it's own version of `FSDP `_ which is upstreamed from their `fairscale `__ project. It was introduced in their `v1.11.0 release `_ but it is recommended to use it with PyTorch v1.12 or more and that's what @@ -217,7 +250,8 @@ Lightning supports. The API is pretty similar to that of FairScale. Auto Wrapping -""""""""""""" +============= + Model layers should be wrapped in FSDP in a nested way to save peak memory and enable communication and computation overlapping. The simplest way to do it is auto wrapping, which can serve as a drop-in replacement for DDP without changing the rest of the code. You don't have to ``wrap`` layers manually as in the case of manual wrapping. @@ -233,7 +267,7 @@ Read more `here `_ if you run into any issues. @@ -343,7 +378,7 @@ If you run into an issue with the install or later in training, ensure that the .. _deepspeed-zero-stage-1: DeepSpeed ZeRO Stage 1 -"""""""""""""""""""""" +====================== `DeepSpeed ZeRO Stage 1 `_ partitions your optimizer states (Stage 1) across your GPUs to reduce memory. @@ -361,7 +396,7 @@ It is recommended to skip Stage 1 and use Stage 2, which comes with larger memor .. _deepspeed-zero-stage-2: DeepSpeed ZeRO Stage 2 -"""""""""""""""""""""" +====================== `DeepSpeed ZeRO Stage 2 `_ partitions your optimizer states (Stage 1) and your gradients (Stage 2) across your GPUs to reduce memory. In most cases, this is more efficient or at parity with DDP, primarily due to the optimized custom communications written by the DeepSpeed team. As a result, benefits can also be seen on a single GPU. Do note that the default bucket sizes allocate around ``3.6GB`` of VRAM to use during distributed communications, which can be tweaked when instantiating the strategy described in a few sections below. @@ -382,7 +417,7 @@ As a result, benefits can also be seen on a single GPU. Do note that the default .. _deepspeed-zero-stage-2-offload: DeepSpeed ZeRO Stage 2 Offload -"""""""""""""""""""""""""""""" +------------------------------ Below we show an example of running `ZeRO-Offload `_. ZeRO-Offload leverages the host CPU to offload optimizer memory/computation, reducing the overall memory consumption. @@ -452,7 +487,7 @@ For even more speed benefit, DeepSpeed offers an optimized CPU version of ADAM c .. _deepspeed-zero-stage-3: DeepSpeed ZeRO Stage 3 -"""""""""""""""""""""" +====================== DeepSpeed ZeRO Stage 3 shards the optimizer states, gradients and the model parameters (also optionally activations). Sharding model parameters and activations comes with an increase in distributed communication, however allows you to scale your models massively from one GPU to multiple GPUs. **The DeepSpeed team report the ability to fine-tune models with over 40B parameters on a single GPU and over 2 Trillion parameters on 512 GPUs.** For more information we suggest checking the `DeepSpeed ZeRO-3 Offload documentation `__. @@ -511,7 +546,7 @@ You can also use the Lightning Trainer to run predict or evaluate with DeepSpeed Shard Model Instantly to Reduce Initialization Time/Memory -"""""""""""""""""""""""""""""""""""""""""""""""""""""""""" +---------------------------------------------------------- When instantiating really large models, it is sometimes necessary to shard the model layers instantly. @@ -550,7 +585,7 @@ This reduces the time taken to initialize very large models, as well as ensure w .. _deepspeed-zero-stage-3-offload: DeepSpeed ZeRO Stage 3 Offload -"""""""""""""""""""""""""""""" +------------------------------ DeepSpeed ZeRO Stage 3 Offloads optimizer state, gradients to the host CPU to reduce memory usage as ZeRO Stage 2 does, however additionally allows you to offload the parameters as well for even more memory saving. @@ -584,7 +619,7 @@ DeepSpeed ZeRO Stage 3 Offloads optimizer state, gradients to the host CPU to re DeepSpeed Infinity (NVMe Offloading) -"""""""""""""""""""""""""""""""""""" +------------------------------------ Additionally, DeepSpeed supports offloading to NVMe drives for even larger models, utilizing the large memory space found in NVMes. DeepSpeed `reports `__ the ability to fine-tune 1 Trillion+ parameters using NVMe Offloading on one 8 GPU machine. Below shows how to enable this, assuming the NVMe drive is mounted in a directory called ``/local_nvme``. @@ -621,7 +656,7 @@ When offloading to NVMe you may notice that the speed is slow. There are paramet .. _deepspeed-activation-checkpointing: DeepSpeed Activation Checkpointing -"""""""""""""""""""""""""""""""""" +---------------------------------- Activation checkpointing frees activations from memory as soon as they are not needed during the forward pass. They are then re-computed for the backwards pass as needed. @@ -697,7 +732,7 @@ This saves memory when training larger models, however requires using a checkpoi .. _deepspeed-zero-stage-3-tips: DeepSpeed ZeRO Stage 3 Tips -""""""""""""""""""""""""""" +--------------------------- Here is some helpful information when setting up DeepSpeed ZeRO Stage 3 with Lightning. @@ -709,7 +744,7 @@ Here is some helpful information when setting up DeepSpeed ZeRO Stage 3 with Lig .. _deepspeed-zero-stage-3-single-file: Collating Single File Checkpoint for DeepSpeed ZeRO Stage 3 -""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" +----------------------------------------------------------- After training using ZeRO Stage 3, you'll notice that your checkpoints are a directory of sharded model and optimizer states. If you'd like to collate a single file from the checkpoint directory please use the below command, which handles all the Lightning states additionally when collating the file. @@ -728,7 +763,7 @@ After training using ZeRO Stage 3, you'll notice that your checkpoints are a dir This single file checkpoint does not include the optimizer/lr-scheduler states. This means we cannot restore training via the ``trainer.fit(ckpt_path=)`` call. Ensure to keep the sharded checkpoint directory if this is required. Custom DeepSpeed Config -""""""""""""""""""""""" +======================= In some cases you may want to define your own DeepSpeed Config, to access all parameters defined. We've exposed most of the important parameters, however, there may be debugging parameters to enable. Also, DeepSpeed allows the use of custom DeepSpeed optimizers and schedulers defined within a config file that is supported. @@ -801,12 +836,13 @@ You can use also use an environment variable via your PyTorch Lightning script: .. _ddp-optimizations: +***************** DDP Optimizations -^^^^^^^^^^^^^^^^^ +***************** When Using DDP Strategies, Set find_unused_parameters=False -""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" +=========================================================== By default, we have set ``find_unused_parameters=True`` for compatibility reasons that have been observed in the past (refer to the `discussion `_ for more details). When enabled, it can result in a performance hit and can be disabled in most cases. Read more about it `here `_. @@ -836,7 +872,7 @@ When enabled, it can result in a performance hit and can be disabled in most cas DDP Static Graph -"""""""""""""""" +================ `DDP static graph `__ assumes that your model employs the same set of used/unused parameters in every iteration, so that it can deterministically know the flow of @@ -854,7 +890,7 @@ training and apply special optimizations during runtime. When Using DDP on a Multi-node Cluster, Set NCCL Parameters -""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" +=========================================================== `NCCL `__ is the NVIDIA Collective Communications Library that is used by PyTorch to handle communication across nodes and GPUs. There are reported benefits in terms of speedups when adjusting NCCL parameters as seen in this `issue `__. In the issue, we see a 30% speed improvement when training the Transformer XLM-RoBERTa and a 15% improvement in training with Detectron2. @@ -875,7 +911,7 @@ NCCL parameters can be adjusted via environment variables. Gradients as Bucket View -"""""""""""""""""""""""" +======================== Enabling ``gradient_as_bucket_view=True`` in the ``DDPStrategy`` will make gradients views point to different offsets of the ``allreduce`` communication buckets. See :class:`~torch.nn.parallel.DistributedDataParallel` for more information. @@ -894,8 +930,9 @@ This can reduce peak memory usage and throughput as saved memory will be equal t trainer = Trainer(accelerator="gpu", devices=4, strategy=DDPStrategy(gradient_as_bucket_view=True)) trainer.fit(model) + DDP Communication Hooks -""""""""""""""""""""""" +======================= DDP Communication hooks is an interface to control how gradients are communicated across workers, overriding the standard allreduce in DistributedDataParallel. This allows you to enable performance improving communication hooks when using multiple nodes. diff --git a/docs/source-pytorch/extensions/strategy.rst b/docs/source-pytorch/extensions/strategy.rst index ed39f68d45e23..21a6e8a8814b2 100644 --- a/docs/source-pytorch/extensions/strategy.rst +++ b/docs/source-pytorch/extensions/strategy.rst @@ -83,10 +83,10 @@ The below table lists all relevant strategies available in Lightning with their - Strategy for Fully Sharded Data Parallel provided by FairScale. :ref:`Learn more. ` * - ddp_sharded - :class:`~pytorch_lightning.strategies.DDPShardedStrategy` - - Optimizer and gradient sharded training provided by FairScale. :ref:`Learn more. ` + - Optimizer and gradient sharded training provided by FairScale. :ref:`Learn more. ` * - ddp_sharded_spawn - :class:`~pytorch_lightning.strategies.DDPSpawnShardedStrategy` - - Optimizer sharded training provided by FairScale. :ref:`Learn more. ` + - Optimizer sharded training provided by FairScale. :ref:`Learn more. ` * - ddp_spawn - :class:`~pytorch_lightning.strategies.DDPSpawnStrategy` - Spawns processes using the :func:`torch.multiprocessing.spawn` method and joins processes after training finishes. :ref:`Learn more. ` diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index f517d56fb0465..0d5d55d3324ec 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -21,6 +21,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added support for saving sharded optimizer state dict outside of `DDPShardedStrategy` ([#14208](https://github.com/PyTorchLightning/pytorch-lightning/pull/14208)) +- Added support for auto wrapping for `DDPFullyShardedStrategy` ([#14383](https://github.com/Lightning-AI/lightning/issues/14383)) + + ### Changed diff --git a/src/pytorch_lightning/callbacks/stochastic_weight_avg.py b/src/pytorch_lightning/callbacks/stochastic_weight_avg.py index 90e2c62a7962d..51cbceb7f9fb6 100644 --- a/src/pytorch_lightning/callbacks/stochastic_weight_avg.py +++ b/src/pytorch_lightning/callbacks/stochastic_weight_avg.py @@ -25,6 +25,7 @@ import pytorch_lightning as pl from pytorch_lightning.callbacks.callback import Callback from pytorch_lightning.strategies import DDPFullyShardedStrategy, DeepSpeedStrategy +from pytorch_lightning.strategies.fully_sharded_native import DDPFullyShardedNativeStrategy from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.rank_zero import rank_zero_info, rank_zero_warn from pytorch_lightning.utilities.types import _LRScheduler, LRSchedulerConfig @@ -144,6 +145,9 @@ def pl_module_contains_batch_norm(pl_module: "pl.LightningModule") -> bool: return any(isinstance(module, nn.modules.batchnorm._BatchNorm) for module in pl_module.modules()) def setup(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule", stage: str) -> None: + if isinstance(trainer.strategy, (DDPFullyShardedStrategy, DDPFullyShardedNativeStrategy, DeepSpeedStrategy)): + raise MisconfigurationException("SWA does not currently support sharded models.") + # copy the model before moving it to accelerator device. with pl_module._prevent_trainer_and_dataloaders_deepcopy(): self._average_model = deepcopy(pl_module) @@ -155,9 +159,6 @@ def on_fit_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") - if len(trainer.lr_scheduler_configs) > 1: raise MisconfigurationException("SWA currently not supported for more than 1 `lr_scheduler`.") - if isinstance(trainer.strategy, (DDPFullyShardedStrategy, DeepSpeedStrategy)): - raise MisconfigurationException("SWA does not currently support sharded models.") - if isinstance(self._swa_epoch_start, float): self._swa_epoch_start = int(trainer.max_epochs * self._swa_epoch_start) diff --git a/src/pytorch_lightning/strategies/fully_sharded.py b/src/pytorch_lightning/strategies/fully_sharded.py index 6f7ca3b34b03d..a364d7d19a679 100644 --- a/src/pytorch_lightning/strategies/fully_sharded.py +++ b/src/pytorch_lightning/strategies/fully_sharded.py @@ -18,6 +18,7 @@ import torch import pytorch_lightning as pl +from pytorch_lightning.overrides.base import _LightningModuleWrapperBase from pytorch_lightning.overrides.fairscale import _FAIRSCALE_AVAILABLE from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO @@ -26,16 +27,28 @@ from pytorch_lightning.trainer.states import TrainerFn from pytorch_lightning.utilities.enums import PrecisionType from pytorch_lightning.utilities.exceptions import MisconfigurationException +from pytorch_lightning.utilities.model_helpers import is_overridden from pytorch_lightning.utilities.optimizer import optimizers_to_device -from pytorch_lightning.utilities.types import PredictStep, STEP_OUTPUT, TestStep, TrainingStep, ValidationStep +from pytorch_lightning.utilities.rank_zero import rank_zero_info +from pytorch_lightning.utilities.types import STEP_OUTPUT if _FAIRSCALE_AVAILABLE: from fairscale.nn import default_auto_wrap_policy, enable_wrap from fairscale.nn.data_parallel import FullyShardedDataParallel +else: + FullyShardedDataParallel = None log = logging.getLogger(__name__) +class _DDPFullyShardedStrategyModuleWrapper(_LightningModuleWrapperBase): + def state_dict(self, *args: Any, **kwargs: Any) -> Dict[str, Any]: # type: ignore[override] + # this is required because with FSDP lightning_module is empty because weights are sharded. + # So we need to call self.trainer.model.state_dict (wrapped version) and use this wraper to + # avoid extra keys `_forward_module.layer.weight.` since we want `layer.weight.` in state_dict. + return self._forward_module.state_dict(*args, **kwargs) + + class DDPFullyShardedStrategy(DDPStrategy): strategy_name = "ddp_fully_sharded" @@ -132,6 +145,25 @@ def process_group(self) -> Any: self._process_group = torch.distributed.new_group() return self._process_group + def lightning_module_state_dict(self) -> Dict[str, Any]: + """Returns model state.""" + assert self.model is not None + return self.model.state_dict() + + def connect(self, model: "pl.LightningModule") -> None: + """Called by the accelerator to connect the accelerator and the model with this plugin.""" + # TODO: Wait for this issue to resolve and remove this blocker + # https://github.com/facebookresearch/fairscale/issues/648 + # Also make sure to update the tests + if not is_overridden("configure_sharded_model", self.lightning_module) and len(list(model.parameters())) == 0: + assert self.lightning_module is not None + raise MisconfigurationException( + f"Using the same instance of model with `trainer.{self.lightning_module.trainer.state.fn}()` is not" + " supported with Fairscale FSDP auto-wrap. Please reinitialize your `LightningModule` and pass that." + ) + + super().connect(model) + def setup_distributed(self) -> None: if not self.root_device.type == "cuda": raise MisconfigurationException( @@ -144,17 +176,46 @@ def setup(self, trainer: "pl.Trainer") -> None: self.accelerator.setup(trainer) if trainer.state.fn == TrainerFn.FITTING: - self.setup_optimizers(trainer) - optimizers_to_device(self.optimizers, self.root_device) - if self._layer_sync: assert self.model self.model = self._layer_sync.apply(self.model) - self.setup_precision_plugin() self.configure_ddp() + assert isinstance(self.model, pl.LightningModule) + self.model = _DDPFullyShardedStrategyModuleWrapper(self.model) + assert self.lightning_module is not None + if not is_overridden("configure_sharded_model", self.lightning_module): + self.model = self._setup_model(self.model) + self.setup_optimizers(self.lightning_module.trainer) + optimizers_to_device(self.optimizers, self.root_device) self.barrier() + self.setup_precision_plugin() + + def _setup_model(self, model: torch.nn.Module) -> FullyShardedDataParallel: + """Wraps the model into a + :class:`~fairscale.nn.data_parallel.fully_sharded_data_parallel.FullyShardedDataParallel` module.""" + log.detail(f"setting up `Fairscale FSDP` model with device id: {self.root_device.index}.") + + rank_zero_info( + "When using FairScale FSDP auto-wrap, make sure to initalize your model using trainer else" + " you will get an error.\ntorch.optim.Optimizer(self.trainer.model.parameters(), ...)" + ) + + return FullyShardedDataParallel( + module=model, + process_group=self.process_group, + cpu_offload=self.cpu_offload, + move_grads_to_cpu=self.move_grads_to_cpu, + flatten_parameters=self.flatten_parameters, + mixed_precision=(self.precision_plugin.precision in (PrecisionType.MIXED, PrecisionType.HALF)), + reshard_after_forward=self.reshard_after_forward, + fp32_reduce_scatter=self.fp32_reduce_scatter, + compute_dtype=self.compute_dtype, + bucket_cap_mb=self.bucket_cap_mb, + state_dict_device=self.state_dict_device, + ) + @contextlib.contextmanager def model_sharded_context(self) -> Generator: log.detail(f"{self.__class__.__name__}: entered model_sharded_context.") @@ -190,10 +251,6 @@ def configure_ddp(self) -> None: # (TODO: need to figure out solution) self.model_to_device() - # setup optimizers after fully sharded has wrapped the lightning module - assert self.lightning_module - self.setup_optimizers(self.lightning_module.trainer) - def model_to_device(self) -> None: log.detail(f"{self.__class__.__name__}: moving model to device [{self.root_device}]...") # ensure we update the device type in the lightning module @@ -201,24 +258,22 @@ def model_to_device(self) -> None: self.lightning_module.to(self.root_device) def training_step(self, *args: Any, **kwargs: Any) -> STEP_OUTPUT: - with self.precision_plugin.train_step_context(): - assert isinstance(self.model, TrainingStep) - return self.model.training_step(*args, **kwargs) + # we don't need precision context since casting is done by FSDP + # read `mixed_precision` docstring here: https://pytorch.org/docs/stable/fsdp.html + assert self.model is not None + return self.model(*args, **kwargs) def validation_step(self, *args: Any, **kwargs: Any) -> Optional[STEP_OUTPUT]: - with self.precision_plugin.val_step_context(): - assert isinstance(self.model, ValidationStep) - return self.model.validation_step(*args, **kwargs) + assert self.model is not None + return self.model(*args, **kwargs) def test_step(self, *args: Any, **kwargs: Any) -> Optional[STEP_OUTPUT]: - with self.precision_plugin.test_step_context(): - assert isinstance(self.model, TestStep) - return self.model.test_step(*args, **kwargs) + assert self.model is not None + return self.model(*args, **kwargs) def predict_step(self, *args: Any, **kwargs: Any) -> STEP_OUTPUT: - with self.precision_plugin.predict_step_context(): - assert isinstance(self.model, PredictStep) - return self.model.predict_step(*args, **kwargs) + assert self.model is not None + return self.model(*args, **kwargs) def post_training_step(self) -> None: pass diff --git a/src/pytorch_lightning/strategies/strategy.py b/src/pytorch_lightning/strategies/strategy.py index 0d89529a8d115..0a10722166f8d 100644 --- a/src/pytorch_lightning/strategies/strategy.py +++ b/src/pytorch_lightning/strategies/strategy.py @@ -443,7 +443,7 @@ def handles_gradient_accumulation(self) -> bool: """Whether the plugin handles gradient accumulation internally.""" return False - def lightning_module_state_dict(self) -> Dict[str, Union[Any, Tensor]]: + def lightning_module_state_dict(self) -> Dict[str, Any]: """Returns model state.""" assert self.lightning_module is not None return self.lightning_module.state_dict() diff --git a/tests/tests_pytorch/callbacks/test_stochastic_weight_avg.py b/tests/tests_pytorch/callbacks/test_stochastic_weight_avg.py index a39a7a2145225..f18fce183f4cd 100644 --- a/tests/tests_pytorch/callbacks/test_stochastic_weight_avg.py +++ b/tests/tests_pytorch/callbacks/test_stochastic_weight_avg.py @@ -348,6 +348,7 @@ def test_swa_resume_training_from_checkpoint_ddp(tmpdir): [ pytest.param("fsdp", marks=RunIf(fairscale=True, min_cuda_gpus=1)), pytest.param("deepspeed", marks=RunIf(deepspeed=True, min_cuda_gpus=1)), + pytest.param("fsdp_native", marks=RunIf(min_cuda_gpus=1, skip_windows=True, min_torch="1.12")), ], ) def test_misconfiguration_error_with_sharded_model(tmpdir, strategy: str): diff --git a/tests/tests_pytorch/strategies/test_ddp_fully_sharded_with_full_state_dict.py b/tests/tests_pytorch/strategies/test_ddp_fully_sharded_with_full_state_dict.py index fe587877e84fb..88a07a78efecf 100644 --- a/tests/tests_pytorch/strategies/test_ddp_fully_sharded_with_full_state_dict.py +++ b/tests/tests_pytorch/strategies/test_ddp_fully_sharded_with_full_state_dict.py @@ -41,7 +41,7 @@ def test_fsdp_with_sharded_amp(device_count_mock, mock_cuda_available, tmpdir): assert isinstance(trainer.strategy.precision_plugin, FullyShardedNativeMixedPrecisionPlugin) -class TestFSDPModel(BoringModel): +class TestFSDPModelManualWrapped(BoringModel): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.layer: Optional[torch.nn.Module] = None @@ -69,16 +69,16 @@ def on_load_checkpoint(self, checkpoint: Dict[str, Any]) -> None: def configure_optimizers(self): return torch.optim.SGD(self.layer.parameters(), lr=0.1) - def on_train_start(self) -> None: + def on_train_batch_end(self, *_, **__) -> None: self._assert_layer_fsdp_instance() - def on_test_start(self) -> None: + def on_test_batch_end(self, *_, **__) -> None: self._assert_layer_fsdp_instance() - def on_validation_start(self) -> None: + def on_validation_batch_end(self, *_, **__) -> None: self._assert_layer_fsdp_instance() - def on_prediction_start(self) -> None: + def on_prediction_batch_end(self, *_, **__) -> None: self._assert_layer_fsdp_instance() def _assert_layer_fsdp_instance(self) -> None: @@ -87,8 +87,8 @@ def _assert_layer_fsdp_instance(self) -> None: assert isinstance(self.layer.module[2], FullyShardedDataParallel) # Assert that the nested layers are set reshard_after_forward to True - assert self.layer.module[0].reshard_after_forward is True - assert self.layer.module[2].reshard_after_forward is True + assert self.layer.module[0].reshard_after_forward + assert self.layer.module[2].reshard_after_forward if isinstance(self.trainer.precision_plugin, FullyShardedNativeMixedPrecisionPlugin): assert self.layer.mixed_precision @@ -96,11 +96,40 @@ def _assert_layer_fsdp_instance(self) -> None: assert self.layer.module[2].mixed_precision +class TestFSDPModelAutoWrapped(BoringModel): + def __init__(self): + super().__init__() + self.layer = torch.nn.Sequential(torch.nn.Linear(32, 32), torch.nn.ReLU(), torch.nn.Linear(32, 2)) + + def configure_optimizers(self): + return torch.optim.SGD(self.trainer.model.parameters(), lr=0.1) + + def on_train_batch_end(self, *_, **__) -> None: + self._assert_layer_fsdp_instance() + + def on_test_batch_end(self, *_, **__) -> None: + self._assert_layer_fsdp_instance() + + def on_validation_batch_end(self, *_, **__) -> None: + self._assert_layer_fsdp_instance() + + def on_prediction_batch_end(self, *_, **__) -> None: + self._assert_layer_fsdp_instance() + + def _assert_layer_fsdp_instance(self) -> None: + assert isinstance(self.trainer.model, FullyShardedDataParallel) + # `disable_reshard_on_root=True` (default) in FSDP which turns-off resharding + assert not self.trainer.model.reshard_after_forward + + if isinstance(self.trainer.precision_plugin, FullyShardedNativeMixedPrecisionPlugin): + assert self.trainer.model.mixed_precision + + @RunIf(min_cuda_gpus=1, standalone=True, fairscale=True) def test_fully_sharded_strategy_checkpoint(tmpdir): """Test to ensure that checkpoint is saved correctly when using a single GPU, and all stages can be run.""" - model = TestFSDPModel() + model = TestFSDPModelManualWrapped() trainer = Trainer( default_root_dir=tmpdir, accelerator="gpu", @@ -115,18 +144,28 @@ def test_fully_sharded_strategy_checkpoint(tmpdir): @RunIf(min_cuda_gpus=2, standalone=True, fairscale=True) -def test_fully_sharded_strategy_checkpoint_multi_gpus(tmpdir): +@pytest.mark.parametrize( + "model, strategy", + [ + (TestFSDPModelManualWrapped(), DDPFullyShardedStrategy(min_num_params=2)), + (TestFSDPModelAutoWrapped(), "fsdp"), + ], +) +def test_fully_sharded_strategy_checkpoint_multi_gpus(tmpdir, model, strategy): """Test to ensure that checkpoint is saved correctly when using multiple GPUs, and all stages can be run.""" - model = TestFSDPModel() ck = ModelCheckpoint(save_last=True) trainer = Trainer( default_root_dir=tmpdir, accelerator="gpu", devices=2, - strategy="fsdp", + strategy=strategy, precision=16, max_epochs=1, + limit_train_batches=2, + limit_val_batches=2, + limit_test_batches=2, + limit_predict_batches=2, callbacks=[ck], enable_progress_bar=False, enable_model_summary=False, @@ -134,7 +173,7 @@ def test_fully_sharded_strategy_checkpoint_multi_gpus(tmpdir): _run_multiple_stages(trainer, model) -def _assert_save_equality(trainer, ckpt_path, cls=TestFSDPModel): +def _assert_save_equality(trainer, ckpt_path, cls=TestFSDPModelManualWrapped): # Use FullySharded to get the state dict for the sake of comparison model_state_dict = trainer.strategy.lightning_module_state_dict() @@ -153,19 +192,36 @@ def _run_multiple_stages(trainer, model, model_path: Optional[str] = None): trainer.save_checkpoint(model_path, weights_only=True) - _assert_save_equality(trainer, model_path, cls=TestFSDPModel) + _assert_save_equality(trainer, model_path, cls=model.__class__) # Test entry point + if model.__class__ is TestFSDPModelAutoWrapped: + model = TestFSDPModelAutoWrapped() trainer.test(model) # model is wrapped, will not call configure_shared_model - # provide model path, will create a new unwrapped model and load and then call configure_shared_model to wrap - trainer.test(ckpt_path=model_path) + # provide model path, will create a new unwrapped model and load and then call `configure_shared_model` to wrap + if model.__class__ is TestFSDPModelAutoWrapped: + model = TestFSDPModelAutoWrapped() + trainer.test(model, ckpt_path=model_path) + + # Predict entry point + if model.__class__ is TestFSDPModelAutoWrapped: + model = TestFSDPModelAutoWrapped() + + if model.__class__ is TestFSDPModelAutoWrapped: + model = TestFSDPModelAutoWrapped() + trainer.predict(model) # model is wrapped, will not call `configure_sharded_model` + + # provide model path, will create a new unwrapped model and load and then call `configure_shared_model` to wrap + if model.__class__ is TestFSDPModelAutoWrapped: + model = TestFSDPModelAutoWrapped() + trainer.predict(model, ckpt_path=model_path) @RunIf(min_cuda_gpus=1, standalone=True, fairscale=True) def test_fsdp_gradient_clipping_raises(tmpdir): """Test to ensure that an exception is raised when clipping gradients by value with FSDP.""" - model = BoringModel() + model = TestFSDPModelManualWrapped() trainer = Trainer( default_root_dir=tmpdir, strategy="fsdp", @@ -182,3 +238,21 @@ def test_fsdp_gradient_clipping_raises(tmpdir): MisconfigurationException, match="gradient_clip_algorithm='norm'` is currently not supported for `FullySharded" ): trainer.fit(model) + + +@RunIf(min_cuda_gpus=1, skip_windows=True, standalone=True, fairscale_fully_sharded=True) +def test_fsdp_rewrap_limitation(tmpdir): + trainer = Trainer( + default_root_dir=tmpdir, + accelerator="gpu", + devices=1, + max_steps=1, + limit_val_batches=0, + limit_test_batches=1, + strategy="fsdp", + ) + model = TestFSDPModelAutoWrapped() + trainer.fit(model) + + with pytest.raises(MisconfigurationException, match="Using the same instance of model .* not supported"): + trainer.test(model) From 8a4a3b67661d6c2163280dc4a1184f9094e040f6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Tue, 6 Sep 2022 14:17:15 +0200 Subject: [PATCH 066/193] Mark the lite `DeviceDtypeModuleMixin` as protected (#14548) --- docs/source-pytorch/api_references.rst | 1 - src/lightning_lite/utilities/device_dtype_mixin.py | 8 ++++---- src/pytorch_lightning/CHANGELOG.md | 2 +- src/pytorch_lightning/core/mixins/__init__.py | 2 +- src/pytorch_lightning/core/mixins/device_dtype_mixin.py | 7 +++---- src/pytorch_lightning/core/module.py | 6 +++--- src/pytorch_lightning/lite/wrappers.py | 4 ++-- src/pytorch_lightning/overrides/base.py | 6 +++--- .../trainer/connectors/logger_connector/result.py | 4 ++-- tests/tests_lite/utilities/test_device_dtype_mixin.py | 8 ++++---- tests/tests_pytorch/lite/test_wrappers.py | 4 ++-- tests/tests_pytorch/utilities/test_dtype_device_mixin.py | 4 ++-- 12 files changed, 27 insertions(+), 29 deletions(-) diff --git a/docs/source-pytorch/api_references.rst b/docs/source-pytorch/api_references.rst index 9203f60ef3c02..2d9cc4572f4d7 100644 --- a/docs/source-pytorch/api_references.rst +++ b/docs/source-pytorch/api_references.rst @@ -76,7 +76,6 @@ core ~hooks.ModelHooks LightningDataModule LightningModule - ~mixins.DeviceDtypeModuleMixin ~mixins.HyperparametersMixin ~optimizer.LightningOptimizer ~saving.ModelIO diff --git a/src/lightning_lite/utilities/device_dtype_mixin.py b/src/lightning_lite/utilities/device_dtype_mixin.py index 5086583d8e26f..b889288ea5e0a 100644 --- a/src/lightning_lite/utilities/device_dtype_mixin.py +++ b/src/lightning_lite/utilities/device_dtype_mixin.py @@ -19,7 +19,7 @@ from typing_extensions import Self -class DeviceDtypeModuleMixin(Module): +class _DeviceDtypeModuleMixin(Module): __jit_unused_properties__ = ["device", "dtype"] def __init__(self) -> None: @@ -79,7 +79,7 @@ def to(self, *args: Any, **kwargs: Any) -> Self: # type: ignore[valid-type] Example:: >>> from torch import Tensor - >>> class ExampleModule(DeviceDtypeModuleMixin): + >>> class ExampleModule(_DeviceDtypeModuleMixin): ... def __init__(self, weight: Tensor): ... super().__init__() ... self.register_buffer('weight', weight) @@ -180,8 +180,8 @@ def half(self) -> Self: # type: ignore[valid-type] def __update_properties( self, device: Optional[torch.device] = None, dtype: Optional[Union[str, torch.dtype]] = None ) -> None: - def apply_fn(module: Union[DeviceDtypeModuleMixin, Module]) -> None: - if not isinstance(module, DeviceDtypeModuleMixin): + def apply_fn(module: Union[_DeviceDtypeModuleMixin, Module]) -> None: + if not isinstance(module, _DeviceDtypeModuleMixin): return if device is not None: module._device = device diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 0d5d55d3324ec..0bcef15dbcb30 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -75,7 +75,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Deprecated the `on_colab_kaggle` function ([#14247](https://github.com/Lightning-AI/lightning/pull/14247)) -- Deprecated the `pl.core.mixins.DeviceDtypeModuleMixin` in favor of `lightning_lite.utilities.DeviceDtypeModuleMixin` ([#14511](https://github.com/Lightning-AI/lightning/pull/14511)) +- Deprecated the internal `pl.core.mixins.DeviceDtypeModuleMixin` class ([#14511](https://github.com/Lightning-AI/lightning/pull/14511), [#14548](https://github.com/Lightning-AI/lightning/pull/14548)) - Deprecated all functions in `pytorch_lightning.utilities.xla_device` in favor of `lightning_lite.utilities.xla_device` ([#14514](https://github.com/Lightning-AI/lightning/pull/14514)) diff --git a/src/pytorch_lightning/core/mixins/__init__.py b/src/pytorch_lightning/core/mixins/__init__.py index 3671ced07aa93..42ee5fc7748c6 100644 --- a/src/pytorch_lightning/core/mixins/__init__.py +++ b/src/pytorch_lightning/core/mixins/__init__.py @@ -12,5 +12,5 @@ # See the License for the specific language governing permissions and # limitations under the License. -from lightning_lite.utilities.device_dtype_mixin import DeviceDtypeModuleMixin # noqa: F401 +from lightning_lite.utilities.device_dtype_mixin import _DeviceDtypeModuleMixin # noqa: F401 from pytorch_lightning.core.mixins.hparams_mixin import HyperparametersMixin # noqa: F401 diff --git a/src/pytorch_lightning/core/mixins/device_dtype_mixin.py b/src/pytorch_lightning/core/mixins/device_dtype_mixin.py index 9e85fb7742b19..c1b2a3294e65b 100644 --- a/src/pytorch_lightning/core/mixins/device_dtype_mixin.py +++ b/src/pytorch_lightning/core/mixins/device_dtype_mixin.py @@ -12,15 +12,14 @@ # See the License for the specific language governing permissions and # limitations under the License. -from lightning_lite.utilities.device_dtype_mixin import DeviceDtypeModuleMixin as NewDeviceDtypeModuleMixin +from lightning_lite.utilities.device_dtype_mixin import _DeviceDtypeModuleMixin from pytorch_lightning.utilities import rank_zero_deprecation -class DeviceDtypeModuleMixin(NewDeviceDtypeModuleMixin): +class DeviceDtypeModuleMixin(_DeviceDtypeModuleMixin): def __init__(self) -> None: rank_zero_deprecation( "`pytorch_lightning.core.mixins.DeviceDtypeModuleMixin` has been deprecated in v1.8.0 and will be" - " removed in v1.10.0. Please use `lightning_lite.utilities.device_dtype_mixin.DeviceDtypeModuleMixin`" - " instead." + " removed in v1.10.0. This class is internal but you can copy over its implementation." ) super().__init__() diff --git a/src/pytorch_lightning/core/module.py b/src/pytorch_lightning/core/module.py index 5ee5b36d8b9f6..a47a06a538c98 100644 --- a/src/pytorch_lightning/core/module.py +++ b/src/pytorch_lightning/core/module.py @@ -34,7 +34,7 @@ import pytorch_lightning as pl from lightning_lite.utilities.apply_func import apply_to_collection, convert_to_tensors from lightning_lite.utilities.cloud_io import get_filesystem -from lightning_lite.utilities.device_dtype_mixin import DeviceDtypeModuleMixin +from lightning_lite.utilities.device_dtype_mixin import _DeviceDtypeModuleMixin from pytorch_lightning.callbacks.callback import Callback from pytorch_lightning.core.hooks import CheckpointHooks, DataHooks, ModelHooks from pytorch_lightning.core.mixins import HyperparametersMixin @@ -64,7 +64,7 @@ class LightningModule( - DeviceDtypeModuleMixin, + _DeviceDtypeModuleMixin, HyperparametersMixin, ModelIO, ModelHooks, @@ -89,7 +89,7 @@ class LightningModule( "use_amp", "trainer", ] - + DeviceDtypeModuleMixin.__jit_unused_properties__ + + _DeviceDtypeModuleMixin.__jit_unused_properties__ + HyperparametersMixin.__jit_unused_properties__ ) diff --git a/src/pytorch_lightning/lite/wrappers.py b/src/pytorch_lightning/lite/wrappers.py index 29a2cffe931a0..477534cedf90a 100644 --- a/src/pytorch_lightning/lite/wrappers.py +++ b/src/pytorch_lightning/lite/wrappers.py @@ -20,7 +20,7 @@ from torch.utils.data import DataLoader from lightning_lite.utilities.apply_func import apply_to_collection, move_data_to_device -from lightning_lite.utilities.device_dtype_mixin import DeviceDtypeModuleMixin +from lightning_lite.utilities.device_dtype_mixin import _DeviceDtypeModuleMixin from pytorch_lightning.plugins import PrecisionPlugin from pytorch_lightning.strategies import Strategy @@ -64,7 +64,7 @@ def step(self, closure: Optional[Callable] = None) -> Any: ) -class _LiteModule(DeviceDtypeModuleMixin): +class _LiteModule(_DeviceDtypeModuleMixin): def __init__( self, forward_module: nn.Module, precision_plugin: PrecisionPlugin, original_module: Optional[nn.Module] = None ) -> None: diff --git a/src/pytorch_lightning/overrides/base.py b/src/pytorch_lightning/overrides/base.py index c169431a2d2fd..bd2a904de686f 100644 --- a/src/pytorch_lightning/overrides/base.py +++ b/src/pytorch_lightning/overrides/base.py @@ -19,11 +19,11 @@ from torch.nn.parallel import DistributedDataParallel import pytorch_lightning as pl -from lightning_lite.utilities.device_dtype_mixin import DeviceDtypeModuleMixin +from lightning_lite.utilities.device_dtype_mixin import _DeviceDtypeModuleMixin from pytorch_lightning.utilities import rank_zero_deprecation -class _LightningPrecisionModuleWrapperBase(DeviceDtypeModuleMixin, torch.nn.Module): +class _LightningPrecisionModuleWrapperBase(_DeviceDtypeModuleMixin, torch.nn.Module): def __init__(self, pl_module: "pl.LightningModule") -> None: """Wraps the user's LightningModule. Requires overriding all ``*_step`` methods and ``forward`` so that it can safely be wrapped by a ``_LightningModuleWrapperBase`` and a ``*DataParallel``. @@ -54,7 +54,7 @@ def forward(self, *args: Any, **kwargs: Any) -> Any: raise NotImplementedError -class _LightningModuleWrapperBase(DeviceDtypeModuleMixin, torch.nn.Module): +class _LightningModuleWrapperBase(_DeviceDtypeModuleMixin, torch.nn.Module): def __init__( self, forward_module: Optional[Union["pl.LightningModule", _LightningPrecisionModuleWrapperBase]] ) -> None: diff --git a/src/pytorch_lightning/trainer/connectors/logger_connector/result.py b/src/pytorch_lightning/trainer/connectors/logger_connector/result.py index 1909ade8a9cd9..9408ce826e140 100644 --- a/src/pytorch_lightning/trainer/connectors/logger_connector/result.py +++ b/src/pytorch_lightning/trainer/connectors/logger_connector/result.py @@ -21,7 +21,7 @@ from typing_extensions import TypedDict from lightning_lite.utilities.apply_func import apply_to_collection, apply_to_collections, move_data_to_device -from lightning_lite.utilities.device_dtype_mixin import DeviceDtypeModuleMixin +from lightning_lite.utilities.device_dtype_mixin import _DeviceDtypeModuleMixin from pytorch_lightning.utilities.data import extract_batch_size from pytorch_lightning.utilities.distributed import distributed_available from pytorch_lightning.utilities.exceptions import MisconfigurationException @@ -200,7 +200,7 @@ def _reconstruct(cls, state: dict, sync_fn: Optional[Callable] = None) -> "_Meta return meta -class _ResultMetric(Metric, DeviceDtypeModuleMixin): +class _ResultMetric(Metric, _DeviceDtypeModuleMixin): """Wraps the value provided to `:meth:`~pytorch_lightning.core.module.LightningModule.log`""" def __init__(self, metadata: _Metadata, is_tensor: bool) -> None: diff --git a/tests/tests_lite/utilities/test_device_dtype_mixin.py b/tests/tests_lite/utilities/test_device_dtype_mixin.py index 28e7021a9eac4..35caf08148f94 100644 --- a/tests/tests_lite/utilities/test_device_dtype_mixin.py +++ b/tests/tests_lite/utilities/test_device_dtype_mixin.py @@ -3,10 +3,10 @@ from tests_lite.helpers.runif import RunIf from torch import nn as nn -from lightning_lite.utilities.device_dtype_mixin import DeviceDtypeModuleMixin +from lightning_lite.utilities.device_dtype_mixin import _DeviceDtypeModuleMixin -class SubSubModule(DeviceDtypeModuleMixin): +class SubSubModule(_DeviceDtypeModuleMixin): pass @@ -16,7 +16,7 @@ def __init__(self): self.module = SubSubModule() -class TopModule(DeviceDtypeModuleMixin): +class TopModule(_DeviceDtypeModuleMixin): def __init__(self) -> None: super().__init__() self.module = SubModule() @@ -76,7 +76,7 @@ def test_cuda_device(device): def test_cuda_current_device(): """Test that calling .cuda() moves the model to the correct device and respects current cuda device setting.""" - class CudaModule(DeviceDtypeModuleMixin): + class CudaModule(_DeviceDtypeModuleMixin): def __init__(self): super().__init__() self.layer = nn.Linear(1, 1) diff --git a/tests/tests_pytorch/lite/test_wrappers.py b/tests/tests_pytorch/lite/test_wrappers.py index 03799babf94c6..957c321dd1b40 100644 --- a/tests/tests_pytorch/lite/test_wrappers.py +++ b/tests/tests_pytorch/lite/test_wrappers.py @@ -17,7 +17,7 @@ import torch from torch.utils.data.dataloader import DataLoader -from lightning_lite.utilities.device_dtype_mixin import DeviceDtypeModuleMixin +from lightning_lite.utilities.device_dtype_mixin import _DeviceDtypeModuleMixin from pytorch_lightning.lite import LightningLite from pytorch_lightning.lite.wrappers import _LiteDataLoader, _LiteModule, _LiteOptimizer from tests_pytorch.helpers.runif import RunIf @@ -136,7 +136,7 @@ def test_lite_module_device_dtype_propagation(device_str, dtype): device = torch.device(device_str) - class DeviceModule(DeviceDtypeModuleMixin): + class DeviceModule(_DeviceDtypeModuleMixin): pass device_module = DeviceModule() diff --git a/tests/tests_pytorch/utilities/test_dtype_device_mixin.py b/tests/tests_pytorch/utilities/test_dtype_device_mixin.py index 68aad3257beb5..2afc42c8878bc 100644 --- a/tests/tests_pytorch/utilities/test_dtype_device_mixin.py +++ b/tests/tests_pytorch/utilities/test_dtype_device_mixin.py @@ -13,13 +13,13 @@ # limitations under the License. import torch.nn as nn -from lightning_lite.utilities.device_dtype_mixin import DeviceDtypeModuleMixin +from lightning_lite.utilities.device_dtype_mixin import _DeviceDtypeModuleMixin from pytorch_lightning import Callback, Trainer from pytorch_lightning.demos.boring_classes import BoringModel from tests_pytorch.helpers.runif import RunIf -class SubSubModule(DeviceDtypeModuleMixin): +class SubSubModule(_DeviceDtypeModuleMixin): pass From 44216fdd691bbf219a9afe4eec5bfd2d568b222f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Tue, 6 Sep 2022 14:56:20 +0200 Subject: [PATCH 067/193] Integrate `lightning_utilities.core.imports` (#14475) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Adrian Wälchli --- docs/source-lit/conf.py | 2 - docs/source-pytorch/conf.py | 3 +- requirements/app/base.txt | 1 + requirements/lite/base.txt | 1 + requirements/pytorch/base.txt | 1 + src/lightning_app/__init__.py | 5 +- src/lightning_app/utilities/imports.py | 90 +++-------- src/pytorch_lightning/CHANGELOG.md | 3 + .../callbacks/progress/rich_progress.py | 6 +- src/pytorch_lightning/cli.py | 4 +- src/pytorch_lightning/loggers/comet.py | 4 +- src/pytorch_lightning/loggers/mlflow.py | 5 +- src/pytorch_lightning/loggers/neptune.py | 6 +- src/pytorch_lightning/loggers/wandb.py | 8 +- src/pytorch_lightning/overrides/fairscale.py | 5 +- .../plugins/precision/deepspeed.py | 5 +- .../serve/servable_module_validator.py | 6 +- src/pytorch_lightning/strategies/bagua.py | 4 +- src/pytorch_lightning/strategies/deepspeed.py | 4 +- .../strategies/launchers/subprocess_script.py | 4 +- src/pytorch_lightning/trainer/trainer.py | 5 +- src/pytorch_lightning/tuner/lr_finder.py | 4 +- src/pytorch_lightning/utilities/__init__.py | 1 - src/pytorch_lightning/utilities/imports.py | 140 +++--------------- src/pytorch_lightning/utilities/meta.py | 4 +- .../model_summary/model_summary_deepspeed.py | 4 +- tests/tests_app/utilities/test_imports.py | 11 +- tests/tests_pytorch/helpers/datamodules.py | 13 +- .../tests_pytorch/loggers/test_tensorboard.py | 17 +-- tests/tests_pytorch/models/test_gpu.py | 3 - tests/tests_pytorch/models/test_hparams.py | 4 +- tests/tests_pytorch/utilities/test_imports.py | 44 +----- .../utilities/test_torchdistx.py | 4 +- 33 files changed, 103 insertions(+), 318 deletions(-) diff --git a/docs/source-lit/conf.py b/docs/source-lit/conf.py index d1d48b28163e7..5a7cdd25b59dc 100644 --- a/docs/source-lit/conf.py +++ b/docs/source-lit/conf.py @@ -410,8 +410,6 @@ def find_source(): _TPU_AVAILABLE, _TORCHVISION_AVAILABLE, _TORCH_GREATER_EQUAL_1_10, - _module_available, ) -_JSONARGPARSE_AVAILABLE = _module_available("jsonargparse") """ coverage_skip_undoc_in_source = True diff --git a/docs/source-pytorch/conf.py b/docs/source-pytorch/conf.py index fefe6df85104a..c732a7c181acd 100644 --- a/docs/source-pytorch/conf.py +++ b/docs/source-pytorch/conf.py @@ -391,14 +391,13 @@ def package_list_from_file(file): from torch.utils.data import IterableDataset, DataLoader, Dataset from pytorch_lightning import LightningDataModule, LightningModule, Trainer, seed_everything from pytorch_lightning.callbacks import Callback +from pytorch_lightning.cli import _JSONARGPARSE_SIGNATURES_AVAILABLE as _JSONARGPARSE_AVAILABLE from pytorch_lightning.utilities import ( _APEX_AVAILABLE, _XLA_AVAILABLE, _TPU_AVAILABLE, _TORCHVISION_AVAILABLE, _TORCH_GREATER_EQUAL_1_10, - _module_available, ) -_JSONARGPARSE_AVAILABLE = _module_available("jsonargparse") """ coverage_skip_undoc_in_source = True diff --git a/requirements/app/base.txt b/requirements/app/base.txt index 50a6d6c1d6e24..928f720139df6 100644 --- a/requirements/app/base.txt +++ b/requirements/app/base.txt @@ -7,3 +7,4 @@ s3fs>=2022.5.0, <=2022.7.1 croniter>=1.3.0, <1.4.0 # strict; TODO: for now until we find something more robust. traitlets<5.2.0 # Traitlets 5.2.X fails: https://github.com/ipython/traitlets/issues/741 arrow>=1.2.0, <=1.2.2 +lightning-utilities>=0.2.0 diff --git a/requirements/lite/base.txt b/requirements/lite/base.txt index 89061745d428d..333f8791748a7 100644 --- a/requirements/lite/base.txt +++ b/requirements/lite/base.txt @@ -5,3 +5,4 @@ torch>=1.9.*, <1.13.0 fsspec[http]>=2021.05.0, !=2021.06.0, <2022.6.0 packaging>=17.0, <=21.3 typing-extensions>=4.0.0, <4.3.1 +lightning-utilities>=0.2.0 diff --git a/requirements/pytorch/base.txt b/requirements/pytorch/base.txt index 698d0ec4ad858..f5192f24be1c9 100644 --- a/requirements/pytorch/base.txt +++ b/requirements/pytorch/base.txt @@ -10,3 +10,4 @@ tensorboard>=2.9.1, <2.11.0 torchmetrics>=0.7.0, <0.9.3 # needed for using fixed compare_version packaging>=17.0, <=21.3 typing-extensions>=4.0.0, <4.3.1 +lightning-utilities>=0.2.0 diff --git a/src/lightning_app/__init__.py b/src/lightning_app/__init__.py index 07926c203662b..e64e2269aa39e 100644 --- a/src/lightning_app/__init__.py +++ b/src/lightning_app/__init__.py @@ -2,6 +2,8 @@ import logging import os +from lightning_utilities.core.imports import module_available + _root_logger = logging.getLogger() _logger = logging.getLogger(__name__) _logger.setLevel(logging.INFO) @@ -29,11 +31,10 @@ from lightning_app.core.app import LightningApp # noqa: E402 from lightning_app.core.flow import LightningFlow # noqa: E402 from lightning_app.core.work import LightningWork # noqa: E402 -from lightning_app.utilities.imports import _module_available # noqa: E402 from lightning_app.utilities.packaging.build_config import BuildConfig # noqa: E402 from lightning_app.utilities.packaging.cloud_compute import CloudCompute # noqa: E402 -if _module_available("lightning_app.components.demo"): +if module_available("lightning_app.components.demo"): from lightning_app.components import demo # noqa: F401 _PACKAGE_ROOT = os.path.dirname(__file__) diff --git a/src/lightning_app/utilities/imports.py b/src/lightning_app/utilities/imports.py index 090af1b879340..06f780edb6380 100644 --- a/src/lightning_app/utilities/imports.py +++ b/src/lightning_app/utilities/imports.py @@ -13,45 +13,10 @@ # limitations under the License. """General utilities.""" import functools -import importlib import os -from importlib.util import find_spec from typing import List, Union - -def _package_available(package_name: str) -> bool: - """Check if a package is available in your environment. - - >>> _package_available('os') - True - >>> _package_available('bla') - False - """ - try: - return find_spec(package_name) is not None - except ModuleNotFoundError: - return False - - -@functools.lru_cache() -def _module_available(module_path: str) -> bool: - """Check if a module path is available in your environment. - - >>> _module_available('os') - True - >>> _module_available('os.bla') - False - >>> _module_available('bla.bla') - False - """ - module_names = module_path.split(".") - if not _package_available(module_names[0]): - return False - try: - importlib.import_module(module_path) - except ModuleNotFoundError: - return False - return True +from lightning_utilities.core.imports import module_available def requires(module_paths: Union[str, List]): @@ -62,7 +27,7 @@ def requires(module_paths: Union[str, List]): def decorator(func): @functools.wraps(func) def wrapper(*args, **kwargs): - unavailable_modules = [f"'{module}'" for module in module_paths if not _module_available(module)] + unavailable_modules = [f"'{module}'" for module in module_paths if not module_available(module)] if any(unavailable_modules) and not bool(int(os.getenv("LIGHTING_TESTING", "0"))): raise ModuleNotFoundError( f"Required dependencies not available. Please run: pip install {' '.join(unavailable_modules)}" @@ -75,89 +40,72 @@ def wrapper(*args, **kwargs): # TODO: Automatically detect dependencies -@functools.lru_cache() def _is_redis_available() -> bool: - return _module_available("redis") + return module_available("redis") -@functools.lru_cache() def _is_torch_available() -> bool: - return _module_available("torch") + return module_available("torch") -@functools.lru_cache() def _is_pytorch_lightning_available() -> bool: - return _module_available("pytorch_lightning") + return module_available("pytorch_lightning") -@functools.lru_cache() def _is_torchvision_available() -> bool: - return _module_available("torchvision") + return module_available("torchvision") -@functools.lru_cache() def _is_json_argparse_available() -> bool: - return _module_available("jsonargparse") + return module_available("jsonargparse") -@functools.lru_cache() def _is_streamlit_available() -> bool: - return _module_available("streamlit") + return module_available("streamlit") -@functools.lru_cache() def _is_param_available() -> bool: - return _module_available("param") + return module_available("param") -@functools.lru_cache() def _is_streamlit_tensorboard_available() -> bool: - return _module_available("streamlit_tensorboard") + return module_available("streamlit_tensorboard") -@functools.lru_cache() def _is_starsessions_available() -> bool: - return _module_available("starsessions") + return module_available("starsessions") -@functools.lru_cache() def _is_gradio_available() -> bool: - return _module_available("gradio") + return module_available("gradio") -@functools.lru_cache() def _is_lightning_flash_available() -> bool: - return _module_available("flash") + return module_available("flash") -@functools.lru_cache() def _is_pil_available() -> bool: - return _module_available("PIL") + return module_available("PIL") -@functools.lru_cache() def _is_numpy_available() -> bool: - return _module_available("numpy") + return module_available("numpy") -@functools.lru_cache() def _is_docker_available() -> bool: - return _module_available("docker") + return module_available("docker") -@functools.lru_cache() def _is_jinja2_available() -> bool: - return _module_available("jinja2") + return module_available("jinja2") -@functools.lru_cache() def _is_playwright_available() -> bool: - return _module_available("playwright") + return module_available("playwright") -@functools.lru_cache() def _is_s3fs_available() -> bool: - return _module_available("s3fs") + return module_available("s3fs") _CLOUD_TEST_RUN = bool(os.getenv("CLOUD", False)) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 0bcef15dbcb30..ef0a8b051e094 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -24,6 +24,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added support for auto wrapping for `DDPFullyShardedStrategy` ([#14383](https://github.com/Lightning-AI/lightning/issues/14383)) +- Integrate the `lightning_utilities` package ([#14475](https://github.com/Lightning-AI/lightning/issues/14475)) + + ### Changed diff --git a/src/pytorch_lightning/callbacks/progress/rich_progress.py b/src/pytorch_lightning/callbacks/progress/rich_progress.py index e0d8fca2e753e..cb0ce0c52e72d 100644 --- a/src/pytorch_lightning/callbacks/progress/rich_progress.py +++ b/src/pytorch_lightning/callbacks/progress/rich_progress.py @@ -12,18 +12,16 @@ # See the License for the specific language governing permissions and # limitations under the License. import math -import operator from dataclasses import dataclass from datetime import timedelta from typing import Any, Dict, Optional, Union -from torchmetrics.utilities.imports import _compare_version +from lightning_utilities.core.imports import RequirementCache import pytorch_lightning as pl from pytorch_lightning.callbacks.progress.base import ProgressBarBase -from pytorch_lightning.utilities.imports import _package_available -_RICH_AVAILABLE: bool = _package_available("rich") and _compare_version("rich", operator.ge, "10.2.2") +_RICH_AVAILABLE: bool = RequirementCache("rich>=10.2.2") Task, Style = None, None if _RICH_AVAILABLE: diff --git a/src/pytorch_lightning/cli.py b/src/pytorch_lightning/cli.py index 27b9c0487ce6b..ac03263f63613 100644 --- a/src/pytorch_lightning/cli.py +++ b/src/pytorch_lightning/cli.py @@ -17,17 +17,17 @@ from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union import torch +from lightning_utilities.core.imports import RequirementCache from torch.optim import Optimizer import pytorch_lightning as pl from lightning_lite.utilities.cloud_io import get_filesystem from pytorch_lightning import Callback, LightningDataModule, LightningModule, seed_everything, Trainer from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning.utilities.imports import _RequirementAvailable from pytorch_lightning.utilities.model_helpers import is_overridden from pytorch_lightning.utilities.rank_zero import _warn, rank_zero_deprecation, rank_zero_warn -_JSONARGPARSE_SIGNATURES_AVAILABLE = _RequirementAvailable("jsonargparse[signatures]>=4.12.0") +_JSONARGPARSE_SIGNATURES_AVAILABLE = RequirementCache("jsonargparse[signatures]>=4.12.0") if _JSONARGPARSE_SIGNATURES_AVAILABLE: import docstring_parser diff --git a/src/pytorch_lightning/loggers/comet.py b/src/pytorch_lightning/loggers/comet.py index ed4fb2f8f2c1a..dfbb236422839 100644 --- a/src/pytorch_lightning/loggers/comet.py +++ b/src/pytorch_lightning/loggers/comet.py @@ -21,17 +21,17 @@ from argparse import Namespace from typing import Any, Callable, Dict, Mapping, Optional, Sequence, Union +from lightning_utilities.core.imports import module_available from torch import Tensor import pytorch_lightning as pl from pytorch_lightning.loggers.logger import Logger, rank_zero_experiment from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning.utilities.imports import _module_available from pytorch_lightning.utilities.logger import _add_prefix, _convert_params, _flatten_dict from pytorch_lightning.utilities.rank_zero import rank_zero_only log = logging.getLogger(__name__) -_COMET_AVAILABLE = _module_available("comet_ml") +_COMET_AVAILABLE = module_available("comet_ml") if _COMET_AVAILABLE: import comet_ml diff --git a/src/pytorch_lightning/loggers/mlflow.py b/src/pytorch_lightning/loggers/mlflow.py index 5675a3bd9fc67..c715595674eac 100644 --- a/src/pytorch_lightning/loggers/mlflow.py +++ b/src/pytorch_lightning/loggers/mlflow.py @@ -22,14 +22,15 @@ from time import time from typing import Any, Dict, Mapping, Optional, Union +from lightning_utilities.core.imports import module_available + from pytorch_lightning.loggers.logger import Logger, rank_zero_experiment -from pytorch_lightning.utilities.imports import _module_available from pytorch_lightning.utilities.logger import _add_prefix, _convert_params, _flatten_dict from pytorch_lightning.utilities.rank_zero import rank_zero_only, rank_zero_warn log = logging.getLogger(__name__) LOCAL_FILE_URI_PREFIX = "file:" -_MLFLOW_AVAILABLE = _module_available("mlflow") +_MLFLOW_AVAILABLE = module_available("mlflow") try: import mlflow from mlflow.tracking import context, MlflowClient diff --git a/src/pytorch_lightning/loggers/neptune.py b/src/pytorch_lightning/loggers/neptune.py index 7c4038cd81abb..0c1ab35cf58ee 100644 --- a/src/pytorch_lightning/loggers/neptune.py +++ b/src/pytorch_lightning/loggers/neptune.py @@ -26,18 +26,18 @@ from typing import Any, Callable, Dict, Generator, List, Mapping, Optional, Sequence, Set, Union from weakref import ReferenceType +from lightning_utilities.core.imports import RequirementCache from torch import Tensor import pytorch_lightning as pl from pytorch_lightning.callbacks import Checkpoint from pytorch_lightning.loggers.logger import Logger, rank_zero_experiment -from pytorch_lightning.utilities.imports import _RequirementAvailable from pytorch_lightning.utilities.logger import _add_prefix, _convert_params, _sanitize_callable_params from pytorch_lightning.utilities.model_summary import ModelSummary from pytorch_lightning.utilities.rank_zero import rank_zero_only -_NEPTUNE_AVAILABLE = _RequirementAvailable("neptune-client") -_NEPTUNE_GREATER_EQUAL_0_9 = _RequirementAvailable("neptune-client>=0.9.0") +_NEPTUNE_AVAILABLE = RequirementCache("neptune-client") +_NEPTUNE_GREATER_EQUAL_0_9 = RequirementCache("neptune-client>=0.9.0") if _NEPTUNE_AVAILABLE and _NEPTUNE_GREATER_EQUAL_0_9: diff --git a/src/pytorch_lightning/loggers/wandb.py b/src/pytorch_lightning/loggers/wandb.py index 3198e46b1a586..396bc49ea0e70 100644 --- a/src/pytorch_lightning/loggers/wandb.py +++ b/src/pytorch_lightning/loggers/wandb.py @@ -22,11 +22,11 @@ from weakref import ReferenceType import torch.nn as nn +from lightning_utilities.core.imports import RequirementCache from pytorch_lightning.callbacks import Checkpoint from pytorch_lightning.loggers.logger import Logger, rank_zero_experiment from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning.utilities.imports import _RequirementAvailable from pytorch_lightning.utilities.logger import _add_prefix, _convert_params, _flatten_dict, _sanitize_callable_params from pytorch_lightning.utilities.rank_zero import rank_zero_only, rank_zero_warn @@ -38,9 +38,9 @@ # needed for test mocks, these tests shall be updated wandb, Run, RunDisabled = None, None, None # type: ignore -_WANDB_AVAILABLE = _RequirementAvailable("wandb") -_WANDB_GREATER_EQUAL_0_10_22 = _RequirementAvailable("wandb>=0.10.22") -_WANDB_GREATER_EQUAL_0_12_10 = _RequirementAvailable("wandb>=0.12.10") +_WANDB_AVAILABLE = RequirementCache("wandb") +_WANDB_GREATER_EQUAL_0_10_22 = RequirementCache("wandb>=0.10.22") +_WANDB_GREATER_EQUAL_0_12_10 = RequirementCache("wandb>=0.12.10") class WandbLogger(Logger): diff --git a/src/pytorch_lightning/overrides/fairscale.py b/src/pytorch_lightning/overrides/fairscale.py index d9fd2e60aff61..572efd277d316 100644 --- a/src/pytorch_lightning/overrides/fairscale.py +++ b/src/pytorch_lightning/overrides/fairscale.py @@ -14,6 +14,7 @@ from typing import Optional, Union import torch.nn as nn +from lightning_utilities.core.imports import module_available import pytorch_lightning as pl from pytorch_lightning.overrides.base import ( @@ -22,9 +23,9 @@ unwrap_lightning_module, ) from pytorch_lightning.utilities import rank_zero_deprecation -from pytorch_lightning.utilities.imports import _IS_WINDOWS, _module_available +from pytorch_lightning.utilities.imports import _IS_WINDOWS -_FAIRSCALE_AVAILABLE = not _IS_WINDOWS and _module_available("fairscale.nn") +_FAIRSCALE_AVAILABLE = not _IS_WINDOWS and module_available("fairscale.nn") if _FAIRSCALE_AVAILABLE: # pragma: no-cover diff --git a/src/pytorch_lightning/plugins/precision/deepspeed.py b/src/pytorch_lightning/plugins/precision/deepspeed.py index 456bba1e77823..92ed82774e55e 100644 --- a/src/pytorch_lightning/plugins/precision/deepspeed.py +++ b/src/pytorch_lightning/plugins/precision/deepspeed.py @@ -13,6 +13,7 @@ # limitations under the License. from typing import Any, Callable, Optional, TYPE_CHECKING, Union +from lightning_utilities.core.imports import RequirementCache from torch import Tensor from torch.nn import Module from torch.optim import LBFGS, Optimizer @@ -22,11 +23,11 @@ from pytorch_lightning.utilities import GradClipAlgorithmType from pytorch_lightning.utilities.enums import AMPType, PrecisionType from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning.utilities.imports import _APEX_AVAILABLE, _RequirementAvailable +from pytorch_lightning.utilities.imports import _APEX_AVAILABLE from pytorch_lightning.utilities.model_helpers import is_overridden from pytorch_lightning.utilities.warnings import WarningCache -_DEEPSPEED_AVAILABLE = _RequirementAvailable("deepspeed") +_DEEPSPEED_AVAILABLE = RequirementCache("deepspeed") if TYPE_CHECKING and _DEEPSPEED_AVAILABLE: import deepspeed diff --git a/src/pytorch_lightning/serve/servable_module_validator.py b/src/pytorch_lightning/serve/servable_module_validator.py index 305e520e422b3..c3aed93daa570 100644 --- a/src/pytorch_lightning/serve/servable_module_validator.py +++ b/src/pytorch_lightning/serve/servable_module_validator.py @@ -5,6 +5,7 @@ import requests import torch +from lightning_utilities.core.imports import RequirementCache from typing_extensions import Literal import pytorch_lightning as pl @@ -12,7 +13,6 @@ from pytorch_lightning.serve.servable_module import ServableModule from pytorch_lightning.strategies import DDPFullyShardedNativeStrategy, DDPFullyShardedStrategy, DeepSpeedStrategy from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning.utilities.imports import _RequirementAvailable from pytorch_lightning.utilities.model_helpers import is_overridden from pytorch_lightning.utilities.rank_zero import rank_zero_only @@ -51,10 +51,10 @@ def __init__( exit_on_failure: bool = True, ): super().__init__() - fastapi_installed = _RequirementAvailable("fastapi") + fastapi_installed = RequirementCache("fastapi") if not fastapi_installed: raise ModuleNotFoundError(fastapi_installed.message) - uvicorn_installed = _RequirementAvailable("uvicorn") + uvicorn_installed = RequirementCache("uvicorn") if not uvicorn_installed: raise ModuleNotFoundError(uvicorn_installed.message) diff --git a/src/pytorch_lightning/strategies/bagua.py b/src/pytorch_lightning/strategies/bagua.py index f08d1aebf1b7c..fd5f7b13195e6 100644 --- a/src/pytorch_lightning/strategies/bagua.py +++ b/src/pytorch_lightning/strategies/bagua.py @@ -3,6 +3,7 @@ from typing import Any, Dict, List, Optional, Union import torch +from lightning_utilities.core.imports import package_available from torch import Tensor from torch.nn import Module @@ -16,11 +17,10 @@ from pytorch_lightning.trainer.states import TrainerFn from pytorch_lightning.utilities.distributed import ReduceOp from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning.utilities.imports import _package_available from pytorch_lightning.utilities.optimizer import optimizers_to_device from pytorch_lightning.utilities.seed import reset_seed -_BAGUA_AVAILABLE = _package_available("bagua") +_BAGUA_AVAILABLE = package_available("bagua") if _BAGUA_AVAILABLE: import bagua.torch_api as bagua diff --git a/src/pytorch_lightning/strategies/deepspeed.py b/src/pytorch_lightning/strategies/deepspeed.py index 2fb4deeb76dd9..a24da43445635 100644 --- a/src/pytorch_lightning/strategies/deepspeed.py +++ b/src/pytorch_lightning/strategies/deepspeed.py @@ -22,6 +22,7 @@ from typing import Any, Dict, Generator, List, Mapping, Optional, Tuple, Union import torch +from lightning_utilities.core.imports import RequirementCache from torch import Tensor from torch.nn import Module from torch.optim import Optimizer @@ -44,7 +45,6 @@ ) from pytorch_lightning.utilities.enums import AMPType, PrecisionType from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning.utilities.imports import _RequirementAvailable from pytorch_lightning.utilities.model_helpers import is_overridden from pytorch_lightning.utilities.optimizer import optimizers_to_device from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation, rank_zero_info, rank_zero_warn @@ -54,7 +54,7 @@ warning_cache = WarningCache() -_DEEPSPEED_AVAILABLE = _RequirementAvailable("deepspeed") +_DEEPSPEED_AVAILABLE = RequirementCache("deepspeed") if _DEEPSPEED_AVAILABLE: import deepspeed diff --git a/src/pytorch_lightning/strategies/launchers/subprocess_script.py b/src/pytorch_lightning/strategies/launchers/subprocess_script.py index a51a109917f6e..fd28fd3dcb20c 100644 --- a/src/pytorch_lightning/strategies/launchers/subprocess_script.py +++ b/src/pytorch_lightning/strategies/launchers/subprocess_script.py @@ -19,13 +19,13 @@ import __main__ import numpy as np +from lightning_utilities.core.imports import RequirementCache import pytorch_lightning as pl from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment from pytorch_lightning.strategies.launchers.base import _Launcher -from pytorch_lightning.utilities.imports import _RequirementAvailable -_HYDRA_AVAILABLE = _RequirementAvailable("hydra") +_HYDRA_AVAILABLE = RequirementCache("hydra") class _SubprocessScriptLauncher(_Launcher): diff --git a/src/pytorch_lightning/trainer/trainer.py b/src/pytorch_lightning/trainer/trainer.py index 494d56f6c7a1f..ebc2f62024664 100644 --- a/src/pytorch_lightning/trainer/trainer.py +++ b/src/pytorch_lightning/trainer/trainer.py @@ -30,6 +30,7 @@ import torch import torch.distributed as dist +from lightning_utilities.core.imports import module_available from packaging.version import Version from torch import Tensor from torch.optim import Optimizer @@ -104,7 +105,7 @@ from pytorch_lightning.utilities.data import _auto_add_worker_init_fn, has_len_all_ranks from pytorch_lightning.utilities.distributed import distributed_available from pytorch_lightning.utilities.exceptions import ExitGracefullyException, MisconfigurationException -from pytorch_lightning.utilities.imports import _fault_tolerant_training, _module_available +from pytorch_lightning.utilities.imports import _fault_tolerant_training from pytorch_lightning.utilities.model_helpers import is_overridden from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation, rank_zero_info, rank_zero_warn from pytorch_lightning.utilities.seed import isolate_rng @@ -1463,7 +1464,7 @@ def _call_setup_hook(self) -> None: def _call_configure_sharded_model(self) -> None: with self.strategy.model_sharded_context(): # experimental support for torchdistx - if _module_available("torchdistx.deferred_init"): + if module_available("torchdistx.deferred_init"): from torchdistx.deferred_init import materialize_module materialize_module(self.lightning_module) diff --git a/src/pytorch_lightning/tuner/lr_finder.py b/src/pytorch_lightning/tuner/lr_finder.py index 45286612231c2..6b6d5771a4751 100644 --- a/src/pytorch_lightning/tuner/lr_finder.py +++ b/src/pytorch_lightning/tuner/lr_finder.py @@ -20,6 +20,7 @@ import numpy as np import torch +from lightning_utilities.core.imports import RequirementCache from torch.optim.lr_scheduler import _LRScheduler import pytorch_lightning as pl @@ -27,7 +28,6 @@ from pytorch_lightning.core.optimizer import _init_optimizers_and_lr_schedulers, _set_scheduler_opt_idx from pytorch_lightning.loggers.logger import DummyLogger from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning.utilities.imports import _RequirementAvailable from pytorch_lightning.utilities.parsing import lightning_hasattr, lightning_setattr from pytorch_lightning.utilities.rank_zero import rank_zero_warn from pytorch_lightning.utilities.types import LRSchedulerConfig, STEP_OUTPUT @@ -39,7 +39,7 @@ else: from tqdm import tqdm -_MATPLOTLIB_AVAILABLE = _RequirementAvailable("matplotlib") +_MATPLOTLIB_AVAILABLE = RequirementCache("matplotlib") if _MATPLOTLIB_AVAILABLE and TYPE_CHECKING: import matplotlib.pyplot as plt diff --git a/src/pytorch_lightning/utilities/__init__.py b/src/pytorch_lightning/utilities/__init__.py index 127794f35fdba..a0baa3a85f7b0 100644 --- a/src/pytorch_lightning/utilities/__init__.py +++ b/src/pytorch_lightning/utilities/__init__.py @@ -33,7 +33,6 @@ _IPU_AVAILABLE, _IS_INTERACTIVE, _IS_WINDOWS, - _module_available, _OMEGACONF_AVAILABLE, _POPTORCH_AVAILABLE, _TORCH_GREATER_EQUAL_1_10, diff --git a/src/pytorch_lightning/utilities/imports.py b/src/pytorch_lightning/utilities/imports.py index fafc693228b0b..cbbfcc21ddaf4 100644 --- a/src/pytorch_lightning/utilities/imports.py +++ b/src/pytorch_lightning/utilities/imports.py @@ -12,139 +12,37 @@ # See the License for the specific language governing permissions and # limitations under the License. """General utilities.""" -import importlib import operator import platform import sys -from importlib.util import find_spec -from typing import Callable -import pkg_resources import torch -from packaging.version import Version -from pkg_resources import DistributionNotFound - - -def _package_available(package_name: str) -> bool: - """Check if a package is available in your environment. - - >>> _package_available('os') - True - >>> _package_available('bla') - False - """ - try: - return find_spec(package_name) is not None - except ModuleNotFoundError: - return False - - -def _module_available(module_path: str) -> bool: - """Check if a module path is available in your environment. - - >>> _module_available('os') - True - >>> _module_available('os.bla') - False - >>> _module_available('bla.bla') - False - """ - module_names = module_path.split(".") - if not _package_available(module_names[0]): - return False - try: - importlib.import_module(module_path) - except ImportError: - return False - return True - - -def _compare_version(package: str, op: Callable, version: str, use_base_version: bool = False) -> bool: - """Compare package version with some requirements. - - >>> _compare_version("torch", operator.ge, "0.1") - True - >>> _compare_version("does_not_exist", operator.ge, "0.0") - False - """ - try: - pkg = importlib.import_module(package) - except (ImportError, DistributionNotFound): - return False - try: - if hasattr(pkg, "__version__"): - pkg_version = Version(pkg.__version__) - else: - # try pkg_resources to infer version - pkg_version = Version(pkg_resources.get_distribution(package).version) - except TypeError: - # this is mocked by Sphinx, so it should return True to generate all summaries - return True - if use_base_version: - pkg_version = Version(pkg_version.base_version) - return op(pkg_version, Version(version)) - - -class _RequirementAvailable: - """Boolean-like class for check of requirement with extras and version specifiers. - - >>> _RequirementAvailable("torch>=0.1") - Requirement 'torch>=0.1' met - >>> bool(_RequirementAvailable("torch>=0.1")) - True - >>> bool(_RequirementAvailable("torch>100.0")) - False - """ - - def __init__(self, requirement: str) -> None: - self.requirement = requirement - - def _check_requirement(self) -> None: - if not hasattr(self, "available"): - try: - pkg_resources.require(self.requirement) - self.available = True - self.message = f"Requirement {self.requirement!r} met" - except Exception as ex: - self.available = False - self.message = f"Requirement {self.requirement!r} not met, {ex.__class__.__name__}: {ex}" - - def __bool__(self) -> bool: - self._check_requirement() - return self.available - - def __str__(self) -> str: - self._check_requirement() - return self.message - - def __repr__(self) -> str: - return self.__str__() - +from lightning_utilities.core.imports import compare_version, module_available, package_available _IS_WINDOWS = platform.system() == "Windows" _IS_INTERACTIVE = hasattr(sys, "ps1") # https://stackoverflow.com/a/64523765 _PYTHON_GREATER_EQUAL_3_8_0 = (sys.version_info.major, sys.version_info.minor) >= (3, 8) _PYTHON_GREATER_EQUAL_3_10_0 = (sys.version_info.major, sys.version_info.minor) >= (3, 10) -_TORCH_GREATER_EQUAL_1_9_1 = _compare_version("torch", operator.ge, "1.9.1") -_TORCH_GREATER_EQUAL_1_10 = _compare_version("torch", operator.ge, "1.10.0") -_TORCH_LESSER_EQUAL_1_10_2 = _compare_version("torch", operator.le, "1.10.2") -_TORCH_GREATER_EQUAL_1_11 = _compare_version("torch", operator.ge, "1.11.0") -_TORCH_GREATER_EQUAL_1_12 = _compare_version("torch", operator.ge, "1.12.0") -_TORCH_GREATER_EQUAL_1_13 = _compare_version("torch", operator.ge, "1.13.0", use_base_version=True) - -_APEX_AVAILABLE = _module_available("apex.amp") -_DALI_AVAILABLE = _module_available("nvidia.dali") -_HABANA_FRAMEWORK_AVAILABLE = _package_available("habana_frameworks") -_HIVEMIND_AVAILABLE = _package_available("hivemind") -_HOROVOD_AVAILABLE = _module_available("horovod.torch") +_TORCH_GREATER_EQUAL_1_9_1 = compare_version("torch", operator.ge, "1.9.1") +_TORCH_GREATER_EQUAL_1_10 = compare_version("torch", operator.ge, "1.10.0") +_TORCH_LESSER_EQUAL_1_10_2 = compare_version("torch", operator.le, "1.10.2") +_TORCH_GREATER_EQUAL_1_11 = compare_version("torch", operator.ge, "1.11.0") +_TORCH_GREATER_EQUAL_1_12 = compare_version("torch", operator.ge, "1.12.0") +_TORCH_GREATER_EQUAL_1_13 = compare_version("torch", operator.ge, "1.13.0", use_base_version=True) + +_APEX_AVAILABLE = module_available("apex.amp") +_DALI_AVAILABLE = module_available("nvidia.dali") +_HABANA_FRAMEWORK_AVAILABLE = package_available("habana_frameworks") +_HIVEMIND_AVAILABLE = package_available("hivemind") +_HOROVOD_AVAILABLE = module_available("horovod.torch") _KINETO_AVAILABLE = torch.profiler.kineto_available() -_OMEGACONF_AVAILABLE = _package_available("omegaconf") -_POPTORCH_AVAILABLE = _package_available("poptorch") -_PSUTIL_AVAILABLE = _package_available("psutil") -_RICH_AVAILABLE = _package_available("rich") and _compare_version("rich", operator.ge, "10.2.2") +_OMEGACONF_AVAILABLE = package_available("omegaconf") +_POPTORCH_AVAILABLE = package_available("poptorch") +_PSUTIL_AVAILABLE = package_available("psutil") +_RICH_AVAILABLE = package_available("rich") and compare_version("rich", operator.ge, "10.2.2") _TORCH_QUANTIZE_AVAILABLE = bool([eg for eg in torch.backends.quantized.supported_engines if eg != "none"]) -_TORCHVISION_AVAILABLE = _package_available("torchvision") -_XLA_AVAILABLE: bool = _package_available("torch_xla") +_TORCHVISION_AVAILABLE = package_available("torchvision") +_XLA_AVAILABLE: bool = package_available("torch_xla") from lightning_lite.utilities.xla_device import XLADeviceUtils # noqa: E402 diff --git a/src/pytorch_lightning/utilities/meta.py b/src/pytorch_lightning/utilities/meta.py index 9f4cd72bfe65d..c028e8b446951 100644 --- a/src/pytorch_lightning/utilities/meta.py +++ b/src/pytorch_lightning/utilities/meta.py @@ -14,11 +14,11 @@ from contextlib import contextmanager from typing import Any, Callable, Generator, Mapping, Optional, Set, Type, Union +from lightning_utilities.core.imports import module_available from torch import Tensor from torch.nn import Module, Parameter from pytorch_lightning.utilities import rank_zero_deprecation -from pytorch_lightning.utilities.imports import _module_available def is_meta_init() -> bool: @@ -107,7 +107,7 @@ def is_on_meta_device(module: Module) -> bool: def _is_deferred(module: Optional[Module]) -> bool: - if module is None or not _module_available("torchdistx.fake"): + if module is None or not module_available("torchdistx.fake"): return False from torchdistx.fake import is_fake diff --git a/src/pytorch_lightning/utilities/model_summary/model_summary_deepspeed.py b/src/pytorch_lightning/utilities/model_summary/model_summary_deepspeed.py index 5fc189ccf99b2..d5d14d6bd77de 100644 --- a/src/pytorch_lightning/utilities/model_summary/model_summary_deepspeed.py +++ b/src/pytorch_lightning/utilities/model_summary/model_summary_deepspeed.py @@ -17,9 +17,9 @@ from typing import Dict, List, Tuple import torch +from lightning_utilities.core.imports import RequirementCache from torch.nn import Parameter -from pytorch_lightning.utilities.imports import _RequirementAvailable from pytorch_lightning.utilities.model_summary.model_summary import ( _is_lazy_weight_tensor, get_human_readable_count, @@ -44,7 +44,7 @@ def average_shard_parameters(self) -> int: """Returns the number of parameters in this module.""" def partitioned_size(p: Parameter) -> int: - return p.partitioned_size() if _RequirementAvailable("deepspeed<0.6.6") else p.partition_numel() + return p.partitioned_size() if RequirementCache("deepspeed<0.6.6") else p.partition_numel() return sum(partitioned_size(p) if not _is_lazy_weight_tensor(p) else 0 for p in self._module.parameters()) diff --git a/tests/tests_app/utilities/test_imports.py b/tests/tests_app/utilities/test_imports.py index 265181bab1f2a..00a24d41a09f0 100644 --- a/tests/tests_app/utilities/test_imports.py +++ b/tests/tests_app/utilities/test_imports.py @@ -3,16 +3,7 @@ import pytest -from lightning_app.utilities.imports import _module_available, requires - - -def test_module_available(): - """Test if the 3rd party libs are available.""" - assert _module_available("deepdiff") - assert _module_available("deepdiff.deephash") - assert not _module_available("torch.nn.asdf") - assert not _module_available("asdf") - assert not _module_available("asdf.bla.asdf") +from lightning_app.utilities.imports import requires @mock.patch.dict(os.environ, {"LIGHTING_TESTING": "0"}) diff --git a/tests/tests_pytorch/helpers/datamodules.py b/tests/tests_pytorch/helpers/datamodules.py index 4278422593c00..4984914c275dd 100644 --- a/tests/tests_pytorch/helpers/datamodules.py +++ b/tests/tests_pytorch/helpers/datamodules.py @@ -14,16 +14,13 @@ import pytest import torch +from lightning_utilities.core.imports import RequirementCache from torch.utils.data import DataLoader from pytorch_lightning.core.datamodule import LightningDataModule -from pytorch_lightning.utilities import _module_available from tests_pytorch.helpers.datasets import MNIST, SklearnDataset, TrialMNIST -_SKLEARN_AVAILABLE = _module_available("sklearn") -if _SKLEARN_AVAILABLE: - from sklearn.datasets import make_classification, make_regression - from sklearn.model_selection import train_test_split +_SKLEARN_AVAILABLE = RequirementCache("sklearn") class MNISTDataModule(LightningDataModule): @@ -66,6 +63,8 @@ def __init__(self, sklearn_dataset, x_type, y_type, batch_size: int = 10): self._y_type = y_type def _split_data(self): + from sklearn.model_selection import train_test_split + self.x_train, self.x_test, self.y_train, self.y_test = train_test_split( self._x, self._y, test_size=0.20, random_state=42 ) @@ -102,6 +101,8 @@ class ClassifDataModule(SklearnDataModule): def __init__(self, num_features=32, length=800, num_classes=3, batch_size=10): if not _SKLEARN_AVAILABLE: pytest.skip("`sklearn` is not available.") + from sklearn.datasets import make_classification + data = make_classification( n_samples=length, n_features=num_features, n_classes=num_classes, n_clusters_per_class=1, random_state=42 ) @@ -112,6 +113,8 @@ class RegressDataModule(SklearnDataModule): def __init__(self, num_features=16, length=800, batch_size=10): if not _SKLEARN_AVAILABLE: pytest.skip("`sklearn` is not available.") + from sklearn.datasets import make_regression + x, y = make_regression(n_samples=length, n_features=num_features, random_state=42) y = [[v] for v in y] super().__init__((x, y), x_type=torch.float32, y_type=torch.float32, batch_size=batch_size) diff --git a/tests/tests_pytorch/loggers/test_tensorboard.py b/tests/tests_pytorch/loggers/test_tensorboard.py index 86070d886296b..3793b5c58b5b6 100644 --- a/tests/tests_pytorch/loggers/test_tensorboard.py +++ b/tests/tests_pytorch/loggers/test_tensorboard.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. import logging -import operator import os from argparse import Namespace from unittest import mock @@ -25,19 +24,14 @@ from pytorch_lightning import Trainer from pytorch_lightning.demos.boring_classes import BoringModel from pytorch_lightning.loggers import TensorBoardLogger -from pytorch_lightning.utilities.imports import _compare_version, _OMEGACONF_AVAILABLE +from pytorch_lightning.utilities.imports import _OMEGACONF_AVAILABLE from tests_pytorch.helpers.runif import RunIf if _OMEGACONF_AVAILABLE: from omegaconf import OmegaConf -@pytest.mark.skipif( - _compare_version("tensorboard", operator.ge, "2.6.0"), reason="cannot import EventAccumulator in >= 2.6.0" -) def test_tensorboard_hparams_reload(tmpdir): - from tensorboard.backend.event_processing.event_accumulator import EventAccumulator - class CustomModel(BoringModel): def __init__(self, b1=0.5, b2=0.999): super().__init__() @@ -63,15 +57,6 @@ def __init__(self, b1=0.5, b2=0.999): # verify artifacts assert len(os.listdir(os.path.join(folder_path, "checkpoints"))) == 1 - # verify tb logs - event_acc = EventAccumulator(folder_path) - event_acc.Reload() - - hparams_data = b'\x12\x1f"\x06\n\x02b1 \x03"\x06\n\x02b2 \x03*\r\n\x0b\x12\thp_metric' - - assert event_acc.summary_metadata["_hparams_/experiment"].plugin_data.plugin_name == "hparams" - assert event_acc.summary_metadata["_hparams_/experiment"].plugin_data.content == hparams_data - def test_tensorboard_automatic_versioning(tmpdir): """Verify that automatic versioning works.""" diff --git a/tests/tests_pytorch/models/test_gpu.py b/tests/tests_pytorch/models/test_gpu.py index 0a030e74273b1..5eded60d2084f 100644 --- a/tests/tests_pytorch/models/test_gpu.py +++ b/tests/tests_pytorch/models/test_gpu.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import operator import os from collections import namedtuple from unittest import mock @@ -28,12 +27,10 @@ from pytorch_lightning.plugins.environments import TorchElasticEnvironment from pytorch_lightning.utilities import device_parser from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning.utilities.imports import _compare_version from tests_pytorch.helpers.datamodules import ClassifDataModule from tests_pytorch.helpers.runif import RunIf from tests_pytorch.helpers.simple_models import ClassificationModel -PL_VERSION_LT_1_5 = _compare_version("pytorch_lightning", operator.lt, "1.5") PRETEND_N_OF_GPUS = 16 diff --git a/tests/tests_pytorch/models/test_hparams.py b/tests/tests_pytorch/models/test_hparams.py index 628eb28403486..80ef49e87fcf2 100644 --- a/tests/tests_pytorch/models/test_hparams.py +++ b/tests/tests_pytorch/models/test_hparams.py @@ -24,6 +24,7 @@ import pytest import torch from fsspec.implementations.local import LocalFileSystem +from lightning_utilities.core.imports import RequirementCache from torch.utils.data import DataLoader from pytorch_lightning import LightningModule, Trainer @@ -34,7 +35,6 @@ from pytorch_lightning.demos.boring_classes import BoringDataModule, BoringModel, RandomDataset from pytorch_lightning.utilities import _OMEGACONF_AVAILABLE, AttributeDict, is_picklable from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning.utilities.imports import _RequirementAvailable from tests_pytorch.helpers.runif import RunIf from tests_pytorch.helpers.utils import no_warning_call @@ -666,7 +666,7 @@ def test_model_with_fsspec_as_parameter(tmpdir): trainer.test() -@pytest.mark.skipif(_RequirementAvailable("hydra-core<1.1"), reason="Requires Hydra's Compose API") +@pytest.mark.skipif(RequirementCache("hydra-core<1.1"), reason="Requires Hydra's Compose API") def test_model_save_hyper_parameters_interpolation_with_hydra(tmpdir): """This test relies on configuration saved under tests/models/conf/config.yaml.""" from hydra import compose, initialize diff --git a/tests/tests_pytorch/utilities/test_imports.py b/tests/tests_pytorch/utilities/test_imports.py index 25995bb029f3a..05845e0b15172 100644 --- a/tests/tests_pytorch/utilities/test_imports.py +++ b/tests/tests_pytorch/utilities/test_imports.py @@ -11,53 +11,11 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import operator from pytorch_lightning.overrides.fairscale import _FAIRSCALE_AVAILABLE from pytorch_lightning.strategies.bagua import _BAGUA_AVAILABLE from pytorch_lightning.strategies.deepspeed import _DEEPSPEED_AVAILABLE -from pytorch_lightning.utilities import ( - _APEX_AVAILABLE, - _HOROVOD_AVAILABLE, - _module_available, - _OMEGACONF_AVAILABLE, - _POPTORCH_AVAILABLE, -) -from pytorch_lightning.utilities.imports import _compare_version, _RequirementAvailable, torch - - -def test_module_exists(): - """Test if the some 3rd party libs are available.""" - assert _module_available("torch") - assert _module_available("torch.nn.parallel") - assert not _module_available("torch.nn.asdf") - assert not _module_available("asdf") - assert not _module_available("asdf.bla.asdf") - - -def test_compare_version(monkeypatch): - monkeypatch.setattr(torch, "__version__", "1.8.9") - assert not _compare_version("torch", operator.ge, "1.10.0") - assert _compare_version("torch", operator.lt, "1.10.0") - - monkeypatch.setattr(torch, "__version__", "1.10.0.dev123") - assert _compare_version("torch", operator.ge, "1.10.0.dev123") - assert not _compare_version("torch", operator.ge, "1.10.0.dev124") - - assert _compare_version("torch", operator.ge, "1.10.0.dev123", use_base_version=True) - assert _compare_version("torch", operator.ge, "1.10.0.dev124", use_base_version=True) - - monkeypatch.setattr(torch, "__version__", "1.10.0a0+0aef44c") # dev version before rc - assert _compare_version("torch", operator.ge, "1.10.0.rc0", use_base_version=True) - assert not _compare_version("torch", operator.ge, "1.10.0.rc0") - assert _compare_version("torch", operator.ge, "1.10.0", use_base_version=True) - assert not _compare_version("torch", operator.ge, "1.10.0") - - -def test_requirement_avaliable(): - assert _RequirementAvailable(f"torch>={torch.__version__}") - assert not _RequirementAvailable(f"torch<{torch.__version__}") - assert "Requirement '-' not met" in str(_RequirementAvailable("-")) +from pytorch_lightning.utilities import _APEX_AVAILABLE, _HOROVOD_AVAILABLE, _OMEGACONF_AVAILABLE, _POPTORCH_AVAILABLE def test_imports(): diff --git a/tests/tests_pytorch/utilities/test_torchdistx.py b/tests/tests_pytorch/utilities/test_torchdistx.py index aa3f8e34bfaac..92c03dc36031f 100644 --- a/tests/tests_pytorch/utilities/test_torchdistx.py +++ b/tests/tests_pytorch/utilities/test_torchdistx.py @@ -12,16 +12,16 @@ # See the License for the specific language governing permissions and # limitations under the License. import pytest +from lightning_utilities.core.imports import RequirementCache from torch import nn from pytorch_lightning import Trainer from pytorch_lightning.core.module import LightningModule from pytorch_lightning.demos.boring_classes import BoringModel -from pytorch_lightning.utilities.imports import _RequirementAvailable from pytorch_lightning.utilities.meta import _is_deferred from tests_pytorch.helpers.runif import RunIf -_TORCHDISTX_AVAILABLE = _RequirementAvailable("torchdistx") +_TORCHDISTX_AVAILABLE = RequirementCache("torchdistx") class SimpleBoringModel(LightningModule): From 273a9ed8c14650c59b60c546fe53d93a4183d439 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Tue, 6 Sep 2022 15:52:54 +0200 Subject: [PATCH 068/193] Integrate `lightning_utilities.core.apply_func` (#14537) --- requirements/app/base.txt | 2 +- requirements/lite/base.txt | 2 +- requirements/pytorch/base.txt | 2 +- setup.cfg | 3 - src/lightning_app/utilities/apply_func.py | 124 ------- src/lightning_app/utilities/component.py | 2 +- src/lightning_app/utilities/proxies.py | 2 +- src/lightning_lite/utilities/apply_func.py | 227 +----------- src/pytorch_lightning/CHANGELOG.md | 2 +- src/pytorch_lightning/callbacks/pruning.py | 2 +- src/pytorch_lightning/core/module.py | 3 +- src/pytorch_lightning/core/saving.py | 2 +- src/pytorch_lightning/lite/lite.py | 3 +- src/pytorch_lightning/lite/wrappers.py | 3 +- .../loops/dataloader/evaluation_loop.py | 2 +- .../loops/epoch/training_epoch_loop.py | 2 +- .../overrides/data_parallel.py | 2 +- .../plugins/io/xla_plugin.py | 3 +- .../plugins/precision/double.py | 2 +- src/pytorch_lightning/strategies/deepspeed.py | 2 +- src/pytorch_lightning/strategies/dp.py | 2 +- src/pytorch_lightning/strategies/ipu.py | 2 +- .../strategies/launchers/multiprocessing.py | 3 +- src/pytorch_lightning/strategies/tpu_spawn.py | 2 +- .../trainer/connectors/data_connector.py | 2 +- .../logger_connector/logger_connector.py | 3 +- .../connectors/logger_connector/result.py | 3 +- src/pytorch_lightning/trainer/supporters.py | 2 +- src/pytorch_lightning/trainer/trainer.py | 2 +- src/pytorch_lightning/utilities/apply_func.py | 22 +- .../utilities/auto_restart.py | 2 +- src/pytorch_lightning/utilities/data.py | 4 +- src/pytorch_lightning/utilities/fetching.py | 2 +- src/pytorch_lightning/utilities/memory.py | 3 +- src/pytorch_lightning/utilities/metrics.py | 2 +- src/pytorch_lightning/utilities/optimizer.py | 3 +- tests/tests_app/utilities/test_apply_func.py | 264 -------------- tests/tests_lite/utilities/test_apply_func.py | 334 +----------------- tests/tests_pytorch/lite/test_parity.py | 3 +- .../tests_pytorch/trainer/test_supporters.py | 2 +- 40 files changed, 63 insertions(+), 991 deletions(-) delete mode 100644 src/lightning_app/utilities/apply_func.py delete mode 100644 tests/tests_app/utilities/test_apply_func.py diff --git a/requirements/app/base.txt b/requirements/app/base.txt index 928f720139df6..3824e10263aff 100644 --- a/requirements/app/base.txt +++ b/requirements/app/base.txt @@ -7,4 +7,4 @@ s3fs>=2022.5.0, <=2022.7.1 croniter>=1.3.0, <1.4.0 # strict; TODO: for now until we find something more robust. traitlets<5.2.0 # Traitlets 5.2.X fails: https://github.com/ipython/traitlets/issues/741 arrow>=1.2.0, <=1.2.2 -lightning-utilities>=0.2.0 +lightning-utilities==0.2.* diff --git a/requirements/lite/base.txt b/requirements/lite/base.txt index 333f8791748a7..0ab00d4df2b77 100644 --- a/requirements/lite/base.txt +++ b/requirements/lite/base.txt @@ -5,4 +5,4 @@ torch>=1.9.*, <1.13.0 fsspec[http]>=2021.05.0, !=2021.06.0, <2022.6.0 packaging>=17.0, <=21.3 typing-extensions>=4.0.0, <4.3.1 -lightning-utilities>=0.2.0 +lightning-utilities==0.2.* diff --git a/requirements/pytorch/base.txt b/requirements/pytorch/base.txt index f5192f24be1c9..13105d73fe756 100644 --- a/requirements/pytorch/base.txt +++ b/requirements/pytorch/base.txt @@ -10,4 +10,4 @@ tensorboard>=2.9.1, <2.11.0 torchmetrics>=0.7.0, <0.9.3 # needed for using fixed compare_version packaging>=17.0, <=21.3 typing-extensions>=4.0.0, <4.3.1 -lightning-utilities>=0.2.0 +lightning-utilities==0.2.* diff --git a/setup.cfg b/setup.cfg index 1f2e17557310e..6007aec280e4c 100644 --- a/setup.cfg +++ b/setup.cfg @@ -36,9 +36,6 @@ filterwarnings = # error out on our deprecation warnings - ensures the code and tests are kept up-to-date error::pytorch_lightning.utilities.rank_zero.LightningDeprecationWarning error::FutureWarning - # warnings from deprecated modules on import - # TODO: remove in 1.7 - ignore::pytorch_lightning.utilities.rank_zero.LightningDeprecationWarning:pytorch_lightning.core.decorators xfail_strict = true junit_duration_report = call diff --git a/src/lightning_app/utilities/apply_func.py b/src/lightning_app/utilities/apply_func.py deleted file mode 100644 index e8ec148e6c517..0000000000000 --- a/src/lightning_app/utilities/apply_func.py +++ /dev/null @@ -1,124 +0,0 @@ -# Copyright The PyTorch Lightning team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Utilities used for collections.""" - -import dataclasses -from collections import defaultdict, OrderedDict -from copy import deepcopy -from typing import Any, Callable, Mapping, Optional, Sequence, Tuple, Union - -from lightning_app.utilities.exceptions import MisconfigurationException - - -def _is_namedtuple(obj: object) -> bool: - # https://github.com/pytorch/pytorch/blob/v1.8.1/torch/nn/parallel/scatter_gather.py#L4-L8 - return isinstance(obj, tuple) and hasattr(obj, "_asdict") and hasattr(obj, "_fields") - - -def _is_dataclass_instance(obj: object) -> bool: - # https://docs.python.org/3/library/dataclasses.html#module-level-decorators-classes-and-functions - return dataclasses.is_dataclass(obj) and not isinstance(obj, type) - - -def apply_to_collection( - data: Any, - dtype: Union[type, Any, Tuple[Union[type, Any]]], - function: Callable, - *args: Any, - wrong_dtype: Optional[Union[type, Tuple[type]]] = None, - include_none: bool = True, - **kwargs: Any, -) -> Any: - """Recursively applies a function to all elements of a certain dtype. - - Args: - data: the collection to apply the function to - dtype: the given function will be applied to all elements of this dtype - function: the function to apply - *args: positional arguments (will be forwarded to calls of ``function``) - wrong_dtype: the given function won't be applied if this type is specified and the given collections - is of the ``wrong_dtype`` even if it is of type ``dtype`` - include_none: Whether to include an element if the output of ``function`` is ``None``. - **kwargs: keyword arguments (will be forwarded to calls of ``function``) - - Returns: - The resulting collection - """ - # Breaking condition - if isinstance(data, dtype) and (wrong_dtype is None or not isinstance(data, wrong_dtype)): - return function(data, *args, **kwargs) - - elem_type = type(data) - - # Recursively apply to collection items - if isinstance(data, Mapping): - out = [] - for k, v in data.items(): - v = apply_to_collection( - v, dtype, function, *args, wrong_dtype=wrong_dtype, include_none=include_none, **kwargs - ) - if include_none or v is not None: - out.append((k, v)) - if isinstance(data, defaultdict): - return elem_type(data.default_factory, OrderedDict(out)) - return elem_type(OrderedDict(out)) - - is_namedtuple = _is_namedtuple(data) - is_sequence = isinstance(data, Sequence) and not isinstance(data, str) - if is_namedtuple or is_sequence: - out = [] - for d in data: - v = apply_to_collection( - d, dtype, function, *args, wrong_dtype=wrong_dtype, include_none=include_none, **kwargs - ) - if include_none or v is not None: - out.append(v) - return elem_type(*out) if is_namedtuple else elem_type(out) - - if _is_dataclass_instance(data): - # make a deepcopy of the data, - # but do not deepcopy mapped fields since the computation would - # be wasted on values that likely get immediately overwritten - fields = {} - memo = {} - for field in dataclasses.fields(data): - field_value = getattr(data, field.name) - fields[field.name] = (field_value, field.init) - memo[id(field_value)] = field_value - result = deepcopy(data, memo=memo) - # apply function to each field - for field_name, (field_value, field_init) in fields.items(): - if field_init: - v = apply_to_collection( - field_value, - dtype, - function, - *args, - wrong_dtype=wrong_dtype, - include_none=include_none, - **kwargs, - ) - if not field_init or (not include_none and v is None): # retain old value - v = getattr(data, field_name) - try: - setattr(result, field_name, v) - except dataclasses.FrozenInstanceError as e: - raise MisconfigurationException( - "A frozen dataclass was passed to `apply_to_collection` but this is not allowed." - " HINT: is your batch a frozen dataclass?" - ) from e - return result - - # data is neither of dtype, nor a collection - return data diff --git a/src/lightning_app/utilities/component.py b/src/lightning_app/utilities/component.py index 8220b0614c320..1ca2b72d85aad 100644 --- a/src/lightning_app/utilities/component.py +++ b/src/lightning_app/utilities/component.py @@ -3,8 +3,8 @@ from typing import Any, Dict, Generator, Optional, TYPE_CHECKING from deepdiff.helper import NotPresent +from lightning_utilities.core.apply_func import apply_to_collection -from lightning_app.utilities.apply_func import apply_to_collection from lightning_app.utilities.enum import ComponentContext from lightning_app.utilities.tree import breadth_first diff --git a/src/lightning_app/utilities/proxies.py b/src/lightning_app/utilities/proxies.py index 99ad6e2aad0cf..a03fe45caa752 100644 --- a/src/lightning_app/utilities/proxies.py +++ b/src/lightning_app/utilities/proxies.py @@ -14,6 +14,7 @@ from typing import Any, Callable, Dict, Optional, Set, Tuple, TYPE_CHECKING, Union from deepdiff import DeepDiff, Delta +from lightning_utilities.core.apply_func import apply_to_collection from lightning_app.storage import Path from lightning_app.storage.copier import Copier, copy_files @@ -21,7 +22,6 @@ from lightning_app.storage.path import path_to_work_artifact from lightning_app.storage.payload import Payload from lightning_app.utilities.app_helpers import affiliation -from lightning_app.utilities.apply_func import apply_to_collection from lightning_app.utilities.component import _set_work_context from lightning_app.utilities.enum import ( CacheCallsKeys, diff --git a/src/lightning_lite/utilities/apply_func.py b/src/lightning_lite/utilities/apply_func.py index ae3d81eab46a5..c76fe01985ff6 100644 --- a/src/lightning_lite/utilities/apply_func.py +++ b/src/lightning_lite/utilities/apply_func.py @@ -12,20 +12,15 @@ # See the License for the specific language governing permissions and # limitations under the License. """Utilities used for collections.""" - -import dataclasses from abc import ABC -from collections import defaultdict, OrderedDict -from copy import deepcopy from functools import partial -from typing import Any, Callable, List, Mapping, Optional, Sequence, Tuple, Union +from typing import Any, Callable, List, Tuple, Union import numpy as np import torch +from lightning_utilities.core.apply_func import apply_to_collection from torch import Tensor -from pytorch_lightning.utilities.exceptions import MisconfigurationException - _BLOCKING_DEVICE_TYPES = ("cpu", "mps") @@ -48,224 +43,6 @@ def from_numpy(value: np.ndarray, device: Union[str, torch.device]) -> Tensor: ] -def _is_namedtuple(obj: object) -> bool: - # https://github.com/pytorch/pytorch/blob/v1.8.1/torch/nn/parallel/scatter_gather.py#L4-L8 - return isinstance(obj, tuple) and hasattr(obj, "_asdict") and hasattr(obj, "_fields") - - -def _is_dataclass_instance(obj: object) -> bool: - # https://docs.python.org/3/library/dataclasses.html#module-level-decorators-classes-and-functions - return dataclasses.is_dataclass(obj) and not isinstance(obj, type) - - -def apply_to_collection( - data: Any, - dtype: Union[type, Any, Tuple[Union[type, Any]]], - function: Callable, - *args: Any, - wrong_dtype: Optional[Union[type, Tuple[type, ...]]] = None, - include_none: bool = True, - **kwargs: Any, -) -> Any: - """Recursively applies a function to all elements of a certain dtype. - - Args: - data: the collection to apply the function to - dtype: the given function will be applied to all elements of this dtype - function: the function to apply - *args: positional arguments (will be forwarded to calls of ``function``) - wrong_dtype: the given function won't be applied if this type is specified and the given collections - is of the ``wrong_dtype`` even if it is of type ``dtype`` - include_none: Whether to include an element if the output of ``function`` is ``None``. - **kwargs: keyword arguments (will be forwarded to calls of ``function``) - - Returns: - The resulting collection - """ - # Breaking condition - if isinstance(data, dtype) and (wrong_dtype is None or not isinstance(data, wrong_dtype)): - return function(data, *args, **kwargs) - - elem_type = type(data) - - # Recursively apply to collection items - if isinstance(data, Mapping): - out = [] - for k, v in data.items(): - v = apply_to_collection( - v, dtype, function, *args, wrong_dtype=wrong_dtype, include_none=include_none, **kwargs - ) - if include_none or v is not None: - out.append((k, v)) - if isinstance(data, defaultdict): - return elem_type(data.default_factory, OrderedDict(out)) - return elem_type(OrderedDict(out)) - - is_namedtuple = _is_namedtuple(data) - is_sequence = isinstance(data, Sequence) and not isinstance(data, str) - if is_namedtuple or is_sequence: - out = [] - for d in data: - v = apply_to_collection( - d, dtype, function, *args, wrong_dtype=wrong_dtype, include_none=include_none, **kwargs - ) - if include_none or v is not None: - out.append(v) - return elem_type(*out) if is_namedtuple else elem_type(out) - - if _is_dataclass_instance(data): - # make a deepcopy of the data, - # but do not deepcopy mapped fields since the computation would - # be wasted on values that likely get immediately overwritten - fields = {} - memo = {} - for field in dataclasses.fields(data): - field_value = getattr(data, field.name) - fields[field.name] = (field_value, field.init) - memo[id(field_value)] = field_value - result = deepcopy(data, memo=memo) - # apply function to each field - for field_name, (field_value, field_init) in fields.items(): - v = None - if field_init: - v = apply_to_collection( - field_value, - dtype, - function, - *args, - wrong_dtype=wrong_dtype, - include_none=include_none, - **kwargs, - ) - if not field_init or (not include_none and v is None): # retain old value - v = getattr(data, field_name) - try: - setattr(result, field_name, v) - except dataclasses.FrozenInstanceError as e: - raise MisconfigurationException( - "A frozen dataclass was passed to `apply_to_collection` but this is not allowed." - " HINT: is your batch a frozen dataclass?" - ) from e - return result - - # data is neither of dtype, nor a collection - return data - - -def apply_to_collections( - data1: Optional[Any], - data2: Optional[Any], - dtype: Union[type, Any, Tuple[Union[type, Any]]], - function: Callable, - *args: Any, - wrong_dtype: Optional[Union[type, Tuple[type]]] = None, - **kwargs: Any, -) -> Any: - """Zips two collections and applies a function to their items of a certain dtype. - - Args: - data1: The first collection - data2: The second collection - dtype: the given function will be applied to all elements of this dtype - function: the function to apply - *args: positional arguments (will be forwarded to calls of ``function``) - wrong_dtype: the given function won't be applied if this type is specified and the given collections - is of the ``wrong_dtype`` even if it is of type ``dtype`` - **kwargs: keyword arguments (will be forwarded to calls of ``function``) - - Returns: - The resulting collection - - Raises: - AssertionError: - If sequence collections have different data sizes. - """ - if data1 is None: - if data2 is None: - return - # in case they were passed reversed - data1, data2 = data2, None - - elem_type = type(data1) - - if isinstance(data1, dtype) and data2 is not None and (wrong_dtype is None or not isinstance(data1, wrong_dtype)): - return function(data1, data2, *args, **kwargs) - - if isinstance(data1, Mapping) and data2 is not None: - # use union because we want to fail if a key does not exist in both - zipped = {k: (data1[k], data2[k]) for k in data1.keys() | data2.keys()} - return elem_type( - { - k: apply_to_collections(*v, dtype, function, *args, wrong_dtype=wrong_dtype, **kwargs) - for k, v in zipped.items() - } - ) - - is_namedtuple = _is_namedtuple(data1) - is_sequence = isinstance(data1, Sequence) and not isinstance(data1, str) - if (is_namedtuple or is_sequence) and data2 is not None: - assert len(data1) == len(data2), "Sequence collections have different sizes." - out = [ - apply_to_collections(v1, v2, dtype, function, *args, wrong_dtype=wrong_dtype, **kwargs) - for v1, v2 in zip(data1, data2) - ] - return elem_type(*out) if is_namedtuple else elem_type(out) - - if _is_dataclass_instance(data1) and data2 is not None: - if not _is_dataclass_instance(data2): - raise TypeError( - "Expected inputs to be dataclasses of the same type or to have identical fields" - f" but got input 1 of type {type(data1)} and input 2 of type {type(data2)}." - ) - if not ( - len(dataclasses.fields(data1)) == len(dataclasses.fields(data2)) - and all(map(lambda f1, f2: isinstance(f1, type(f2)), dataclasses.fields(data1), dataclasses.fields(data2))) - ): - raise TypeError("Dataclasses fields do not match.") - # make a deepcopy of the data, - # but do not deepcopy mapped fields since the computation would - # be wasted on values that likely get immediately overwritten - data = [data1, data2] - fields: List[dict] = [{}, {}] - memo: dict = {} - for i in range(len(data)): - for field in dataclasses.fields(data[i]): - field_value = getattr(data[i], field.name) - fields[i][field.name] = (field_value, field.init) - if i == 0: - memo[id(field_value)] = field_value - - result = deepcopy(data1, memo=memo) - - # apply function to each field - for ((field_name, (field_value1, field_init1)), (_, (field_value2, field_init2))) in zip( - fields[0].items(), fields[1].items() - ): - v = None - if field_init1 and field_init2: - v = apply_to_collections( - field_value1, - field_value2, - dtype, - function, - *args, - wrong_dtype=wrong_dtype, - **kwargs, - ) - if not field_init1 or not field_init2 or v is None: # retain old value - return apply_to_collection(data1, dtype, function, *args, wrong_dtype=wrong_dtype, **kwargs) - try: - setattr(result, field_name, v) - except dataclasses.FrozenInstanceError as e: - raise MisconfigurationException( - "A frozen dataclass was passed to `apply_to_collections` but this is not allowed." - " HINT: is your batch a frozen dataclass?" - ) from e - return result - - return apply_to_collection(data1, dtype, function, *args, wrong_dtype=wrong_dtype, **kwargs) - - class TransferableDataType(ABC): """A custom type for data that can be moved to a torch device via ``.to(...)``. diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index ef0a8b051e094..c3534aa3d6a73 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -89,7 +89,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). -- Deprecated the functions in `pytorch_lightning.utilities.apply_func` in favor of `lightning_lite.utilities.apply_func` ([#14516](https://github.com/Lightning-AI/lightning/pull/14516)) +- Deprecated the functions in `pytorch_lightning.utilities.apply_func` in favor of `lightning_utilities.core.apply_func` ([#14516](https://github.com/Lightning-AI/lightning/pull/14516), [#14537](https://github.com/Lightning-AI/lightning/pull/14537)) diff --git a/src/pytorch_lightning/callbacks/pruning.py b/src/pytorch_lightning/callbacks/pruning.py index 14fc1acd424e3..ee90964bb8493 100644 --- a/src/pytorch_lightning/callbacks/pruning.py +++ b/src/pytorch_lightning/callbacks/pruning.py @@ -22,11 +22,11 @@ from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union import torch.nn.utils.prune as pytorch_prune +from lightning_utilities.core.apply_func import apply_to_collection from torch import nn, Tensor from typing_extensions import TypedDict import pytorch_lightning as pl -from lightning_lite.utilities.apply_func import apply_to_collection from pytorch_lightning.callbacks.callback import Callback from pytorch_lightning.core.module import LightningModule from pytorch_lightning.utilities.exceptions import MisconfigurationException diff --git a/src/pytorch_lightning/core/module.py b/src/pytorch_lightning/core/module.py index a47a06a538c98..a0512a719d276 100644 --- a/src/pytorch_lightning/core/module.py +++ b/src/pytorch_lightning/core/module.py @@ -25,6 +25,7 @@ from typing import Any, Callable, Dict, Generator, List, Mapping, Optional, overload, Sequence, Tuple, Union import torch +from lightning_utilities.core.apply_func import apply_to_collection from torch import ScriptModule, Tensor from torch.nn import Module from torch.optim.optimizer import Optimizer @@ -32,7 +33,7 @@ from typing_extensions import Literal import pytorch_lightning as pl -from lightning_lite.utilities.apply_func import apply_to_collection, convert_to_tensors +from lightning_lite.utilities.apply_func import convert_to_tensors from lightning_lite.utilities.cloud_io import get_filesystem from lightning_lite.utilities.device_dtype_mixin import _DeviceDtypeModuleMixin from pytorch_lightning.callbacks.callback import Callback diff --git a/src/pytorch_lightning/core/saving.py b/src/pytorch_lightning/core/saving.py index 5b2f54114e404..7d999eebb4828 100644 --- a/src/pytorch_lightning/core/saving.py +++ b/src/pytorch_lightning/core/saving.py @@ -24,9 +24,9 @@ from warnings import warn import yaml +from lightning_utilities.core.apply_func import apply_to_collection import pytorch_lightning as pl -from lightning_lite.utilities.apply_func import apply_to_collection from lightning_lite.utilities.cloud_io import get_filesystem from lightning_lite.utilities.cloud_io import load as pl_load from pytorch_lightning.utilities import _OMEGACONF_AVAILABLE, AttributeDict diff --git a/src/pytorch_lightning/lite/lite.py b/src/pytorch_lightning/lite/lite.py index b87e690ec8862..0ec9cf5c2daa4 100644 --- a/src/pytorch_lightning/lite/lite.py +++ b/src/pytorch_lightning/lite/lite.py @@ -20,11 +20,12 @@ import torch import torch.nn as nn +from lightning_utilities.core.apply_func import apply_to_collection from torch import Tensor from torch.optim import Optimizer from torch.utils.data import BatchSampler, DataLoader, DistributedSampler -from lightning_lite.utilities.apply_func import apply_to_collection, convert_to_tensors +from lightning_lite.utilities.apply_func import convert_to_tensors from pytorch_lightning.accelerators.accelerator import Accelerator from pytorch_lightning.lite.wrappers import _LiteDataLoader, _LiteModule, _LiteOptimizer from pytorch_lightning.overrides.distributed import DistributedSamplerWrapper diff --git a/src/pytorch_lightning/lite/wrappers.py b/src/pytorch_lightning/lite/wrappers.py index 477534cedf90a..f6d75941aab45 100644 --- a/src/pytorch_lightning/lite/wrappers.py +++ b/src/pytorch_lightning/lite/wrappers.py @@ -14,12 +14,13 @@ from typing import Any, Callable, Dict, Generator, Iterator, Optional, Union import torch +from lightning_utilities.core.apply_func import apply_to_collection from torch import nn as nn from torch import Tensor from torch.optim import Optimizer from torch.utils.data import DataLoader -from lightning_lite.utilities.apply_func import apply_to_collection, move_data_to_device +from lightning_lite.utilities.apply_func import move_data_to_device from lightning_lite.utilities.device_dtype_mixin import _DeviceDtypeModuleMixin from pytorch_lightning.plugins import PrecisionPlugin from pytorch_lightning.strategies import Strategy diff --git a/src/pytorch_lightning/loops/dataloader/evaluation_loop.py b/src/pytorch_lightning/loops/dataloader/evaluation_loop.py index d041d371ddfaa..bbd269b7045fd 100644 --- a/src/pytorch_lightning/loops/dataloader/evaluation_loop.py +++ b/src/pytorch_lightning/loops/dataloader/evaluation_loop.py @@ -17,11 +17,11 @@ from collections import ChainMap, OrderedDict from typing import Any, Iterable, List, Optional, Sequence, Tuple, Type, Union +from lightning_utilities.core.apply_func import apply_to_collection from torch import Tensor from torch.utils.data.dataloader import DataLoader import pytorch_lightning as pl -from lightning_lite.utilities.apply_func import apply_to_collection from pytorch_lightning.accelerators import CUDAAccelerator from pytorch_lightning.callbacks.progress.rich_progress import _RICH_AVAILABLE from pytorch_lightning.loops.dataloader import DataLoaderLoop diff --git a/src/pytorch_lightning/loops/epoch/training_epoch_loop.py b/src/pytorch_lightning/loops/epoch/training_epoch_loop.py index edc020cd72c3d..c03ab4ec68c10 100644 --- a/src/pytorch_lightning/loops/epoch/training_epoch_loop.py +++ b/src/pytorch_lightning/loops/epoch/training_epoch_loop.py @@ -17,9 +17,9 @@ import numpy as np import torch +from lightning_utilities.core.apply_func import apply_to_collection import pytorch_lightning as pl -from lightning_lite.utilities.apply_func import apply_to_collection from pytorch_lightning import loops # import as loops to avoid circular imports from pytorch_lightning.loops.batch import TrainingBatchLoop from pytorch_lightning.loops.batch.training_batch_loop import _OUTPUTS_TYPE as _BATCH_OUTPUTS_TYPE diff --git a/src/pytorch_lightning/overrides/data_parallel.py b/src/pytorch_lightning/overrides/data_parallel.py index f3feb95f5eea8..a972ad37bc58e 100644 --- a/src/pytorch_lightning/overrides/data_parallel.py +++ b/src/pytorch_lightning/overrides/data_parallel.py @@ -16,10 +16,10 @@ from typing import Any, Optional, Union import torch +from lightning_utilities.core.apply_func import apply_to_collection from torch import Tensor import pytorch_lightning as pl -from lightning_lite.utilities.apply_func import apply_to_collection from pytorch_lightning.overrides.base import _LightningModuleWrapperBase, _LightningPrecisionModuleWrapperBase from pytorch_lightning.utilities.rank_zero import rank_zero_warn diff --git a/src/pytorch_lightning/plugins/io/xla_plugin.py b/src/pytorch_lightning/plugins/io/xla_plugin.py index 9430ee5a8d176..791e1e068384a 100644 --- a/src/pytorch_lightning/plugins/io/xla_plugin.py +++ b/src/pytorch_lightning/plugins/io/xla_plugin.py @@ -14,7 +14,8 @@ import os from typing import Any, Dict, Optional -from lightning_lite.utilities.apply_func import apply_to_collection +from lightning_utilities.core.apply_func import apply_to_collection + from lightning_lite.utilities.cloud_io import get_filesystem from pytorch_lightning.plugins.io.torch_plugin import TorchCheckpointIO from pytorch_lightning.utilities import _OMEGACONF_AVAILABLE, _TPU_AVAILABLE diff --git a/src/pytorch_lightning/plugins/precision/double.py b/src/pytorch_lightning/plugins/precision/double.py index 5f124e8f997fb..cff3b2619fa56 100644 --- a/src/pytorch_lightning/plugins/precision/double.py +++ b/src/pytorch_lightning/plugins/precision/double.py @@ -16,11 +16,11 @@ import torch import torch.nn as nn +from lightning_utilities.core.apply_func import apply_to_collection from torch import FloatTensor, Tensor from torch.optim import Optimizer import pytorch_lightning as pl -from lightning_lite.utilities.apply_func import apply_to_collection from pytorch_lightning.overrides.base import _LightningPrecisionModuleWrapperBase from pytorch_lightning.plugins.precision.precision_plugin import PrecisionPlugin diff --git a/src/pytorch_lightning/strategies/deepspeed.py b/src/pytorch_lightning/strategies/deepspeed.py index a24da43445635..695fa6f8fc19b 100644 --- a/src/pytorch_lightning/strategies/deepspeed.py +++ b/src/pytorch_lightning/strategies/deepspeed.py @@ -22,13 +22,13 @@ from typing import Any, Dict, Generator, List, Mapping, Optional, Tuple, Union import torch +from lightning_utilities.core.apply_func import apply_to_collection from lightning_utilities.core.imports import RequirementCache from torch import Tensor from torch.nn import Module from torch.optim import Optimizer import pytorch_lightning as pl -from lightning_lite.utilities.apply_func import apply_to_collection from pytorch_lightning.accelerators.cuda import CUDAAccelerator from pytorch_lightning.core.optimizer import _init_optimizers_and_lr_schedulers from pytorch_lightning.overrides.base import _LightningModuleWrapperBase, _LightningPrecisionModuleWrapperBase diff --git a/src/pytorch_lightning/strategies/dp.py b/src/pytorch_lightning/strategies/dp.py index a144630fb0fd1..a377171982f28 100644 --- a/src/pytorch_lightning/strategies/dp.py +++ b/src/pytorch_lightning/strategies/dp.py @@ -14,11 +14,11 @@ from typing import Any, Dict, List, Optional, Union import torch +from lightning_utilities.core.apply_func import apply_to_collection from torch import Tensor from torch.nn import DataParallel, Module import pytorch_lightning as pl -from lightning_lite.utilities.apply_func import apply_to_collection from pytorch_lightning.overrides.base import _LightningPrecisionModuleWrapperBase from pytorch_lightning.overrides.data_parallel import LightningParallelModule from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO diff --git a/src/pytorch_lightning/strategies/ipu.py b/src/pytorch_lightning/strategies/ipu.py index de3b2877528df..2d976e545deef 100644 --- a/src/pytorch_lightning/strategies/ipu.py +++ b/src/pytorch_lightning/strategies/ipu.py @@ -16,11 +16,11 @@ from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union import torch +from lightning_utilities.core.apply_func import apply_to_collection from torch import FloatTensor, Tensor from torch.utils.data import DataLoader, Sampler import pytorch_lightning as pl -from lightning_lite.utilities.apply_func import apply_to_collection from lightning_lite.utilities.cloud_io import get_filesystem from pytorch_lightning.overrides.base import _LightningModuleWrapperBase, _LightningPrecisionModuleWrapperBase from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment diff --git a/src/pytorch_lightning/strategies/launchers/multiprocessing.py b/src/pytorch_lightning/strategies/launchers/multiprocessing.py index 90a32d7a5a5e5..6bf81eb72d69c 100644 --- a/src/pytorch_lightning/strategies/launchers/multiprocessing.py +++ b/src/pytorch_lightning/strategies/launchers/multiprocessing.py @@ -21,11 +21,12 @@ import torch import torch.backends.cudnn import torch.multiprocessing as mp +from lightning_utilities.core.apply_func import apply_to_collection from torch import Tensor from typing_extensions import Literal import pytorch_lightning as pl -from lightning_lite.utilities.apply_func import apply_to_collection, move_data_to_device +from lightning_lite.utilities.apply_func import move_data_to_device from pytorch_lightning.strategies.launchers.base import _Launcher from pytorch_lightning.strategies.strategy import Strategy from pytorch_lightning.trainer.states import TrainerFn, TrainerState diff --git a/src/pytorch_lightning/strategies/tpu_spawn.py b/src/pytorch_lightning/strategies/tpu_spawn.py index 3c46fe10964ad..748406479bf51 100644 --- a/src/pytorch_lightning/strategies/tpu_spawn.py +++ b/src/pytorch_lightning/strategies/tpu_spawn.py @@ -16,12 +16,12 @@ from typing import Any, Dict, List, Mapping, Optional, Sequence, Union import torch +from lightning_utilities.core.apply_func import apply_to_collection from torch import Tensor from torch.nn import Module from torch.utils.data import DataLoader import pytorch_lightning as pl -from lightning_lite.utilities.apply_func import apply_to_collection from pytorch_lightning.overrides import LightningDistributedModule from pytorch_lightning.plugins.environments import XLAEnvironment from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO diff --git a/src/pytorch_lightning/trainer/connectors/data_connector.py b/src/pytorch_lightning/trainer/connectors/data_connector.py index f308d772033c1..c6733562d0289 100644 --- a/src/pytorch_lightning/trainer/connectors/data_connector.py +++ b/src/pytorch_lightning/trainer/connectors/data_connector.py @@ -17,11 +17,11 @@ from typing import Any, Iterable, List, Optional, Tuple, Union from weakref import proxy +from lightning_utilities.core.apply_func import apply_to_collection from torch.utils.data import BatchSampler, DataLoader, Sampler, SequentialSampler from torch.utils.data.distributed import DistributedSampler import pytorch_lightning as pl -from lightning_lite.utilities.apply_func import apply_to_collection from pytorch_lightning.accelerators.ipu import IPUAccelerator from pytorch_lightning.overrides.distributed import DistributedSamplerWrapper, UnrepeatedDistributedSamplerWrapper from pytorch_lightning.strategies import DDPSpawnStrategy diff --git a/src/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py b/src/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py index 897fe8a988605..6c251c1c13277 100644 --- a/src/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py +++ b/src/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py @@ -13,10 +13,11 @@ # limitations under the License. from typing import Any, Iterable, Optional, Union +from lightning_utilities.core.apply_func import apply_to_collection from torch import Tensor import pytorch_lightning as pl -from lightning_lite.utilities.apply_func import apply_to_collection, move_data_to_device +from lightning_lite.utilities.apply_func import move_data_to_device from pytorch_lightning.loggers import Logger, TensorBoardLogger from pytorch_lightning.plugins.environments.slurm_environment import SLURMEnvironment from pytorch_lightning.trainer.connectors.logger_connector.result import _METRICS, _OUT_DICT, _PBAR_DICT diff --git a/src/pytorch_lightning/trainer/connectors/logger_connector/result.py b/src/pytorch_lightning/trainer/connectors/logger_connector/result.py index 9408ce826e140..4736db1d12d27 100644 --- a/src/pytorch_lightning/trainer/connectors/logger_connector/result.py +++ b/src/pytorch_lightning/trainer/connectors/logger_connector/result.py @@ -16,11 +16,12 @@ from typing import Any, Callable, cast, Dict, Generator, List, Optional, Tuple, Union import torch +from lightning_utilities.core.apply_func import apply_to_collection, apply_to_collections from torch import Tensor from torchmetrics import Metric from typing_extensions import TypedDict -from lightning_lite.utilities.apply_func import apply_to_collection, apply_to_collections, move_data_to_device +from lightning_lite.utilities.apply_func import move_data_to_device from lightning_lite.utilities.device_dtype_mixin import _DeviceDtypeModuleMixin from pytorch_lightning.utilities.data import extract_batch_size from pytorch_lightning.utilities.distributed import distributed_available diff --git a/src/pytorch_lightning/trainer/supporters.py b/src/pytorch_lightning/trainer/supporters.py index 84b12bd5b9fb4..3be43e37fe338 100644 --- a/src/pytorch_lightning/trainer/supporters.py +++ b/src/pytorch_lightning/trainer/supporters.py @@ -16,11 +16,11 @@ from typing import Any, Callable, Dict, Iterable, Iterator, List, Mapping, Optional, Sequence, Union import torch +from lightning_utilities.core.apply_func import apply_to_collection, apply_to_collections from torch.utils.data import Dataset from torch.utils.data.dataloader import _BaseDataLoaderIter, _MultiProcessingDataLoaderIter, DataLoader from torch.utils.data.dataset import IterableDataset -from lightning_lite.utilities.apply_func import apply_to_collection, apply_to_collections from pytorch_lightning.utilities.auto_restart import ( _reload_dataloader_state_dict, MergedIteratorState, diff --git a/src/pytorch_lightning/trainer/trainer.py b/src/pytorch_lightning/trainer/trainer.py index ebc2f62024664..72caec41179ed 100644 --- a/src/pytorch_lightning/trainer/trainer.py +++ b/src/pytorch_lightning/trainer/trainer.py @@ -30,6 +30,7 @@ import torch import torch.distributed as dist +from lightning_utilities.core.apply_func import apply_to_collection from lightning_utilities.core.imports import module_available from packaging.version import Version from torch import Tensor @@ -37,7 +38,6 @@ from torch.utils.data import DataLoader import pytorch_lightning as pl -from lightning_lite.utilities.apply_func import apply_to_collection from lightning_lite.utilities.cloud_io import get_filesystem from pytorch_lightning.accelerators import ( Accelerator, diff --git a/src/pytorch_lightning/utilities/apply_func.py b/src/pytorch_lightning/utilities/apply_func.py index bc516d3fb3d6f..f4b1bfedef4b5 100644 --- a/src/pytorch_lightning/utilities/apply_func.py +++ b/src/pytorch_lightning/utilities/apply_func.py @@ -15,30 +15,40 @@ from typing import Any -from lightning_lite.utilities.apply_func import apply_to_collection as new_apply_to_collection -from lightning_lite.utilities.apply_func import apply_to_collections as new_apply_to_collections +from lightning_utilities.core.apply_func import apply_to_collection as new_apply_to_collection +from lightning_utilities.core.apply_func import apply_to_collections as new_apply_to_collections + from lightning_lite.utilities.apply_func import convert_to_tensors as new_convert_to_tensors from lightning_lite.utilities.apply_func import from_numpy as new_from_numpy from lightning_lite.utilities.apply_func import move_data_to_device as new_move_data_to_device from lightning_lite.utilities.apply_func import to_dtype_tensor as new_to_dtype_tensor from lightning_lite.utilities.apply_func import TransferableDataType as NewTransferableDataType from pytorch_lightning.utilities import rank_zero_deprecation +from pytorch_lightning.utilities.exceptions import MisconfigurationException def apply_to_collection(*args: Any, **kwargs: Any) -> Any: rank_zero_deprecation( "`pytorch_lightning.utilities.apply_func.apply_to_collection` has been deprecated in v1.8.0 and will be" - " removed in v1.10.0. Please use `lightning_lite.utilities.apply_func.apply_to_collection` instead." + " removed in v1.10.0. Please use `lightning_utilities.core.apply_func.apply_to_collection` instead." ) - return new_apply_to_collection(*args, **kwargs) + try: + return new_apply_to_collection(*args, **kwargs) + except ValueError as e: + # upstream had to change the exception type + raise MisconfigurationException from e def apply_to_collections(*args: Any, **kwargs: Any) -> Any: rank_zero_deprecation( "`pytorch_lightning.utilities.apply_func.apply_to_collections` has been deprecated in v1.8.0 and will be" - " removed in v1.10.0. Please use `lightning_lite.utilities.apply_func.apply_to_collections` instead." + " removed in v1.10.0. Please use `lightning_utilities.core.apply_func.apply_to_collections` instead." ) - return new_apply_to_collections(*args, **kwargs) + try: + return new_apply_to_collections(*args, **kwargs) + except ValueError as e: + # upstream had to change the exception type + raise MisconfigurationException from e def convert_to_tensors(*args: Any, **kwargs: Any) -> Any: diff --git a/src/pytorch_lightning/utilities/auto_restart.py b/src/pytorch_lightning/utilities/auto_restart.py index df6f0508281ce..0f6fadfb2657f 100644 --- a/src/pytorch_lightning/utilities/auto_restart.py +++ b/src/pytorch_lightning/utilities/auto_restart.py @@ -17,6 +17,7 @@ from functools import partial, wraps from typing import Any, Callable, Dict, Generator, Iterable, Iterator, List, Optional, Tuple, Union +from lightning_utilities.core.apply_func import apply_to_collection from torch.utils.data import Dataset, DistributedSampler, get_worker_info, RandomSampler, Sampler, SequentialSampler from torch.utils.data.dataloader import ( _BaseDataLoaderIter, @@ -28,7 +29,6 @@ from typing_extensions import TypedDict import pytorch_lightning as pl -from lightning_lite.utilities.apply_func import apply_to_collection from pytorch_lightning.utilities.distributed import _collect_states_on_rank_zero from pytorch_lightning.utilities.enums import _FaultTolerantMode, AutoRestartBatchKeys from pytorch_lightning.utilities.exceptions import MisconfigurationException diff --git a/src/pytorch_lightning/utilities/data.py b/src/pytorch_lightning/utilities/data.py index 6c5c32a6bf811..f65ef47d694b5 100644 --- a/src/pytorch_lightning/utilities/data.py +++ b/src/pytorch_lightning/utilities/data.py @@ -21,6 +21,7 @@ from typing import Any, Callable, Dict, Generator, Iterable, Mapping, Optional, Tuple, Type, Union import torch +from lightning_utilities.core.apply_func import is_dataclass_instance from torch import Tensor from torch.utils.data import ( BatchSampler, @@ -33,7 +34,6 @@ ) import pytorch_lightning as pl -from lightning_lite.utilities.apply_func import _is_dataclass_instance from pytorch_lightning.overrides.distributed import IndexBatchSamplerWrapper from pytorch_lightning.trainer.states import RunningStage from pytorch_lightning.utilities.auto_restart import CaptureIterableDataset, CaptureMapDataset, FastForwardSampler @@ -73,7 +73,7 @@ def _extract_batch_size(batch: BType) -> Generator[int, None, None]: for sample in batch: yield from _extract_batch_size(sample) - elif _is_dataclass_instance(batch): + elif is_dataclass_instance(batch): for field in fields(batch): yield from _extract_batch_size(getattr(batch, field.name)) else: diff --git a/src/pytorch_lightning/utilities/fetching.py b/src/pytorch_lightning/utilities/fetching.py index f0b6e98b8872b..ba44e2132a0e0 100644 --- a/src/pytorch_lightning/utilities/fetching.py +++ b/src/pytorch_lightning/utilities/fetching.py @@ -17,9 +17,9 @@ from typing import Any, Callable, Iterable, Iterator, List, Optional, Sized, Tuple import torch +from lightning_utilities.core.apply_func import apply_to_collection, apply_to_collections from torch.utils.data.dataloader import DataLoader -from lightning_lite.utilities.apply_func import apply_to_collection, apply_to_collections from pytorch_lightning.trainer.supporters import CombinedLoader, CycleIterator from pytorch_lightning.utilities.auto_restart import ( _add_capture_metadata_collate, diff --git a/src/pytorch_lightning/utilities/memory.py b/src/pytorch_lightning/utilities/memory.py index 3480f2e2da50b..f796d6d30ac91 100644 --- a/src/pytorch_lightning/utilities/memory.py +++ b/src/pytorch_lightning/utilities/memory.py @@ -21,11 +21,10 @@ from typing import Any, Dict import torch +from lightning_utilities.core.apply_func import apply_to_collection from torch import Tensor from torch.nn import Module -from lightning_lite.utilities.apply_func import apply_to_collection - def recursive_detach(in_dict: Any, to_cpu: bool = False) -> Any: """Detach all tensors in `in_dict`. diff --git a/src/pytorch_lightning/utilities/metrics.py b/src/pytorch_lightning/utilities/metrics.py index d0752029edcc3..bbc27e4e672a9 100644 --- a/src/pytorch_lightning/utilities/metrics.py +++ b/src/pytorch_lightning/utilities/metrics.py @@ -16,9 +16,9 @@ from typing import Any +from lightning_utilities.core.apply_func import apply_to_collection from torch import Tensor -from lightning_lite.utilities.apply_func import apply_to_collection from pytorch_lightning.utilities.exceptions import MisconfigurationException diff --git a/src/pytorch_lightning/utilities/optimizer.py b/src/pytorch_lightning/utilities/optimizer.py index f3f4734b1c3e9..b13baf25522c1 100644 --- a/src/pytorch_lightning/utilities/optimizer.py +++ b/src/pytorch_lightning/utilities/optimizer.py @@ -14,10 +14,11 @@ from typing import Iterable +from lightning_utilities.core.apply_func import apply_to_collection from torch import Tensor from torch.optim import Optimizer -from lightning_lite.utilities.apply_func import apply_to_collection, move_data_to_device +from lightning_lite.utilities.apply_func import move_data_to_device from pytorch_lightning.utilities.types import _DEVICE diff --git a/tests/tests_app/utilities/test_apply_func.py b/tests/tests_app/utilities/test_apply_func.py deleted file mode 100644 index 509a55b1ae7d2..0000000000000 --- a/tests/tests_app/utilities/test_apply_func.py +++ /dev/null @@ -1,264 +0,0 @@ -# Copyright The PyTorch Lightning team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import dataclasses -import numbers -from collections import defaultdict, namedtuple, OrderedDict -from dataclasses import InitVar -from typing import Any, ClassVar, List, Optional - -import pytest - -from lightning_app.utilities.apply_func import apply_to_collection -from lightning_app.utilities.exceptions import MisconfigurationException -from lightning_app.utilities.imports import _is_numpy_available, _is_torch_available - -if _is_torch_available(): - import torch - -if _is_numpy_available(): - import numpy as np - - -@pytest.mark.skipif(not (_is_torch_available() and _is_numpy_available()), reason="Requires torch and numpy") -def test_recursive_application_to_collection(): - ntc = namedtuple("Foo", ["bar"]) - - @dataclasses.dataclass - class Feature: - input_ids: torch.Tensor - segment_ids: np.ndarray - - def __eq__(self, o: object) -> bool: - if not isinstance(o, Feature): - return NotImplemented - else: - return torch.equal(self.input_ids, o.input_ids) and np.equal(self.segment_ids, o.segment_ids).all() - - @dataclasses.dataclass - class ModelExample: - example_ids: List[str] - feature: Feature - label: torch.Tensor - some_constant: int = dataclasses.field(init=False) - - def __post_init__(self): - self.some_constant = 7 - - def __eq__(self, o: object) -> bool: - if not isinstance(o, ModelExample): - return NotImplemented - else: - return ( - self.example_ids == o.example_ids - and self.feature == o.feature - and torch.equal(self.label, o.label) - and self.some_constant == o.some_constant - ) - - @dataclasses.dataclass - class WithClassVar: - class_var: ClassVar[int] = 0 - dummy: Any - - def __eq__(self, o: object) -> bool: - if not isinstance(o, WithClassVar): - return NotImplemented - elif isinstance(self.dummy, torch.Tensor): - return torch.equal(self.dummy, o.dummy) - else: - return self.dummy == o.dummy - - @dataclasses.dataclass - class WithInitVar: - dummy: Any - override: InitVar[Optional[Any]] = None - - def __post_init__(self, override: Optional[Any]): - if override is not None: - self.dummy = override - - def __eq__(self, o: object) -> bool: - if not isinstance(o, WithInitVar): - return NotImplemented - elif isinstance(self.dummy, torch.Tensor): - return torch.equal(self.dummy, o.dummy) - else: - return self.dummy == o.dummy - - @dataclasses.dataclass - class WithClassAndInitVar: - class_var: ClassVar[torch.Tensor] = torch.tensor(0) - dummy: Any - override: InitVar[Optional[Any]] = torch.tensor(1) - - def __post_init__(self, override: Optional[Any]): - if override is not None: - self.dummy = override - - def __eq__(self, o: object) -> bool: - if not isinstance(o, WithClassAndInitVar): - return NotImplemented - elif isinstance(self.dummy, torch.Tensor): - return torch.equal(self.dummy, o.dummy) - else: - return self.dummy == o.dummy - - model_example = ModelExample( - example_ids=["i-1", "i-2", "i-3"], - feature=Feature(input_ids=torch.tensor([1.0, 2.0, 3.0]), segment_ids=np.array([4.0, 5.0, 6.0])), - label=torch.tensor([7.0, 8.0, 9.0]), - ) - - to_reduce = { - "a": torch.tensor([1.0]), # Tensor - "b": [torch.tensor([2.0])], # list - "c": (torch.tensor([100.0]),), # tuple - "d": ntc(bar=5.0), # named tuple - "e": np.array([10.0]), # numpy array - "f": "this_is_a_dummy_str", # string - "g": 12.0, # number - "h": Feature(input_ids=torch.tensor([1.0, 2.0, 3.0]), segment_ids=np.array([4.0, 5.0, 6.0])), # dataclass - "i": model_example, # nested dataclass - "j": WithClassVar(torch.arange(3)), # dataclass with class variable - "k": WithInitVar("this_gets_overridden", torch.tensor([2.0])), # dataclass with init-only variable - "l": WithClassAndInitVar(model_example, None), # nested dataclass with class and init-only variables - } - - model_example_result = ModelExample( - example_ids=["i-1", "i-2", "i-3"], - feature=Feature(input_ids=torch.tensor([2.0, 4.0, 6.0]), segment_ids=np.array([8.0, 10.0, 12.0])), - label=torch.tensor([14.0, 16.0, 18.0]), - ) - - expected_result = { - "a": torch.tensor([2.0]), - "b": [torch.tensor([4.0])], - "c": (torch.tensor([200.0]),), - "d": ntc(bar=torch.tensor([10.0])), - "e": np.array([20.0]), - "f": "this_is_a_dummy_str", - "g": 24.0, - "h": Feature(input_ids=torch.tensor([2.0, 4.0, 6.0]), segment_ids=np.array([8.0, 10.0, 12.0])), - "i": model_example_result, - "j": WithClassVar(torch.arange(0, 6, 2)), - "k": WithInitVar(torch.tensor([4.0])), - "l": WithClassAndInitVar(model_example_result, None), - } - - reduced = apply_to_collection(to_reduce, (torch.Tensor, numbers.Number, np.ndarray), lambda x: x * 2) - - assert isinstance(reduced, dict), "Type Consistency of dict not preserved" - assert all(x in reduced for x in to_reduce), "Not all entries of the dict were preserved" - assert all( - isinstance(reduced[k], type(expected_result[k])) for k in to_reduce - ), "At least one type was not correctly preserved" - - assert isinstance(reduced["a"], torch.Tensor), "Reduction Result of a Tensor should be a Tensor" - assert torch.equal(expected_result["a"], reduced["a"]), "Reduction of a tensor does not yield the expected value" - - assert isinstance(reduced["b"], list), "Reduction Result of a list should be a list" - assert all( - torch.equal(x, y) for x, y in zip(reduced["b"], expected_result["b"]) - ), "At least one value of list reduction did not come out as expected" - - assert isinstance(reduced["c"], tuple), "Reduction Result of a tuple should be a tuple" - assert all( - torch.equal(x, y) for x, y in zip(reduced["c"], expected_result["c"]) - ), "At least one value of tuple reduction did not come out as expected" - - assert isinstance(reduced["d"], ntc), "Type Consistency for named tuple not given" - assert isinstance( - reduced["d"].bar, numbers.Number - ), "Failure in type promotion while reducing fields of named tuples" - assert reduced["d"].bar == expected_result["d"].bar - - assert isinstance(reduced["e"], np.ndarray), "Type Promotion in reduction of numpy arrays failed" - assert reduced["e"] == expected_result["e"], "Reduction of numpy array did not yield the expected result" - - assert isinstance(reduced["f"], str), "A string should not be reduced" - assert reduced["f"] == expected_result["f"], "String not preserved during reduction" - - assert isinstance(reduced["g"], numbers.Number), "Reduction of a number should result in a number" - assert reduced["g"] == expected_result["g"], "Reduction of a number did not yield the desired result" - - def _assert_dataclass_reduction(actual, expected, dataclass_type: str = ""): - assert dataclasses.is_dataclass(actual) and not isinstance( - actual, type - ), f"Reduction of a {dataclass_type} dataclass should result in a dataclass" - for field in dataclasses.fields(actual): - if dataclasses.is_dataclass(field.type): - _assert_dataclass_reduction(getattr(actual, field.name), getattr(expected, field.name), "nested") - assert actual == expected, f"Reduction of a {dataclass_type} dataclass did not yield the desired result" - - _assert_dataclass_reduction(reduced["h"], expected_result["h"]) - - _assert_dataclass_reduction(reduced["i"], expected_result["i"]) - - dataclass_type = "ClassVar-containing" - _assert_dataclass_reduction(reduced["j"], expected_result["j"], dataclass_type) - assert WithClassVar.class_var == 0, f"Reduction of a {dataclass_type} dataclass should not change the class var" - - _assert_dataclass_reduction(reduced["k"], expected_result["k"], "InitVar-containing") - - dataclass_type = "Class-and-InitVar-containing" - _assert_dataclass_reduction(reduced["l"], expected_result["l"], dataclass_type) - assert torch.equal( - WithClassAndInitVar.class_var, torch.tensor(0) - ), f"Reduction of a {dataclass_type} dataclass should not change the class var" - - # mapping support - reduced = apply_to_collection({"a": 1, "b": 2}, int, lambda x: str(x)) - assert reduced == {"a": "1", "b": "2"} - reduced = apply_to_collection(OrderedDict([("b", 2), ("a", 1)]), int, lambda x: str(x)) - assert reduced == OrderedDict([("b", "2"), ("a", "1")]) - - # custom mappings - class _CustomCollection(dict): - def __init__(self, initial_dict): - super().__init__(initial_dict) - - to_reduce = _CustomCollection({"a": 1, "b": 2, "c": 3}) - reduced = apply_to_collection(to_reduce, int, lambda x: str(x)) - assert reduced == _CustomCollection({"a": "1", "b": "2", "c": "3"}) - - # defaultdict - to_reduce = defaultdict(int, {"a": 1, "b": 2, "c": 3}) - reduced = apply_to_collection(to_reduce, int, lambda x: str(x)) - assert reduced == defaultdict(int, {"a": "1", "b": "2", "c": "3"}) - - -def test_apply_to_collection_include_none(): - to_reduce = [1, 2, 3.4, 5.6, 7, (8, 9.1, {10: 10})] - - def fn(x): - if isinstance(x, float): - return x - - reduced = apply_to_collection(to_reduce, (int, float), fn) - assert reduced == [None, None, 3.4, 5.6, None, (None, 9.1, {10: None})] - - reduced = apply_to_collection(to_reduce, (int, float), fn, include_none=False) - assert reduced == [3.4, 5.6, (9.1, {})] - - -@pytest.mark.skipif(not _is_torch_available(), reason="Requires torch and numpy") -def test_apply_to_collection_frozen_dataclass(): - @dataclasses.dataclass(frozen=True) - class Foo: - input: torch.Tensor - - foo = Foo(torch.tensor(0)) - - with pytest.raises(MisconfigurationException, match="frozen dataclass was passed"): - apply_to_collection(foo, torch.Tensor, lambda t: t.to(torch.int)) diff --git a/tests/tests_lite/utilities/test_apply_func.py b/tests/tests_lite/utilities/test_apply_func.py index bbc9b57e0b622..b299783ae85d7 100644 --- a/tests/tests_lite/utilities/test_apply_func.py +++ b/tests/tests_lite/utilities/test_apply_func.py @@ -11,342 +11,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import dataclasses -import numbers -from collections import defaultdict, namedtuple, OrderedDict -from dataclasses import InitVar -from typing import Any, ClassVar, List, Optional - -import numpy as np import pytest import torch -from lightning_lite.utilities.apply_func import apply_to_collection, apply_to_collections, move_data_to_device -from pytorch_lightning.utilities.exceptions import MisconfigurationException - - -@dataclasses.dataclass -class Feature: - input_ids: torch.Tensor - segment_ids: np.ndarray - - def __eq__(self, o: object) -> bool: - if not isinstance(o, Feature): - return NotImplemented - - return torch.equal(self.input_ids, o.input_ids) and np.equal(self.segment_ids, o.segment_ids).all() - - -@dataclasses.dataclass -class ModelExample: - example_ids: List[str] - feature: Feature - label: torch.Tensor - some_constant: int = dataclasses.field(init=False) - - def __post_init__(self): - self.some_constant = 7 - - def __eq__(self, o: object) -> bool: - if not isinstance(o, ModelExample): - return NotImplemented - - return ( - self.example_ids == o.example_ids - and self.feature == o.feature - and torch.equal(self.label, o.label) - and self.some_constant == o.some_constant - ) - - -@dataclasses.dataclass -class WithClassVar: - class_var: ClassVar[int] = 0 - dummy: Any - - def __eq__(self, o: object) -> bool: - if not isinstance(o, WithClassVar): - return NotImplemented - elif isinstance(self.dummy, torch.Tensor): - return torch.equal(self.dummy, o.dummy) - - return self.dummy == o.dummy - - -@dataclasses.dataclass -class WithInitVar: - dummy: Any - override: InitVar[Optional[Any]] = None - - def __post_init__(self, override: Optional[Any]): - if override is not None: - self.dummy = override - - def __eq__(self, o: object) -> bool: - if not isinstance(o, WithInitVar): - return NotImplemented - elif isinstance(self.dummy, torch.Tensor): - return torch.equal(self.dummy, o.dummy) - - return self.dummy == o.dummy - - -@dataclasses.dataclass -class WithClassAndInitVar: - class_var: ClassVar[torch.Tensor] = torch.tensor(0) - dummy: Any - override: InitVar[Optional[Any]] = torch.tensor(1) - - def __post_init__(self, override: Optional[Any]): - if override is not None: - self.dummy = override - - def __eq__(self, o: object) -> bool: - if not isinstance(o, WithClassAndInitVar): - return NotImplemented - elif isinstance(self.dummy, torch.Tensor): - return torch.equal(self.dummy, o.dummy) - - return self.dummy == o.dummy - - -def test_recursive_application_to_collection(): - ntc = namedtuple("Foo", ["bar"]) - - model_example = ModelExample( - example_ids=["i-1", "i-2", "i-3"], - feature=Feature(input_ids=torch.tensor([1.0, 2.0, 3.0]), segment_ids=np.array([4.0, 5.0, 6.0])), - label=torch.tensor([7.0, 8.0, 9.0]), - ) - - to_reduce = { - "a": torch.tensor([1.0]), # Tensor - "b": [torch.tensor([2.0])], # list - "c": (torch.tensor([100.0]),), # tuple - "d": ntc(bar=5.0), # named tuple - "e": np.array([10.0]), # numpy array - "f": "this_is_a_dummy_str", # string - "g": 12.0, # number - "h": Feature(input_ids=torch.tensor([1.0, 2.0, 3.0]), segment_ids=np.array([4.0, 5.0, 6.0])), # dataclass - "i": model_example, # nested dataclass - "j": WithClassVar(torch.arange(3)), # dataclass with class variable - "k": WithInitVar("this_gets_overridden", torch.tensor([2.0])), # dataclass with init-only variable - "l": WithClassAndInitVar(model_example, None), # nested dataclass with class and init-only variables - } - - model_example_result = ModelExample( - example_ids=["i-1", "i-2", "i-3"], - feature=Feature(input_ids=torch.tensor([2.0, 4.0, 6.0]), segment_ids=np.array([8.0, 10.0, 12.0])), - label=torch.tensor([14.0, 16.0, 18.0]), - ) - - expected_result = { - "a": torch.tensor([2.0]), - "b": [torch.tensor([4.0])], - "c": (torch.tensor([200.0]),), - "d": ntc(bar=torch.tensor([10.0])), - "e": np.array([20.0]), - "f": "this_is_a_dummy_str", - "g": 24.0, - "h": Feature(input_ids=torch.tensor([2.0, 4.0, 6.0]), segment_ids=np.array([8.0, 10.0, 12.0])), - "i": model_example_result, - "j": WithClassVar(torch.arange(0, 6, 2)), - "k": WithInitVar(torch.tensor([4.0])), - "l": WithClassAndInitVar(model_example_result, None), - } - - reduced = apply_to_collection(to_reduce, (torch.Tensor, numbers.Number, np.ndarray), lambda x: x * 2) - - assert isinstance(reduced, dict), "Type Consistency of dict not preserved" - assert all(x in reduced for x in to_reduce), "Not all entries of the dict were preserved" - assert all( - isinstance(reduced[k], type(expected_result[k])) for k in to_reduce - ), "At least one type was not correctly preserved" - - assert isinstance(reduced["a"], torch.Tensor), "Reduction Result of a Tensor should be a Tensor" - assert torch.equal(expected_result["a"], reduced["a"]), "Reduction of a tensor does not yield the expected value" - - assert isinstance(reduced["b"], list), "Reduction Result of a list should be a list" - assert all( - torch.equal(x, y) for x, y in zip(reduced["b"], expected_result["b"]) - ), "At least one value of list reduction did not come out as expected" - - assert isinstance(reduced["c"], tuple), "Reduction Result of a tuple should be a tuple" - assert all( - torch.equal(x, y) for x, y in zip(reduced["c"], expected_result["c"]) - ), "At least one value of tuple reduction did not come out as expected" - - assert isinstance(reduced["d"], ntc), "Type Consistency for named tuple not given" - assert isinstance( - reduced["d"].bar, numbers.Number - ), "Failure in type promotion while reducing fields of named tuples" - assert reduced["d"].bar == expected_result["d"].bar - - assert isinstance(reduced["e"], np.ndarray), "Type Promotion in reduction of numpy arrays failed" - assert reduced["e"] == expected_result["e"], "Reduction of numpy array did not yield the expected result" - - assert isinstance(reduced["f"], str), "A string should not be reduced" - assert reduced["f"] == expected_result["f"], "String not preserved during reduction" - - assert isinstance(reduced["g"], numbers.Number), "Reduction of a number should result in a number" - assert reduced["g"] == expected_result["g"], "Reduction of a number did not yield the desired result" - - def _assert_dataclass_reduction(actual, expected, dataclass_type: str = ""): - assert dataclasses.is_dataclass(actual) and not isinstance( - actual, type - ), f"Reduction of a {dataclass_type} dataclass should result in a dataclass" - for field in dataclasses.fields(actual): - if dataclasses.is_dataclass(field.type): - _assert_dataclass_reduction(getattr(actual, field.name), getattr(expected, field.name), "nested") - assert actual == expected, f"Reduction of a {dataclass_type} dataclass did not yield the desired result" - - _assert_dataclass_reduction(reduced["h"], expected_result["h"]) - - _assert_dataclass_reduction(reduced["i"], expected_result["i"]) - - dataclass_type = "ClassVar-containing" - _assert_dataclass_reduction(reduced["j"], expected_result["j"], dataclass_type) - assert WithClassVar.class_var == 0, f"Reduction of a {dataclass_type} dataclass should not change the class var" - - _assert_dataclass_reduction(reduced["k"], expected_result["k"], "InitVar-containing") - - dataclass_type = "Class-and-InitVar-containing" - _assert_dataclass_reduction(reduced["l"], expected_result["l"], dataclass_type) - assert torch.equal( - WithClassAndInitVar.class_var, torch.tensor(0) - ), f"Reduction of a {dataclass_type} dataclass should not change the class var" - - # mapping support - reduced = apply_to_collection({"a": 1, "b": 2}, int, lambda x: str(x)) - assert reduced == {"a": "1", "b": "2"} - reduced = apply_to_collection(OrderedDict([("b", 2), ("a", 1)]), int, lambda x: str(x)) - assert reduced == OrderedDict([("b", "2"), ("a", "1")]) - - # custom mappings - class _CustomCollection(dict): - def __init__(self, initial_dict): - super().__init__(initial_dict) - - to_reduce = _CustomCollection({"a": 1, "b": 2, "c": 3}) - reduced = apply_to_collection(to_reduce, int, lambda x: str(x)) - assert reduced == _CustomCollection({"a": "1", "b": "2", "c": "3"}) - - # defaultdict - to_reduce = defaultdict(int, {"a": 1, "b": 2, "c": 3}) - reduced = apply_to_collection(to_reduce, int, lambda x: str(x)) - assert reduced == defaultdict(int, {"a": "1", "b": "2", "c": "3"}) - - -def test_apply_to_collection_include_none(): - to_reduce = [1, 2, 3.4, 5.6, 7, (8, 9.1, {10: 10})] - - def fn(x): - if isinstance(x, float): - return x - - reduced = apply_to_collection(to_reduce, (int, float), fn) - assert reduced == [None, None, 3.4, 5.6, None, (None, 9.1, {10: None})] - - reduced = apply_to_collection(to_reduce, (int, float), fn, include_none=False) - assert reduced == [3.4, 5.6, (9.1, {})] - - -def test_apply_to_collections(): - to_reduce_1 = {"a": {"b": [1, 2]}, "c": 5} - to_reduce_2 = {"a": {"b": [3, 4]}, "c": 6} - - def fn(a, b): - return a + b - - # basic test - reduced = apply_to_collections(to_reduce_1, to_reduce_2, int, fn) - assert reduced == {"a": {"b": [4, 6]}, "c": 11} - - with pytest.raises(KeyError): - # strict mode - if a key does not exist in both we fail - apply_to_collections({**to_reduce_2, "d": "foo"}, to_reduce_1, float, fn) - - # multiple dtypes - reduced = apply_to_collections(to_reduce_1, to_reduce_2, (list, int), fn) - assert reduced == {"a": {"b": [1, 2, 3, 4]}, "c": 11} - - # wrong dtype - reduced = apply_to_collections(to_reduce_1, to_reduce_2, (list, int), fn, wrong_dtype=int) - assert reduced == {"a": {"b": [1, 2, 3, 4]}, "c": 5} - - # list takes precedence because it is the type of data1 - reduced = apply_to_collections([1, 2, 3], [4], (int, list), fn) - assert reduced == [1, 2, 3, 4] - - # different sizes - with pytest.raises(AssertionError, match="Sequence collections have different sizes"): - apply_to_collections([[1, 2], [3]], [4], int, fn) - - def fn(a, b): - return a.keys() | b.keys() - - # base case - reduced = apply_to_collections(to_reduce_1, to_reduce_2, dict, fn) - assert reduced == {"a", "c"} - - # type conversion - to_reduce = [(1, 2), (3, 4)] - reduced = apply_to_collections(to_reduce, to_reduce, int, lambda *x: sum(x)) - assert reduced == [(2, 4), (6, 8)] - - # named tuple - foo = namedtuple("Foo", ["bar"]) - to_reduce = [foo(1), foo(2), foo(3)] - reduced = apply_to_collections(to_reduce, to_reduce, int, lambda *x: sum(x)) - assert reduced == [foo(2), foo(4), foo(6)] - - # passing none - reduced1 = apply_to_collections([1, 2, 3], None, int, lambda x: x * x) - reduced2 = apply_to_collections(None, [1, 2, 3], int, lambda x: x * x) - assert reduced1 == reduced2 == [1, 4, 9] - reduced = apply_to_collections(None, None, int, lambda x: x * x) - assert reduced is None - - -def test_apply_to_collections_dataclass(): - to_reduce_1 = Feature(input_ids=torch.tensor([1.0, 2.0, 3.0]), segment_ids=np.array([4.0, 5.0, 6.0])) - to_reduce_2 = Feature(input_ids=torch.tensor([1.0, 2.0, 3.0]), segment_ids=np.array([4.0, 5.0, 6.0])) - - def fn(a, b): - return a + b - - reduced = apply_to_collections(to_reduce_1, to_reduce_2, (torch.Tensor, numbers.Number, np.ndarray), fn) - - assert reduced == Feature(input_ids=torch.tensor([2.0, 4.0, 6.0]), segment_ids=np.array([8.0, 10.0, 12.0])) - - model_example = ModelExample( - example_ids=["i-1", "i-2", "i-3"], - feature=to_reduce_1, - label=torch.tensor([7.0, 8.0, 9.0]), - ) - - # different types - with pytest.raises(TypeError, match="Expected inputs to be dataclasses of the same type"): - apply_to_collections(to_reduce_1, [1, 2], (torch.Tensor, numbers.Number, np.ndarray), fn) - - # unmatched fields - with pytest.raises(TypeError, match="Dataclasses fields do not match"): - apply_to_collections(to_reduce_1, model_example, (torch.Tensor, numbers.Number, np.ndarray), fn) - - classvar = WithClassVar(torch.arange(3)) # dataclass with same number but different type of fields - with pytest.raises(TypeError, match="Dataclasses fields do not match"): - apply_to_collections(to_reduce_1, classvar, (torch.Tensor, numbers.Number, np.ndarray), fn) - - -def test_apply_to_collection_frozen_dataclass(): - @dataclasses.dataclass(frozen=True) - class Foo: - input: torch.Tensor - - foo = Foo(torch.tensor(0)) - - with pytest.raises(MisconfigurationException, match="frozen dataclass was passed"): - apply_to_collection(foo, torch.Tensor, lambda t: t.to(torch.int)) +from lightning_lite.utilities.apply_func import move_data_to_device @pytest.mark.parametrize("should_return", [False, True]) diff --git a/tests/tests_pytorch/lite/test_parity.py b/tests/tests_pytorch/lite/test_parity.py index 2edca121343e0..0ea65bb49dd56 100644 --- a/tests/tests_pytorch/lite/test_parity.py +++ b/tests/tests_pytorch/lite/test_parity.py @@ -22,12 +22,13 @@ import torch.distributed import torch.multiprocessing as mp import torch.nn.functional +from lightning_utilities.core.apply_func import apply_to_collection from torch import nn from torch.nn.parallel.distributed import DistributedDataParallel from torch.utils.data import DataLoader from torch.utils.data.distributed import DistributedSampler -from lightning_lite.utilities.apply_func import apply_to_collection, move_data_to_device +from lightning_lite.utilities.apply_func import move_data_to_device from lightning_lite.utilities.cloud_io import atomic_save from pytorch_lightning.demos.boring_classes import RandomDataset from pytorch_lightning.lite import LightningLite diff --git a/tests/tests_pytorch/trainer/test_supporters.py b/tests/tests_pytorch/trainer/test_supporters.py index 92be556d198e3..fec8466748ab1 100644 --- a/tests/tests_pytorch/trainer/test_supporters.py +++ b/tests/tests_pytorch/trainer/test_supporters.py @@ -18,12 +18,12 @@ import pytest import torch +from lightning_utilities.core.apply_func import apply_to_collection from torch.utils.data import DataLoader, TensorDataset from torch.utils.data.dataset import Dataset, IterableDataset from torch.utils.data.distributed import DistributedSampler from torch.utils.data.sampler import RandomSampler, Sampler, SequentialSampler -from lightning_lite.utilities.apply_func import apply_to_collection from pytorch_lightning import Trainer from pytorch_lightning.demos.boring_classes import BoringModel, RandomDataset from pytorch_lightning.trainer.supporters import ( From 8095e2452d519167944e9924819b43710f45b1ad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Wed, 7 Sep 2022 00:35:33 +0200 Subject: [PATCH 069/193] Cleanup Lite `apply_func`s utilitites (#14560) --- src/lightning_lite/utilities/apply_func.py | 33 +++++++++---------- src/lightning_lite/utilities/types.py | 18 ++++++++++ src/pytorch_lightning/CHANGELOG.md | 5 +-- src/pytorch_lightning/utilities/apply_func.py | 12 +++---- 4 files changed, 40 insertions(+), 28 deletions(-) create mode 100644 src/lightning_lite/utilities/types.py diff --git a/src/lightning_lite/utilities/apply_func.py b/src/lightning_lite/utilities/apply_func.py index c76fe01985ff6..a3a203776bcf6 100644 --- a/src/lightning_lite/utilities/apply_func.py +++ b/src/lightning_lite/utilities/apply_func.py @@ -21,25 +21,21 @@ from lightning_utilities.core.apply_func import apply_to_collection from torch import Tensor -_BLOCKING_DEVICE_TYPES = ("cpu", "mps") - +from lightning_lite.utilities.types import _DEVICE -def to_dtype_tensor( - value: Union[int, float, List[Union[int, float]]], dtype: torch.dtype, device: Union[str, torch.device] -) -> Tensor: - return torch.tensor(value, dtype=dtype, device=device) +_BLOCKING_DEVICE_TYPES = ("cpu", "mps") -def from_numpy(value: np.ndarray, device: Union[str, torch.device]) -> Tensor: - return torch.from_numpy(value).to(device) +def _from_numpy(value: np.ndarray, device: _DEVICE) -> Tensor: + return torch.from_numpy(value).to(device) # type: ignore[arg-type] CONVERSION_DTYPES: List[Tuple[Any, Callable[[Any, Any], Tensor]]] = [ # bool -> uint8 as bool -> torch.bool triggers RuntimeError: Unsupported data type for NCCL process group - (bool, partial(to_dtype_tensor, dtype=torch.uint8)), - (int, partial(to_dtype_tensor, dtype=torch.int)), - (float, partial(to_dtype_tensor, dtype=torch.float)), - (np.ndarray, from_numpy), + (bool, partial(torch.tensor, dtype=torch.uint8)), + (int, partial(torch.tensor, dtype=torch.int)), + (float, partial(torch.tensor, dtype=torch.float)), + (np.ndarray, _from_numpy), ] @@ -70,7 +66,7 @@ def __subclasshook__(cls, subclass: Any) -> Union[bool, Any]: return NotImplemented -def move_data_to_device(batch: Any, device: Union[str, torch.device]) -> Any: +def move_data_to_device(batch: Any, device: _DEVICE) -> Any: """Transfers a collection of data to the given device. Any object that defines a method ``to(device)`` will be moved and all other objects in the collection will be left untouched. @@ -105,12 +101,13 @@ def batch_to(data: Any) -> Any: return apply_to_collection(batch, dtype=TransferableDataType, function=batch_to) -def convert_to_tensors(data: Any, device: Union[str, torch.device]) -> Any: +def convert_to_tensors(data: Any, device: _DEVICE) -> Any: + # convert non-tensors for src_dtype, conversion_func in CONVERSION_DTYPES: data = apply_to_collection(data, src_dtype, conversion_func, device=device) - def _move_to_device_and_make_contiguous(t: Tensor, device: Union[str, torch.device]) -> Tensor: - return t.to(device).contiguous() + def _move_to_device_and_make_contiguous(t: Tensor, device: _DEVICE) -> Tensor: + return t.to(device).contiguous() # type: ignore[arg-type] - data = apply_to_collection(data, Tensor, _move_to_device_and_make_contiguous, device=device) - return data + # make sure existing tensors are in the correct device, also contiguous + return apply_to_collection(data, Tensor, _move_to_device_and_make_contiguous, device=device) diff --git a/src/lightning_lite/utilities/types.py b/src/lightning_lite/utilities/types.py new file mode 100644 index 0000000000000..900154e69c2eb --- /dev/null +++ b/src/lightning_lite/utilities/types.py @@ -0,0 +1,18 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import Union + +import torch + +_DEVICE = Union[torch.device, str, int] diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index c3534aa3d6a73..04aeed3fb4073 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -24,7 +24,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added support for auto wrapping for `DDPFullyShardedStrategy` ([#14383](https://github.com/Lightning-AI/lightning/issues/14383)) -- Integrate the `lightning_utilities` package ([#14475](https://github.com/Lightning-AI/lightning/issues/14475)) +- Integrate the `lightning_utilities` package ([#14475](https://github.com/Lightning-AI/lightning/issues/14475), [#14537](https://github.com/Lightning-AI/lightning/issues/14537)) @@ -84,15 +84,12 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Deprecated all functions in `pytorch_lightning.utilities.xla_device` in favor of `lightning_lite.utilities.xla_device` ([#14514](https://github.com/Lightning-AI/lightning/pull/14514)) - - Deprecated all functions in `pytorch_lightning.utilities.cloud_io` in favor of `lightning_lite.utilities.cloud_io` ([#14515](https://github.com/Lightning-AI/lightning/pull/14515)) - - Deprecated the functions in `pytorch_lightning.utilities.apply_func` in favor of `lightning_utilities.core.apply_func` ([#14516](https://github.com/Lightning-AI/lightning/pull/14516), [#14537](https://github.com/Lightning-AI/lightning/pull/14537)) - ### Removed - Removed the deprecated `Trainer.training_type_plugin` property in favor of `Trainer.strategy` ([#14011](https://github.com/Lightning-AI/lightning/pull/14011)) diff --git a/src/pytorch_lightning/utilities/apply_func.py b/src/pytorch_lightning/utilities/apply_func.py index f4b1bfedef4b5..e7c8fedb48113 100644 --- a/src/pytorch_lightning/utilities/apply_func.py +++ b/src/pytorch_lightning/utilities/apply_func.py @@ -15,13 +15,13 @@ from typing import Any +import torch from lightning_utilities.core.apply_func import apply_to_collection as new_apply_to_collection from lightning_utilities.core.apply_func import apply_to_collections as new_apply_to_collections +from lightning_lite.utilities.apply_func import _from_numpy from lightning_lite.utilities.apply_func import convert_to_tensors as new_convert_to_tensors -from lightning_lite.utilities.apply_func import from_numpy as new_from_numpy from lightning_lite.utilities.apply_func import move_data_to_device as new_move_data_to_device -from lightning_lite.utilities.apply_func import to_dtype_tensor as new_to_dtype_tensor from lightning_lite.utilities.apply_func import TransferableDataType as NewTransferableDataType from pytorch_lightning.utilities import rank_zero_deprecation from pytorch_lightning.utilities.exceptions import MisconfigurationException @@ -62,9 +62,9 @@ def convert_to_tensors(*args: Any, **kwargs: Any) -> Any: def from_numpy(*args: Any, **kwargs: Any) -> Any: rank_zero_deprecation( "`pytorch_lightning.utilities.apply_func.from_numpy` has been deprecated in v1.8.0 and will be" - " removed in v1.10.0. Please use `lightning_lite.utilities.apply_func.from_numpy` instead." + " removed in v1.10.0. Please use `torch.from_numpy().to()` instead." ) - return new_from_numpy(*args, **kwargs) + return _from_numpy(*args, **kwargs) def move_data_to_device(*args: Any, **kwargs: Any) -> Any: @@ -78,9 +78,9 @@ def move_data_to_device(*args: Any, **kwargs: Any) -> Any: def to_dtype_tensor(*args: Any, **kwargs: Any) -> Any: rank_zero_deprecation( "`pytorch_lightning.utilities.apply_func.to_dtype_tensor` has been deprecated in v1.8.0 and will be" - " removed in v1.10.0. Please use `lightning_lite.utilities.apply_func.to_dtype_tensor` instead." + " removed in v1.10.0. Please use `torch.tensor` instead." ) - return new_to_dtype_tensor(*args, **kwargs) + return torch.tensor(*args, **kwargs) class TransferableDataType(NewTransferableDataType): From 7ada52e33c79b85da7f5fd0f69b7e912866701d6 Mon Sep 17 00:00:00 2001 From: Dan Nissenbaum Date: Wed, 7 Sep 2022 05:06:30 -0400 Subject: [PATCH 070/193] Typo in major heading seen by newcomers (#14501) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Typo in major heading seen by newcomers Correct typo in one of the first major headings newcomers to Lightning see when they are considering migrating their code to use Lightning. I know this is a trivial change in terms of the text change itself, but I really think it's valuable for one of the most important landing pages that users first investigating Lightning see - to have rock-solid, professional text without obvious typos. Here was a typo in the main heading itself. I suggest fixing it straightaway via this PR. Co-authored-by: Adrian Wälchli --- docs/source-pytorch/starter/converting.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/source-pytorch/starter/converting.rst b/docs/source-pytorch/starter/converting.rst index 55ae242e87f6c..925922e5b5eb2 100644 --- a/docs/source-pytorch/starter/converting.rst +++ b/docs/source-pytorch/starter/converting.rst @@ -8,9 +8,9 @@ To enable your code to work with Lightning, perform the following to organize Py -------- -****************************** -1. Keep you Computational Code -****************************** +******************************* +1. Keep Your Computational Code +******************************* Keep your regular nn.Module architecture From 5216c510969af54fd1ec499fbeb691266b510706 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Wed, 7 Sep 2022 11:21:48 +0200 Subject: [PATCH 071/193] Integrate `lightning_utilities.core.rank_zero` (#14556) --- .github/workflows/ci-pytorch-test-full.yml | 7 ++ requirements/app/base.txt | 2 +- requirements/lite/base.txt | 2 +- requirements/pytorch/base.txt | 2 +- src/pytorch_lightning/CHANGELOG.md | 6 +- .../callbacks/early_stopping.py | 5 +- .../callbacks/model_checkpoint.py | 2 +- src/pytorch_lightning/cli.py | 3 +- src/pytorch_lightning/core/module.py | 2 +- .../loops/epoch/prediction_epoch_loop.py | 2 +- .../loops/epoch/training_epoch_loop.py | 2 +- .../loops/optimization/optimizer_loop.py | 2 +- .../plugins/precision/deepspeed.py | 2 +- .../plugins/precision/ipu.py | 2 +- src/pytorch_lightning/profilers/pytorch.py | 2 +- src/pytorch_lightning/strategies/deepspeed.py | 2 +- .../trainer/connectors/data_connector.py | 3 +- .../connectors/logger_connector/result.py | 3 +- src/pytorch_lightning/utilities/data.py | 2 +- .../utilities/model_summary/model_summary.py | 2 +- src/pytorch_lightning/utilities/rank_zero.py | 71 ++++--------------- src/pytorch_lightning/utilities/seed.py | 5 +- src/pytorch_lightning/utilities/warnings.py | 26 +------ tests/tests_pytorch/run_standalone_tasks.sh | 4 -- .../tests_pytorch/utilities/test_rank_zero.py | 12 +--- .../tests_pytorch/utilities/test_warnings.py | 22 +++--- 26 files changed, 63 insertions(+), 132 deletions(-) diff --git a/.github/workflows/ci-pytorch-test-full.yml b/.github/workflows/ci-pytorch-test-full.yml index fbdc81b91c0ed..b50dae0857587 100644 --- a/.github/workflows/ci-pytorch-test-full.yml +++ b/.github/workflows/ci-pytorch-test-full.yml @@ -163,6 +163,13 @@ jobs: if: ${{ (steps.skip.outputs.continue == '1') }} run: python requirements/pytorch/check-avail-extras.py + - name: Testing Warnings + # the stacklevel can only be set on >=3.7 + if: ${{ (steps.skip.outputs.continue == '1') && ( matrix.python-version != '3.7' ) }} + working-directory: tests/tests_pytorch + # needs to run outside of `pytest` + run: python utilities/test_warnings.py + - name: Testing PyTorch if: ${{ (steps.skip.outputs.continue == '1') }} working-directory: tests/tests_pytorch diff --git a/requirements/app/base.txt b/requirements/app/base.txt index 3824e10263aff..47e70a4682b69 100644 --- a/requirements/app/base.txt +++ b/requirements/app/base.txt @@ -7,4 +7,4 @@ s3fs>=2022.5.0, <=2022.7.1 croniter>=1.3.0, <1.4.0 # strict; TODO: for now until we find something more robust. traitlets<5.2.0 # Traitlets 5.2.X fails: https://github.com/ipython/traitlets/issues/741 arrow>=1.2.0, <=1.2.2 -lightning-utilities==0.2.* +lightning-utilities==0.3.* diff --git a/requirements/lite/base.txt b/requirements/lite/base.txt index 0ab00d4df2b77..4dbc213afe8b6 100644 --- a/requirements/lite/base.txt +++ b/requirements/lite/base.txt @@ -5,4 +5,4 @@ torch>=1.9.*, <1.13.0 fsspec[http]>=2021.05.0, !=2021.06.0, <2022.6.0 packaging>=17.0, <=21.3 typing-extensions>=4.0.0, <4.3.1 -lightning-utilities==0.2.* +lightning-utilities==0.3.* diff --git a/requirements/pytorch/base.txt b/requirements/pytorch/base.txt index 13105d73fe756..7b7ef826811ab 100644 --- a/requirements/pytorch/base.txt +++ b/requirements/pytorch/base.txt @@ -10,4 +10,4 @@ tensorboard>=2.9.1, <2.11.0 torchmetrics>=0.7.0, <0.9.3 # needed for using fixed compare_version packaging>=17.0, <=21.3 typing-extensions>=4.0.0, <4.3.1 -lightning-utilities==0.2.* +lightning-utilities==0.3.* diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 04aeed3fb4073..dab90f512c946 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -24,8 +24,10 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added support for auto wrapping for `DDPFullyShardedStrategy` ([#14383](https://github.com/Lightning-AI/lightning/issues/14383)) -- Integrate the `lightning_utilities` package ([#14475](https://github.com/Lightning-AI/lightning/issues/14475), [#14537](https://github.com/Lightning-AI/lightning/issues/14537)) - +- Integrate the `lightning_utilities` package ( + [#14475](https://github.com/Lightning-AI/lightning/issues/14475), + [#14537](https://github.com/Lightning-AI/lightning/issues/14537), + [#14556](https://github.com/Lightning-AI/lightning/issues/14556)) ### Changed diff --git a/src/pytorch_lightning/callbacks/early_stopping.py b/src/pytorch_lightning/callbacks/early_stopping.py index 79ba68e194586..30ab05c76ebf2 100644 --- a/src/pytorch_lightning/callbacks/early_stopping.py +++ b/src/pytorch_lightning/callbacks/early_stopping.py @@ -23,12 +23,13 @@ import numpy as np import torch +from lightning_utilities.core.rank_zero import rank_prefixed_message from torch import Tensor import pytorch_lightning as pl from pytorch_lightning.callbacks.callback import Callback from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning.utilities.rank_zero import _get_rank, _rank_prefixed_message, rank_zero_warn +from pytorch_lightning.utilities.rank_zero import _get_rank, rank_zero_warn log = logging.getLogger(__name__) @@ -262,6 +263,6 @@ def _log_info(trainer: Optional["pl.Trainer"], message: str, log_rank_zero_only: rank = _get_rank(trainer) if trainer is not None and trainer.world_size <= 1: rank = None - message = _rank_prefixed_message(message, rank) + message = rank_prefixed_message(message, rank) if rank is None or not log_rank_zero_only or rank == 0: log.info(message) diff --git a/src/pytorch_lightning/callbacks/model_checkpoint.py b/src/pytorch_lightning/callbacks/model_checkpoint.py index a442459769ab7..a80c82447c069 100644 --- a/src/pytorch_lightning/callbacks/model_checkpoint.py +++ b/src/pytorch_lightning/callbacks/model_checkpoint.py @@ -31,6 +31,7 @@ import numpy as np import torch import yaml +from lightning_utilities.core.rank_zero import WarningCache from torch import Tensor import pytorch_lightning as pl @@ -39,7 +40,6 @@ from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation, rank_zero_info, rank_zero_warn from pytorch_lightning.utilities.types import _PATH, STEP_OUTPUT -from pytorch_lightning.utilities.warnings import WarningCache log = logging.getLogger(__name__) warning_cache = WarningCache() diff --git a/src/pytorch_lightning/cli.py b/src/pytorch_lightning/cli.py index ac03263f63613..ee53236508a4a 100644 --- a/src/pytorch_lightning/cli.py +++ b/src/pytorch_lightning/cli.py @@ -18,6 +18,7 @@ import torch from lightning_utilities.core.imports import RequirementCache +from lightning_utilities.core.rank_zero import _warn from torch.optim import Optimizer import pytorch_lightning as pl @@ -25,7 +26,7 @@ from pytorch_lightning import Callback, LightningDataModule, LightningModule, seed_everything, Trainer from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.model_helpers import is_overridden -from pytorch_lightning.utilities.rank_zero import _warn, rank_zero_deprecation, rank_zero_warn +from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation, rank_zero_warn _JSONARGPARSE_SIGNATURES_AVAILABLE = RequirementCache("jsonargparse[signatures]>=4.12.0") diff --git a/src/pytorch_lightning/core/module.py b/src/pytorch_lightning/core/module.py index a0512a719d276..6776f8ab95bf1 100644 --- a/src/pytorch_lightning/core/module.py +++ b/src/pytorch_lightning/core/module.py @@ -26,6 +26,7 @@ import torch from lightning_utilities.core.apply_func import apply_to_collection +from lightning_utilities.core.rank_zero import WarningCache from torch import ScriptModule, Tensor from torch.nn import Module from torch.optim.optimizer import Optimizer @@ -56,7 +57,6 @@ LRSchedulerTypeUnion, STEP_OUTPUT, ) -from pytorch_lightning.utilities.warnings import WarningCache warning_cache = WarningCache() log = logging.getLogger(__name__) diff --git a/src/pytorch_lightning/loops/epoch/prediction_epoch_loop.py b/src/pytorch_lightning/loops/epoch/prediction_epoch_loop.py index cd47f31870062..3bad23fc770a1 100644 --- a/src/pytorch_lightning/loops/epoch/prediction_epoch_loop.py +++ b/src/pytorch_lightning/loops/epoch/prediction_epoch_loop.py @@ -2,12 +2,12 @@ from typing import Any, Dict, Iterator, List, Tuple import torch +from lightning_utilities.core.rank_zero import WarningCache from lightning_lite.utilities.apply_func import move_data_to_device from pytorch_lightning.loops.loop import Loop from pytorch_lightning.overrides.distributed import IndexBatchSamplerWrapper from pytorch_lightning.trainer.progress import Progress -from pytorch_lightning.utilities.warnings import WarningCache warning_cache = WarningCache() diff --git a/src/pytorch_lightning/loops/epoch/training_epoch_loop.py b/src/pytorch_lightning/loops/epoch/training_epoch_loop.py index c03ab4ec68c10..a633a7edf6309 100644 --- a/src/pytorch_lightning/loops/epoch/training_epoch_loop.py +++ b/src/pytorch_lightning/loops/epoch/training_epoch_loop.py @@ -18,6 +18,7 @@ import numpy as np import torch from lightning_utilities.core.apply_func import apply_to_collection +from lightning_utilities.core.rank_zero import WarningCache import pytorch_lightning as pl from pytorch_lightning import loops # import as loops to avoid circular imports @@ -33,7 +34,6 @@ from pytorch_lightning.utilities.model_helpers import is_overridden from pytorch_lightning.utilities.rank_zero import rank_zero_warn from pytorch_lightning.utilities.signature_utils import is_param_in_hook_signature -from pytorch_lightning.utilities.warnings import WarningCache _OUTPUTS_TYPE = List[_BATCH_OUTPUTS_TYPE] diff --git a/src/pytorch_lightning/loops/optimization/optimizer_loop.py b/src/pytorch_lightning/loops/optimization/optimizer_loop.py index 931f0c57a745b..57d728544825f 100644 --- a/src/pytorch_lightning/loops/optimization/optimizer_loop.py +++ b/src/pytorch_lightning/loops/optimization/optimizer_loop.py @@ -16,6 +16,7 @@ from typing import Any, Callable, Dict, List, Optional, OrderedDict, Tuple, Union import torch +from lightning_utilities.core.rank_zero import WarningCache from torch import Tensor from torch.optim import Optimizer @@ -32,7 +33,6 @@ from pytorch_lightning.utilities import AMPType from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.types import STEP_OUTPUT -from pytorch_lightning.utilities.warnings import WarningCache @dataclass diff --git a/src/pytorch_lightning/plugins/precision/deepspeed.py b/src/pytorch_lightning/plugins/precision/deepspeed.py index 92ed82774e55e..1a83e9538d688 100644 --- a/src/pytorch_lightning/plugins/precision/deepspeed.py +++ b/src/pytorch_lightning/plugins/precision/deepspeed.py @@ -14,6 +14,7 @@ from typing import Any, Callable, Optional, TYPE_CHECKING, Union from lightning_utilities.core.imports import RequirementCache +from lightning_utilities.core.rank_zero import WarningCache from torch import Tensor from torch.nn import Module from torch.optim import LBFGS, Optimizer @@ -25,7 +26,6 @@ from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.imports import _APEX_AVAILABLE from pytorch_lightning.utilities.model_helpers import is_overridden -from pytorch_lightning.utilities.warnings import WarningCache _DEEPSPEED_AVAILABLE = RequirementCache("deepspeed") if TYPE_CHECKING and _DEEPSPEED_AVAILABLE: diff --git a/src/pytorch_lightning/plugins/precision/ipu.py b/src/pytorch_lightning/plugins/precision/ipu.py index 67e5e373e9f52..34ad358793fc4 100644 --- a/src/pytorch_lightning/plugins/precision/ipu.py +++ b/src/pytorch_lightning/plugins/precision/ipu.py @@ -13,6 +13,7 @@ # limitations under the License. from typing import Any, Callable, Optional, Union +from lightning_utilities.core.rank_zero import WarningCache from torch.nn import Module from torch.optim import LBFGS, Optimizer @@ -22,7 +23,6 @@ from pytorch_lightning.utilities.enums import PrecisionType from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.model_helpers import is_overridden -from pytorch_lightning.utilities.warnings import WarningCache warning_cache = WarningCache() diff --git a/src/pytorch_lightning/profilers/pytorch.py b/src/pytorch_lightning/profilers/pytorch.py index 9b843dccbf2a8..f410230668a9b 100644 --- a/src/pytorch_lightning/profilers/pytorch.py +++ b/src/pytorch_lightning/profilers/pytorch.py @@ -20,6 +20,7 @@ from typing import Any, Callable, Dict, List, Optional, Type, TYPE_CHECKING, Union import torch +from lightning_utilities.core.rank_zero import WarningCache from torch import nn, Tensor from torch.autograd.profiler import record_function @@ -28,7 +29,6 @@ from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.imports import _KINETO_AVAILABLE from pytorch_lightning.utilities.rank_zero import rank_zero_warn -from pytorch_lightning.utilities.warnings import WarningCache if TYPE_CHECKING: from torch.autograd.profiler import EventList diff --git a/src/pytorch_lightning/strategies/deepspeed.py b/src/pytorch_lightning/strategies/deepspeed.py index 695fa6f8fc19b..1d1c687507690 100644 --- a/src/pytorch_lightning/strategies/deepspeed.py +++ b/src/pytorch_lightning/strategies/deepspeed.py @@ -24,6 +24,7 @@ import torch from lightning_utilities.core.apply_func import apply_to_collection from lightning_utilities.core.imports import RequirementCache +from lightning_utilities.core.rank_zero import WarningCache from torch import Tensor from torch.nn import Module from torch.optim import Optimizer @@ -50,7 +51,6 @@ from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation, rank_zero_info, rank_zero_warn from pytorch_lightning.utilities.seed import reset_seed from pytorch_lightning.utilities.types import _LRScheduler, _PATH, LRSchedulerConfig, ReduceLROnPlateau, STEP_OUTPUT -from pytorch_lightning.utilities.warnings import WarningCache warning_cache = WarningCache() diff --git a/src/pytorch_lightning/trainer/connectors/data_connector.py b/src/pytorch_lightning/trainer/connectors/data_connector.py index c6733562d0289..bfb26228e3756 100644 --- a/src/pytorch_lightning/trainer/connectors/data_connector.py +++ b/src/pytorch_lightning/trainer/connectors/data_connector.py @@ -18,6 +18,7 @@ from weakref import proxy from lightning_utilities.core.apply_func import apply_to_collection +from lightning_utilities.core.rank_zero import WarningCache from torch.utils.data import BatchSampler, DataLoader, Sampler, SequentialSampler from torch.utils.data.distributed import DistributedSampler @@ -41,7 +42,7 @@ from pytorch_lightning.utilities.model_helpers import is_overridden from pytorch_lightning.utilities.rank_zero import rank_zero_warn from pytorch_lightning.utilities.types import EVAL_DATALOADERS, TRAIN_DATALOADERS -from pytorch_lightning.utilities.warnings import PossibleUserWarning, WarningCache +from pytorch_lightning.utilities.warnings import PossibleUserWarning warning_cache = WarningCache() diff --git a/src/pytorch_lightning/trainer/connectors/logger_connector/result.py b/src/pytorch_lightning/trainer/connectors/logger_connector/result.py index 4736db1d12d27..9f1be4ba4bed6 100644 --- a/src/pytorch_lightning/trainer/connectors/logger_connector/result.py +++ b/src/pytorch_lightning/trainer/connectors/logger_connector/result.py @@ -17,6 +17,7 @@ import torch from lightning_utilities.core.apply_func import apply_to_collection, apply_to_collections +from lightning_utilities.core.rank_zero import WarningCache from torch import Tensor from torchmetrics import Metric from typing_extensions import TypedDict @@ -30,7 +31,7 @@ from pytorch_lightning.utilities.memory import recursive_detach from pytorch_lightning.utilities.metrics import metrics_to_scalars from pytorch_lightning.utilities.rank_zero import rank_zero_warn -from pytorch_lightning.utilities.warnings import PossibleUserWarning, WarningCache +from pytorch_lightning.utilities.warnings import PossibleUserWarning _IN_METRIC = Union[Metric, Tensor] # Do not include scalars as they were converted to tensors _OUT_METRIC = Union[Tensor, Dict[str, Tensor]] diff --git a/src/pytorch_lightning/utilities/data.py b/src/pytorch_lightning/utilities/data.py index f65ef47d694b5..41c8ddc59b582 100644 --- a/src/pytorch_lightning/utilities/data.py +++ b/src/pytorch_lightning/utilities/data.py @@ -22,6 +22,7 @@ import torch from lightning_utilities.core.apply_func import is_dataclass_instance +from lightning_utilities.core.rank_zero import WarningCache from torch import Tensor from torch.utils.data import ( BatchSampler, @@ -42,7 +43,6 @@ from pytorch_lightning.utilities.meta import _get_all_subclasses from pytorch_lightning.utilities.rank_zero import rank_zero_warn from pytorch_lightning.utilities.seed import pl_worker_init_function -from pytorch_lightning.utilities.warnings import WarningCache BType = Union[Tensor, str, Mapping[Any, "BType"], Iterable["BType"]] diff --git a/src/pytorch_lightning/utilities/model_summary/model_summary.py b/src/pytorch_lightning/utilities/model_summary/model_summary.py index cb2cc0d64ba71..9cc59aaab25f0 100644 --- a/src/pytorch_lightning/utilities/model_summary/model_summary.py +++ b/src/pytorch_lightning/utilities/model_summary/model_summary.py @@ -20,11 +20,11 @@ import numpy as np import torch import torch.nn as nn +from lightning_utilities.core.rank_zero import WarningCache from torch import Tensor from torch.utils.hooks import RemovableHandle import pytorch_lightning as pl -from pytorch_lightning.utilities.warnings import WarningCache log = logging.getLogger(__name__) warning_cache = WarningCache() diff --git a/src/pytorch_lightning/utilities/rank_zero.py b/src/pytorch_lightning/utilities/rank_zero.py index 21f8ca7207abd..156c7c98c5996 100644 --- a/src/pytorch_lightning/utilities/rank_zero.py +++ b/src/pytorch_lightning/utilities/rank_zero.py @@ -15,26 +15,22 @@ """Utilities that can be used for calling functions on a particular rank.""" import logging import os -import warnings -from functools import partial, wraps -from platform import python_version -from typing import Any, Callable, Optional, Union - -import pytorch_lightning as pl - -log = logging.getLogger(__name__) +from typing import Optional +import lightning_utilities.core.rank_zero as rank_zero_module -def rank_zero_only(fn: Callable) -> Callable: - """Function that can be used as a decorator to enable a function/method being called only on global rank 0.""" +# note: we want to keep these indirections so the `rank_zero_only.rank` is set (on import) for PL users +from lightning_utilities.core.rank_zero import ( # noqa: F401 + rank_zero_debug, + rank_zero_deprecation, + rank_zero_info, + rank_zero_only, + rank_zero_warn, +) - @wraps(fn) - def wrapped_fn(*args: Any, **kwargs: Any) -> Optional[Any]: - if rank_zero_only.rank == 0: - return fn(*args, **kwargs) - return None +import pytorch_lightning as pl - return wrapped_fn +rank_zero_module.log = logging.getLogger(__name__) def _get_rank(trainer: Optional["pl.Trainer"] = None) -> Optional[int]: @@ -55,49 +51,8 @@ def _get_rank(trainer: Optional["pl.Trainer"] = None) -> Optional[int]: rank_zero_only.rank = getattr(rank_zero_only, "rank", _get_rank() or 0) -def _info(*args: Any, stacklevel: int = 2, **kwargs: Any) -> None: - if python_version() >= "3.8.0": - kwargs["stacklevel"] = stacklevel - log.info(*args, **kwargs) - - -def _debug(*args: Any, stacklevel: int = 2, **kwargs: Any) -> None: - if python_version() >= "3.8.0": - kwargs["stacklevel"] = stacklevel - log.debug(*args, **kwargs) - - -@rank_zero_only -def rank_zero_debug(*args: Any, stacklevel: int = 4, **kwargs: Any) -> None: - """Function used to log debug-level messages only on global rank 0.""" - _debug(*args, stacklevel=stacklevel, **kwargs) - - -@rank_zero_only -def rank_zero_info(*args: Any, stacklevel: int = 4, **kwargs: Any) -> None: - """Function used to log info-level messages only on global rank 0.""" - _info(*args, stacklevel=stacklevel, **kwargs) - - -def _warn(message: Union[str, Warning], stacklevel: int = 2, **kwargs: Any) -> None: - warnings.warn(message, stacklevel=stacklevel, **kwargs) - - -@rank_zero_only -def rank_zero_warn(message: Union[str, Warning], stacklevel: int = 4, **kwargs: Any) -> None: - """Function used to log warn-level messages only on global rank 0.""" - _warn(message, stacklevel=stacklevel, **kwargs) - - class LightningDeprecationWarning(DeprecationWarning): """Deprecation warnings raised by PyTorch Lightning.""" -rank_zero_deprecation = partial(rank_zero_warn, category=LightningDeprecationWarning) - - -def _rank_prefixed_message(message: str, rank: Optional[int]) -> str: - if rank is not None: - # specify the rank of the process being logged - return f"[rank: {rank}] {message}" - return message +rank_zero_module.rank_zero_deprecation_category = LightningDeprecationWarning diff --git a/src/pytorch_lightning/utilities/seed.py b/src/pytorch_lightning/utilities/seed.py index cc9ff6673ef36..5c33214cf405a 100644 --- a/src/pytorch_lightning/utilities/seed.py +++ b/src/pytorch_lightning/utilities/seed.py @@ -23,8 +23,9 @@ import numpy as np import torch +from lightning_utilities.core.rank_zero import rank_prefixed_message -from pytorch_lightning.utilities.rank_zero import _get_rank, _rank_prefixed_message, rank_zero_only, rank_zero_warn +from pytorch_lightning.utilities.rank_zero import _get_rank, rank_zero_only, rank_zero_warn log = logging.getLogger(__name__) @@ -66,7 +67,7 @@ def seed_everything(seed: Optional[int] = None, workers: bool = False) -> int: rank_zero_warn(f"{seed} is not in bounds, numpy accepts from {min_seed_value} to {max_seed_value}") seed = _select_seed_randomly(min_seed_value, max_seed_value) - log.info(_rank_prefixed_message(f"Global seed set to {seed}", _get_rank())) + log.info(rank_prefixed_message(f"Global seed set to {seed}", _get_rank())) os.environ["PL_GLOBAL_SEED"] = str(seed) random.seed(seed) np.random.seed(seed) diff --git a/src/pytorch_lightning/utilities/warnings.py b/src/pytorch_lightning/utilities/warnings.py index 45b382bc92214..ae608bdbccce7 100644 --- a/src/pytorch_lightning/utilities/warnings.py +++ b/src/pytorch_lightning/utilities/warnings.py @@ -13,36 +13,12 @@ # limitations under the License. """Warning-related utilities.""" import warnings -from typing import Any -from pytorch_lightning.utilities.rank_zero import ( - LightningDeprecationWarning, - rank_zero_deprecation, - rank_zero_info, - rank_zero_warn, -) +from pytorch_lightning.utilities.rank_zero import LightningDeprecationWarning # enable our warnings - warnings.simplefilter("default", category=LightningDeprecationWarning) class PossibleUserWarning(UserWarning): """Warnings that could be false positives.""" - - -class WarningCache(set): - def warn(self, message: str, stacklevel: int = 5, **kwargs: Any) -> None: - if message not in self: - self.add(message) - rank_zero_warn(message, stacklevel=stacklevel, **kwargs) - - def deprecation(self, message: str, stacklevel: int = 5, **kwargs: Any) -> None: - if message not in self: - self.add(message) - rank_zero_deprecation(message, stacklevel=stacklevel, **kwargs) - - def info(self, message: str, stacklevel: int = 5, **kwargs: Any) -> None: - if message not in self: - self.add(message) - rank_zero_info(message, stacklevel=stacklevel, **kwargs) diff --git a/tests/tests_pytorch/run_standalone_tasks.sh b/tests/tests_pytorch/run_standalone_tasks.sh index 698ed7863ab96..0abe25d76c638 100644 --- a/tests/tests_pytorch/run_standalone_tasks.sh +++ b/tests/tests_pytorch/run_standalone_tasks.sh @@ -20,10 +20,6 @@ if nvcc --version; then nvprof --profile-from-start off -o trace_name.prof -- python -m coverage run --source pytorch_lightning --append -m pytest --no-header profilers/test_profiler.py::test_pytorch_profiler_nested_emit_nvtx fi -# needs to run outside of `pytest` -echo "Running utilities/test_warnings.py" -python utilities/test_warnings.py - # test deadlock is properly handled with TorchElastic. echo "Running plugins/environments/torch_elastic_deadlock.py" LOGS=$(PL_RUN_STANDALONE_TESTS=1 PL_RECONCILE_PROCESS=1 python -m torch.distributed.run --nproc_per_node=2 --max_restarts 0 -m coverage run --source pytorch_lightning -a plugins/environments/torch_elastic_deadlock.py | grep "SUCCEEDED") diff --git a/tests/tests_pytorch/utilities/test_rank_zero.py b/tests/tests_pytorch/utilities/test_rank_zero.py index 76fa27926aa39..c4c15b28e5b73 100644 --- a/tests/tests_pytorch/utilities/test_rank_zero.py +++ b/tests/tests_pytorch/utilities/test_rank_zero.py @@ -17,7 +17,7 @@ import pytest -from pytorch_lightning.utilities.rank_zero import _get_rank, _rank_prefixed_message +from pytorch_lightning.utilities.rank_zero import _get_rank @pytest.mark.parametrize( @@ -38,6 +38,7 @@ def test_rank_zero_known_environment_variables(env_vars, expected): """Test that rank environment variables are properly checked for rank_zero_only.""" with mock.patch.dict(os.environ, env_vars): # force module reload to re-trigger the rank_zero_only.rank global computation + sys.modules.pop("lightning_utilities.core.rank_zero", None) sys.modules.pop("pytorch_lightning.utilities.rank_zero", None) from pytorch_lightning.utilities.rank_zero import rank_zero_only @@ -61,12 +62,3 @@ def test_rank_zero_priority(environ, expected_rank): """Test the priority in which the rank gets determined when multiple environment variables are available.""" with mock.patch.dict(os.environ, environ): assert _get_rank() == expected_rank - - -@pytest.mark.parametrize("env_vars", [{"RANK": "0"}, {"RANK": "1"}, {"RANK": "4"}]) -def test_rank_prefixed_message_with_env_vars(env_vars): - with mock.patch.dict(os.environ, env_vars, clear=True): - rank = _get_rank() - message = _rank_prefixed_message("bar", rank) - - assert message == f"[rank: {rank}] bar" diff --git a/tests/tests_pytorch/utilities/test_warnings.py b/tests/tests_pytorch/utilities/test_warnings.py index 3f770ffe2d86f..223cd4e59f75f 100644 --- a/tests/tests_pytorch/utilities/test_warnings.py +++ b/tests/tests_pytorch/utilities/test_warnings.py @@ -15,16 +15,14 @@ Needs to be run outside of `pytest` as it captures all the warnings. """ -import os from contextlib import redirect_stderr from io import StringIO -from pytorch_lightning.utilities.rank_zero import _warn, rank_zero_deprecation, rank_zero_warn -from pytorch_lightning.utilities.warnings import WarningCache +from lightning_utilities.core.rank_zero import _warn, WarningCache -standalone = os.getenv("PL_RUN_STANDALONE_TESTS", "0") == "1" -if standalone and __name__ == "__main__": +from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation, rank_zero_warn +if __name__ == "__main__": stderr = StringIO() # recording with redirect_stderr(stderr): @@ -41,16 +39,16 @@ cache.deprecation("test7") output = stderr.getvalue() - assert "test_warnings.py:31: UserWarning: test1" in output - assert "test_warnings.py:32: DeprecationWarning: test2" in output + assert "test_warnings.py:29: UserWarning: test1" in output + assert "test_warnings.py:30: DeprecationWarning: test2" in output - assert "test_warnings.py:34: UserWarning: test3" in output - assert "test_warnings.py:35: DeprecationWarning: test4" in output + assert "test_warnings.py:32: UserWarning: test3" in output + assert "test_warnings.py:33: DeprecationWarning: test4" in output - assert "test_warnings.py:37: LightningDeprecationWarning: test5" in output + assert "test_warnings.py:35: LightningDeprecationWarning: test5" in output - assert "test_warnings.py:40: UserWarning: test6" in output - assert "test_warnings.py:41: LightningDeprecationWarning: test7" in output + assert "test_warnings.py:38: UserWarning: test6" in output + assert "test_warnings.py:39: LightningDeprecationWarning: test7" in output # check that logging is properly configured import logging From dbd7703e88c39421d740c010cac7f0bb23981c12 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 7 Sep 2022 09:25:24 +0000 Subject: [PATCH 072/193] Update fsspec[http] requirement from !=2021.06.0,<2022.6.0,>=2021.05.0 to >=2021.05.0,!=2021.06.0,<2022.8.0 in /requirements (#14288) Update fsspec[http] requirement in /requirements Updates the requirements on [fsspec[http]](https://github.com/fsspec/filesystem_spec) to permit the latest version. - [Release notes](https://github.com/fsspec/filesystem_spec/releases) - [Commits](https://github.com/fsspec/filesystem_spec/compare/2021.05.0...2022.7.1) --- updated-dependencies: - dependency-name: fsspec[http] dependency-type: direct:production ... Signed-off-by: dependabot[bot] Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- requirements/pytorch/base.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/pytorch/base.txt b/requirements/pytorch/base.txt index 7b7ef826811ab..c7301c08341e6 100644 --- a/requirements/pytorch/base.txt +++ b/requirements/pytorch/base.txt @@ -5,7 +5,7 @@ numpy>=1.17.2, <1.23.1 torch>=1.9.*, <1.13.0 tqdm>=4.57.0, <4.65.0 PyYAML>=5.4, <=6.0 -fsspec[http]>=2021.05.0, !=2021.06.0, <2022.6.0 +fsspec[http]>=2021.05.0, !=2021.06.0, <2022.8.0 tensorboard>=2.9.1, <2.11.0 torchmetrics>=0.7.0, <0.9.3 # needed for using fixed compare_version packaging>=17.0, <=21.3 From 6de2b0b5284eb3c27583e4d8dc9017675a023b28 Mon Sep 17 00:00:00 2001 From: Kushashwa Ravi Shrimali Date: Wed, 7 Sep 2022 18:35:58 +0530 Subject: [PATCH 073/193] Docs [Fix]: use bytes instead of strings while writing (#14505) * Fix doc examples: use bytes instead of strings while writing * Add a note (comment) * nit * Update any_server.rst * Update docs/source-app/workflows/add_server/any_server.rst * Update docs/source-app/workflows/add_server/any_server.rst * Update docs/source-app/workflows/add_server/any_server.rst * Apply suggestions from code review Co-authored-by: Laverne Henderson Co-authored-by: Ethan Harris Co-authored-by: thomas chaton --- docs/source-app/workflows/add_server/any_server.rst | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/docs/source-app/workflows/add_server/any_server.rst b/docs/source-app/workflows/add_server/any_server.rst index 398951276c0a5..2e26b880e8ba9 100644 --- a/docs/source-app/workflows/add_server/any_server.rst +++ b/docs/source-app/workflows/add_server/any_server.rst @@ -31,7 +31,8 @@ Any server that listens on a port, can be enabled via a work. For example, here' def do_GET(self): self.send_response(HTTPStatus.OK) self.end_headers() - html = "

Hello lit world

" + # Data must be passed as bytes to the `self.wfile.write` call + html = b"

Hello lit world " self.wfile.write(html) @@ -52,7 +53,8 @@ To enable the server inside the component, start the server in the run method an def do_GET(self): self.send_response(HTTPStatus.OK) self.end_headers() - html = "

Hello lit world " + # Data must be passed as bytes to the `self.wfile.write` call + html = b"

Hello lit world " self.wfile.write(html) @@ -81,7 +83,8 @@ In this case, we render the ``LitServer`` output in the ``home`` tab of the appl def do_GET(self): self.send_response(HTTPStatus.OK) self.end_headers() - html = "

Hello lit world " + # Data must be passed as bytes to the `self.wfile.write` call + html = b"

Hello lit world " self.wfile.write(html) From 8c4184c105f93f2fd583e405d1a908b9f72dd312 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Wed, 7 Sep 2022 15:14:14 +0200 Subject: [PATCH 074/193] Integrate with `lightning_utilities.core.enums` (#14558) --- .actions/setup_tools.py | 9 ++- src/pytorch_lightning/utilities/enums.py | 66 +++++---------------- tests/tests_pytorch/utilities/test_enums.py | 12 +--- 3 files changed, 23 insertions(+), 64 deletions(-) diff --git a/.actions/setup_tools.py b/.actions/setup_tools.py index 3706dda307a84..c434f42569ee8 100644 --- a/.actions/setup_tools.py +++ b/.actions/setup_tools.py @@ -177,8 +177,15 @@ def replace_block_with_imports(lines: List[str], import_path: str, kword: str = >>> lines = replace_block_with_imports(lines, import_path, "def") """ body, tracking, skip_offset = [], False, 0 - for ln in lines: + for i, ln in enumerate(lines): offset = len(ln) - len(ln.lstrip()) + + # support for defining a class with this condition + if ln.startswith("if TYPE_CHECKING") or ln.startswith("if typing.TYPE_CHECKING"): + # dedent the next line + lines[i + 1] = lines[i + 1].lstrip() + continue + # in case of mating the class args are multi-line if tracking and ln and offset <= skip_offset and not any(ln.lstrip().startswith(c) for c in ")]"): tracking = False diff --git a/src/pytorch_lightning/utilities/enums.py b/src/pytorch_lightning/utilities/enums.py index 06d616f87259f..03d9b8782e5ab 100644 --- a/src/pytorch_lightning/utilities/enums.py +++ b/src/pytorch_lightning/utilities/enums.py @@ -15,52 +15,32 @@ from __future__ import annotations import os -from enum import Enum +from typing import TYPE_CHECKING -from pytorch_lightning.utilities.exceptions import MisconfigurationException +from lightning_utilities.core.enums import StrEnum +from pytorch_lightning.utilities.exceptions import MisconfigurationException -class LightningEnum(str, Enum): - """Type of any enumerator with allowed comparison to string invariant to cases.""" +if TYPE_CHECKING: + from enum import Enum - @classmethod - def from_str(cls, value: str) -> LightningEnum | None: - statuses = cls.__members__.keys() - for st in statuses: - if st.lower() == value.lower(): - return cls[st] - return None + # re-defined because `mypy` infers `StrEnum` as `Any` + class LightningEnum(StrEnum, Enum): + ... - def __eq__(self, other: object) -> bool: - other = other.value if isinstance(other, Enum) else str(other) - return self.value.lower() == other.lower() - - def __hash__(self) -> int: - # re-enable hashtable so it can be used as a dict key or in a set - # example: set(LightningEnum) - return hash(self.value.lower()) +else: + LightningEnum = StrEnum class AMPType(LightningEnum): - """Type of Automatic Mixed Precission used for training. - - >>> # you can match the type with string - >>> AMPType.APEX == 'apex' - True - """ + """Type of Automatic Mixed Precission used for training.""" APEX = "apex" NATIVE = "native" class PrecisionType(LightningEnum): - """Type of precision used. - - >>> PrecisionType.HALF == 16 - True - >>> PrecisionType.HALF in (16, "16") - True - """ + """Type of precision used.""" HALF = "16" FLOAT = "32" @@ -106,15 +86,7 @@ class AutoRestartBatchKeys(LightningEnum): class _StrategyType(LightningEnum): - """Define type of training strategy. - - >>> # you can match the type with string - >>> _StrategyType.DDP == 'DDP' - True - >>> # which is case invariant - >>> _StrategyType.DP in ('dp', ) - True - """ + """Define type of training strategy.""" DP = "dp" DDP = "ddp" @@ -144,17 +116,7 @@ def is_interactive_compatible(self) -> bool: class _AcceleratorType(LightningEnum): - """Define Accelerator type by its nature. - - >>> _AcceleratorType.CPU == _AcceleratorType.from_str('cpu') - True - >>> # you can match the type with string - >>> _AcceleratorType.CUDA == 'CUDA' - True - >>> # which is case invariant - >>> _AcceleratorType.TPU in ('tpu', 'CPU') - True - """ + """Define Accelerator type by its nature.""" CPU = "CPU" CUDA = "CUDA" diff --git a/tests/tests_pytorch/utilities/test_enums.py b/tests/tests_pytorch/utilities/test_enums.py index dcd4410952308..1519e177217bb 100644 --- a/tests/tests_pytorch/utilities/test_enums.py +++ b/tests/tests_pytorch/utilities/test_enums.py @@ -11,17 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from pytorch_lightning.utilities.enums import _AcceleratorType, GradClipAlgorithmType, PrecisionType - - -def test_consistency(): - assert _AcceleratorType.TPU not in ("GPU", "CPU") - assert _AcceleratorType.TPU in ("TPU", "CPU") - assert _AcceleratorType.TPU in ("tpu", "CPU") - assert _AcceleratorType.TPU not in {"GPU", "CPU"} - # hash cannot be case invariant - assert _AcceleratorType.TPU not in {"TPU", "CPU"} - assert _AcceleratorType.TPU in {"tpu", "CPU"} +from pytorch_lightning.utilities.enums import GradClipAlgorithmType, PrecisionType def test_precision_supported_types(): From bda70a2f2f95c9ffe7ee305fed569440d104d1ae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Wed, 7 Sep 2022 15:34:46 +0200 Subject: [PATCH 075/193] Integrate lightning_utilities `get_all_subclasses` (#14575) --- src/pytorch_lightning/CHANGELOG.md | 4 +++- src/pytorch_lightning/utilities/cli.py | 14 +++++++------- src/pytorch_lightning/utilities/data.py | 4 ++-- src/pytorch_lightning/utilities/meta.py | 16 ++-------------- 4 files changed, 14 insertions(+), 24 deletions(-) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index dab90f512c946..4a5c72cc5c51e 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -27,7 +27,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Integrate the `lightning_utilities` package ( [#14475](https://github.com/Lightning-AI/lightning/issues/14475), [#14537](https://github.com/Lightning-AI/lightning/issues/14537), - [#14556](https://github.com/Lightning-AI/lightning/issues/14556)) + [#14556](https://github.com/Lightning-AI/lightning/issues/14556), + [#14558](https://github.com/Lightning-AI/lightning/issues/14558), + [#14575](https://github.com/Lightning-AI/lightning/issues/14575)) ### Changed diff --git a/src/pytorch_lightning/utilities/cli.py b/src/pytorch_lightning/utilities/cli.py index 8af919b78ce93..9916fd75cb2a6 100644 --- a/src/pytorch_lightning/utilities/cli.py +++ b/src/pytorch_lightning/utilities/cli.py @@ -18,11 +18,11 @@ from typing import Any, Generator, List, Optional, Tuple, Type import torch +from lightning_utilities.core.inheritance import get_all_subclasses from torch.optim import Optimizer import pytorch_lightning as pl import pytorch_lightning.cli as new_cli -from pytorch_lightning.utilities.meta import _get_all_subclasses from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation _deprecate_registry_message = ( @@ -108,17 +108,17 @@ def _populate_registries(subclasses: bool) -> None: # Remove in v1.9 if subclasses: rank_zero_deprecation(_deprecate_auto_registry_message) # this will register any subclasses from all loaded modules including userland - for cls in _get_all_subclasses(torch.optim.Optimizer): + for cls in get_all_subclasses(torch.optim.Optimizer): OPTIMIZER_REGISTRY(cls, show_deprecation=False) - for cls in _get_all_subclasses(torch.optim.lr_scheduler._LRScheduler): + for cls in get_all_subclasses(torch.optim.lr_scheduler._LRScheduler): LR_SCHEDULER_REGISTRY(cls, show_deprecation=False) - for cls in _get_all_subclasses(pl.Callback): + for cls in get_all_subclasses(pl.Callback): CALLBACK_REGISTRY(cls, show_deprecation=False) - for cls in _get_all_subclasses(pl.LightningModule): + for cls in get_all_subclasses(pl.LightningModule): MODEL_REGISTRY(cls, show_deprecation=False) - for cls in _get_all_subclasses(pl.LightningDataModule): + for cls in get_all_subclasses(pl.LightningDataModule): DATAMODULE_REGISTRY(cls, show_deprecation=False) - for cls in _get_all_subclasses(pl.loggers.Logger): + for cls in get_all_subclasses(pl.loggers.Logger): LOGGER_REGISTRY(cls, show_deprecation=False) else: # manually register torch's subclasses and our subclasses diff --git a/src/pytorch_lightning/utilities/data.py b/src/pytorch_lightning/utilities/data.py index 41c8ddc59b582..59068b1a1523b 100644 --- a/src/pytorch_lightning/utilities/data.py +++ b/src/pytorch_lightning/utilities/data.py @@ -22,6 +22,7 @@ import torch from lightning_utilities.core.apply_func import is_dataclass_instance +from lightning_utilities.core.inheritance import get_all_subclasses from lightning_utilities.core.rank_zero import WarningCache from torch import Tensor from torch.utils.data import ( @@ -40,7 +41,6 @@ from pytorch_lightning.utilities.auto_restart import CaptureIterableDataset, CaptureMapDataset, FastForwardSampler from pytorch_lightning.utilities.enums import _FaultTolerantMode, LightningEnum from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning.utilities.meta import _get_all_subclasses from pytorch_lightning.utilities.rank_zero import rank_zero_warn from pytorch_lightning.utilities.seed import pl_worker_init_function @@ -549,7 +549,7 @@ def _replace_dunder_methods(base_cls: Type, store_explicit_arg: Optional[str] = It patches the ``__init__``, ``__setattr__`` and ``__delattr__`` methods. """ - classes = _get_all_subclasses(base_cls) | {base_cls} + classes = get_all_subclasses(base_cls) | {base_cls} for cls in classes: # Check that __init__ belongs to the class # https://stackoverflow.com/a/5253424 diff --git a/src/pytorch_lightning/utilities/meta.py b/src/pytorch_lightning/utilities/meta.py index c028e8b446951..b1359df852606 100644 --- a/src/pytorch_lightning/utilities/meta.py +++ b/src/pytorch_lightning/utilities/meta.py @@ -43,21 +43,9 @@ def get_all_subclasses(cls: Type) -> Set[Type]: "`pytorch_lightning.utilities.meta.get_all_subclasses` is deprecated in v1.8 and will be removed in v1.9." " Please copy its implementation if you have a use for it." ) - return _get_all_subclasses(cls) + from lightning_utilities.core.inheritance import get_all_subclasses as new_get_all_subclasses - -# https://stackoverflow.com/a/63851681/9201239 -def _get_all_subclasses(cls: Type) -> Set[Type]: - subclass_list = [] - - def recurse(cl: Type) -> None: - for subclass in cl.__subclasses__(): - subclass_list.append(subclass) - recurse(subclass) - - recurse(cls) - - return set(subclass_list) + return new_get_all_subclasses(cls) def recursively_setattr(root_module: Any, prefix: str, materialized_module: Module) -> None: From 46519e2fc770f3393c7f5e72e6f36a01a15efb27 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Wed, 7 Sep 2022 15:55:45 +0200 Subject: [PATCH 076/193] Add path filters to the TPU job (#14543) --- .circleci/config.yml | 44 ++++++++++--------------------- .github/workflows/ci-circleci.yml | 28 ++++++++++++++++++++ 2 files changed, 42 insertions(+), 30 deletions(-) create mode 100644 .github/workflows/ci-circleci.yml diff --git a/.circleci/config.yml b/.circleci/config.yml index 5c314d4e6e5c1..57d318bc240b6 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -4,33 +4,16 @@ orbs: gcp-gke: circleci/gcp-gke@1.4.0 go: circleci/go@1.7.1 codecov: codecov/codecov@1.1.0 - -trigger: - tags: - include: - - '*' - branches: - include: - - "master" - - "release/*" - - "refs/tags/*" - -pr: - - "master" - - "release/*" - -# Workflow Steps: -# 1. Checkout -# 2. Install GO -# 3. Checkout ml-testing-accelerators -# 4. GCP GKE install -# 5. Update Kubeconfig with credintials -# 6. Install jsonnet -# 7. Update jsonnet -# 8. Deploy the job on the kubernetes cluster -# 9. Statistics -# 10. Upload coverage results -# 11. Upload coverage to Codecov +parameters: + GHA_Actor: + type: string + default: "" + GHA_Action: + type: string + default: "" + GHA_Event: + type: string + default: "" references: @@ -92,7 +75,7 @@ references: # still the job hasn't finished, give up and return the starting # non-zero status code. printf "Waiting for job to finish: " && \ - while [ $i -lt $MAX_CHECKS ]; do ((i++)); if kubectl get jobs $job_name -o jsonpath='Failed:{.status.failed}' | grep "Failed:1"; then status_code=1 && break; elif kubectl get jobs $job_name -o jsonpath='Succeeded:{.status.succeeded}' | grep "Succeeded:1" ; then status_code=0 && break; else printf "."; fi; sleep $CHECK_SPEEP; done && \ + while [ $i -lt $MAX_CHECKS ]; do ((i++)); if kubectl get jobs $job_name -o jsonpath='Failed:{.status.failed}' | grep "Failed:1"; then status_code=1 && break; elif kubectl get jobs $job_name -o jsonpath='Succeeded:{.status.succeeded}' | grep "Succeeded:1" ; then status_code=0 && break; else printf "."; fi; sleep $CHECK_SLEEP; done && \ echo "Done waiting. Job status code: $status_code" && \ kubectl logs -f $pod_name --container=train > /tmp/full_output.txt if grep -q '' /tmp/full_output.txt ; then csplit /tmp/full_output.txt '//'; else mv /tmp/full_output.txt xx00; fi && \ @@ -116,7 +99,7 @@ jobs: - XLA_VER: 1.12 - PYTHON_VER: 3.7 - MAX_CHECKS: 1000 - - CHECK_SPEEP: 5 + - CHECK_SLEEP: 5 steps: - checkout - go/install @@ -153,10 +136,11 @@ jobs: destination: html workflows: - version: 2 #build-docs: # FixMe + # when: << pipeline.parameters.GHA_Action >> # jobs: # - build-Docs test-on-tpus: + when: << pipeline.parameters.GHA_Action >> jobs: - TPU-tests diff --git a/.github/workflows/ci-circleci.yml b/.github/workflows/ci-circleci.yml new file mode 100644 index 0000000000000..697fa444f3dc9 --- /dev/null +++ b/.github/workflows/ci-circleci.yml @@ -0,0 +1,28 @@ +on: + push: + branches: [master, "release/*"] + paths: + - ".github/workflows/ci-circleci.yml" + - ".circleci/config.yml" + - "requirements/pytorch/**" + - "src/pytorch_lightning/**" + - "tests/tests_pytorch/**" + - "setup.cfg" # includes pytest config + pull_request: + branches: [master, "release/*"] + paths: + - ".github/workflows/ci-circleci.yml" + - ".circleci/config.yml" + - "requirements/pytorch/**" + - "src/pytorch_lightning/**" + - "tests/tests_pytorch/**" + - "setup.cfg" # includes pytest config + +jobs: + # https://github.com/marketplace/actions/trigger-circleci-pipeline + trigger-circleci: + runs-on: ubuntu-latest + steps: + - uses: CircleCI-Public/trigger-circleci-pipeline-action@v1.0.5 + env: + CCI_TOKEN: ${{ secrets.CCI_TOKEN }} From bcad90141a9ce76a6ec792c525658758669291cf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Wed, 7 Sep 2022 16:09:59 +0200 Subject: [PATCH 077/193] Remove old test artifacts (#14574) --- tests/tests_pytorch/__init__.py | 7 ------- tests/tests_pytorch/helpers/pipelines.py | 4 ++-- tests/tests_pytorch/helpers/utils.py | 9 +++++++-- 3 files changed, 9 insertions(+), 11 deletions(-) diff --git a/tests/tests_pytorch/__init__.py b/tests/tests_pytorch/__init__.py index 9039a6e4b16e9..2731ae3124419 100644 --- a/tests/tests_pytorch/__init__.py +++ b/tests/tests_pytorch/__init__.py @@ -11,11 +11,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import logging import os -import numpy as np - _TEST_ROOT = os.path.dirname(__file__) _PROJECT_ROOT = os.path.dirname(_TEST_ROOT) _TEMP_PATH = os.path.join(_PROJECT_ROOT, "test_temp") @@ -27,10 +24,6 @@ splitter = ":" if os.environ.get("PYTHONPATH", "") else "" os.environ["PYTHONPATH"] = f'{_PROJECT_ROOT}{splitter}{os.environ.get("PYTHONPATH", "")}' -# generate a list of random seeds for each test -RANDOM_PORTS = list(np.random.randint(12000, 19000, 1000)) if not os.path.isdir(_TEMP_PATH): os.mkdir(_TEMP_PATH) - -logging.basicConfig(level=logging.ERROR) diff --git a/tests/tests_pytorch/helpers/pipelines.py b/tests/tests_pytorch/helpers/pipelines.py index 3de3d75563fb4..3cbc49f11ce26 100644 --- a/tests/tests_pytorch/helpers/pipelines.py +++ b/tests/tests_pytorch/helpers/pipelines.py @@ -31,7 +31,7 @@ def run_model_test_without_loggers( # correct result and ok accuracy assert trainer.state.finished, f"Training failed with {trainer.state}" - model2 = load_model_from_checkpoint(trainer.logger, trainer.checkpoint_callback.best_model_path, type(model)) + model2 = load_model_from_checkpoint(trainer.checkpoint_callback.best_model_path, type(model)) # test new model accuracy test_loaders = model2.test_dataloader() if not data else data.test_dataloader() @@ -68,7 +68,7 @@ def run_model_test( assert change_ratio > 0.03, f"the model is changed of {change_ratio}" # test model loading - _ = load_model_from_checkpoint(logger, trainer.checkpoint_callback.best_model_path, type(model)) + _ = load_model_from_checkpoint(trainer.checkpoint_callback.best_model_path, type(model)) # test new model accuracy test_loaders = model.test_dataloader() if not data else data.test_dataloader() diff --git a/tests/tests_pytorch/helpers/utils.py b/tests/tests_pytorch/helpers/utils.py index 54503bf75b0f7..18393e2193c91 100644 --- a/tests/tests_pytorch/helpers/utils.py +++ b/tests/tests_pytorch/helpers/utils.py @@ -17,13 +17,14 @@ from contextlib import contextmanager from typing import Optional, Type +import numpy as np import pytest from pytorch_lightning import seed_everything from pytorch_lightning.callbacks import ModelCheckpoint from pytorch_lightning.demos.boring_classes import BoringModel from pytorch_lightning.loggers import TensorBoardLogger -from tests_pytorch import _TEMP_PATH, RANDOM_PORTS +from tests_pytorch import _TEMP_PATH def get_default_logger(save_dir, version=None): @@ -52,7 +53,7 @@ def get_data_path(expt_logger, path_dir=None): return path_expt -def load_model_from_checkpoint(logger, root_weights_dir, module_class=BoringModel): +def load_model_from_checkpoint(root_weights_dir, module_class=BoringModel): trained_model = module_class.load_from_checkpoint(root_weights_dir) assert trained_model is not None, "loading model failed" return trained_model @@ -68,6 +69,10 @@ def reset_seed(seed=0): seed_everything(seed) +# generate a list of random seeds for each test +RANDOM_PORTS = list(np.random.randint(12000, 19000, 1000)) + + def set_random_main_port(): reset_seed() port = RANDOM_PORTS.pop() From 31dc6c67142ad1c1e4fa192cad320a99114dc4c1 Mon Sep 17 00:00:00 2001 From: pierocor <32060225+pierocor@users.noreply.github.com> Date: Wed, 7 Sep 2022 16:25:20 +0200 Subject: [PATCH 078/193] Add compatibility when `torch.distributed` is not available (#14454) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Piero Coronica Co-authored-by: Carlos Mocholí --- .actions/setup_tools.py | 10 +++++++--- src/pytorch_lightning/CHANGELOG.md | 3 +++ .../plugins/precision/fsdp_native_native_amp.py | 2 +- src/pytorch_lightning/strategies/ddp.py | 5 ++++- src/pytorch_lightning/strategies/ddp_spawn.py | 6 +++++- .../strategies/fully_sharded_native.py | 15 +++++++++------ src/pytorch_lightning/utilities/types.py | 6 +++++- 7 files changed, 34 insertions(+), 13 deletions(-) diff --git a/.actions/setup_tools.py b/.actions/setup_tools.py index c434f42569ee8..b71d7da707613 100644 --- a/.actions/setup_tools.py +++ b/.actions/setup_tools.py @@ -178,14 +178,18 @@ def replace_block_with_imports(lines: List[str], import_path: str, kword: str = """ body, tracking, skip_offset = [], False, 0 for i, ln in enumerate(lines): - offset = len(ln) - len(ln.lstrip()) - # support for defining a class with this condition - if ln.startswith("if TYPE_CHECKING") or ln.startswith("if typing.TYPE_CHECKING"): + conditional_class_definitions = ("if TYPE_CHECKING", "if typing.TYPE_CHECKING", "if torch.", "if _TORCH_") + if ( + any(ln.startswith(pattern) for pattern in conditional_class_definitions) + # avoid bug in CI for the <1.7 meta code + and "pytorch_lightning.utilities.meta" not in import_path + ): # dedent the next line lines[i + 1] = lines[i + 1].lstrip() continue + offset = len(ln) - len(ln.lstrip()) # in case of mating the class args are multi-line if tracking and ln and offset <= skip_offset and not any(ln.lstrip().startswith(c) for c in ")]"): tracking = False diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 4a5c72cc5c51e..7fec6260b174d 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -179,6 +179,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed incorrect values after transferring data to an MPS device ([#14368](https://github.com/Lightning-AI/lightning/pull/14368)) +- Fixed compatibility when `torch.distributed` is not available ([#14454](https://github.com/Lightning-AI/lightning/pull/14454)) + + ## [1.7.3] - 2022-08-25 ### Fixed diff --git a/src/pytorch_lightning/plugins/precision/fsdp_native_native_amp.py b/src/pytorch_lightning/plugins/precision/fsdp_native_native_amp.py index f91144124ae35..a5b26d7dec052 100644 --- a/src/pytorch_lightning/plugins/precision/fsdp_native_native_amp.py +++ b/src/pytorch_lightning/plugins/precision/fsdp_native_native_amp.py @@ -20,7 +20,7 @@ from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_12 -if _TORCH_GREATER_EQUAL_1_12: +if _TORCH_GREATER_EQUAL_1_12 and torch.distributed.is_available(): from torch.distributed.fsdp.fully_sharded_data_parallel import MixedPrecision from torch.distributed.fsdp.sharded_grad_scaler import ShardedGradScaler else: diff --git a/src/pytorch_lightning/strategies/ddp.py b/src/pytorch_lightning/strategies/ddp.py index 57ab3a151b011..2cfdbab357c70 100644 --- a/src/pytorch_lightning/strategies/ddp.py +++ b/src/pytorch_lightning/strategies/ddp.py @@ -24,7 +24,6 @@ import torch import torch.distributed from torch import Tensor -from torch.distributed.constants import default_pg_timeout from torch.nn import Module from torch.nn.parallel.distributed import DistributedDataParallel from torch.optim.optimizer import Optimizer @@ -68,6 +67,10 @@ if _TORCH_GREATER_EQUAL_1_10 and torch.distributed.is_available(): from torch.distributed.algorithms.model_averaging.averagers import ModelAverager +if torch.distributed.is_available(): + from torch.distributed.constants import default_pg_timeout +else: + default_pg_timeout = timedelta(seconds=1800) log = logging.getLogger(__name__) diff --git a/src/pytorch_lightning/strategies/ddp_spawn.py b/src/pytorch_lightning/strategies/ddp_spawn.py index de34320f54093..2eea8f11f1975 100644 --- a/src/pytorch_lightning/strategies/ddp_spawn.py +++ b/src/pytorch_lightning/strategies/ddp_spawn.py @@ -19,7 +19,6 @@ import torch import torch.distributed from torch import Tensor -from torch.distributed.constants import default_pg_timeout from torch.nn import Module from torch.nn.parallel.distributed import DistributedDataParallel from typing_extensions import Literal @@ -52,6 +51,11 @@ from pytorch_lightning.utilities.rank_zero import rank_zero_info, rank_zero_only from pytorch_lightning.utilities.types import PredictStep, STEP_OUTPUT, TestStep, ValidationStep +if torch.distributed.is_available(): + from torch.distributed.constants import default_pg_timeout +else: + default_pg_timeout = timedelta(seconds=1800) + log = logging.getLogger(__name__) _DDP_FORK_ALIASES = ( diff --git a/src/pytorch_lightning/strategies/fully_sharded_native.py b/src/pytorch_lightning/strategies/fully_sharded_native.py index 456ddd36f93e7..b32f460ee1f3e 100644 --- a/src/pytorch_lightning/strategies/fully_sharded_native.py +++ b/src/pytorch_lightning/strategies/fully_sharded_native.py @@ -17,7 +17,6 @@ import torch from torch import Tensor -from torch.distributed.distributed_c10d import _get_default_group, ProcessGroup import pytorch_lightning as pl from pytorch_lightning.overrides.base import _LightningModuleWrapperBase @@ -32,7 +31,6 @@ from pytorch_lightning.utilities import rank_zero_only from pytorch_lightning.utilities.distributed import ( _get_process_group_backend_from_env, - distributed_available, get_default_process_group_backend_for_device, ) from pytorch_lightning.utilities.distributed import group as _group @@ -43,9 +41,11 @@ from pytorch_lightning.utilities.optimizer import optimizers_to_device from pytorch_lightning.utilities.rank_zero import rank_zero_info from pytorch_lightning.utilities.seed import reset_seed -from pytorch_lightning.utilities.types import STEP_OUTPUT +from pytorch_lightning.utilities.types import ProcessGroup, STEP_OUTPUT -if _TORCH_GREATER_EQUAL_1_12: +_distributed_available = torch.distributed.is_available() +_fsdp_available = _TORCH_GREATER_EQUAL_1_12 and _distributed_available +if _fsdp_available: from torch.distributed.fsdp.fully_sharded_data_parallel import ( BackwardPrefetch, CPUOffload, @@ -59,6 +59,9 @@ BackwardPrefetch = None # type: ignore[misc,assignment] CPUOffload = None # type: ignore[misc,assignment] +if _distributed_available: + from torch.distributed.distributed_c10d import _get_default_group + log = logging.getLogger(__name__) @@ -275,7 +278,7 @@ def model_sharded_context(self) -> Generator: yield def barrier(self, name: Optional[str] = None) -> None: - if not distributed_available(): + if not _distributed_available: return if torch.distributed.get_backend() == "nccl": torch.distributed.barrier(device_ids=self._determine_device_ids()) @@ -358,7 +361,7 @@ def get_registered_strategies(cls) -> List[str]: @classmethod def register_strategies(cls, strategy_registry: Dict) -> None: - if _TORCH_GREATER_EQUAL_1_12: + if _fsdp_available: strategy_registry.register( "fsdp_native", cls, diff --git a/src/pytorch_lightning/utilities/types.py b/src/pytorch_lightning/utilities/types.py index c90657b34e868..39b50748099dd 100644 --- a/src/pytorch_lightning/utilities/types.py +++ b/src/pytorch_lightning/utilities/types.py @@ -24,12 +24,16 @@ import torch from torch import Tensor -from torch._C._distributed_c10d import ProcessGroup from torch.optim import Optimizer from torch.utils.data import DataLoader from torchmetrics import Metric from typing_extensions import Protocol, runtime_checkable +if torch.distributed.is_available(): + from torch._C._distributed_c10d import ProcessGroup +else: + ProcessGroup = ... # type: ignore[assignment,misc] + _NUMBER = Union[int, float] _METRIC = Union[Metric, Tensor, _NUMBER] _METRIC_COLLECTION = Union[_METRIC, Mapping[str, _METRIC]] From d2459df2ff79184c596b2f5c865ffb17c3541307 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 7 Sep 2022 17:25:23 +0200 Subject: [PATCH 079/193] Standalone Lite: Remaining Utilities (#14492) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Jirka Borovec Co-authored-by: Carlos Mocholí Co-authored-by: Laverne Henderson Co-authored-by: Felonious-Spellfire --- .github/workflows/ci-lite-test-full.yml | 7 + docs/source-pytorch/extensions/logging.rst | 2 +- examples/pl_basics/autoencoder.py | 2 +- .../computer_vision_fine_tuning.py | 2 +- pyproject.toml | 2 + requirements/lite/base.txt | 1 + src/lightning_lite/__init__.py | 17 + src/lightning_lite/utilities/__init__.py | 40 ++ src/lightning_lite/utilities/cloud_io.py | 2 +- src/lightning_lite/utilities/data.py | 411 ++++++++++++++ src/lightning_lite/utilities/device_parser.py | 316 +++++++++++ src/lightning_lite/utilities/distributed.py | 264 +++++++++ src/lightning_lite/utilities/enums.py | 95 ++++ src/lightning_lite/utilities/exceptions.py | 17 + src/lightning_lite/utilities/imports.py | 61 +++ src/lightning_lite/utilities/optimizer.py | 34 ++ src/lightning_lite/utilities/rank_zero.py | 60 +++ .../utilities/registry.py | 0 src/lightning_lite/utilities/seed.py | 127 +++++ src/lightning_lite/utilities/types.py | 63 ++- src/lightning_lite/utilities/warnings.py | 24 + src/lightning_lite/utilities/xla_device.py | 2 +- src/pytorch_lightning/__init__.py | 2 +- src/pytorch_lightning/accelerators/cpu.py | 4 +- src/pytorch_lightning/accelerators/cuda.py | 4 +- src/pytorch_lightning/accelerators/mps.py | 4 +- .../accelerators/registry.py | 2 +- src/pytorch_lightning/accelerators/tpu.py | 2 +- src/pytorch_lightning/callbacks/base.py | 2 +- .../callbacks/early_stopping.py | 3 +- .../callbacks/fault_tolerance.py | 2 +- .../callbacks/model_checkpoint.py | 3 +- .../callbacks/stochastic_weight_avg.py | 3 +- src/pytorch_lightning/cli.py | 2 +- src/pytorch_lightning/core/datamodule.py | 3 +- src/pytorch_lightning/core/lightning.py | 2 +- src/pytorch_lightning/core/module.py | 2 +- src/pytorch_lightning/core/optimizer.py | 3 +- src/pytorch_lightning/core/saving.py | 2 +- src/pytorch_lightning/lite/lite.py | 16 +- src/pytorch_lightning/loops/utilities.py | 4 +- src/pytorch_lightning/overrides/base.py | 2 +- src/pytorch_lightning/overrides/fairscale.py | 2 +- .../plugins/io/checkpoint_plugin.py | 2 +- .../plugins/io/hpu_plugin.py | 2 +- .../plugins/io/torch_plugin.py | 2 +- .../plugins/io/xla_plugin.py | 2 +- .../plugins/precision/apex_amp.py | 2 +- .../plugins/precision/deepspeed.py | 2 +- .../precision/fsdp_native_native_amp.py | 2 +- .../plugins/precision/hpu.py | 2 +- .../plugins/precision/ipu.py | 2 +- .../plugins/precision/precision_plugin.py | 2 +- src/pytorch_lightning/profiler/advanced.py | 2 +- src/pytorch_lightning/profiler/profiler.py | 2 +- src/pytorch_lightning/profiler/pytorch.py | 2 +- src/pytorch_lightning/profiler/simple.py | 2 +- src/pytorch_lightning/profiler/xla.py | 2 +- src/pytorch_lightning/profilers/pytorch.py | 2 +- src/pytorch_lightning/strategies/bagua.py | 6 +- src/pytorch_lightning/strategies/ddp.py | 24 +- src/pytorch_lightning/strategies/ddp_spawn.py | 22 +- src/pytorch_lightning/strategies/deepspeed.py | 19 +- src/pytorch_lightning/strategies/dp.py | 2 +- .../strategies/fully_sharded.py | 4 +- .../strategies/fully_sharded_native.py | 19 +- src/pytorch_lightning/strategies/hivemind.py | 6 +- src/pytorch_lightning/strategies/horovod.py | 6 +- .../strategies/hpu_parallel.py | 2 +- src/pytorch_lightning/strategies/ipu.py | 2 +- .../strategies/launchers/multiprocessing.py | 4 +- src/pytorch_lightning/strategies/parallel.py | 12 +- src/pytorch_lightning/strategies/sharded.py | 4 +- .../strategies/sharded_spawn.py | 2 +- .../strategies/single_device.py | 2 +- .../strategies/single_hpu.py | 3 +- src/pytorch_lightning/strategies/strategy.py | 6 +- .../strategies/strategy_registry.py | 2 +- src/pytorch_lightning/strategies/tpu_spawn.py | 9 +- src/pytorch_lightning/strategies/utils.py | 2 +- src/pytorch_lightning/trainer/__init__.py | 2 +- .../trainer/configuration_validator.py | 2 +- .../connectors/accelerator_connector.py | 11 +- .../connectors/checkpoint_connector.py | 2 +- .../trainer/connectors/data_connector.py | 10 +- .../connectors/logger_connector/result.py | 2 +- src/pytorch_lightning/trainer/data_loading.py | 2 +- src/pytorch_lightning/trainer/optimizers.py | 2 +- src/pytorch_lightning/trainer/supporters.py | 2 +- src/pytorch_lightning/trainer/trainer.py | 9 +- .../tuner/auto_gpu_select.py | 2 +- src/pytorch_lightning/utilities/__init__.py | 10 +- .../utilities/auto_restart.py | 2 +- src/pytorch_lightning/utilities/cloud_io.py | 2 +- src/pytorch_lightning/utilities/data.py | 229 +------- src/pytorch_lightning/utilities/deepspeed.py | 2 +- .../utilities/device_parser.py | 353 ++---------- .../utilities/distributed.py | 338 +++--------- src/pytorch_lightning/utilities/enums.py | 80 +-- src/pytorch_lightning/utilities/exceptions.py | 4 +- src/pytorch_lightning/utilities/fetching.py | 2 +- src/pytorch_lightning/utilities/meta.py | 2 +- src/pytorch_lightning/utilities/optimizer.py | 31 +- src/pytorch_lightning/utilities/rank_zero.py | 39 +- src/pytorch_lightning/utilities/seed.py | 155 ++---- src/pytorch_lightning/utilities/types.py | 64 +-- .../utilities/upgrade_checkpoint.py | 2 +- src/pytorch_lightning/utilities/warnings.py | 12 +- src/pytorch_lightning/utilities/xla_device.py | 2 +- tests/tests_lite/conftest.py | 87 +++ tests/tests_lite/helpers/runif.py | 27 +- tests/tests_lite/helpers/utils.py | 31 ++ tests/tests_lite/utilities/test_data.py | 509 ++++++++++++++++++ .../utilities/test_device_parser.py | 31 ++ .../tests_lite/utilities/test_distributed.py | 63 +++ tests/tests_lite/utilities/test_enums.py | 9 + tests/tests_lite/utilities/test_imports.py | 81 +++ .../utilities/test_optimizer.py | 2 +- .../utilities/test_rank_zero.py | 19 +- tests/tests_lite/utilities/test_seed.py | 84 +++ tests/tests_lite/utilities/test_warnings.py | 78 +++ .../utilities/test_xla_device_utils.py | 6 +- .../tests_pytorch/accelerators/test_common.py | 2 +- .../core/test_metric_result_integration.py | 2 +- tests/tests_pytorch/core/test_results.py | 2 +- .../deprecated_api/test_remove_1-10.py | 127 ++++- .../deprecated_api/test_remove_1-8.py | 4 +- .../deprecated_api/test_remove_2-0.py | 4 +- tests/tests_pytorch/lite/test_lite.py | 4 +- tests/tests_pytorch/models/test_gpu.py | 6 +- tests/tests_pytorch/models/test_tpu.py | 2 +- .../overrides/test_distributed.py | 2 +- .../tests_pytorch/plugins/test_amp_plugins.py | 12 +- .../plugins/test_checkpoint_io_plugin.py | 2 +- .../plugins/test_cluster_integration.py | 4 +- .../strategies/test_bagua_strategy.py | 2 +- tests/tests_pytorch/strategies/test_common.py | 2 +- tests/tests_pytorch/strategies/test_ddp.py | 8 +- ..._ddp_fully_sharded_with_full_state_dict.py | 4 +- .../strategies/test_deepspeed_strategy.py | 2 +- tests/tests_pytorch/test_cli.py | 4 +- .../connectors/test_accelerator_connector.py | 52 +- .../trainer/connectors/test_data_connector.py | 2 +- .../trainer/flags/test_env_vars.py | 4 +- .../trainer/flags/test_min_max_epochs.py | 2 +- .../properties/test_auto_gpu_select.py | 4 +- .../test_estimated_stepping_batches.py | 2 +- .../trainer/test_config_validator.py | 4 +- .../tests_pytorch/trainer/test_dataloaders.py | 3 +- .../tests_pytorch/trainer/test_supporters.py | 4 +- tests/tests_pytorch/trainer/test_trainer.py | 8 +- .../tests_pytorch/trainer/test_trainer_cli.py | 3 +- .../utilities/test_all_gather_grad.py | 5 +- .../utilities/test_auto_restart.py | 3 +- tests/tests_pytorch/utilities/test_data.py | 361 +------------ .../utilities/test_device_parser.py | 2 +- .../utilities/test_distributed.py | 57 +- tests/tests_pytorch/utilities/test_enums.py | 10 +- tests/tests_pytorch/utilities/test_seed.py | 80 +-- tests/tests_pytorch/utilities/test_types.py | 2 +- .../tests_pytorch/utilities/test_warnings.py | 31 -- 161 files changed, 3121 insertions(+), 1923 deletions(-) create mode 100644 src/lightning_lite/utilities/data.py create mode 100644 src/lightning_lite/utilities/device_parser.py create mode 100644 src/lightning_lite/utilities/distributed.py create mode 100644 src/lightning_lite/utilities/enums.py create mode 100644 src/lightning_lite/utilities/exceptions.py create mode 100644 src/lightning_lite/utilities/imports.py create mode 100644 src/lightning_lite/utilities/optimizer.py create mode 100644 src/lightning_lite/utilities/rank_zero.py rename src/{pytorch_lightning => lightning_lite}/utilities/registry.py (100%) create mode 100644 src/lightning_lite/utilities/seed.py create mode 100644 src/lightning_lite/utilities/warnings.py create mode 100644 tests/tests_lite/helpers/utils.py create mode 100644 tests/tests_lite/utilities/test_data.py create mode 100644 tests/tests_lite/utilities/test_device_parser.py create mode 100644 tests/tests_lite/utilities/test_distributed.py create mode 100644 tests/tests_lite/utilities/test_enums.py create mode 100644 tests/tests_lite/utilities/test_imports.py rename tests/{tests_pytorch => tests_lite}/utilities/test_optimizer.py (93%) rename tests/{tests_pytorch => tests_lite}/utilities/test_rank_zero.py (65%) create mode 100644 tests/tests_lite/utilities/test_seed.py create mode 100644 tests/tests_lite/utilities/test_warnings.py diff --git a/.github/workflows/ci-lite-test-full.yml b/.github/workflows/ci-lite-test-full.yml index 896086b697d66..2830952e2407b 100644 --- a/.github/workflows/ci-lite-test-full.yml +++ b/.github/workflows/ci-lite-test-full.yml @@ -88,6 +88,13 @@ jobs: pip list shell: bash + - name: Testing Warnings + # the stacklevel can only be set on >=3.7 + if: matrix.python-version != '3.7' + working-directory: tests/tests_lite + # needs to run outside of `pytest` + run: python utilities/test_warnings.py + - name: Testing Lite working-directory: tests/tests_lite # NOTE: do not include coverage report here, see: https://github.com/nedbat/coveragepy/issues/1003 diff --git a/docs/source-pytorch/extensions/logging.rst b/docs/source-pytorch/extensions/logging.rst index f7fb3cfd6fda0..109445779f991 100644 --- a/docs/source-pytorch/extensions/logging.rst +++ b/docs/source-pytorch/extensions/logging.rst @@ -231,7 +231,7 @@ Use the :func:`~pytorch_lightning.loggers.logger.rank_zero_experiment` and :func .. testcode:: from pytorch_lightning.loggers.logger import Logger, rank_zero_experiment - from pytorch_lightning.utilities.distributed import rank_zero_only + from pytorch_lightning.utilities import rank_zero_only class MyLogger(Logger): diff --git a/examples/pl_basics/autoencoder.py b/examples/pl_basics/autoencoder.py index 0fd9ddae18020..ae8c7b6611920 100644 --- a/examples/pl_basics/autoencoder.py +++ b/examples/pl_basics/autoencoder.py @@ -26,8 +26,8 @@ from pytorch_lightning import callbacks, cli_lightning_logo, LightningDataModule, LightningModule, Trainer from pytorch_lightning.cli import LightningCLI from pytorch_lightning.demos.mnist_datamodule import MNIST +from pytorch_lightning.utilities import rank_zero_only from pytorch_lightning.utilities.imports import _TORCHVISION_AVAILABLE -from pytorch_lightning.utilities.rank_zero import rank_zero_only if _TORCHVISION_AVAILABLE: import torchvision diff --git a/examples/pl_domain_templates/computer_vision_fine_tuning.py b/examples/pl_domain_templates/computer_vision_fine_tuning.py index b33d63eb6589b..7a81df983996c 100644 --- a/examples/pl_domain_templates/computer_vision_fine_tuning.py +++ b/examples/pl_domain_templates/computer_vision_fine_tuning.py @@ -57,7 +57,7 @@ from pytorch_lightning import cli_lightning_logo, LightningDataModule, LightningModule from pytorch_lightning.callbacks.finetuning import BaseFinetuning from pytorch_lightning.cli import LightningCLI -from pytorch_lightning.utilities.rank_zero import rank_zero_info +from pytorch_lightning.utilities import rank_zero_info log = logging.getLogger(__name__) DATA_URL = "https://storage.googleapis.com/mledu-datasets/cats_and_dogs_filtered.zip" diff --git a/pyproject.toml b/pyproject.toml index 5b62baf9ce6f3..166447dd655f6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,6 +27,7 @@ exclude = '(_notebooks/.*)' [tool.mypy] files = [ "src/pytorch_lightning", + "src/lightning_lite", # TODO: Check typing in app source # "src/lightning_app", ] @@ -57,5 +58,6 @@ module = [ "pytorch_lightning.trainer.trainer", "pytorch_lightning.tuner.batch_size_scaling", "pytorch_lightning.utilities.data", + "lightning_lite.utilities.data", ] ignore_errors = "True" diff --git a/requirements/lite/base.txt b/requirements/lite/base.txt index 4dbc213afe8b6..eb130bc2546d5 100644 --- a/requirements/lite/base.txt +++ b/requirements/lite/base.txt @@ -1,6 +1,7 @@ # NOTE: the upper bound for the package version is only set for CI stability, and it is dropped while installing this package # in case you want to preserve/enforce restrictions on the latest compatible version, add "strict" as an in-line comment +numpy>=1.17.2, <1.23.1 torch>=1.9.*, <1.13.0 fsspec[http]>=2021.05.0, !=2021.06.0, <2022.6.0 packaging>=17.0, <=21.3 diff --git a/src/lightning_lite/__init__.py b/src/lightning_lite/__init__.py index 5e0d0ad5cb20a..6c16dcbf6c393 100644 --- a/src/lightning_lite/__init__.py +++ b/src/lightning_lite/__init__.py @@ -1,4 +1,21 @@ """Root package info.""" +import logging from lightning_lite.__about__ import * # noqa: F401, F403 from lightning_lite.__version__ import version as __version__ # noqa: F401 + +_root_logger = logging.getLogger() +_logger = logging.getLogger(__name__) +_logger.setLevel(logging.INFO) + +if not _root_logger.hasHandlers(): + _logger.addHandler(logging.StreamHandler()) + _logger.propagate = False + +from lightning_lite.lite import LightningLite # noqa: E402 +from lightning_lite.utilities.seed import seed_everything # noqa: E402 + +__all__ = ["LightningLite", "seed_everything"] + +# for compatibility with namespace packages +__import__("pkg_resources").declare_namespace(__name__) diff --git a/src/lightning_lite/utilities/__init__.py b/src/lightning_lite/utilities/__init__.py index e69de29bb2d1d..edeab0cd5d360 100644 --- a/src/lightning_lite/utilities/__init__.py +++ b/src/lightning_lite/utilities/__init__.py @@ -0,0 +1,40 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""General utilities.""" + +from lightning_lite.utilities.apply_func import move_data_to_device # noqa: F401 +from lightning_lite.utilities.distributed import AllGatherGrad # noqa: F401 +from lightning_lite.utilities.enums import _AcceleratorType, _StrategyType, AMPType, LightningEnum # noqa: F401 + +# TODO(lite): Avoid importing protected attributes in `__init__.py` files +from lightning_lite.utilities.imports import ( # noqa: F401 + _HIVEMIND_AVAILABLE, + _HOROVOD_AVAILABLE, + _HPU_AVAILABLE, + _IPU_AVAILABLE, + _IS_INTERACTIVE, + _IS_WINDOWS, + _POPTORCH_AVAILABLE, + _TORCH_GREATER_EQUAL_1_10, + _TORCH_GREATER_EQUAL_1_11, + _TORCH_GREATER_EQUAL_1_12, + _TPU_AVAILABLE, + _XLA_AVAILABLE, +) +from lightning_lite.utilities.rank_zero import ( # noqa: F401 + rank_zero_deprecation, + rank_zero_info, + rank_zero_only, + rank_zero_warn, +) diff --git a/src/lightning_lite/utilities/cloud_io.py b/src/lightning_lite/utilities/cloud_io.py index 99629bcda8980..bdc20f7e3f87e 100644 --- a/src/lightning_lite/utilities/cloud_io.py +++ b/src/lightning_lite/utilities/cloud_io.py @@ -22,7 +22,7 @@ from fsspec.core import url_to_fs from fsspec.implementations.local import AbstractFileSystem -from pytorch_lightning.utilities.types import _MAP_LOCATION_TYPE, _PATH +from lightning_lite.utilities.types import _MAP_LOCATION_TYPE, _PATH def load( diff --git a/src/lightning_lite/utilities/data.py b/src/lightning_lite/utilities/data.py new file mode 100644 index 0000000000000..cdaf806a0c48d --- /dev/null +++ b/src/lightning_lite/utilities/data.py @@ -0,0 +1,411 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import functools +import inspect +import os +from collections import OrderedDict +from contextlib import contextmanager +from functools import partial +from typing import Any, Callable, Dict, Generator, Iterable, Optional, Tuple, Type, Union + +from lightning_utilities.core.inheritance import get_all_subclasses +from torch.utils.data import BatchSampler, DataLoader, IterableDataset, Sampler + +from lightning_lite.utilities.enums import LightningEnum +from lightning_lite.utilities.exceptions import MisconfigurationException +from lightning_lite.utilities.rank_zero import rank_zero_warn +from lightning_lite.utilities.seed import pl_worker_init_function + + +class _WrapAttrTag(LightningEnum): + SET = "set" + DEL = "del" + + def __call__(self, *args): + if self == self.SET: + fn = setattr + else: + fn = delattr + return fn(*args) + + +def has_iterable_dataset(dataloader: DataLoader) -> bool: + return hasattr(dataloader, "dataset") and isinstance(dataloader.dataset, IterableDataset) + + +def has_len(dataloader: Union[DataLoader, Iterable]) -> bool: + """Checks if a given Dataloader has ``__len__`` method implemented i.e. if it is a finite dataloader or + infinite dataloader.""" + try: + # try getting the length + if len(dataloader) == 0: + rank_zero_warn( + f"`{dataloader.__class__.__name__}` returned 0 length. Please make sure this was your intention." + ) + has_len = True + except (TypeError, NotImplementedError): + has_len = False + + if has_len and has_iterable_dataset(dataloader): + rank_zero_warn( + "Your `IterableDataset` has `__len__` defined." + " In combination with multi-process data loading (when num_workers > 1)," + " `__len__` could be inaccurate if each worker is not configured independently" + " to avoid having duplicate data." + ) + return has_len + + +def _update_dataloader(dataloader: DataLoader, sampler: Union[Sampler, Iterable]) -> DataLoader: + dl_args, dl_kwargs = _get_dataloader_init_args_and_kwargs(dataloader, sampler) + dataloader = _reinstantiate_wrapped_cls(dataloader, *dl_args, **dl_kwargs) + return dataloader + + +def _get_dataloader_init_args_and_kwargs( + dataloader: DataLoader, + sampler: Optional[Sampler], + disallow_batch_sampler: bool = False, +) -> Tuple[Tuple[Any], Dict[str, Any]]: + if not isinstance(dataloader, DataLoader): + raise ValueError(f"The dataloader {dataloader} needs to subclass `torch.utils.data.DataLoader`") + + was_wrapped = hasattr(dataloader, "__pl_saved_args") + if was_wrapped: + dl_args = dataloader.__pl_saved_args + dl_kwargs = dataloader.__pl_saved_kwargs + arg_names = dataloader.__pl_saved_arg_names + original_dataset = dataloader.__dataset # we have this saved from _wrap_init + else: + # get the dataloader instance attributes + attrs = {k: v for k, v in vars(dataloader).items() if not k.startswith("_")} + # We cannot be 100% sure the class sets dataset argument. Let's set it to None to be safe + # and hope we can get it from the instance attributes + original_dataset = None + # not part of `vars` + attrs["multiprocessing_context"] = dataloader.multiprocessing_context + arg_names = () + + # get the dataloader instance `__init__` parameters + params = dict(inspect.signature(dataloader.__init__).parameters) + has_variadic_kwargs = any(p.kind is p.VAR_KEYWORD for p in params.values()) + if has_variadic_kwargs: + # if the signature takes **kwargs, assume they will be passed down with `super().__init__(**kwargs)` + + if was_wrapped: + # if the dataloader was wrapped in a hook, only take arguments with default values + # and assume user passes their kwargs correctly + params.update( + {k: v for k, v in inspect.signature(DataLoader.__init__).parameters.items() if v.default is not v.empty} + ) + else: + params.update(inspect.signature(DataLoader.__init__).parameters) + params.pop("self", None) + + if not was_wrapped: + # keep only the params whose default is different to the current attr value + non_defaults = {name for name, p in params.items() if name in attrs and p.default != attrs[name]} + + # add `dataset` as it might have been replaced with `*args` + non_defaults.add("dataset") + # kwargs to re-construct the dataloader + dl_kwargs = {k: v for k, v in attrs.items() if k in non_defaults} + dl_args = () + + dataset = dl_kwargs.get("dataset", original_dataset) + if isinstance(dataset, IterableDataset): + dl_kwargs["batch_sampler"] = None + dl_kwargs["sampler"] = None + else: + dl_kwargs.update(_dataloader_init_kwargs_resolve_sampler(dataloader, sampler, disallow_batch_sampler)) + + required_args = { + p.name + for p in params.values() + if p.kind in (p.POSITIONAL_ONLY, p.POSITIONAL_OR_KEYWORD) + and p.default is p.empty + and p.name not in dl_kwargs + and p.name not in arg_names + } + # the dataloader has required args which we could not extract from the existing attributes + if required_args: + required_args = sorted(required_args) + dataloader_cls_name = dataloader.__class__.__name__ + missing_args_message = ", ".join(f"`self.{arg_name}`" for arg_name in required_args) + raise MisconfigurationException( + f"Trying to inject custom `Sampler` into the `{dataloader_cls_name}` instance. " + "This would fail as some of the `__init__` arguments are not available as instance attributes. " + f"The missing attributes are {required_args}. If you instantiate your `{dataloader_cls_name}` inside a " + "`*_dataloader` hook of your module, we will do this for you." + f" Otherwise, define {missing_args_message} inside your `__init__`." + ) + + if not has_variadic_kwargs: + # the dataloader signature does not allow keyword arguments that need to be passed + missing_kwargs = (set(dl_kwargs) | set(arg_names)) - params.keys() + if missing_kwargs: + missing_kwargs = sorted(missing_kwargs) + dataloader_cls_name = dataloader.__class__.__name__ + raise TypeError( + f"Trying to inject parameters into the `{dataloader_cls_name}` instance. " + "This would fail as it doesn't expose all its attributes in the `__init__` signature. " + f"The missing arguments are {missing_kwargs}. HINT: If you wrote the `{dataloader_cls_name}` class, " + "add the `__init__` arguments or allow passing `**kwargs`" + ) + + return dl_args, dl_kwargs + + +def _dataloader_init_kwargs_resolve_sampler( + dataloader: DataLoader, + sampler: Optional[Sampler], + disallow_batch_sampler: bool = False, +) -> Dict[str, Any]: + """This function is used to handle the sampler, batch_sampler arguments associated within a DataLoader for its + re-instantiation. + + If there are multiple devices in IPU mode, it is necessary to disallow BatchSampler that isn't instantiated + automatically, since `poptorch.DataLoader` will try to increase the batch_size + """ + batch_sampler = getattr(dataloader, "batch_sampler") + + if batch_sampler is not None: + if disallow_batch_sampler: + # Check that we don't have a PyTorch default batch sampler that was instantiated in DataLoader __init__ + if not ( + type(batch_sampler) is BatchSampler + and batch_sampler.sampler == sampler + and dataloader.batch_size == batch_sampler.batch_size + ): + raise MisconfigurationException( + "It is not possible to have a batch sampler in your dataloader, " + "when running on multiple IPU devices." + ) + elif type(batch_sampler) is not BatchSampler: + batch_sampler_cls = type(batch_sampler) + if hasattr(batch_sampler, "__pl_saved_args"): + args = batch_sampler.__pl_saved_args + kwargs = batch_sampler.__pl_saved_kwargs + default_kwargs = batch_sampler.__pl_saved_default_kwargs + arg_names = batch_sampler.__pl_saved_arg_names + + success, args, kwargs = _replace_value_in_saved_args( + "sampler", sampler, args, kwargs, default_kwargs, arg_names + ) + if not success: + raise TypeError( + "Trying to inject a modified sampler into the batch sampler; however, it seems the class " + f"`{batch_sampler_cls.__qualname__}` does not have an argument called `sampler.` To mitigate " + "this, expose an argument `sampler` in the `__init__` method of your custom class." + ) + + batch_sampler = _reinstantiate_wrapped_cls(batch_sampler, *args, **kwargs) + else: + try: + batch_sampler = batch_sampler_cls( + sampler, + batch_size=batch_sampler.batch_size, + drop_last=batch_sampler.drop_last, + ) + except TypeError as e: + import re + + match = re.match(r".*__init__\(\) (got multiple values)|(missing \d required)", str(e)) + if not match: + # an unexpected `TypeError`, continue failure + raise + + # There could either be too few or too many arguments. Customizing the message based on this doesn't + # make much sense since our MisconfigurationException is going to be raised from the original one. + raise TypeError( + "We tried to re-instantiate your custom batch sampler and failed. " + "To mitigate this, either follow the API of `BatchSampler` or instantiate " + "your custom batch sampler inside `*_dataloader` hooks of your module." + ) from e + + return { + "sampler": None, + "shuffle": False, + "batch_sampler": batch_sampler, + "batch_size": 1, + "drop_last": False, + } + + return {"sampler": sampler, "shuffle": False, "batch_sampler": None} + + +def _auto_add_worker_init_fn(dataloader: DataLoader, rank: int) -> None: + if int(os.environ.get("PL_SEED_WORKERS", 0)) and dataloader.worker_init_fn is None: + dataloader.worker_init_fn = partial(pl_worker_init_function, rank=rank) + + +def _reinstantiate_wrapped_cls(orig_object: Any, *args: Any, explicit_cls: Optional[Type] = None, **kwargs: Any) -> Any: + constructor = type(orig_object) if explicit_cls is None else explicit_cls + + try: + result = constructor(*args, **kwargs) + except TypeError as e: + # improve exception message due to an incorrect implementation of the `DataLoader` where multiple subclass + # `__init__` arguments map to one `DataLoader.__init__` argument + import re + + match = re.match(r".*__init__\(\) got multiple values .* '(\w+)'", str(e)) + if not match: + # an unexpected `TypeError`, continue failure + raise + argument = match.groups()[0] + message = ( + f"The {constructor.__name__} implementation has an error where more than one `__init__` argument" + f" can be passed to its parent's `{argument}=...` `__init__` argument. This is likely caused by allowing" + f" passing both a custom argument that will map to the `{argument}` argument as well as `**kwargs`." + f" `kwargs` should be filtered to make sure they don't contain the `{argument}` key." + " This argument was automatically passed to your object by PyTorch Lightning." + ) + raise MisconfigurationException(message) from e + + attrs_record = getattr(orig_object, "__pl_attrs_record", list()) + for args, fn in attrs_record: + fn(result, *args) + + return result + + +def _wrap_init_method(init: Callable, store_explicit_arg: Optional[str] = None) -> Callable: + """Wraps the ``__init__`` method of classes (currently :class:`~torch.utils.data.DataLoader` and + :class:`~torch.utils.data.BatchSampler`) in order to enable re-instantiation of custom subclasses.""" + + @functools.wraps(init) + def wrapper(obj: Any, *args: Any, **kwargs: Any) -> None: + # We need to inspect `init`, as inspecting `obj.__init__` + # can lead to inspecting the wrong function with multiple inheritance + old_inside_init = getattr(obj, "__pl_inside_init", False) + object.__setattr__(obj, "__pl_inside_init", True) + params = inspect.signature(init).parameters + + parameters_defaults = OrderedDict( + (param.name, param.default) + for param in params.values() + if param.name != "self" and param.kind not in (param.VAR_POSITIONAL, param.VAR_KEYWORD) + ) + + param_names = tuple(parameters_defaults)[: len(args)] + + default_kwargs = { + name: value + for name, value in parameters_defaults.items() + if name not in kwargs and name not in param_names and value != inspect.Parameter.empty + } + + if not hasattr(obj, "__pl_saved_args"): + object.__setattr__(obj, "__pl_saved_args", args) + object.__setattr__(obj, "__pl_saved_kwargs", kwargs) + object.__setattr__(obj, "__pl_saved_arg_names", param_names) + object.__setattr__(obj, "__pl_saved_default_kwargs", default_kwargs) + + # We want to use the latest possible value for explicit argument (i.e. ideally what gets passed to base class) + # so that we can be sure, that it will not get changed anymore. + # That is why we are setting this in every `__init__` + if store_explicit_arg is not None: + if store_explicit_arg in param_names: + object.__setattr__(obj, f"__{store_explicit_arg}", args[param_names.index(store_explicit_arg)]) + elif store_explicit_arg in kwargs: + object.__setattr__(obj, f"__{store_explicit_arg}", kwargs[store_explicit_arg]) + + init(obj, *args, **kwargs) + object.__setattr__(obj, "__pl_inside_init", old_inside_init) + + return wrapper + + +def _wrap_attr_method(method: Callable, tag: _WrapAttrTag) -> Callable: + """Wraps the ``__setattr__`` or ``__delattr__`` method of classes (currently :class:`~torch.utils.data.DataLoader` and + :class:`~torch.utils.data.BatchSampler`) in order to enable re-instantiation of custom subclasses.""" + + @functools.wraps(method) + def wrapper(obj: Any, *args: Any): + # First, let's find out if we're the first in inheritance chain calling the patched method. + name, *_ = args + prev_call_name, prev_call_method = getattr(obj, "__pl_current_call", (None, "method")) + first_call = not (prev_call_name == name and prev_call_method == tag) + + # Then mark the current called method + object.__setattr__(obj, "__pl_current_call", (name, tag)) + + # call original method + method(obj, *args) + if first_call and not getattr(obj, "__pl_inside_init", True): + # and save the value it was called with to the internal list, + # if we're outside of __init__ and the original call did not fail and we're the first call + attrs_record = getattr(obj, "__pl_attrs_record", list()) + attrs_record.append((args, tag)) + object.__setattr__(obj, "__pl_attrs_record", attrs_record) + object.__setattr__(obj, "__pl_current_call", (prev_call_name, prev_call_method)) + + return wrapper + + +@contextmanager +def _replace_dunder_methods(base_cls: Type, store_explicit_arg: Optional[str] = None) -> Generator[None, None, None]: + """This context manager is used to add support for re-instantiation of custom (subclasses) of `base_cls`. + + It patches the ``__init__``, ``__setattr__`` and ``__delattr__`` methods. + """ + classes = get_all_subclasses(base_cls) | {base_cls} + for cls in classes: + # Check that __init__ belongs to the class + # https://stackoverflow.com/a/5253424 + if "__init__" in cls.__dict__: + cls.__old__init__ = cls.__init__ + cls.__init__ = _wrap_init_method(cls.__init__, store_explicit_arg) + + # we want at least one setattr/delattr in the chain to be patched and it can happen, that none of the subclasses + # implement `__setattr__`/`__delattr__`. Therefore, we are always patching the `base_cls` + for patch_fn_name, tag in (("__setattr__", _WrapAttrTag.SET), ("__delattr__", _WrapAttrTag.DEL)): + if patch_fn_name in cls.__dict__ or cls is base_cls: + saved_name = f"__old{patch_fn_name}" + setattr(cls, saved_name, getattr(cls, patch_fn_name)) + setattr(cls, patch_fn_name, _wrap_attr_method(getattr(cls, patch_fn_name), tag)) + yield + for cls in classes: + for patched_name in ("__setattr__", "__delattr__", "__init__"): + # Check that __old__{init,setattr,delattr} belongs to the class + # https://stackoverflow.com/a/5253424 + if f"__old{patched_name}" in cls.__dict__: + setattr(cls, patched_name, getattr(cls, f"__old{patched_name}")) + delattr(cls, f"__old{patched_name}") + + +def _replace_value_in_saved_args( + replace_key: str, + replace_value: Any, + args: Tuple[Any, ...], + kwargs: Dict[str, Any], + default_kwargs: Dict[str, Any], + arg_names: Tuple[str, ...], +) -> Tuple[bool, Tuple[Any, ...], Dict[str, Any]]: + """Tries to replace an argument value in a saved list of args and kwargs. + + Returns a tuple indicating success of the operation and modified saved args and kwargs + """ + + if replace_key in arg_names: + replace_index = arg_names.index(replace_key) + args = args[:replace_index] + (replace_value,) + args[replace_index + 1 :] + return True, args, kwargs + elif replace_key in kwargs or replace_key in default_kwargs: + kwargs[replace_key] = replace_value + return True, args, kwargs + + return False, args, kwargs diff --git a/src/lightning_lite/utilities/device_parser.py b/src/lightning_lite/utilities/device_parser.py new file mode 100644 index 0000000000000..78bf8a9a8c93f --- /dev/null +++ b/src/lightning_lite/utilities/device_parser.py @@ -0,0 +1,316 @@ +import multiprocessing +import os +from typing import Any, List, MutableSequence, Optional, Tuple, Union + +import torch + +# TODO(lite): Fix the imports +# from lightning_lite.plugins.environments import TorchElasticEnvironment +# from lightning_lite.strategies.launchers.multiprocessing import _is_forking_disabled +from lightning_lite.utilities.exceptions import MisconfigurationException +from lightning_lite.utilities.types import _DEVICE + + +def determine_root_gpu_device(gpus: List[_DEVICE]) -> Optional[_DEVICE]: + """ + Args: + gpus: Non-empty list of ints representing which GPUs to use + + Returns: + Designated root GPU device id + + Raises: + TypeError: + If ``gpus`` is not a list + AssertionError: + If GPU list is empty + """ + if gpus is None: + return None + + if not isinstance(gpus, list): + raise TypeError("GPUs should be a list") + + assert len(gpus) > 0, "GPUs should be a non-empty list" + + # set root gpu + root_gpu = gpus[0] + + return root_gpu + + +def parse_gpu_ids( + gpus: Optional[Union[int, str, List[int]]], + include_cuda: bool = False, + include_mps: bool = False, +) -> Optional[List[int]]: + """ + Parses the GPU IDs given in the format as accepted by the + :class:`~pytorch_lightning.trainer.Trainer`. + + Args: + gpus: An int -1 or string '-1' indicate that all available GPUs should be used. + A list of unique ints or a string containing a list of comma separated unique integers + indicates specific GPUs to use. + An int of 0 means that no GPUs should be used. + Any int N > 0 indicates that GPUs [0..N) should be used. + include_cuda: A boolean value indicating whether to include CUDA devices for GPU parsing. + include_mps: A boolean value indicating whether to include MPS devices for GPU parsing. + + Returns: + A list of GPUs to be used or ``None`` if no GPUs were requested + + Raises: + MisconfigurationException: + If no GPUs are available but the value of gpus variable indicates request for GPUs + + .. note:: + ``include_cuda`` and ``include_mps`` default to ``False`` so that you only + have to specify which device type to use and all other devices are not disabled. + """ + # Check that gpus param is None, Int, String or Sequence of Ints + _check_data_type(gpus) + + # Handle the case when no GPUs are requested + if gpus is None or (isinstance(gpus, int) and gpus == 0) or str(gpus).strip() in ("0", "[]"): + return None + + # We know the user requested GPUs therefore if some of the + # requested GPUs are not available an exception is thrown. + gpus = _normalize_parse_gpu_string_input(gpus) + gpus = _normalize_parse_gpu_input_to_list(gpus, include_cuda=include_cuda, include_mps=include_mps) + if not gpus: + raise MisconfigurationException("GPUs requested but none are available.") + + if ( + True # TorchElasticEnvironment.detect() # TODO(lite): Revert this once environments have moved + and len(gpus) != 1 + and len(_get_all_available_gpus(include_cuda=include_cuda, include_mps=include_mps)) == 1 + ): + # Omit sanity check on torchelastic because by default it shows one visible GPU per process + return gpus + + # Check that GPUs are unique. Duplicate GPUs are not supported by the backend. + _check_unique(gpus) + + return _sanitize_gpu_ids(gpus, include_cuda=include_cuda, include_mps=include_mps) + + +def parse_tpu_cores(tpu_cores: Optional[Union[int, str, List[int]]]) -> Optional[Union[int, List[int]]]: + """ + Parses the tpu_cores given in the format as accepted by the + :class:`~pytorch_lightning.trainer.Trainer`. + + Args: + tpu_cores: An int of 1 or string '1' indicates that 1 core with multi-processing should be used + An int 8 or string '8' indicates that all 8 cores with multi-processing should be used + A list of ints or a strings containing a list of comma separated integers + indicates the specific TPU core to use. + + Returns: + A list of tpu_cores to be used or ``None`` if no TPU cores were requested + + Raises: + MisconfigurationException: + If TPU cores aren't 1, 8 or [<1-8>] + """ + _check_data_type(tpu_cores) + + if isinstance(tpu_cores, str): + tpu_cores = _parse_tpu_cores_str(tpu_cores.strip()) + + if not _tpu_cores_valid(tpu_cores): + raise MisconfigurationException("`tpu_cores` can only be 1, 8 or [<1-8>]") + + return tpu_cores + + +def parse_cpu_cores(cpu_cores: Union[int, str, List[int]]) -> int: + """Parses the cpu_cores given in the format as accepted by the ``devices`` argument in the + :class:`~pytorch_lightning.trainer.Trainer`. + + Args: + cpu_cores: An int > 0. + + Returns: + An int representing the number of processes + + Raises: + MisconfigurationException: + If cpu_cores is not an int > 0 + """ + if isinstance(cpu_cores, str) and cpu_cores.strip().isdigit(): + cpu_cores = int(cpu_cores) + + if not isinstance(cpu_cores, int) or cpu_cores <= 0: + raise MisconfigurationException("`devices` selected with `CPUAccelerator` should be an int > 0.") + + return cpu_cores + + +def _normalize_parse_gpu_string_input(s: Union[int, str, List[int]]) -> Union[int, List[int]]: + if not isinstance(s, str): + return s + if s == "-1": + return -1 + if "," in s: + return [int(x.strip()) for x in s.split(",") if len(x) > 0] + return int(s.strip()) + + +def _sanitize_gpu_ids(gpus: List[int], include_cuda: bool = False, include_mps: bool = False) -> List[int]: + """Checks that each of the GPUs in the list is actually available. Raises a MisconfigurationException if any of + the GPUs is not available. + + Args: + gpus: List of ints corresponding to GPU indices + + Returns: + Unmodified gpus variable + + Raises: + MisconfigurationException: + If machine has fewer available GPUs than requested. + """ + if sum((include_cuda, include_mps)) == 0: + raise ValueError("At least one gpu type should be specified!") + all_available_gpus = _get_all_available_gpus(include_cuda=include_cuda, include_mps=include_mps) + for gpu in gpus: + if gpu not in all_available_gpus: + raise MisconfigurationException( + f"You requested gpu: {gpus}\n But your machine only has: {all_available_gpus}" + ) + return gpus + + +def _normalize_parse_gpu_input_to_list( + gpus: Union[int, List[int], Tuple[int, ...]], include_cuda: bool, include_mps: bool +) -> Optional[List[int]]: + assert gpus is not None + if isinstance(gpus, (MutableSequence, tuple)): + return list(gpus) + + # must be an int + if not gpus: # gpus==0 + return None + if gpus == -1: + return _get_all_available_gpus(include_cuda=include_cuda, include_mps=include_mps) + + return list(range(gpus)) + + +def _get_all_available_gpus(include_cuda: bool = False, include_mps: bool = False) -> List[int]: + """ + Returns: + A list of all available GPUs + """ + cuda_gpus = _get_all_available_cuda_gpus() if include_cuda else [] + mps_gpus = _get_all_available_mps_gpus() if include_mps else [] + return cuda_gpus + mps_gpus + + +def _get_all_available_mps_gpus() -> List[int]: + """ + Returns: + A list of all available MPS GPUs + """ + # lazy import to avoid circular dependencies + # from lightning_lite.accelerators.mps import _MPS_AVAILABLE + _MPS_AVAILABLE = False # TODO(lite): revert this once MPS utils have moved + return [0] if _MPS_AVAILABLE else [] + + +def _get_all_available_cuda_gpus() -> List[int]: + """ + Returns: + A list of all available CUDA GPUs + """ + return list(range(num_cuda_devices())) + + +def _check_unique(device_ids: List[int]) -> None: + """Checks that the device_ids are unique. + + Args: + device_ids: List of ints corresponding to GPUs indices + + Raises: + MisconfigurationException: + If ``device_ids`` of GPUs aren't unique + """ + if len(device_ids) != len(set(device_ids)): + raise MisconfigurationException("Device ID's (GPU) must be unique.") + + +def _check_data_type(device_ids: Any) -> None: + """Checks that the device_ids argument is one of the following: None, int, string, or sequence of integers. + + Args: + device_ids: gpus/tpu_cores parameter as passed to the Trainer + + Raises: + MisconfigurationException: + If ``device_ids`` of GPU/TPUs aren't ``int``, ``str``, sequence of ``int`` or ``None`` + """ + msg = "Device IDs (GPU/TPU) must be an int, a string, a sequence of ints or None, but you passed" + + if device_ids is None: + return + elif isinstance(device_ids, (MutableSequence, tuple)): + for id_ in device_ids: + if type(id_) is not int: + raise MisconfigurationException(f"{msg} a sequence of {type(id_).__name__}.") + elif type(device_ids) not in (int, str): + raise MisconfigurationException(f"{msg} {type(device_ids).__name__}.") + + +def _tpu_cores_valid(tpu_cores: Any) -> bool: + # allow 1 or 8 cores + if tpu_cores in (1, 8, None): + return True + + # allow picking 1 of 8 indexes + if isinstance(tpu_cores, (list, tuple, set)): + has_1_tpu_idx = len(tpu_cores) == 1 + is_valid_tpu_idx = 1 <= list(tpu_cores)[0] <= 8 + + is_valid_tpu_core_choice = has_1_tpu_idx and is_valid_tpu_idx + return is_valid_tpu_core_choice + + return False + + +def _parse_tpu_cores_str(tpu_cores: str) -> Union[int, List[int]]: + if tpu_cores in ("1", "8"): + return int(tpu_cores) + return [int(x.strip()) for x in tpu_cores.split(",") if len(x) > 0] + + +def num_cuda_devices() -> int: + """Returns the number of GPUs available. + + Unlike :func:`torch.cuda.device_count`, this function does its best not to create a CUDA context for fork support, + if the platform allows it. + """ + if "fork" not in torch.multiprocessing.get_all_start_methods() or _is_forking_disabled(): + return torch.cuda.device_count() + with multiprocessing.get_context("fork").Pool(1) as pool: + return pool.apply(torch.cuda.device_count) + + +def is_cuda_available() -> bool: + """Returns a bool indicating if CUDA is currently available. + + Unlike :func:`torch.cuda.is_available`, this function does its best not to create a CUDA context for fork support, + if the platform allows it. + """ + if "fork" not in torch.multiprocessing.get_all_start_methods() or _is_forking_disabled(): + return torch.cuda.is_available() + with multiprocessing.get_context("fork").Pool(1) as pool: + return pool.apply(torch.cuda.is_available) + + +# TODO(lite): move this back to launchers/multiprocessing.py once launchers have moved +def _is_forking_disabled() -> bool: + """Returns whether forking is disabled through the environment variable ``PL_DISABLE_FORK``.""" + return bool(int(os.environ.get("PL_DISABLE_FORK", "0"))) diff --git a/src/lightning_lite/utilities/distributed.py b/src/lightning_lite/utilities/distributed.py new file mode 100644 index 0000000000000..77123c53ff14a --- /dev/null +++ b/src/lightning_lite/utilities/distributed.py @@ -0,0 +1,264 @@ +import logging +import os +from typing import Any, List, Optional, Tuple, Union + +import torch +from torch import Tensor +from torch.nn import functional as F + +from lightning_lite.utilities.imports import _HPU_AVAILABLE, _TPU_AVAILABLE +from lightning_lite.utilities.rank_zero import rank_zero_deprecation +from lightning_lite.utilities.rank_zero import rank_zero_info as new_rank_zero_info + +if _TPU_AVAILABLE: + import torch_xla.core.xla_model as xm + + +if torch.distributed.is_available(): + from torch.distributed import group, ReduceOp +else: + + class ReduceOp: # type: ignore # (see https://github.com/python/mypy/issues/1153) + SUM = None + + class group: # type: ignore + WORLD = None + + +log = logging.getLogger(__name__) + + +def gather_all_tensors(result: Tensor, group: Optional[Any] = None) -> List[Tensor]: + """Function to gather all tensors from several DDP processes onto a list that is broadcasted to all processes. + + Works on tensors that have the same number of dimensions, but where each dimension may differ. In this case + tensors are padded, gathered and then trimmed to secure equal workload for all processes. + + Args: + result: The value to sync + group: The process group to gather results from. Defaults to all processes (world) + + Return: + gathered_result: List with size equal to the process group where + gathered_result[i] corresponds to result tensor from process i + """ + if group is None: + group = torch.distributed.group.WORLD + + # Convert tensors to contiguous format + result = result.contiguous() + + world_size = torch.distributed.get_world_size(group) + torch.distributed.barrier(group=group) + + # If the tensor is scalar, things are easy + if result.ndim == 0: + return _simple_gather_all_tensors(result, group, world_size) + + # 1. Gather sizes of all tensors + local_size = torch.tensor(result.shape, device=result.device) + local_sizes = [torch.zeros_like(local_size) for _ in range(world_size)] + torch.distributed.all_gather(local_sizes, local_size, group=group) + max_size = torch.stack(local_sizes).max(dim=0).values + all_sizes_equal = all(all(ls == max_size) for ls in local_sizes) + + # 2. If shapes are all the same, then do a simple gather: + if all_sizes_equal: + return _simple_gather_all_tensors(result, group, world_size) + + # 3. If not, we need to pad each local tensor to maximum size, gather and then truncate + pad_dims = [] + pad_by = (max_size - local_size).detach().cpu() + for val in reversed(pad_by): + pad_dims.append(0) + pad_dims.append(val.item()) + result_padded = F.pad(result, pad_dims) + gathered_result = [torch.zeros_like(result_padded) for _ in range(world_size)] + torch.distributed.all_gather(gathered_result, result_padded, group) + for idx, item_size in enumerate(local_sizes): + slice_param = [slice(dim_size) for dim_size in item_size] + gathered_result[idx] = gathered_result[idx][slice_param] + return gathered_result + + +def _simple_gather_all_tensors(result: Tensor, group: Any, world_size: int) -> List[Tensor]: + gathered_result = [torch.zeros_like(result) for _ in range(world_size)] + torch.distributed.all_gather(gathered_result, result, group) + return gathered_result + + +def distributed_available() -> bool: + return torch.distributed.is_available() and torch.distributed.is_initialized() or tpu_distributed() + + +def sync_ddp_if_available( + result: Tensor, group: Optional[Any] = None, reduce_op: Optional[Union[ReduceOp, str]] = None +) -> Tensor: + """Function to reduce a tensor across worker processes during distributed training. + + Args: + result: The value to sync and reduce (typically tensor or number) + group: The process group to gather results from. Defaults to all processes (world) + reduce_op: The reduction operation. Defaults to sum. + Can also be a string of 'avg', 'mean' to calculate the mean during reduction. + + Return: + reduced value + """ + if distributed_available(): + return sync_ddp(result, group=group, reduce_op=reduce_op) + return result + + +def sync_ddp(result: Tensor, group: Optional[Any] = None, reduce_op: Optional[Union[ReduceOp, str]] = None) -> Tensor: + """Function to reduce the tensors from several DDP processes to one main process. + + Args: + result: The value to sync and reduce (typically tensor or number) + group: The process group to gather results from. Defaults to all processes (world) + reduce_op: The reduction operation. Defaults to sum. + Can also be a string of 'avg', 'mean' to calculate the mean during reduction. + + Return: + reduced value + """ + divide_by_world_size = False + + if group is None: + group = torch.distributed.group.WORLD + + op: Optional[ReduceOp] + if isinstance(reduce_op, str): + if reduce_op.lower() in ("avg", "mean"): + op = ReduceOp.SUM + divide_by_world_size = True + else: + op = getattr(ReduceOp, reduce_op.upper()) + else: + op = reduce_op + + # WA for HPU. HPU doesn't support Long types, forcefully set it to float + if _HPU_AVAILABLE: + is_hpu_backend = os.environ.get("HCCL_DISTRIBUTED_BACKEND") == "1" + if is_hpu_backend: + if (result.type() == "torch.LongTensor") or (result.type() == "torch.hpu.LongTensor"): + new_rank_zero_info("Long tensor unsupported on HPU, casting to float") + result = result.float() + + # Sync all processes before reduction + torch.distributed.barrier(group=group) + torch.distributed.all_reduce(result, op=op, group=group, async_op=False) + + if divide_by_world_size: + result = result / torch.distributed.get_world_size(group) + + return result + + +class AllGatherGrad(torch.autograd.Function): + @staticmethod + def forward( # type: ignore[override] + ctx: Any, + tensor: Tensor, + group: Optional["torch.distributed.ProcessGroup"] = group.WORLD, + ) -> Tensor: + ctx.group = group + + gathered_tensor = [torch.zeros_like(tensor) for _ in range(torch.distributed.get_world_size())] + + torch.distributed.all_gather(gathered_tensor, tensor, group=group) + gathered_tensor = torch.stack(gathered_tensor, dim=0) + + return gathered_tensor + + @staticmethod + def backward(ctx: Any, *grad_output: Tensor) -> Tuple[Tensor, None]: + grad_output = torch.cat(grad_output) + + torch.distributed.all_reduce(grad_output, op=torch.distributed.ReduceOp.SUM, async_op=False, group=ctx.group) + + return grad_output[torch.distributed.get_rank()], None + + +def all_gather_ddp_if_available( + tensor: Tensor, group: Optional["torch.distributed.ProcessGroup"] = None, sync_grads: bool = False +) -> Tensor: + """Function to gather a tensor from several distributed processes. + + Args: + tensor: Tensor of shape (batch, ...) + group: The process group to gather results from. Defaults to all processes (world) + sync_grads: Flag that allows users to synchronize gradients for all_gather op + + Return: + A tensor of shape (world_size, batch, ...) + """ + group = group if group is not None else torch.distributed.group.WORLD + if distributed_available(): + if sync_grads: + return AllGatherGrad.apply(tensor, group) + with torch.no_grad(): + return AllGatherGrad.apply(tensor, group) + return tensor + + +def init_dist_connection( + # TODO(lite): Fix this type error + cluster_environment: "ClusterEnvironment", # type: ignore[name-defined] # noqa: F821 + torch_distributed_backend: str, + global_rank: Optional[int] = None, + world_size: Optional[int] = None, + **kwargs: Any, +) -> None: + """Utility function to initialize distributed connection by setting env variables and initializing the + distributed process group. + + Args: + cluster_environment: ``ClusterEnvironment`` instance + torch_distributed_backend: Backend to use (includes `nccl` and `gloo`) + global_rank: Rank of the current process + world_size: Number of processes in the group + kwargs: Kwargs for ``init_process_group`` + + Raises: + RuntimeError: + If ``torch.distributed`` is not available + """ + if not torch.distributed.is_available(): + raise RuntimeError("torch.distributed is not available. Cannot initialize distributed process group") + if torch.distributed.is_initialized(): + log.debug("torch.distributed is already initialized. Exiting early") + return + global_rank = global_rank if global_rank is not None else cluster_environment.global_rank() + world_size = world_size if world_size is not None else cluster_environment.world_size() + os.environ["MASTER_ADDR"] = cluster_environment.main_address + os.environ["MASTER_PORT"] = str(cluster_environment.main_port) + log.info(f"Initializing distributed: GLOBAL_RANK: {global_rank}, MEMBER: {global_rank + 1}/{world_size}") + torch.distributed.init_process_group(torch_distributed_backend, rank=global_rank, world_size=world_size, **kwargs) + + # On rank=0 let everyone know training is starting + new_rank_zero_info( + f"{'-' * 100}\n" + f"distributed_backend={torch_distributed_backend}\n" + f"All distributed processes registered. Starting with {world_size} processes\n" + f"{'-' * 100}\n" + ) + + +def tpu_distributed() -> bool: + return _TPU_AVAILABLE and xm.xrt_world_size() > 1 + + +def get_default_process_group_backend_for_device(device: torch.device) -> str: + return "nccl" if device.type == "cuda" else "gloo" + + +def _get_process_group_backend_from_env() -> Optional[str]: + torch_backend = os.getenv("PL_TORCH_DISTRIBUTED_BACKEND") + if torch_backend is not None: + rank_zero_deprecation( + "Environment variable `PL_TORCH_DISTRIBUTED_BACKEND`" + " was deprecated in v1.6 and will be removed in v1.8." + " Specify `process_group_backend` directly on the strategy constructor." + ) + return torch_backend diff --git a/src/lightning_lite/utilities/enums.py b/src/lightning_lite/utilities/enums.py new file mode 100644 index 0000000000000..567483b1e5e97 --- /dev/null +++ b/src/lightning_lite/utilities/enums.py @@ -0,0 +1,95 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Enumerated utilities.""" +from __future__ import annotations + +from typing import TYPE_CHECKING + +from lightning_utilities.core.enums import StrEnum + +if TYPE_CHECKING: + from enum import Enum + + # re-defined because `mypy` infers `StrEnum` as `Any` + class LightningEnum(StrEnum, Enum): + ... + +else: + LightningEnum = StrEnum + + +class AMPType(LightningEnum): + """Type of Automatic Mixed Precission used for training.""" + + APEX = "apex" + NATIVE = "native" + + +class PrecisionType(LightningEnum): + """Type of precision used.""" + + HALF = "16" + FLOAT = "32" + FULL = "64" + BFLOAT = "bf16" + MIXED = "mixed" + + @staticmethod + def supported_type(precision: str | int) -> bool: + return any(x == precision for x in PrecisionType) + + @staticmethod + def supported_types() -> list[str]: + return [x.value for x in PrecisionType] + + +class _StrategyType(LightningEnum): + """Define type of training strategy.""" + + DP = "dp" + DDP = "ddp" + DDP_SPAWN = "ddp_spawn" + DDP_FORK = "ddp_fork" + TPU_SPAWN = "tpu_spawn" + DEEPSPEED = "deepspeed" + HOROVOD = "horovod" + DDP_SHARDED = "ddp_sharded" + DDP_SHARDED_SPAWN = "ddp_sharded_spawn" + DDP_FULLY_SHARDED = "ddp_fully_sharded" + BAGUA = "bagua" + HPU_PARALLEL = "hpu_parallel" + + @staticmethod + def interactive_compatible_types() -> list[_StrategyType]: + """Returns a list containing interactive compatible _StrategyTypes.""" + return [ + _StrategyType.DP, + _StrategyType.TPU_SPAWN, + _StrategyType.DDP_FORK, + ] + + def is_interactive_compatible(self) -> bool: + """Returns whether self is interactive compatible.""" + return self in _StrategyType.interactive_compatible_types() + + +class _AcceleratorType(LightningEnum): + """Define Accelerator type by its nature.""" + + CPU = "CPU" + CUDA = "CUDA" + IPU = "IPU" + TPU = "TPU" + HPU = "HPU" + MPS = "MPS" diff --git a/src/lightning_lite/utilities/exceptions.py b/src/lightning_lite/utilities/exceptions.py new file mode 100644 index 0000000000000..7f6c3dd9b3e2b --- /dev/null +++ b/src/lightning_lite/utilities/exceptions.py @@ -0,0 +1,17 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +class MisconfigurationException(Exception): + """Exception used to inform users of misuse with Lightning.""" diff --git a/src/lightning_lite/utilities/imports.py b/src/lightning_lite/utilities/imports.py new file mode 100644 index 0000000000000..34e7b5ac5f82f --- /dev/null +++ b/src/lightning_lite/utilities/imports.py @@ -0,0 +1,61 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""General utilities.""" +import operator +import platform +import sys + +from lightning_utilities.core.imports import compare_version, module_available, package_available + +_IS_WINDOWS = platform.system() == "Windows" +_IS_INTERACTIVE = hasattr(sys, "ps1") # https://stackoverflow.com/a/64523765 +_PYTHON_GREATER_EQUAL_3_8_0 = (sys.version_info.major, sys.version_info.minor) >= (3, 8) +_PYTHON_GREATER_EQUAL_3_10_0 = (sys.version_info.major, sys.version_info.minor) >= (3, 10) +_TORCH_GREATER_EQUAL_1_9_1 = compare_version("torch", operator.ge, "1.9.1") +_TORCH_GREATER_EQUAL_1_10 = compare_version("torch", operator.ge, "1.10.0") +_TORCH_LESSER_EQUAL_1_10_2 = compare_version("torch", operator.le, "1.10.2") +_TORCH_GREATER_EQUAL_1_11 = compare_version("torch", operator.ge, "1.11.0") +_TORCH_GREATER_EQUAL_1_12 = compare_version("torch", operator.ge, "1.12.0") +_TORCH_GREATER_EQUAL_1_13 = compare_version("torch", operator.ge, "1.13.0", use_base_version=True) + +_APEX_AVAILABLE = module_available("apex.amp") +_HABANA_FRAMEWORK_AVAILABLE = package_available("habana_frameworks") +_HIVEMIND_AVAILABLE = package_available("hivemind") +_HOROVOD_AVAILABLE = module_available("horovod.torch") +_OMEGACONF_AVAILABLE = package_available("omegaconf") +_POPTORCH_AVAILABLE = package_available("poptorch") +_PSUTIL_AVAILABLE = package_available("psutil") +_XLA_AVAILABLE: bool = package_available("torch_xla") + +# TODO(lite): import this from the fairscale files once they move to lite package +_FAIRSCALE_AVAILABLE = not _IS_WINDOWS and module_available("fairscale.nn") + + +from lightning_lite.utilities.xla_device import XLADeviceUtils # noqa: E402 + +_TPU_AVAILABLE = XLADeviceUtils.tpu_device_exists() + +if _POPTORCH_AVAILABLE: + import poptorch + + _IPU_AVAILABLE = poptorch.ipuHardwareIsAvailable() +else: + _IPU_AVAILABLE = False + +if _HABANA_FRAMEWORK_AVAILABLE: + from habana_frameworks.torch.utils.library_loader import is_habana_avaialble + + _HPU_AVAILABLE = is_habana_avaialble() +else: + _HPU_AVAILABLE = False diff --git a/src/lightning_lite/utilities/optimizer.py b/src/lightning_lite/utilities/optimizer.py new file mode 100644 index 0000000000000..c10c426bfe438 --- /dev/null +++ b/src/lightning_lite/utilities/optimizer.py @@ -0,0 +1,34 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Iterable + +from lightning_utilities.core.apply_func import apply_to_collection +from torch import Tensor +from torch.optim import Optimizer + +from lightning_lite.utilities.apply_func import move_data_to_device +from lightning_lite.utilities.types import _DEVICE + + +def optimizers_to_device(optimizers: Iterable[Optimizer], device: _DEVICE) -> None: + """Moves optimizer states for a sequence of optimizers to the device.""" + for opt in optimizers: + optimizer_to_device(opt, device) + + +def optimizer_to_device(optimizer: Optimizer, device: _DEVICE) -> None: + """Moves the state of a single optimizer to the device.""" + for p, v in optimizer.state.items(): + optimizer.state[p] = apply_to_collection(v, Tensor, move_data_to_device, device) diff --git a/src/lightning_lite/utilities/rank_zero.py b/src/lightning_lite/utilities/rank_zero.py new file mode 100644 index 0000000000000..db364dfd8f922 --- /dev/null +++ b/src/lightning_lite/utilities/rank_zero.py @@ -0,0 +1,60 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Utilities that can be used for calling functions on a particular rank.""" +import logging +import os +from typing import Optional + +import lightning_utilities.core.rank_zero as rank_zero_module + +# note: we want to keep these indirections so the `rank_zero_only.rank` is set on import +from lightning_utilities.core.rank_zero import ( # noqa: F401 + rank_zero_debug, + rank_zero_deprecation, + rank_zero_info, + rank_zero_only, + rank_zero_warn, +) + +import lightning_lite + +rank_zero_module.log = logging.getLogger(__name__) + + +def _get_rank( + strategy: Optional["lightning_lite.strategies.Strategy"] = None, # type: ignore[name-defined] +) -> Optional[int]: + if strategy is not None: + return strategy.global_rank + # SLURM_PROCID can be set even if SLURM is not managing the multiprocessing, + # therefore LOCAL_RANK needs to be checked first + rank_keys = ("RANK", "LOCAL_RANK", "SLURM_PROCID", "JSM_NAMESPACE_RANK") + for key in rank_keys: + rank = os.environ.get(key) + if rank is not None: + return int(rank) + # None to differentiate whether an environment variable was set at all + return None + + +# add the attribute to the function but don't overwrite in case Trainer has already set it +rank_zero_only.rank = getattr(rank_zero_only, "rank", _get_rank() or 0) + + +class LightningDeprecationWarning(DeprecationWarning): + """Deprecation warnings raised by Lightning.""" + + +rank_zero_module.rank_zero_deprecation_category = LightningDeprecationWarning diff --git a/src/pytorch_lightning/utilities/registry.py b/src/lightning_lite/utilities/registry.py similarity index 100% rename from src/pytorch_lightning/utilities/registry.py rename to src/lightning_lite/utilities/registry.py diff --git a/src/lightning_lite/utilities/seed.py b/src/lightning_lite/utilities/seed.py new file mode 100644 index 0000000000000..a55b5e3dd84dc --- /dev/null +++ b/src/lightning_lite/utilities/seed.py @@ -0,0 +1,127 @@ +import logging +import os +import random +from random import getstate as python_get_rng_state +from random import setstate as python_set_rng_state +from typing import Any, Dict, Optional + +import numpy as np +import torch +from lightning_utilities.core.rank_zero import rank_prefixed_message + +from lightning_lite.utilities.rank_zero import _get_rank, rank_zero_only, rank_zero_warn + +log = logging.getLogger(__name__) + +max_seed_value = np.iinfo(np.uint32).max +min_seed_value = np.iinfo(np.uint32).min + + +def seed_everything(seed: Optional[int] = None, workers: bool = False) -> int: + """Function that sets seed for pseudo-random number generators in: pytorch, numpy, python.random In addition, + sets the following environment variables: + + - `PL_GLOBAL_SEED`: will be passed to spawned subprocesses (e.g. ddp_spawn backend). + - `PL_SEED_WORKERS`: (optional) is set to 1 if ``workers=True``. + + Args: + seed: the integer value seed for global random state in Lightning. + If `None`, will read seed from `PL_GLOBAL_SEED` env variable + or select it randomly. + workers: if set to ``True``, will properly configure all dataloaders passed to the + Trainer with a ``worker_init_fn``. If the user already provides such a function + for their dataloaders, setting this argument will have no influence. See also: + :func:`~lightning_lite.utilities.seed.pl_worker_init_function`. + """ + if seed is None: + env_seed = os.environ.get("PL_GLOBAL_SEED") + if env_seed is None: + seed = _select_seed_randomly(min_seed_value, max_seed_value) + rank_zero_warn(f"No seed found, seed set to {seed}") + else: + try: + seed = int(env_seed) + except ValueError: + seed = _select_seed_randomly(min_seed_value, max_seed_value) + rank_zero_warn(f"Invalid seed found: {repr(env_seed)}, seed set to {seed}") + elif not isinstance(seed, int): + seed = int(seed) + + if not (min_seed_value <= seed <= max_seed_value): + rank_zero_warn(f"{seed} is not in bounds, numpy accepts from {min_seed_value} to {max_seed_value}") + seed = _select_seed_randomly(min_seed_value, max_seed_value) + + log.info(rank_prefixed_message(f"Global seed set to {seed}", _get_rank())) + os.environ["PL_GLOBAL_SEED"] = str(seed) + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + + os.environ["PL_SEED_WORKERS"] = f"{int(workers)}" + + return seed + + +def _select_seed_randomly(min_seed_value: int = min_seed_value, max_seed_value: int = max_seed_value) -> int: + return random.randint(min_seed_value, max_seed_value) + + +def reset_seed() -> None: + """Reset the seed to the value that :func:`lightning_lite.utilities.seed.seed_everything` previously set. + + If :func:`lightning_lite.utilities.seed.seed_everything` is unused, this function will do nothing. + """ + seed = os.environ.get("PL_GLOBAL_SEED", None) + if seed is None: + return + workers = os.environ.get("PL_SEED_WORKERS", "0") + seed_everything(int(seed), workers=bool(int(workers))) + + +def pl_worker_init_function(worker_id: int, rank: Optional[int] = None) -> None: # pragma: no cover + """The worker_init_fn that Lightning automatically adds to your dataloader if you previously set the seed with + ``seed_everything(seed, workers=True)``. + + See also the PyTorch documentation on + `randomness in DataLoaders `_. + """ + # implementation notes: https://github.com/pytorch/pytorch/issues/5059#issuecomment-817392562 + global_rank = rank if rank is not None else rank_zero_only.rank + process_seed = torch.initial_seed() + # back out the base seed so we can use all the bits + base_seed = process_seed - worker_id + log.debug( + f"Initializing random number generators of process {global_rank} worker {worker_id} with base seed {base_seed}" + ) + ss = np.random.SeedSequence([base_seed, worker_id, global_rank]) + # use 128 bits (4 x 32-bit words) + np.random.seed(ss.generate_state(4)) + # Spawn distinct SeedSequences for the PyTorch PRNG and the stdlib random module + torch_ss, stdlib_ss = ss.spawn(2) + torch.manual_seed(torch_ss.generate_state(1, dtype=np.uint64)[0]) + # use 128 bits expressed as an integer + stdlib_seed = (stdlib_ss.generate_state(2, dtype=np.uint64).astype(object) * [1 << 64, 1]).sum() + random.seed(stdlib_seed) + + +def _collect_rng_states() -> Dict[str, Any]: + """Collect the global random state of :mod:`torch`, :mod:`torch.cuda`, :mod:`numpy` and Python.""" + return { + "torch": torch.get_rng_state(), + "torch.cuda": torch.cuda.get_rng_state_all(), + "numpy": np.random.get_state(), + "python": python_get_rng_state(), + } + + +def _set_rng_states(rng_state_dict: Dict[str, Any]) -> None: + """Set the global random state of :mod:`torch`, :mod:`torch.cuda`, :mod:`numpy` and Python in the current + process.""" + torch.set_rng_state(rng_state_dict["torch"]) + # torch.cuda rng_state is only included since v1.8. + if "torch.cuda" in rng_state_dict: + torch.cuda.set_rng_state_all(rng_state_dict["torch.cuda"]) + np.random.set_state(rng_state_dict["numpy"]) + version, state, gauss = rng_state_dict["python"] + python_set_rng_state((version, tuple(state), gauss)) diff --git a/src/lightning_lite/utilities/types.py b/src/lightning_lite/utilities/types.py index 900154e69c2eb..950210925ab42 100644 --- a/src/lightning_lite/utilities/types.py +++ b/src/lightning_lite/utilities/types.py @@ -11,8 +11,69 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import Union +from pathlib import Path +from typing import Any, Callable, Dict, Iterator, List, Optional, TypeVar, Union import torch +from torch import Tensor +from torch.optim import Optimizer +from typing_extensions import Protocol, runtime_checkable +_PATH = Union[str, Path] _DEVICE = Union[torch.device, str, int] +_MAP_LOCATION_TYPE = Optional[Union[_DEVICE, Callable[[_DEVICE], _DEVICE], Dict[_DEVICE, _DEVICE]]] +_PARAMETERS = Iterator[torch.nn.Parameter] + + +_DictKey = TypeVar("_DictKey") + + +@runtime_checkable +class _Stateful(Protocol[_DictKey]): + """This class is used to detect if an object is stateful using `isinstance(obj, _Stateful)`.""" + + def state_dict(self) -> Dict[_DictKey, Any]: + ... + + def load_state_dict(self, state_dict: Dict[_DictKey, Any]) -> None: + ... + + +# Inferred from `torch.optim.lr_scheduler.pyi` +# Missing attributes were added to improve typing +@runtime_checkable +class _LRScheduler(_Stateful[str], Protocol): + optimizer: Optimizer + base_lrs: List[float] + + def __init__(self, optimizer: Optimizer, *args: Any, **kwargs: Any) -> None: + ... + + def step(self, epoch: Optional[int] = None) -> None: + ... + + +# Inferred from `torch.optim.lr_scheduler.pyi` +# Missing attributes were added to improve typing +@runtime_checkable +class ReduceLROnPlateau(_Stateful[str], Protocol): + in_cooldown: bool + optimizer: Optimizer + + def __init__( + self, + optimizer: Optimizer, + mode: str = ..., + factor: float = ..., + patience: int = ..., + verbose: bool = ..., + threshold: float = ..., + threshold_mode: str = ..., + cooldown: int = ..., + min_lr: float = ..., + eps: float = ..., + ) -> None: + ... + + def step(self, metrics: Union[float, int, Tensor], epoch: Optional[int] = None) -> None: + ... diff --git a/src/lightning_lite/utilities/warnings.py b/src/lightning_lite/utilities/warnings.py new file mode 100644 index 0000000000000..dfd298fd49fb9 --- /dev/null +++ b/src/lightning_lite/utilities/warnings.py @@ -0,0 +1,24 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Warning-related utilities.""" +import warnings + +from lightning_lite.utilities.rank_zero import LightningDeprecationWarning + +# enable our warnings +warnings.simplefilter("default", category=LightningDeprecationWarning) + + +class PossibleUserWarning(UserWarning): + """Warnings that could be false positives.""" diff --git a/src/lightning_lite/utilities/xla_device.py b/src/lightning_lite/utilities/xla_device.py index 2feef71c563f2..cc0bfb78823bc 100644 --- a/src/lightning_lite/utilities/xla_device.py +++ b/src/lightning_lite/utilities/xla_device.py @@ -18,7 +18,7 @@ from multiprocessing import Process, Queue from typing import Any, Callable, Union -from pytorch_lightning.utilities.imports import _XLA_AVAILABLE +from lightning_lite.utilities.imports import _XLA_AVAILABLE if _XLA_AVAILABLE: import torch_xla.core.xla_model as xm diff --git a/src/pytorch_lightning/__init__.py b/src/pytorch_lightning/__init__.py index 5a009713e063c..d1f7c29aae195 100644 --- a/src/pytorch_lightning/__init__.py +++ b/src/pytorch_lightning/__init__.py @@ -31,10 +31,10 @@ def _detail(self: Any, message: str, *args: Any, **kwargs: Any) -> None: _logger.addHandler(logging.StreamHandler()) _logger.propagate = False +from lightning_lite.utilities.seed import seed_everything # noqa: E402 from pytorch_lightning.callbacks import Callback # noqa: E402 from pytorch_lightning.core import LightningDataModule, LightningModule # noqa: E402 from pytorch_lightning.trainer import Trainer # noqa: E402 -from pytorch_lightning.utilities.seed import seed_everything # noqa: E402 __all__ = ["Trainer", "LightningDataModule", "LightningModule", "Callback", "seed_everything"] diff --git a/src/pytorch_lightning/accelerators/cpu.py b/src/pytorch_lightning/accelerators/cpu.py index d0981e7269305..00eeac15ff641 100644 --- a/src/pytorch_lightning/accelerators/cpu.py +++ b/src/pytorch_lightning/accelerators/cpu.py @@ -15,11 +15,11 @@ import torch +from lightning_lite.utilities.device_parser import parse_cpu_cores +from lightning_lite.utilities.types import _DEVICE from pytorch_lightning.accelerators.accelerator import Accelerator -from pytorch_lightning.utilities.device_parser import parse_cpu_cores from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.imports import _PSUTIL_AVAILABLE -from pytorch_lightning.utilities.types import _DEVICE class CPUAccelerator(Accelerator): diff --git a/src/pytorch_lightning/accelerators/cuda.py b/src/pytorch_lightning/accelerators/cuda.py index 1c69015546976..e5f939c69ac1c 100644 --- a/src/pytorch_lightning/accelerators/cuda.py +++ b/src/pytorch_lightning/accelerators/cuda.py @@ -20,10 +20,10 @@ import torch import pytorch_lightning as pl +from lightning_lite.utilities import device_parser +from lightning_lite.utilities.types import _DEVICE from pytorch_lightning.accelerators.accelerator import Accelerator -from pytorch_lightning.utilities import device_parser from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning.utilities.types import _DEVICE _log = logging.getLogger(__name__) diff --git a/src/pytorch_lightning/accelerators/mps.py b/src/pytorch_lightning/accelerators/mps.py index 5ebcb37cd0ed7..5610ba1549da9 100644 --- a/src/pytorch_lightning/accelerators/mps.py +++ b/src/pytorch_lightning/accelerators/mps.py @@ -16,11 +16,11 @@ import torch +from lightning_lite.utilities import device_parser +from lightning_lite.utilities.types import _DEVICE from pytorch_lightning.accelerators.accelerator import Accelerator -from pytorch_lightning.utilities import device_parser from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.imports import _PSUTIL_AVAILABLE, _TORCH_GREATER_EQUAL_1_12 -from pytorch_lightning.utilities.types import _DEVICE # For using the `MPSAccelerator`, user's machine should have `torch>=1.12`, Metal programming framework and # the ARM-based Apple Silicon processors. diff --git a/src/pytorch_lightning/accelerators/registry.py b/src/pytorch_lightning/accelerators/registry.py index 992fa34b02aee..74a306df265ca 100644 --- a/src/pytorch_lightning/accelerators/registry.py +++ b/src/pytorch_lightning/accelerators/registry.py @@ -15,9 +15,9 @@ from inspect import getmembers, isclass from typing import Any, Callable, Dict, List, Optional +from lightning_lite.utilities.registry import _is_register_method_overridden from pytorch_lightning.accelerators.accelerator import Accelerator from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning.utilities.registry import _is_register_method_overridden class _AcceleratorRegistry(dict): diff --git a/src/pytorch_lightning/accelerators/tpu.py b/src/pytorch_lightning/accelerators/tpu.py index fa8bd007cb25f..89170e4c924ad 100644 --- a/src/pytorch_lightning/accelerators/tpu.py +++ b/src/pytorch_lightning/accelerators/tpu.py @@ -15,8 +15,8 @@ import torch +from lightning_lite.utilities import device_parser from pytorch_lightning.accelerators.accelerator import Accelerator -from pytorch_lightning.utilities import device_parser from pytorch_lightning.utilities.imports import _TPU_AVAILABLE, _XLA_AVAILABLE if _XLA_AVAILABLE: diff --git a/src/pytorch_lightning/callbacks/base.py b/src/pytorch_lightning/callbacks/base.py index d0d564110a348..0504249ea73aa 100644 --- a/src/pytorch_lightning/callbacks/base.py +++ b/src/pytorch_lightning/callbacks/base.py @@ -14,7 +14,7 @@ from typing import Any from pytorch_lightning.callbacks.callback import Callback as NewCallback -from pytorch_lightning.utilities import rank_zero_deprecation +from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation class Callback(NewCallback): diff --git a/src/pytorch_lightning/callbacks/early_stopping.py b/src/pytorch_lightning/callbacks/early_stopping.py index 30ab05c76ebf2..6c1a43e1d140c 100644 --- a/src/pytorch_lightning/callbacks/early_stopping.py +++ b/src/pytorch_lightning/callbacks/early_stopping.py @@ -27,9 +27,10 @@ from torch import Tensor import pytorch_lightning as pl +from lightning_lite.utilities.rank_zero import _get_rank from pytorch_lightning.callbacks.callback import Callback from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning.utilities.rank_zero import _get_rank, rank_zero_warn +from pytorch_lightning.utilities.rank_zero import rank_zero_warn log = logging.getLogger(__name__) diff --git a/src/pytorch_lightning/callbacks/fault_tolerance.py b/src/pytorch_lightning/callbacks/fault_tolerance.py index 9d04fc86b62ce..75347df01b061 100644 --- a/src/pytorch_lightning/callbacks/fault_tolerance.py +++ b/src/pytorch_lightning/callbacks/fault_tolerance.py @@ -21,8 +21,8 @@ from typing import Any import pytorch_lightning as pl +from lightning_lite.utilities.types import _PATH from pytorch_lightning.callbacks import Checkpoint -from pytorch_lightning.utilities.types import _PATH class _FaultToleranceCheckpoint(Checkpoint): diff --git a/src/pytorch_lightning/callbacks/model_checkpoint.py b/src/pytorch_lightning/callbacks/model_checkpoint.py index a80c82447c069..e484cfde5cb8c 100644 --- a/src/pytorch_lightning/callbacks/model_checkpoint.py +++ b/src/pytorch_lightning/callbacks/model_checkpoint.py @@ -36,10 +36,11 @@ import pytorch_lightning as pl from lightning_lite.utilities.cloud_io import get_filesystem +from lightning_lite.utilities.types import _PATH from pytorch_lightning.callbacks import Checkpoint from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation, rank_zero_info, rank_zero_warn -from pytorch_lightning.utilities.types import _PATH, STEP_OUTPUT +from pytorch_lightning.utilities.types import STEP_OUTPUT log = logging.getLogger(__name__) warning_cache = WarningCache() diff --git a/src/pytorch_lightning/callbacks/stochastic_weight_avg.py b/src/pytorch_lightning/callbacks/stochastic_weight_avg.py index 51cbceb7f9fb6..732c8831b26d1 100644 --- a/src/pytorch_lightning/callbacks/stochastic_weight_avg.py +++ b/src/pytorch_lightning/callbacks/stochastic_weight_avg.py @@ -23,12 +23,13 @@ from torch.optim.swa_utils import SWALR import pytorch_lightning as pl +from lightning_lite.utilities.types import _LRScheduler from pytorch_lightning.callbacks.callback import Callback from pytorch_lightning.strategies import DDPFullyShardedStrategy, DeepSpeedStrategy from pytorch_lightning.strategies.fully_sharded_native import DDPFullyShardedNativeStrategy from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.rank_zero import rank_zero_info, rank_zero_warn -from pytorch_lightning.utilities.types import _LRScheduler, LRSchedulerConfig +from pytorch_lightning.utilities.types import LRSchedulerConfig _AVG_FN = Callable[[Tensor, Tensor, Tensor], Tensor] diff --git a/src/pytorch_lightning/cli.py b/src/pytorch_lightning/cli.py index ee53236508a4a..82156c6b4ab90 100644 --- a/src/pytorch_lightning/cli.py +++ b/src/pytorch_lightning/cli.py @@ -287,7 +287,7 @@ def __init__( this argument will not be configurable from a configuration file and will always be present for this particular CLI. Alternatively, configurable callbacks can be added as explained in :ref:`the CLI docs `. - seed_everything_default: Value for the :func:`~pytorch_lightning.utilities.seed.seed_everything` + seed_everything_default: Value for the :func:`~lightning_lite.utilities.seed.seed_everything` seed argument. Set to True to automatically choose a valid seed. Setting it to False will not call seed_everything. description: Description of the tool shown when running ``--help``. diff --git a/src/pytorch_lightning/core/datamodule.py b/src/pytorch_lightning/core/datamodule.py index e4adf9b1ca928..6a5ea13013655 100644 --- a/src/pytorch_lightning/core/datamodule.py +++ b/src/pytorch_lightning/core/datamodule.py @@ -19,6 +19,7 @@ from torch.utils.data import DataLoader, Dataset, IterableDataset import pytorch_lightning as pl +from lightning_lite.utilities.types import _PATH from pytorch_lightning.core.hooks import CheckpointHooks, DataHooks from pytorch_lightning.core.mixins import HyperparametersMixin from pytorch_lightning.core.saving import _load_from_checkpoint @@ -28,7 +29,7 @@ get_init_arguments_and_types, parse_argparser, ) -from pytorch_lightning.utilities.types import _ADD_ARGPARSE_RETURN, _PATH, EVAL_DATALOADERS, TRAIN_DATALOADERS +from pytorch_lightning.utilities.types import _ADD_ARGPARSE_RETURN, EVAL_DATALOADERS, TRAIN_DATALOADERS class LightningDataModule(CheckpointHooks, DataHooks, HyperparametersMixin): diff --git a/src/pytorch_lightning/core/lightning.py b/src/pytorch_lightning/core/lightning.py index bf6fe19c7dfc9..974cecb39edc9 100644 --- a/src/pytorch_lightning/core/lightning.py +++ b/src/pytorch_lightning/core/lightning.py @@ -14,7 +14,7 @@ from typing import Any from pytorch_lightning.core.module import LightningModule as NewLightningModule -from pytorch_lightning.utilities import rank_zero_deprecation +from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation class LightningModule(NewLightningModule): diff --git a/src/pytorch_lightning/core/module.py b/src/pytorch_lightning/core/module.py index 6776f8ab95bf1..ab655adb4d656 100644 --- a/src/pytorch_lightning/core/module.py +++ b/src/pytorch_lightning/core/module.py @@ -37,6 +37,7 @@ from lightning_lite.utilities.apply_func import convert_to_tensors from lightning_lite.utilities.cloud_io import get_filesystem from lightning_lite.utilities.device_dtype_mixin import _DeviceDtypeModuleMixin +from lightning_lite.utilities.distributed import distributed_available, sync_ddp from pytorch_lightning.callbacks.callback import Callback from pytorch_lightning.core.hooks import CheckpointHooks, DataHooks, ModelHooks from pytorch_lightning.core.mixins import HyperparametersMixin @@ -45,7 +46,6 @@ from pytorch_lightning.loggers import Logger, LoggerCollection from pytorch_lightning.trainer.connectors.logger_connector.fx_validator import _FxValidator from pytorch_lightning.utilities import _IS_WINDOWS, _TORCH_GREATER_EQUAL_1_10, GradClipAlgorithmType -from pytorch_lightning.utilities.distributed import distributed_available, sync_ddp from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_11, _TORCH_GREATER_EQUAL_1_13 from pytorch_lightning.utilities.rank_zero import rank_zero_debug, rank_zero_deprecation, rank_zero_warn diff --git a/src/pytorch_lightning/core/optimizer.py b/src/pytorch_lightning/core/optimizer.py index b96cfabd83b8b..e1a834f8c87ef 100644 --- a/src/pytorch_lightning/core/optimizer.py +++ b/src/pytorch_lightning/core/optimizer.py @@ -21,10 +21,11 @@ from torch.optim import Optimizer import pytorch_lightning as pl +from lightning_lite.utilities.types import _Stateful, ReduceLROnPlateau from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.model_helpers import is_overridden from pytorch_lightning.utilities.rank_zero import rank_zero_warn -from pytorch_lightning.utilities.types import _Stateful, LRSchedulerConfig, LRSchedulerTypeTuple, ReduceLROnPlateau +from pytorch_lightning.utilities.types import LRSchedulerConfig, LRSchedulerTypeTuple def do_nothing_closure() -> None: diff --git a/src/pytorch_lightning/core/saving.py b/src/pytorch_lightning/core/saving.py index 7d999eebb4828..1bec607fad139 100644 --- a/src/pytorch_lightning/core/saving.py +++ b/src/pytorch_lightning/core/saving.py @@ -29,11 +29,11 @@ import pytorch_lightning as pl from lightning_lite.utilities.cloud_io import get_filesystem from lightning_lite.utilities.cloud_io import load as pl_load +from lightning_lite.utilities.types import _MAP_LOCATION_TYPE, _PATH from pytorch_lightning.utilities import _OMEGACONF_AVAILABLE, AttributeDict from pytorch_lightning.utilities.migration import pl_legacy_patch from pytorch_lightning.utilities.parsing import parse_class_init_keys from pytorch_lightning.utilities.rank_zero import rank_zero_warn -from pytorch_lightning.utilities.types import _MAP_LOCATION_TYPE, _PATH log = logging.getLogger(__name__) PRIMITIVE_TYPES = (bool, int, float, str) diff --git a/src/pytorch_lightning/lite/lite.py b/src/pytorch_lightning/lite/lite.py index 0ec9cf5c2daa4..c301f71d441b8 100644 --- a/src/pytorch_lightning/lite/lite.py +++ b/src/pytorch_lightning/lite/lite.py @@ -25,7 +25,15 @@ from torch.optim import Optimizer from torch.utils.data import BatchSampler, DataLoader, DistributedSampler +from lightning_lite.utilities import _AcceleratorType, _StrategyType, move_data_to_device from lightning_lite.utilities.apply_func import convert_to_tensors +from lightning_lite.utilities.data import ( + _auto_add_worker_init_fn, + _replace_dunder_methods, + _update_dataloader, + has_iterable_dataset, +) +from lightning_lite.utilities.seed import seed_everything from pytorch_lightning.accelerators.accelerator import Accelerator from pytorch_lightning.lite.wrappers import _LiteDataLoader, _LiteModule, _LiteOptimizer from pytorch_lightning.overrides.distributed import DistributedSamplerWrapper @@ -33,15 +41,7 @@ from pytorch_lightning.strategies import DeepSpeedStrategy, Strategy, TPUSpawnStrategy from pytorch_lightning.strategies.strategy import TBroadcast from pytorch_lightning.trainer.connectors.accelerator_connector import AcceleratorConnector -from pytorch_lightning.utilities import _AcceleratorType, _StrategyType, move_data_to_device -from pytorch_lightning.utilities.data import ( - _auto_add_worker_init_fn, - _replace_dunder_methods, - _update_dataloader, - has_iterable_dataset, -) from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning.utilities.seed import seed_everything class LightningLite(ABC): diff --git a/src/pytorch_lightning/loops/utilities.py b/src/pytorch_lightning/loops/utilities.py index 9b8ec84ba3661..d5824c431cea6 100644 --- a/src/pytorch_lightning/loops/utilities.py +++ b/src/pytorch_lightning/loops/utilities.py @@ -23,16 +23,16 @@ from torch.utils.data import DataLoader import pytorch_lightning as pl +from lightning_lite.utilities.warnings import PossibleUserWarning from pytorch_lightning.callbacks.timer import Timer from pytorch_lightning.loops import Loop from pytorch_lightning.strategies import ParallelStrategy, Strategy from pytorch_lightning.trainer.progress import BaseProgress -from pytorch_lightning.utilities import rank_zero_warn from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.memory import recursive_detach +from pytorch_lightning.utilities.rank_zero import rank_zero_warn from pytorch_lightning.utilities.signature_utils import is_param_in_hook_signature from pytorch_lightning.utilities.types import STEP_OUTPUT -from pytorch_lightning.utilities.warnings import PossibleUserWarning def check_finite_loss(loss: Optional[Tensor]) -> None: diff --git a/src/pytorch_lightning/overrides/base.py b/src/pytorch_lightning/overrides/base.py index bd2a904de686f..10ab5c06b26f3 100644 --- a/src/pytorch_lightning/overrides/base.py +++ b/src/pytorch_lightning/overrides/base.py @@ -20,7 +20,7 @@ import pytorch_lightning as pl from lightning_lite.utilities.device_dtype_mixin import _DeviceDtypeModuleMixin -from pytorch_lightning.utilities import rank_zero_deprecation +from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation class _LightningPrecisionModuleWrapperBase(_DeviceDtypeModuleMixin, torch.nn.Module): diff --git a/src/pytorch_lightning/overrides/fairscale.py b/src/pytorch_lightning/overrides/fairscale.py index 572efd277d316..0a35f9ddd4d8a 100644 --- a/src/pytorch_lightning/overrides/fairscale.py +++ b/src/pytorch_lightning/overrides/fairscale.py @@ -22,8 +22,8 @@ _LightningPrecisionModuleWrapperBase, unwrap_lightning_module, ) -from pytorch_lightning.utilities import rank_zero_deprecation from pytorch_lightning.utilities.imports import _IS_WINDOWS +from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation _FAIRSCALE_AVAILABLE = not _IS_WINDOWS and module_available("fairscale.nn") diff --git a/src/pytorch_lightning/plugins/io/checkpoint_plugin.py b/src/pytorch_lightning/plugins/io/checkpoint_plugin.py index 7dcc85042425a..04ace9945ff4e 100644 --- a/src/pytorch_lightning/plugins/io/checkpoint_plugin.py +++ b/src/pytorch_lightning/plugins/io/checkpoint_plugin.py @@ -14,7 +14,7 @@ from abc import ABC, abstractmethod from typing import Any, Dict, Optional -from pytorch_lightning.utilities.types import _PATH +from lightning_lite.utilities.types import _PATH class CheckpointIO(ABC): diff --git a/src/pytorch_lightning/plugins/io/hpu_plugin.py b/src/pytorch_lightning/plugins/io/hpu_plugin.py index 59dfa93219413..9fb564cda7237 100644 --- a/src/pytorch_lightning/plugins/io/hpu_plugin.py +++ b/src/pytorch_lightning/plugins/io/hpu_plugin.py @@ -19,8 +19,8 @@ from lightning_lite.utilities.apply_func import move_data_to_device from lightning_lite.utilities.cloud_io import atomic_save, get_filesystem +from lightning_lite.utilities.types import _PATH from pytorch_lightning.plugins.io.torch_plugin import TorchCheckpointIO -from pytorch_lightning.utilities.types import _PATH class HPUCheckpointIO(TorchCheckpointIO): diff --git a/src/pytorch_lightning/plugins/io/torch_plugin.py b/src/pytorch_lightning/plugins/io/torch_plugin.py index ccdc4874a197d..723900864c517 100644 --- a/src/pytorch_lightning/plugins/io/torch_plugin.py +++ b/src/pytorch_lightning/plugins/io/torch_plugin.py @@ -18,9 +18,9 @@ import pytorch_lightning as pl from lightning_lite.utilities.cloud_io import atomic_save, get_filesystem from lightning_lite.utilities.cloud_io import load as pl_load +from lightning_lite.utilities.types import _PATH from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO from pytorch_lightning.utilities.rank_zero import rank_zero_warn -from pytorch_lightning.utilities.types import _PATH log = logging.getLogger(__name__) diff --git a/src/pytorch_lightning/plugins/io/xla_plugin.py b/src/pytorch_lightning/plugins/io/xla_plugin.py index 791e1e068384a..88d8c2bcb7481 100644 --- a/src/pytorch_lightning/plugins/io/xla_plugin.py +++ b/src/pytorch_lightning/plugins/io/xla_plugin.py @@ -17,9 +17,9 @@ from lightning_utilities.core.apply_func import apply_to_collection from lightning_lite.utilities.cloud_io import get_filesystem +from lightning_lite.utilities.types import _PATH from pytorch_lightning.plugins.io.torch_plugin import TorchCheckpointIO from pytorch_lightning.utilities import _OMEGACONF_AVAILABLE, _TPU_AVAILABLE -from pytorch_lightning.utilities.types import _PATH if _TPU_AVAILABLE: import torch_xla.core.xla_model as xm diff --git a/src/pytorch_lightning/plugins/precision/apex_amp.py b/src/pytorch_lightning/plugins/precision/apex_amp.py index d85dceb53a069..0416e216f6834 100644 --- a/src/pytorch_lightning/plugins/precision/apex_amp.py +++ b/src/pytorch_lightning/plugins/precision/apex_amp.py @@ -18,10 +18,10 @@ from torch.optim import LBFGS, Optimizer import pytorch_lightning as pl +from lightning_lite.utilities.types import _PARAMETERS from pytorch_lightning.plugins.precision.mixed import MixedPrecisionPlugin from pytorch_lightning.utilities import _APEX_AVAILABLE, AMPType from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning.utilities.types import _PARAMETERS if _APEX_AVAILABLE: from apex import amp diff --git a/src/pytorch_lightning/plugins/precision/deepspeed.py b/src/pytorch_lightning/plugins/precision/deepspeed.py index 1a83e9538d688..658e66cd1b7ad 100644 --- a/src/pytorch_lightning/plugins/precision/deepspeed.py +++ b/src/pytorch_lightning/plugins/precision/deepspeed.py @@ -20,9 +20,9 @@ from torch.optim import LBFGS, Optimizer import pytorch_lightning as pl +from lightning_lite.utilities.enums import AMPType, PrecisionType from pytorch_lightning.plugins.precision.precision_plugin import PrecisionPlugin from pytorch_lightning.utilities import GradClipAlgorithmType -from pytorch_lightning.utilities.enums import AMPType, PrecisionType from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.imports import _APEX_AVAILABLE from pytorch_lightning.utilities.model_helpers import is_overridden diff --git a/src/pytorch_lightning/plugins/precision/fsdp_native_native_amp.py b/src/pytorch_lightning/plugins/precision/fsdp_native_native_amp.py index a5b26d7dec052..ce372a1f04e0c 100644 --- a/src/pytorch_lightning/plugins/precision/fsdp_native_native_amp.py +++ b/src/pytorch_lightning/plugins/precision/fsdp_native_native_amp.py @@ -15,8 +15,8 @@ import torch +from lightning_lite.utilities.enums import PrecisionType from pytorch_lightning.plugins.precision.native_amp import NativeMixedPrecisionPlugin -from pytorch_lightning.utilities.enums import PrecisionType from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_12 diff --git a/src/pytorch_lightning/plugins/precision/hpu.py b/src/pytorch_lightning/plugins/precision/hpu.py index 4f8db7dabb460..170372ad4e14a 100644 --- a/src/pytorch_lightning/plugins/precision/hpu.py +++ b/src/pytorch_lightning/plugins/precision/hpu.py @@ -13,8 +13,8 @@ # limitations under the License. from typing import Optional, Union +from lightning_lite.utilities.enums import PrecisionType from pytorch_lightning.plugins.precision.precision_plugin import PrecisionPlugin -from pytorch_lightning.utilities.enums import PrecisionType from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.imports import _HPU_AVAILABLE diff --git a/src/pytorch_lightning/plugins/precision/ipu.py b/src/pytorch_lightning/plugins/precision/ipu.py index 34ad358793fc4..2b01dd010fc5f 100644 --- a/src/pytorch_lightning/plugins/precision/ipu.py +++ b/src/pytorch_lightning/plugins/precision/ipu.py @@ -18,9 +18,9 @@ from torch.optim import LBFGS, Optimizer import pytorch_lightning as pl +from lightning_lite.utilities.enums import PrecisionType from pytorch_lightning.plugins.precision.precision_plugin import PrecisionPlugin from pytorch_lightning.utilities import GradClipAlgorithmType -from pytorch_lightning.utilities.enums import PrecisionType from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.model_helpers import is_overridden diff --git a/src/pytorch_lightning/plugins/precision/precision_plugin.py b/src/pytorch_lightning/plugins/precision/precision_plugin.py index 285a0f31e3955..063c8cabb7a22 100644 --- a/src/pytorch_lightning/plugins/precision/precision_plugin.py +++ b/src/pytorch_lightning/plugins/precision/precision_plugin.py @@ -21,9 +21,9 @@ from torch.optim import Optimizer import pytorch_lightning as pl +from lightning_lite.utilities.types import _PARAMETERS from pytorch_lightning.core.hooks import CheckpointHooks from pytorch_lightning.utilities import grad_norm, GradClipAlgorithmType -from pytorch_lightning.utilities.types import _PARAMETERS class PrecisionPlugin(CheckpointHooks): diff --git a/src/pytorch_lightning/profiler/advanced.py b/src/pytorch_lightning/profiler/advanced.py index 1d2bbed5d96f6..d0456f7afa303 100644 --- a/src/pytorch_lightning/profiler/advanced.py +++ b/src/pytorch_lightning/profiler/advanced.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. from pytorch_lightning.profilers.advanced import AdvancedProfiler as NewAdvancedProfiler -from pytorch_lightning.utilities import rank_zero_deprecation +from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation class AdvancedProfiler(NewAdvancedProfiler): diff --git a/src/pytorch_lightning/profiler/profiler.py b/src/pytorch_lightning/profiler/profiler.py index 84bea3ecae238..40d18e79a3284 100644 --- a/src/pytorch_lightning/profiler/profiler.py +++ b/src/pytorch_lightning/profiler/profiler.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. from pytorch_lightning.profilers.profiler import Profiler as NewProfiler -from pytorch_lightning.utilities import rank_zero_deprecation +from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation class Profiler(NewProfiler): diff --git a/src/pytorch_lightning/profiler/pytorch.py b/src/pytorch_lightning/profiler/pytorch.py index d443059912602..488ce3b654673 100644 --- a/src/pytorch_lightning/profiler/pytorch.py +++ b/src/pytorch_lightning/profiler/pytorch.py @@ -14,7 +14,7 @@ from pytorch_lightning.profilers.pytorch import PyTorchProfiler as NewPyTorchProfiler from pytorch_lightning.profilers.pytorch import RegisterRecordFunction as NewRegisterRecordFuncion from pytorch_lightning.profilers.pytorch import ScheduleWrapper as NewScheduleWrapper -from pytorch_lightning.utilities import rank_zero_deprecation +from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation class RegisterRecordFunction(NewRegisterRecordFuncion): diff --git a/src/pytorch_lightning/profiler/simple.py b/src/pytorch_lightning/profiler/simple.py index 61ef7da8ae0f4..9438f516b2c93 100644 --- a/src/pytorch_lightning/profiler/simple.py +++ b/src/pytorch_lightning/profiler/simple.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. from pytorch_lightning.profilers.simple import SimpleProfiler as NewSimpleProfiler -from pytorch_lightning.utilities import rank_zero_deprecation +from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation class SimpleProfiler(NewSimpleProfiler): diff --git a/src/pytorch_lightning/profiler/xla.py b/src/pytorch_lightning/profiler/xla.py index dde858e99eeaa..0cdc0196001ff 100644 --- a/src/pytorch_lightning/profiler/xla.py +++ b/src/pytorch_lightning/profiler/xla.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. from pytorch_lightning.profilers.xla import XLAProfiler as NewXLAProfiler -from pytorch_lightning.utilities import rank_zero_deprecation +from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation class XLAProfiler(NewXLAProfiler): diff --git a/src/pytorch_lightning/profilers/pytorch.py b/src/pytorch_lightning/profilers/pytorch.py index f410230668a9b..c7f34fdc79d9c 100644 --- a/src/pytorch_lightning/profilers/pytorch.py +++ b/src/pytorch_lightning/profilers/pytorch.py @@ -24,8 +24,8 @@ from torch import nn, Tensor from torch.autograd.profiler import record_function +from lightning_lite.utilities.device_parser import is_cuda_available from pytorch_lightning.profilers.profiler import Profiler -from pytorch_lightning.utilities.device_parser import is_cuda_available from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.imports import _KINETO_AVAILABLE from pytorch_lightning.utilities.rank_zero import rank_zero_warn diff --git a/src/pytorch_lightning/strategies/bagua.py b/src/pytorch_lightning/strategies/bagua.py index fd5f7b13195e6..a54267a32b7b7 100644 --- a/src/pytorch_lightning/strategies/bagua.py +++ b/src/pytorch_lightning/strategies/bagua.py @@ -8,6 +8,9 @@ from torch.nn import Module import pytorch_lightning as pl +from lightning_lite.utilities.distributed import ReduceOp +from lightning_lite.utilities.optimizer import optimizers_to_device +from lightning_lite.utilities.seed import reset_seed from pytorch_lightning.overrides.base import _LightningModuleWrapperBase, _LightningPrecisionModuleWrapperBase from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO @@ -15,10 +18,7 @@ from pytorch_lightning.strategies.ddp import DDPStrategy from pytorch_lightning.strategies.strategy import TBroadcast from pytorch_lightning.trainer.states import TrainerFn -from pytorch_lightning.utilities.distributed import ReduceOp from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning.utilities.optimizer import optimizers_to_device -from pytorch_lightning.utilities.seed import reset_seed _BAGUA_AVAILABLE = package_available("bagua") diff --git a/src/pytorch_lightning/strategies/ddp.py b/src/pytorch_lightning/strategies/ddp.py index 2cfdbab357c70..c0eaf47ff8485 100644 --- a/src/pytorch_lightning/strategies/ddp.py +++ b/src/pytorch_lightning/strategies/ddp.py @@ -29,6 +29,15 @@ from torch.optim.optimizer import Optimizer import pytorch_lightning as pl +from lightning_lite.utilities.distributed import ( + _get_process_group_backend_from_env, + distributed_available, + get_default_process_group_backend_for_device, +) +from lightning_lite.utilities.distributed import group as _group +from lightning_lite.utilities.distributed import init_dist_connection, ReduceOp, sync_ddp_if_available +from lightning_lite.utilities.optimizer import optimizers_to_device +from lightning_lite.utilities.seed import reset_seed from pytorch_lightning.core.optimizer import LightningOptimizer from pytorch_lightning.overrides import LightningDistributedModule from pytorch_lightning.overrides.base import _LightningPrecisionModuleWrapperBase @@ -41,23 +50,10 @@ from pytorch_lightning.strategies.parallel import ParallelStrategy from pytorch_lightning.strategies.strategy import TBroadcast from pytorch_lightning.trainer.states import TrainerFn -from pytorch_lightning.utilities.distributed import ( - _get_process_group_backend_from_env, - distributed_available, - get_default_process_group_backend_for_device, -) -from pytorch_lightning.utilities.distributed import group as _group -from pytorch_lightning.utilities.distributed import ( - init_dist_connection, - ReduceOp, - register_ddp_comm_hook, - sync_ddp_if_available, -) +from pytorch_lightning.utilities.distributed import register_ddp_comm_hook from pytorch_lightning.utilities.exceptions import DeadlockDetectedException from pytorch_lightning.utilities.imports import _IS_WINDOWS, _TORCH_GREATER_EQUAL_1_10, _TORCH_GREATER_EQUAL_1_11 -from pytorch_lightning.utilities.optimizer import optimizers_to_device from pytorch_lightning.utilities.rank_zero import rank_zero_info, rank_zero_only, rank_zero_warn -from pytorch_lightning.utilities.seed import reset_seed from pytorch_lightning.utilities.types import PredictStep, STEP_OUTPUT, TestStep, ValidationStep if _FAIRSCALE_AVAILABLE: diff --git a/src/pytorch_lightning/strategies/ddp_spawn.py b/src/pytorch_lightning/strategies/ddp_spawn.py index 2eea8f11f1975..35d90498131df 100644 --- a/src/pytorch_lightning/strategies/ddp_spawn.py +++ b/src/pytorch_lightning/strategies/ddp_spawn.py @@ -24,6 +24,14 @@ from typing_extensions import Literal import pytorch_lightning as pl +from lightning_lite.utilities.distributed import ( + _get_process_group_backend_from_env, + distributed_available, + get_default_process_group_backend_for_device, +) +from lightning_lite.utilities.distributed import group as _group +from lightning_lite.utilities.distributed import init_dist_connection, ReduceOp, sync_ddp_if_available +from lightning_lite.utilities.optimizer import optimizers_to_device from pytorch_lightning.overrides import LightningDistributedModule from pytorch_lightning.overrides.base import _LightningPrecisionModuleWrapperBase from pytorch_lightning.overrides.distributed import prepare_for_backward @@ -34,20 +42,8 @@ from pytorch_lightning.strategies.parallel import ParallelStrategy from pytorch_lightning.strategies.strategy import TBroadcast from pytorch_lightning.trainer.states import TrainerFn -from pytorch_lightning.utilities.distributed import ( - _get_process_group_backend_from_env, - distributed_available, - get_default_process_group_backend_for_device, -) -from pytorch_lightning.utilities.distributed import group as _group -from pytorch_lightning.utilities.distributed import ( - init_dist_connection, - ReduceOp, - register_ddp_comm_hook, - sync_ddp_if_available, -) +from pytorch_lightning.utilities.distributed import register_ddp_comm_hook from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_11 -from pytorch_lightning.utilities.optimizer import optimizers_to_device from pytorch_lightning.utilities.rank_zero import rank_zero_info, rank_zero_only from pytorch_lightning.utilities.types import PredictStep, STEP_OUTPUT, TestStep, ValidationStep diff --git a/src/pytorch_lightning/strategies/deepspeed.py b/src/pytorch_lightning/strategies/deepspeed.py index 1d1c687507690..46634f00123db 100644 --- a/src/pytorch_lightning/strategies/deepspeed.py +++ b/src/pytorch_lightning/strategies/deepspeed.py @@ -30,6 +30,15 @@ from torch.optim import Optimizer import pytorch_lightning as pl +from lightning_lite.utilities.distributed import ( + _get_process_group_backend_from_env, + get_default_process_group_backend_for_device, + log, +) +from lightning_lite.utilities.enums import AMPType, PrecisionType +from lightning_lite.utilities.optimizer import optimizers_to_device +from lightning_lite.utilities.seed import reset_seed +from lightning_lite.utilities.types import _LRScheduler, _PATH, ReduceLROnPlateau from pytorch_lightning.accelerators.cuda import CUDAAccelerator from pytorch_lightning.core.optimizer import _init_optimizers_and_lr_schedulers from pytorch_lightning.overrides.base import _LightningModuleWrapperBase, _LightningPrecisionModuleWrapperBase @@ -39,18 +48,10 @@ from pytorch_lightning.strategies.utils import _fp_to_half from pytorch_lightning.trainer.states import TrainerFn from pytorch_lightning.utilities import GradClipAlgorithmType -from pytorch_lightning.utilities.distributed import ( - _get_process_group_backend_from_env, - get_default_process_group_backend_for_device, - log, -) -from pytorch_lightning.utilities.enums import AMPType, PrecisionType from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.model_helpers import is_overridden -from pytorch_lightning.utilities.optimizer import optimizers_to_device from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation, rank_zero_info, rank_zero_warn -from pytorch_lightning.utilities.seed import reset_seed -from pytorch_lightning.utilities.types import _LRScheduler, _PATH, LRSchedulerConfig, ReduceLROnPlateau, STEP_OUTPUT +from pytorch_lightning.utilities.types import LRSchedulerConfig, STEP_OUTPUT warning_cache = WarningCache() diff --git a/src/pytorch_lightning/strategies/dp.py b/src/pytorch_lightning/strategies/dp.py index a377171982f28..1724f0021db63 100644 --- a/src/pytorch_lightning/strategies/dp.py +++ b/src/pytorch_lightning/strategies/dp.py @@ -19,13 +19,13 @@ from torch.nn import DataParallel, Module import pytorch_lightning as pl +from lightning_lite.utilities.distributed import ReduceOp from pytorch_lightning.overrides.base import _LightningPrecisionModuleWrapperBase from pytorch_lightning.overrides.data_parallel import LightningParallelModule from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO from pytorch_lightning.plugins.precision import PrecisionPlugin from pytorch_lightning.strategies.parallel import ParallelStrategy from pytorch_lightning.strategies.strategy import TBroadcast, TReduce -from pytorch_lightning.utilities.distributed import ReduceOp from pytorch_lightning.utilities.model_helpers import is_overridden from pytorch_lightning.utilities.types import STEP_OUTPUT diff --git a/src/pytorch_lightning/strategies/fully_sharded.py b/src/pytorch_lightning/strategies/fully_sharded.py index a364d7d19a679..add78dc35e41f 100644 --- a/src/pytorch_lightning/strategies/fully_sharded.py +++ b/src/pytorch_lightning/strategies/fully_sharded.py @@ -18,6 +18,8 @@ import torch import pytorch_lightning as pl +from lightning_lite.utilities.enums import PrecisionType +from lightning_lite.utilities.optimizer import optimizers_to_device from pytorch_lightning.overrides.base import _LightningModuleWrapperBase from pytorch_lightning.overrides.fairscale import _FAIRSCALE_AVAILABLE from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment @@ -25,10 +27,8 @@ from pytorch_lightning.plugins.precision import PrecisionPlugin from pytorch_lightning.strategies.ddp import DDPStrategy from pytorch_lightning.trainer.states import TrainerFn -from pytorch_lightning.utilities.enums import PrecisionType from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.model_helpers import is_overridden -from pytorch_lightning.utilities.optimizer import optimizers_to_device from pytorch_lightning.utilities.rank_zero import rank_zero_info from pytorch_lightning.utilities.types import STEP_OUTPUT diff --git a/src/pytorch_lightning/strategies/fully_sharded_native.py b/src/pytorch_lightning/strategies/fully_sharded_native.py index b32f460ee1f3e..ed7c237c9bae5 100644 --- a/src/pytorch_lightning/strategies/fully_sharded_native.py +++ b/src/pytorch_lightning/strategies/fully_sharded_native.py @@ -19,6 +19,14 @@ from torch import Tensor import pytorch_lightning as pl +from lightning_lite.utilities.distributed import ( + _get_process_group_backend_from_env, + get_default_process_group_backend_for_device, +) +from lightning_lite.utilities.distributed import group as _group +from lightning_lite.utilities.distributed import init_dist_connection, ReduceOp, sync_ddp_if_available +from lightning_lite.utilities.optimizer import optimizers_to_device +from lightning_lite.utilities.seed import reset_seed from pytorch_lightning.overrides.base import _LightningModuleWrapperBase from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO @@ -28,19 +36,10 @@ from pytorch_lightning.strategies.parallel import ParallelStrategy from pytorch_lightning.strategies.strategy import TBroadcast from pytorch_lightning.trainer.states import TrainerFn -from pytorch_lightning.utilities import rank_zero_only -from pytorch_lightning.utilities.distributed import ( - _get_process_group_backend_from_env, - get_default_process_group_backend_for_device, -) -from pytorch_lightning.utilities.distributed import group as _group -from pytorch_lightning.utilities.distributed import init_dist_connection, ReduceOp, sync_ddp_if_available from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_12 from pytorch_lightning.utilities.model_helpers import is_overridden -from pytorch_lightning.utilities.optimizer import optimizers_to_device -from pytorch_lightning.utilities.rank_zero import rank_zero_info -from pytorch_lightning.utilities.seed import reset_seed +from pytorch_lightning.utilities.rank_zero import rank_zero_info, rank_zero_only from pytorch_lightning.utilities.types import ProcessGroup, STEP_OUTPUT _distributed_available = torch.distributed.is_available() diff --git a/src/pytorch_lightning/strategies/hivemind.py b/src/pytorch_lightning/strategies/hivemind.py index b258fe7f738ad..7cad027ac6aef 100644 --- a/src/pytorch_lightning/strategies/hivemind.py +++ b/src/pytorch_lightning/strategies/hivemind.py @@ -8,14 +8,14 @@ from torch import Tensor import pytorch_lightning as pl +from lightning_lite.utilities.enums import PrecisionType +from lightning_lite.utilities.types import _LRScheduler, ReduceLROnPlateau from pytorch_lightning.strategies.strategy import Strategy, TBroadcast -from pytorch_lightning.utilities import rank_zero_warn from pytorch_lightning.utilities.data import extract_batch_size -from pytorch_lightning.utilities.enums import PrecisionType from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.imports import _HIVEMIND_AVAILABLE from pytorch_lightning.utilities.model_helpers import is_overridden -from pytorch_lightning.utilities.types import _LRScheduler, ReduceLROnPlateau +from pytorch_lightning.utilities.rank_zero import rank_zero_warn if _HIVEMIND_AVAILABLE: import hivemind diff --git a/src/pytorch_lightning/strategies/horovod.py b/src/pytorch_lightning/strategies/horovod.py index 6329d1e4091e0..27793306fb28e 100644 --- a/src/pytorch_lightning/strategies/horovod.py +++ b/src/pytorch_lightning/strategies/horovod.py @@ -20,14 +20,14 @@ from torch.optim import Optimizer import pytorch_lightning as pl +from lightning_lite.utilities.distributed import distributed_available +from lightning_lite.utilities.distributed import group as dist_group +from lightning_lite.utilities.distributed import ReduceOp from pytorch_lightning.core.optimizer import LightningOptimizer from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO from pytorch_lightning.plugins.precision import PrecisionPlugin from pytorch_lightning.strategies.parallel import ParallelStrategy from pytorch_lightning.strategies.strategy import TBroadcast -from pytorch_lightning.utilities.distributed import distributed_available -from pytorch_lightning.utilities.distributed import group as dist_group -from pytorch_lightning.utilities.distributed import ReduceOp from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.imports import _HOROVOD_AVAILABLE from pytorch_lightning.utilities.rank_zero import rank_zero_only diff --git a/src/pytorch_lightning/strategies/hpu_parallel.py b/src/pytorch_lightning/strategies/hpu_parallel.py index 3e6f8e932e7c2..e7c18d34713d9 100644 --- a/src/pytorch_lightning/strategies/hpu_parallel.py +++ b/src/pytorch_lightning/strategies/hpu_parallel.py @@ -18,6 +18,7 @@ import torch.distributed import pytorch_lightning as pl +from lightning_lite.utilities.distributed import group as _group from pytorch_lightning.overrides import LightningDistributedModule from pytorch_lightning.overrides.torch_distributed import broadcast_object_list from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment @@ -26,7 +27,6 @@ from pytorch_lightning.plugins.io.wrapper import _WrappingCheckpointIO from pytorch_lightning.plugins.precision import PrecisionPlugin from pytorch_lightning.strategies.ddp import DDPStrategy -from pytorch_lightning.utilities.distributed import group as _group from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.imports import _HPU_AVAILABLE, _TORCH_LESSER_EQUAL_1_10_2 from pytorch_lightning.utilities.types import STEP_OUTPUT diff --git a/src/pytorch_lightning/strategies/ipu.py b/src/pytorch_lightning/strategies/ipu.py index 2d976e545deef..69de6049711b7 100644 --- a/src/pytorch_lightning/strategies/ipu.py +++ b/src/pytorch_lightning/strategies/ipu.py @@ -22,6 +22,7 @@ import pytorch_lightning as pl from lightning_lite.utilities.cloud_io import get_filesystem +from lightning_lite.utilities.enums import PrecisionType from pytorch_lightning.overrides.base import _LightningModuleWrapperBase, _LightningPrecisionModuleWrapperBase from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO @@ -32,7 +33,6 @@ from pytorch_lightning.trainer.states import RunningStage, TrainerFn from pytorch_lightning.utilities import _IPU_AVAILABLE, _POPTORCH_AVAILABLE, rank_zero_warn from pytorch_lightning.utilities.data import _get_dataloader_init_args_and_kwargs, _reinstantiate_wrapped_cls -from pytorch_lightning.utilities.enums import PrecisionType from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.model_helpers import is_overridden from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation diff --git a/src/pytorch_lightning/strategies/launchers/multiprocessing.py b/src/pytorch_lightning/strategies/launchers/multiprocessing.py index 6bf81eb72d69c..31508067abf36 100644 --- a/src/pytorch_lightning/strategies/launchers/multiprocessing.py +++ b/src/pytorch_lightning/strategies/launchers/multiprocessing.py @@ -27,13 +27,13 @@ import pytorch_lightning as pl from lightning_lite.utilities.apply_func import move_data_to_device +from lightning_lite.utilities.seed import _collect_rng_states, _set_rng_states +from lightning_lite.utilities.types import _PATH from pytorch_lightning.strategies.launchers.base import _Launcher from pytorch_lightning.strategies.strategy import Strategy from pytorch_lightning.trainer.states import TrainerFn, TrainerState from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_11 from pytorch_lightning.utilities.rank_zero import rank_zero_debug -from pytorch_lightning.utilities.seed import _collect_rng_states, _set_rng_states -from pytorch_lightning.utilities.types import _PATH class _MultiProcessingLauncher(_Launcher): diff --git a/src/pytorch_lightning/strategies/parallel.py b/src/pytorch_lightning/strategies/parallel.py index 0790b5e75e077..124d01f362fef 100644 --- a/src/pytorch_lightning/strategies/parallel.py +++ b/src/pytorch_lightning/strategies/parallel.py @@ -19,17 +19,17 @@ from torch import Tensor import pytorch_lightning as pl -from pytorch_lightning.plugins import LayerSync -from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment -from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO -from pytorch_lightning.plugins.precision import PrecisionPlugin -from pytorch_lightning.strategies.strategy import Strategy -from pytorch_lightning.utilities.distributed import ( +from lightning_lite.utilities.distributed import ( _get_process_group_backend_from_env, all_gather_ddp_if_available, get_default_process_group_backend_for_device, ReduceOp, ) +from pytorch_lightning.plugins import LayerSync +from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment +from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO +from pytorch_lightning.plugins.precision import PrecisionPlugin +from pytorch_lightning.strategies.strategy import Strategy from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation diff --git a/src/pytorch_lightning/strategies/sharded.py b/src/pytorch_lightning/strategies/sharded.py index 22a1c22e96398..df0d126385f32 100644 --- a/src/pytorch_lightning/strategies/sharded.py +++ b/src/pytorch_lightning/strategies/sharded.py @@ -19,14 +19,14 @@ from torch.optim import Optimizer import pytorch_lightning as pl +from lightning_lite.utilities.enums import PrecisionType +from lightning_lite.utilities.optimizer import optimizers_to_device from pytorch_lightning.core.optimizer import LightningOptimizer from pytorch_lightning.overrides.base import _LightningModuleWrapperBase, _LightningPrecisionModuleWrapperBase from pytorch_lightning.overrides.fairscale import _FAIRSCALE_AVAILABLE from pytorch_lightning.strategies.ddp import DDPStrategy from pytorch_lightning.trainer.states import TrainerFn -from pytorch_lightning.utilities.enums import PrecisionType from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning.utilities.optimizer import optimizers_to_device if _FAIRSCALE_AVAILABLE: from fairscale.nn.data_parallel.sharded_ddp import ShardedDataParallel diff --git a/src/pytorch_lightning/strategies/sharded_spawn.py b/src/pytorch_lightning/strategies/sharded_spawn.py index b5cd9497a3640..438f6d5eb6a47 100644 --- a/src/pytorch_lightning/strategies/sharded_spawn.py +++ b/src/pytorch_lightning/strategies/sharded_spawn.py @@ -19,12 +19,12 @@ from torch.optim import Optimizer import pytorch_lightning as pl +from lightning_lite.utilities.optimizer import optimizers_to_device from pytorch_lightning.overrides.base import _LightningModuleWrapperBase, _LightningPrecisionModuleWrapperBase from pytorch_lightning.overrides.fairscale import _FAIRSCALE_AVAILABLE from pytorch_lightning.strategies.ddp_spawn import DDPSpawnStrategy from pytorch_lightning.trainer.states import TrainerFn from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning.utilities.optimizer import optimizers_to_device if _FAIRSCALE_AVAILABLE: from fairscale.nn.data_parallel.sharded_ddp import ShardedDataParallel diff --git a/src/pytorch_lightning/strategies/single_device.py b/src/pytorch_lightning/strategies/single_device.py index cb436fded86d0..a9d5d7ca87fd0 100644 --- a/src/pytorch_lightning/strategies/single_device.py +++ b/src/pytorch_lightning/strategies/single_device.py @@ -19,10 +19,10 @@ from torch import Tensor import pytorch_lightning as pl +from lightning_lite.utilities.types import _DEVICE from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO from pytorch_lightning.plugins.precision import PrecisionPlugin from pytorch_lightning.strategies.strategy import Strategy, TBroadcast -from pytorch_lightning.utilities.types import _DEVICE class SingleDeviceStrategy(Strategy): diff --git a/src/pytorch_lightning/strategies/single_hpu.py b/src/pytorch_lightning/strategies/single_hpu.py index 45eb8c58f2cd4..5c29829fa6ce9 100644 --- a/src/pytorch_lightning/strategies/single_hpu.py +++ b/src/pytorch_lightning/strategies/single_hpu.py @@ -15,6 +15,7 @@ from typing import Dict, Optional import pytorch_lightning as pl +from lightning_lite.utilities.types import _DEVICE from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO from pytorch_lightning.plugins.io.hpu_plugin import HPUCheckpointIO from pytorch_lightning.plugins.io.wrapper import _WrappingCheckpointIO @@ -22,7 +23,7 @@ from pytorch_lightning.strategies.single_device import SingleDeviceStrategy from pytorch_lightning.utilities import _HPU_AVAILABLE from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning.utilities.types import _DEVICE, STEP_OUTPUT +from pytorch_lightning.utilities.types import STEP_OUTPUT if _HPU_AVAILABLE: import habana_frameworks.torch.core as htcore diff --git a/src/pytorch_lightning/strategies/strategy.py b/src/pytorch_lightning/strategies/strategy.py index 0a10722166f8d..bb63c602690d4 100644 --- a/src/pytorch_lightning/strategies/strategy.py +++ b/src/pytorch_lightning/strategies/strategy.py @@ -24,6 +24,9 @@ import pytorch_lightning as pl from lightning_lite.utilities.apply_func import move_data_to_device +from lightning_lite.utilities.distributed import ReduceOp +from lightning_lite.utilities.optimizer import optimizer_to_device, optimizers_to_device +from lightning_lite.utilities.types import _PATH from pytorch_lightning.core.optimizer import _init_optimizers_and_lr_schedulers, LightningOptimizer from pytorch_lightning.plugins import TorchCheckpointIO from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO @@ -31,10 +34,7 @@ from pytorch_lightning.plugins.precision import PrecisionPlugin from pytorch_lightning.strategies.launchers.base import _Launcher from pytorch_lightning.trainer.states import TrainerFn -from pytorch_lightning.utilities.distributed import ReduceOp -from pytorch_lightning.utilities.optimizer import optimizer_to_device, optimizers_to_device from pytorch_lightning.utilities.types import ( - _PATH, LRSchedulerConfig, PredictStep, STEP_OUTPUT, diff --git a/src/pytorch_lightning/strategies/strategy_registry.py b/src/pytorch_lightning/strategies/strategy_registry.py index 7dee7146d415d..43089b735aca0 100644 --- a/src/pytorch_lightning/strategies/strategy_registry.py +++ b/src/pytorch_lightning/strategies/strategy_registry.py @@ -15,9 +15,9 @@ from inspect import getmembers, isclass from typing import Any, Callable, Dict, List, Optional +from lightning_lite.utilities.registry import _is_register_method_overridden from pytorch_lightning.strategies.strategy import Strategy from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning.utilities.registry import _is_register_method_overridden class _StrategyRegistry(dict): diff --git a/src/pytorch_lightning/strategies/tpu_spawn.py b/src/pytorch_lightning/strategies/tpu_spawn.py index 748406479bf51..52dec94ac3702 100644 --- a/src/pytorch_lightning/strategies/tpu_spawn.py +++ b/src/pytorch_lightning/strategies/tpu_spawn.py @@ -22,6 +22,10 @@ from torch.utils.data import DataLoader import pytorch_lightning as pl +from lightning_lite.utilities.data import has_len +from lightning_lite.utilities.distributed import ReduceOp +from lightning_lite.utilities.optimizer import optimizers_to_device +from lightning_lite.utilities.types import _PATH from pytorch_lightning.overrides import LightningDistributedModule from pytorch_lightning.plugins.environments import XLAEnvironment from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO @@ -34,12 +38,9 @@ from pytorch_lightning.trainer.connectors.data_connector import DataConnector from pytorch_lightning.trainer.states import TrainerFn from pytorch_lightning.utilities import _TPU_AVAILABLE, find_shared_parameters, set_shared_parameters -from pytorch_lightning.utilities.data import has_len -from pytorch_lightning.utilities.distributed import ReduceOp from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning.utilities.optimizer import optimizers_to_device from pytorch_lightning.utilities.rank_zero import rank_zero_only -from pytorch_lightning.utilities.types import _PATH, EVAL_DATALOADERS, STEP_OUTPUT, TRAIN_DATALOADERS +from pytorch_lightning.utilities.types import EVAL_DATALOADERS, STEP_OUTPUT, TRAIN_DATALOADERS if _TPU_AVAILABLE: import torch_xla.core.xla_env_vars as xenv diff --git a/src/pytorch_lightning/strategies/utils.py b/src/pytorch_lightning/strategies/utils.py index ec7a1bd6ffb89..3c3ebbe241811 100644 --- a/src/pytorch_lightning/strategies/utils.py +++ b/src/pytorch_lightning/strategies/utils.py @@ -15,7 +15,7 @@ import torch -from pytorch_lightning.utilities.enums import PrecisionType +from lightning_lite.utilities.enums import PrecisionType from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation diff --git a/src/pytorch_lightning/trainer/__init__.py b/src/pytorch_lightning/trainer/__init__.py index 6226a75de42fb..b53effd6e7f85 100644 --- a/src/pytorch_lightning/trainer/__init__.py +++ b/src/pytorch_lightning/trainer/__init__.py @@ -13,7 +13,7 @@ # limitations under the License. """""" +from lightning_lite.utilities.seed import seed_everything from pytorch_lightning.trainer.trainer import Trainer -from pytorch_lightning.utilities.seed import seed_everything __all__ = ["Trainer", "seed_everything"] diff --git a/src/pytorch_lightning/trainer/configuration_validator.py b/src/pytorch_lightning/trainer/configuration_validator.py index 6ec2b15a11c6d..f1d86995d10c2 100644 --- a/src/pytorch_lightning/trainer/configuration_validator.py +++ b/src/pytorch_lightning/trainer/configuration_validator.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. import pytorch_lightning as pl +from lightning_lite.utilities.warnings import PossibleUserWarning from pytorch_lightning.accelerators.ipu import IPUAccelerator from pytorch_lightning.plugins.precision.precision_plugin import PrecisionPlugin from pytorch_lightning.strategies import DataParallelStrategy @@ -20,7 +21,6 @@ from pytorch_lightning.utilities.model_helpers import is_overridden from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation, rank_zero_warn from pytorch_lightning.utilities.signature_utils import is_param_in_hook_signature -from pytorch_lightning.utilities.warnings import PossibleUserWarning def verify_loop_configurations(trainer: "pl.Trainer") -> None: diff --git a/src/pytorch_lightning/trainer/connectors/accelerator_connector.py b/src/pytorch_lightning/trainer/connectors/accelerator_connector.py index e9183f7f52cf3..f3be6caa5be2e 100644 --- a/src/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/src/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -20,6 +20,7 @@ import torch from typing_extensions import Literal +from lightning_lite.utilities import _StrategyType, AMPType, device_parser, LightningEnum from pytorch_lightning.accelerators.accelerator import Accelerator from pytorch_lightning.accelerators.cpu import CPUAccelerator from pytorch_lightning.accelerators.cuda import CUDAAccelerator @@ -75,15 +76,6 @@ from pytorch_lightning.strategies.ddp_spawn import _DDP_FORK_ALIASES from pytorch_lightning.strategies.launchers.multiprocessing import _is_forking_disabled from pytorch_lightning.tuner.auto_gpu_select import pick_multiple_gpus -from pytorch_lightning.utilities import ( - _StrategyType, - AMPType, - device_parser, - LightningEnum, - rank_zero_deprecation, - rank_zero_info, - rank_zero_warn, -) from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.imports import ( _HOROVOD_AVAILABLE, @@ -93,6 +85,7 @@ _TORCH_GREATER_EQUAL_1_11, _TPU_AVAILABLE, ) +from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation, rank_zero_info, rank_zero_warn log = logging.getLogger(__name__) diff --git a/src/pytorch_lightning/trainer/connectors/checkpoint_connector.py b/src/pytorch_lightning/trainer/connectors/checkpoint_connector.py index 647d505eb3568..300f3c129243b 100644 --- a/src/pytorch_lightning/trainer/connectors/checkpoint_connector.py +++ b/src/pytorch_lightning/trainer/connectors/checkpoint_connector.py @@ -24,6 +24,7 @@ import pytorch_lightning as pl from lightning_lite.utilities.cloud_io import get_filesystem +from lightning_lite.utilities.types import _PATH from pytorch_lightning.plugins.precision import ApexMixedPrecisionPlugin, NativeMixedPrecisionPlugin from pytorch_lightning.trainer.states import TrainerFn from pytorch_lightning.utilities import _OMEGACONF_AVAILABLE @@ -31,7 +32,6 @@ from pytorch_lightning.utilities.imports import _fault_tolerant_training from pytorch_lightning.utilities.migration import pl_legacy_patch from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation, rank_zero_info -from pytorch_lightning.utilities.types import _PATH from pytorch_lightning.utilities.upgrade_checkpoint import KEYS_MAPPING as DEPRECATED_CHECKPOINT_KEYS if _OMEGACONF_AVAILABLE: diff --git a/src/pytorch_lightning/trainer/connectors/data_connector.py b/src/pytorch_lightning/trainer/connectors/data_connector.py index bfb26228e3756..56ba809e105b2 100644 --- a/src/pytorch_lightning/trainer/connectors/data_connector.py +++ b/src/pytorch_lightning/trainer/connectors/data_connector.py @@ -23,20 +23,14 @@ from torch.utils.data.distributed import DistributedSampler import pytorch_lightning as pl +from lightning_lite.utilities.data import _auto_add_worker_init_fn, _replace_dunder_methods, has_iterable_dataset from pytorch_lightning.accelerators.ipu import IPUAccelerator from pytorch_lightning.overrides.distributed import DistributedSamplerWrapper, UnrepeatedDistributedSamplerWrapper from pytorch_lightning.strategies import DDPSpawnStrategy from pytorch_lightning.trainer.states import RunningStage, TrainerFn from pytorch_lightning.trainer.supporters import CombinedLoader, CycleIterator from pytorch_lightning.utilities.auto_restart import _validate_fault_tolerant_automatic -from pytorch_lightning.utilities.data import ( - _auto_add_worker_init_fn, - _is_dataloader_shuffled, - _replace_dunder_methods, - _update_dataloader, - has_iterable_dataset, - has_len_all_ranks, -) +from pytorch_lightning.utilities.data import _is_dataloader_shuffled, _update_dataloader, has_len_all_ranks from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.imports import _fault_tolerant_training from pytorch_lightning.utilities.model_helpers import is_overridden diff --git a/src/pytorch_lightning/trainer/connectors/logger_connector/result.py b/src/pytorch_lightning/trainer/connectors/logger_connector/result.py index 9f1be4ba4bed6..4d856223d5c10 100644 --- a/src/pytorch_lightning/trainer/connectors/logger_connector/result.py +++ b/src/pytorch_lightning/trainer/connectors/logger_connector/result.py @@ -24,8 +24,8 @@ from lightning_lite.utilities.apply_func import move_data_to_device from lightning_lite.utilities.device_dtype_mixin import _DeviceDtypeModuleMixin +from lightning_lite.utilities.distributed import distributed_available from pytorch_lightning.utilities.data import extract_batch_size -from pytorch_lightning.utilities.distributed import distributed_available from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.imports import _fault_tolerant_training from pytorch_lightning.utilities.memory import recursive_detach diff --git a/src/pytorch_lightning/trainer/data_loading.py b/src/pytorch_lightning/trainer/data_loading.py index e3a2fd4785f31..3163e7660cb0c 100644 --- a/src/pytorch_lightning/trainer/data_loading.py +++ b/src/pytorch_lightning/trainer/data_loading.py @@ -18,7 +18,7 @@ import pytorch_lightning as pl from pytorch_lightning.trainer.states import RunningStage -from pytorch_lightning.utilities import rank_zero_deprecation +from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation class TrainerDataLoadingMixin(ABC): diff --git a/src/pytorch_lightning/trainer/optimizers.py b/src/pytorch_lightning/trainer/optimizers.py index 8e25fb5ac60f7..fcd37c4e272c0 100644 --- a/src/pytorch_lightning/trainer/optimizers.py +++ b/src/pytorch_lightning/trainer/optimizers.py @@ -19,7 +19,7 @@ import pytorch_lightning as pl from pytorch_lightning.core.optimizer import _init_optimizers_and_lr_schedulers, LightningOptimizer -from pytorch_lightning.utilities import rank_zero_deprecation +from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation class TrainerOptimizersMixin(ABC): diff --git a/src/pytorch_lightning/trainer/supporters.py b/src/pytorch_lightning/trainer/supporters.py index 3be43e37fe338..e183bdcc644d6 100644 --- a/src/pytorch_lightning/trainer/supporters.py +++ b/src/pytorch_lightning/trainer/supporters.py @@ -21,13 +21,13 @@ from torch.utils.data.dataloader import _BaseDataLoaderIter, _MultiProcessingDataLoaderIter, DataLoader from torch.utils.data.dataset import IterableDataset +from lightning_lite.utilities.distributed import distributed_available from pytorch_lightning.utilities.auto_restart import ( _reload_dataloader_state_dict, MergedIteratorState, patch_dataloader_iterator, ) from pytorch_lightning.utilities.data import get_len -from pytorch_lightning.utilities.distributed import distributed_available from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.imports import _fault_tolerant_training diff --git a/src/pytorch_lightning/trainer/trainer.py b/src/pytorch_lightning/trainer/trainer.py index 72caec41179ed..1859eab265db2 100644 --- a/src/pytorch_lightning/trainer/trainer.py +++ b/src/pytorch_lightning/trainer/trainer.py @@ -39,6 +39,10 @@ import pytorch_lightning as pl from lightning_lite.utilities.cloud_io import get_filesystem +from lightning_lite.utilities.data import _auto_add_worker_init_fn +from lightning_lite.utilities.distributed import distributed_available +from lightning_lite.utilities.types import _PATH +from lightning_lite.utilities.warnings import PossibleUserWarning from pytorch_lightning.accelerators import ( Accelerator, CUDAAccelerator, @@ -102,8 +106,7 @@ parse_env_variables, ) from pytorch_lightning.utilities.auto_restart import _add_capture_metadata_collate -from pytorch_lightning.utilities.data import _auto_add_worker_init_fn, has_len_all_ranks -from pytorch_lightning.utilities.distributed import distributed_available +from pytorch_lightning.utilities.data import has_len_all_ranks from pytorch_lightning.utilities.exceptions import ExitGracefullyException, MisconfigurationException from pytorch_lightning.utilities.imports import _fault_tolerant_training from pytorch_lightning.utilities.model_helpers import is_overridden @@ -111,13 +114,11 @@ from pytorch_lightning.utilities.seed import isolate_rng from pytorch_lightning.utilities.types import ( _EVALUATE_OUTPUT, - _PATH, _PREDICT_OUTPUT, EVAL_DATALOADERS, LRSchedulerConfig, TRAIN_DATALOADERS, ) -from pytorch_lightning.utilities.warnings import PossibleUserWarning log = logging.getLogger(__name__) # warnings to ignore in trainer diff --git a/src/pytorch_lightning/tuner/auto_gpu_select.py b/src/pytorch_lightning/tuner/auto_gpu_select.py index a42e55a61321d..5b165c9d9409c 100644 --- a/src/pytorch_lightning/tuner/auto_gpu_select.py +++ b/src/pytorch_lightning/tuner/auto_gpu_select.py @@ -15,7 +15,7 @@ import torch -from pytorch_lightning.utilities import device_parser +from lightning_lite.utilities import device_parser from pytorch_lightning.utilities.exceptions import MisconfigurationException diff --git a/src/pytorch_lightning/utilities/__init__.py b/src/pytorch_lightning/utilities/__init__.py index a0baa3a85f7b0..c29ed71bd8ca0 100644 --- a/src/pytorch_lightning/utilities/__init__.py +++ b/src/pytorch_lightning/utilities/__init__.py @@ -15,15 +15,9 @@ import numpy +from lightning_lite.utilities import AllGatherGrad, AMPType, LightningEnum # noqa: F401 from lightning_lite.utilities.apply_func import move_data_to_device # noqa: F401 -from pytorch_lightning.utilities.distributed import AllGatherGrad # noqa: F401 -from pytorch_lightning.utilities.enums import ( # noqa: F401 - _AcceleratorType, - _StrategyType, - AMPType, - GradClipAlgorithmType, - LightningEnum, -) +from pytorch_lightning.utilities.enums import GradClipAlgorithmType # noqa: F401 from pytorch_lightning.utilities.grads import grad_norm # noqa: F401 from pytorch_lightning.utilities.imports import ( # noqa: F401 _APEX_AVAILABLE, diff --git a/src/pytorch_lightning/utilities/auto_restart.py b/src/pytorch_lightning/utilities/auto_restart.py index 0f6fadfb2657f..d9d8c5da38858 100644 --- a/src/pytorch_lightning/utilities/auto_restart.py +++ b/src/pytorch_lightning/utilities/auto_restart.py @@ -29,11 +29,11 @@ from typing_extensions import TypedDict import pytorch_lightning as pl +from lightning_lite.utilities.types import _Stateful from pytorch_lightning.utilities.distributed import _collect_states_on_rank_zero from pytorch_lightning.utilities.enums import _FaultTolerantMode, AutoRestartBatchKeys from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.seed import _collect_rng_states, _set_rng_states -from pytorch_lightning.utilities.types import _Stateful class _IteratorStateDict(TypedDict): diff --git a/src/pytorch_lightning/utilities/cloud_io.py b/src/pytorch_lightning/utilities/cloud_io.py index 735b2e95ed1dc..4993b8d3d0600 100644 --- a/src/pytorch_lightning/utilities/cloud_io.py +++ b/src/pytorch_lightning/utilities/cloud_io.py @@ -18,7 +18,7 @@ from lightning_lite.utilities.cloud_io import atomic_save as new_atomic_save from lightning_lite.utilities.cloud_io import get_filesystem as new_get_filesystem from lightning_lite.utilities.cloud_io import load as new_load -from pytorch_lightning.utilities import rank_zero_deprecation # TODO(lite): change to lightning_lite.utilities +from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation def atomic_save(*args: Any, **kwargs: Any) -> Any: diff --git a/src/pytorch_lightning/utilities/data.py b/src/pytorch_lightning/utilities/data.py index 59068b1a1523b..cf07949461f05 100644 --- a/src/pytorch_lightning/utilities/data.py +++ b/src/pytorch_lightning/utilities/data.py @@ -11,18 +11,12 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import functools import inspect -import os -from collections import OrderedDict -from contextlib import contextmanager from dataclasses import fields -from functools import partial -from typing import Any, Callable, Dict, Generator, Iterable, Mapping, Optional, Tuple, Type, Union +from typing import Any, Dict, Generator, Iterable, Mapping, Optional, Tuple, Union import torch from lightning_utilities.core.apply_func import is_dataclass_instance -from lightning_utilities.core.inheritance import get_all_subclasses from lightning_utilities.core.rank_zero import WarningCache from torch import Tensor from torch.utils.data import ( @@ -36,13 +30,16 @@ ) import pytorch_lightning as pl +from lightning_lite.utilities import LightningEnum +from lightning_lite.utilities.data import _reinstantiate_wrapped_cls, _replace_value_in_saved_args +from lightning_lite.utilities.data import has_iterable_dataset as new_has_iterable_dataset +from lightning_lite.utilities.data import has_len as new_has_len from pytorch_lightning.overrides.distributed import IndexBatchSamplerWrapper from pytorch_lightning.trainer.states import RunningStage from pytorch_lightning.utilities.auto_restart import CaptureIterableDataset, CaptureMapDataset, FastForwardSampler -from pytorch_lightning.utilities.enums import _FaultTolerantMode, LightningEnum +from pytorch_lightning.utilities.enums import _FaultTolerantMode from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning.utilities.rank_zero import rank_zero_warn -from pytorch_lightning.utilities.seed import pl_worker_init_function +from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation, rank_zero_warn BType = Union[Tensor, str, Mapping[Any, "BType"], Iterable["BType"]] @@ -110,33 +107,6 @@ def extract_batch_size(batch: BType) -> int: return batch_size -def has_iterable_dataset(dataloader: DataLoader) -> bool: - return hasattr(dataloader, "dataset") and isinstance(dataloader.dataset, IterableDataset) - - -def has_len(dataloader: Union[DataLoader, Iterable]) -> bool: - """Checks if a given Dataloader has ``__len__`` method implemented i.e. if it is a finite dataloader or - infinite dataloader.""" - try: - # try getting the length - if len(dataloader) == 0: - rank_zero_warn( - f"`{dataloader.__class__.__name__}` returned 0 length. Please make sure this was your intention." - ) - has_len = True - except (TypeError, NotImplementedError): - has_len = False - - if has_len and has_iterable_dataset(dataloader): - rank_zero_warn( - "Your `IterableDataset` has `__len__` defined." - " In combination with multi-process data loading (when num_workers > 1)," - " `__len__` could be inaccurate if each worker is not configured independently" - " to avoid having duplicate data." - ) - return has_len - - def has_len_all_ranks( dataloader: DataLoader, strategy: "pl.Strategy", @@ -171,7 +141,7 @@ def has_len_all_ranks( except (TypeError, NotImplementedError): has_len = False - if has_len and has_iterable_dataset(dataloader): + if has_len and new_has_iterable_dataset(dataloader): rank_zero_warn( "Your `IterableDataset` has `__len__` defined." " In combination with multi-process data loading (when num_workers > 1)," @@ -187,7 +157,7 @@ def get_len(dataloader: DataLoader) -> Union[int, float]: If ``__len__`` method is not implemented, return float('inf'). """ - if has_len(dataloader): + if new_has_len(dataloader): return len(dataloader) return float("inf") @@ -409,171 +379,6 @@ def _dataloader_init_kwargs_resolve_sampler( return {"sampler": sampler, "shuffle": False, "batch_sampler": None} -def _replace_value_in_saved_args( - replace_key: str, - replace_value: Any, - args: Tuple[Any, ...], - kwargs: Dict[str, Any], - default_kwargs: Dict[str, Any], - arg_names: Tuple[str, ...], -) -> Tuple[bool, Tuple[Any, ...], Dict[str, Any]]: - """Tries to replace an argument value in a saved list of args and kwargs. - - Returns a tuple indicating success of the operation and modified saved args and kwargs - """ - - if replace_key in arg_names: - replace_index = arg_names.index(replace_key) - args = args[:replace_index] + (replace_value,) + args[replace_index + 1 :] - return True, args, kwargs - elif replace_key in kwargs or replace_key in default_kwargs: - kwargs[replace_key] = replace_value - return True, args, kwargs - - return False, args, kwargs - - -def _auto_add_worker_init_fn(dataloader: DataLoader, rank: int) -> None: - if int(os.environ.get("PL_SEED_WORKERS", 0)) and dataloader.worker_init_fn is None: - dataloader.worker_init_fn = partial(pl_worker_init_function, rank=rank) - - -def _reinstantiate_wrapped_cls(orig_object: Any, *args: Any, explicit_cls: Optional[Type] = None, **kwargs: Any) -> Any: - constructor = type(orig_object) if explicit_cls is None else explicit_cls - - try: - result = constructor(*args, **kwargs) - except TypeError as e: - # improve exception message due to an incorrect implementation of the `DataLoader` where multiple subclass - # `__init__` arguments map to one `DataLoader.__init__` argument - import re - - match = re.match(r".*__init__\(\) got multiple values .* '(\w+)'", str(e)) - if not match: - # an unexpected `TypeError`, continue failure - raise - argument = match.groups()[0] - message = ( - f"The {constructor.__name__} implementation has an error where more than one `__init__` argument" - f" can be passed to its parent's `{argument}=...` `__init__` argument. This is likely caused by allowing" - f" passing both a custom argument that will map to the `{argument}` argument as well as `**kwargs`." - f" `kwargs` should be filtered to make sure they don't contain the `{argument}` key." - " This argument was automatically passed to your object by PyTorch Lightning." - ) - raise MisconfigurationException(message) from e - - attrs_record = getattr(orig_object, "__pl_attrs_record", list()) - for args, fn in attrs_record: - fn(result, *args) - - return result - - -def _wrap_init_method(init: Callable, store_explicit_arg: Optional[str] = None) -> Callable: - """Wraps the ``__init__`` method of classes (currently :class:`~torch.utils.data.DataLoader` and - :class:`~torch.utils.data.BatchSampler`) in order to enable re-instantiation of custom subclasses.""" - - @functools.wraps(init) - def wrapper(obj: Any, *args: Any, **kwargs: Any) -> None: - # We need to inspect `init`, as inspecting `obj.__init__` - # can lead to inspecting the wrong function with multiple inheritance - old_inside_init = getattr(obj, "__pl_inside_init", False) - object.__setattr__(obj, "__pl_inside_init", True) - params = inspect.signature(init).parameters - - parameters_defaults = OrderedDict( - (param.name, param.default) - for param in params.values() - if param.name != "self" and param.kind not in (param.VAR_POSITIONAL, param.VAR_KEYWORD) - ) - - param_names = tuple(parameters_defaults)[: len(args)] - - default_kwargs = { - name: value - for name, value in parameters_defaults.items() - if name not in kwargs and name not in param_names and value != inspect.Parameter.empty - } - - if not hasattr(obj, "__pl_saved_args"): - object.__setattr__(obj, "__pl_saved_args", args) - object.__setattr__(obj, "__pl_saved_kwargs", kwargs) - object.__setattr__(obj, "__pl_saved_arg_names", param_names) - object.__setattr__(obj, "__pl_saved_default_kwargs", default_kwargs) - - # We want to use the latest possible value for explicit argument (i.e. ideally what gets passed to base class) - # so that we can be sure, that it will not get changed anymore. - # That is why we are setting this in every `__init__` - if store_explicit_arg is not None: - if store_explicit_arg in param_names: - object.__setattr__(obj, f"__{store_explicit_arg}", args[param_names.index(store_explicit_arg)]) - elif store_explicit_arg in kwargs: - object.__setattr__(obj, f"__{store_explicit_arg}", kwargs[store_explicit_arg]) - - init(obj, *args, **kwargs) - object.__setattr__(obj, "__pl_inside_init", old_inside_init) - - return wrapper - - -def _wrap_attr_method(method: Callable, tag: _WrapAttrTag) -> Callable: - """Wraps the ``__setattr__`` or ``__delattr__`` method of classes (currently :class:`~torch.utils.data.DataLoader` and - :class:`~torch.utils.data.BatchSampler`) in order to enable re-instantiation of custom subclasses.""" - - @functools.wraps(method) - def wrapper(obj: Any, *args: Any): - # First, let's find out if we're the first in inheritance chain calling the patched method. - name, *_ = args - prev_call_name, prev_call_method = getattr(obj, "__pl_current_call", (None, "method")) - first_call = not (prev_call_name == name and prev_call_method == tag) - - # Then mark the current called method - object.__setattr__(obj, "__pl_current_call", (name, tag)) - - # call original method - method(obj, *args) - if first_call and not getattr(obj, "__pl_inside_init", True): - # and save the value it was called with to the internal list, - # if we're outside of __init__ and the original call did not fail and we're the first call - attrs_record = getattr(obj, "__pl_attrs_record", list()) - attrs_record.append((args, tag)) - object.__setattr__(obj, "__pl_attrs_record", attrs_record) - object.__setattr__(obj, "__pl_current_call", (prev_call_name, prev_call_method)) - - return wrapper - - -@contextmanager -def _replace_dunder_methods(base_cls: Type, store_explicit_arg: Optional[str] = None) -> Generator[None, None, None]: - """This context manager is used to add support for re-instantiation of custom (subclasses) of `base_cls`. - - It patches the ``__init__``, ``__setattr__`` and ``__delattr__`` methods. - """ - classes = get_all_subclasses(base_cls) | {base_cls} - for cls in classes: - # Check that __init__ belongs to the class - # https://stackoverflow.com/a/5253424 - if "__init__" in cls.__dict__: - cls.__old__init__ = cls.__init__ - cls.__init__ = _wrap_init_method(cls.__init__, store_explicit_arg) - - # we want at least one setattr/delattr in the chain to be patched and it can happen, that none of the subclasses - # implement `__setattr__`/`__delattr__`. Therefore, we are always patching the `base_cls` - for patch_fn_name, tag in (("__setattr__", _WrapAttrTag.SET), ("__delattr__", _WrapAttrTag.DEL)): - if patch_fn_name in cls.__dict__ or cls is base_cls: - saved_name = f"__old{patch_fn_name}" - setattr(cls, saved_name, getattr(cls, patch_fn_name)) - setattr(cls, patch_fn_name, _wrap_attr_method(getattr(cls, patch_fn_name), tag)) - yield - for cls in classes: - for patched_name in ("__setattr__", "__delattr__", "__init__"): - # Check that __old__{init,setattr,delattr} belongs to the class - # https://stackoverflow.com/a/5253424 - if f"__old{patched_name}" in cls.__dict__: - setattr(cls, patched_name, getattr(cls, f"__old{patched_name}")) - delattr(cls, f"__old{patched_name}") - - def _wrap_with_capture_dataset(dataset: Dataset) -> Dataset: if isinstance(dataset, IterableDataset): # wrap the `IterableDataset` into a `CaptureIterableDataset` to record sampler states. @@ -627,3 +432,19 @@ def _is_dataloader_shuffled(dataloader: object) -> bool: if isinstance(sampler, SequentialSampler): return False return isinstance(sampler, RandomSampler) + + +def has_iterable_dataset(*args: Any, **kwargs: Any) -> Any: + rank_zero_deprecation( + "`pytorch_lightning.utilities.data.has_iterable_dataset` has been deprecated in v1.8.0 and will be" + " removed in v1.10.0. Please use `lightning_lite.utilities.data.has_iterable_dataset` instead." + ) + return new_has_iterable_dataset(*args, **kwargs) + + +def has_len(*args: Any, **kwargs: Any) -> Any: + rank_zero_deprecation( + "`pytorch_lightning.utilities.data.has_len` has been deprecated in v1.8.0 and will be" + " removed in v1.10.0. Please use `lightning_lite.utilities.data.has_len` instead." + ) + return new_has_len(*args, **kwargs) diff --git a/src/pytorch_lightning/utilities/deepspeed.py b/src/pytorch_lightning/utilities/deepspeed.py index cfa4e6a2f4d2b..3d5b77e7746c3 100644 --- a/src/pytorch_lightning/utilities/deepspeed.py +++ b/src/pytorch_lightning/utilities/deepspeed.py @@ -19,8 +19,8 @@ import torch +from lightning_lite.utilities.types import _PATH from pytorch_lightning.strategies.deepspeed import _DEEPSPEED_AVAILABLE -from pytorch_lightning.utilities.types import _PATH if _DEEPSPEED_AVAILABLE: from deepspeed.utils.zero_to_fp32 import ( diff --git a/src/pytorch_lightning/utilities/device_parser.py b/src/pytorch_lightning/utilities/device_parser.py index 32f370b5b246e..b1337c25547b6 100644 --- a/src/pytorch_lightning/utilities/device_parser.py +++ b/src/pytorch_lightning/utilities/device_parser.py @@ -11,291 +11,16 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import multiprocessing -from typing import Any, List, MutableSequence, Optional, Tuple, Union - -import torch -import torch.cuda - -from pytorch_lightning.plugins.environments import TorchElasticEnvironment -from pytorch_lightning.strategies.launchers.multiprocessing import _is_forking_disabled +from typing import Any, List, Optional, Union + +from lightning_lite.utilities.device_parser import determine_root_gpu_device as new_determine_root_gpu_device +from lightning_lite.utilities.device_parser import is_cuda_available as new_is_cuda_available +from lightning_lite.utilities.device_parser import num_cuda_devices as new_num_cuda_devices +from lightning_lite.utilities.device_parser import parse_cpu_cores as new_parse_cpu_cores +from lightning_lite.utilities.device_parser import parse_gpu_ids as new_parse_gpu_ids +from lightning_lite.utilities.device_parser import parse_tpu_cores as new_parse_tpu_cores from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning.utilities.types import _DEVICE - - -def determine_root_gpu_device(gpus: List[_DEVICE]) -> Optional[_DEVICE]: - """ - Args: - gpus: non-empty list of ints representing which gpus to use - - Returns: - designated root GPU device id - - Raises: - TypeError: - If ``gpus`` is not a list - AssertionError: - If GPU list is empty - """ - if gpus is None: - return None - - if not isinstance(gpus, list): - raise TypeError("gpus should be a list") - - assert len(gpus) > 0, "gpus should be a non empty list" - - # set root gpu - root_gpu = gpus[0] - - return root_gpu - - -def parse_gpu_ids( - gpus: Optional[Union[int, str, List[int]]], - include_cuda: bool = False, - include_mps: bool = False, -) -> Optional[List[int]]: - """ - Parses the GPU ids given in the format as accepted by the - :class:`~pytorch_lightning.trainer.Trainer`. - - Args: - gpus: An int -1 or string '-1' indicate that all available GPUs should be used. - A list of unique ints or a string containing list of comma separated unique integers - indicates specific GPUs to use. - An int 0 means that no GPUs should be used. - Any int N > 0 indicates that GPUs [0..N) should be used. - include_cuda: A boolean indicating whether to include cuda devices for gpu parsing. - include_mps: A boolean indicating whether to include mps devices for gpu parsing. - - Returns: - a list of gpus to be used or ``None`` if no GPUs were requested - - Raises: - MisconfigurationException: - If no GPUs are available but the value of gpus variable indicates request for GPUs - - .. note:: - ``include_cuda`` and ``include_mps`` default to ``False`` so that you only - have to specify which device type to use and not disabling all the others. - """ - # Check that gpus param is None, Int, String or Sequence of Ints - _check_data_type(gpus) - - # Handle the case when no gpus are requested - if gpus is None or (isinstance(gpus, int) and gpus == 0) or str(gpus).strip() in ("0", "[]"): - return None - - # We know user requested GPUs therefore if some of the - # requested GPUs are not available an exception is thrown. - gpus = _normalize_parse_gpu_string_input(gpus) - gpus = _normalize_parse_gpu_input_to_list(gpus, include_cuda=include_cuda, include_mps=include_mps) - if not gpus: - raise MisconfigurationException("GPUs requested but none are available.") - - if ( - TorchElasticEnvironment.detect() - and len(gpus) != 1 - and len(_get_all_available_gpus(include_cuda=include_cuda, include_mps=include_mps)) == 1 - ): - # omit sanity check on torchelastic as by default shows one visible GPU per process - return gpus - - # Check that gpus are unique. Duplicate gpus are not supported by the backend. - _check_unique(gpus) - - return _sanitize_gpu_ids(gpus, include_cuda=include_cuda, include_mps=include_mps) - - -def parse_tpu_cores(tpu_cores: Optional[Union[int, str, List[int]]]) -> Optional[Union[int, List[int]]]: - """ - Parses the tpu_cores given in the format as accepted by the - :class:`~pytorch_lightning.trainer.Trainer`. - - Args: - tpu_cores: An int 1 or string '1' indicate that 1 core with multi-processing should be used - An int 8 or string '8' indicate that all 8 cores with multi-processing should be used - A list of int or a string containing list of comma separated integer - indicates specific TPU core to use. - - Returns: - a list of tpu_cores to be used or ``None`` if no TPU cores were requested - - Raises: - MisconfigurationException: - If TPU cores aren't 1, 8 or [<1-8>] - """ - _check_data_type(tpu_cores) - - if isinstance(tpu_cores, str): - tpu_cores = _parse_tpu_cores_str(tpu_cores.strip()) - - if not _tpu_cores_valid(tpu_cores): - raise MisconfigurationException("`tpu_cores` can only be 1, 8 or [<1-8>]") - - return tpu_cores - - -def parse_cpu_cores(cpu_cores: Union[int, str, List[int]]) -> int: - """Parses the cpu_cores given in the format as accepted by the ``devices`` argument in the - :class:`~pytorch_lightning.trainer.Trainer`. - - Args: - cpu_cores: An int > 0. - - Returns: - an int representing the number of processes - - Raises: - MisconfigurationException: - If cpu_cores is not an int > 0 - """ - if isinstance(cpu_cores, str) and cpu_cores.strip().isdigit(): - cpu_cores = int(cpu_cores) - - if not isinstance(cpu_cores, int) or cpu_cores <= 0: - raise MisconfigurationException("`devices` selected with `CPUAccelerator` should be an int > 0.") - - return cpu_cores - - -def _normalize_parse_gpu_string_input(s: Union[int, str, List[int]]) -> Union[int, List[int]]: - if not isinstance(s, str): - return s - if s == "-1": - return -1 - if "," in s: - return [int(x.strip()) for x in s.split(",") if len(x) > 0] - return int(s.strip()) - - -def _sanitize_gpu_ids(gpus: List[int], include_cuda: bool = False, include_mps: bool = False) -> List[int]: - """Checks that each of the GPUs in the list is actually available. Raises a MisconfigurationException if any of - the GPUs is not available. - - Args: - gpus: list of ints corresponding to GPU indices - - Returns: - unmodified gpus variable - - Raises: - MisconfigurationException: - If machine has fewer available GPUs than requested. - """ - if sum((include_cuda, include_mps)) == 0: - raise ValueError("At least one gpu type should be specified!") - all_available_gpus = _get_all_available_gpus(include_cuda=include_cuda, include_mps=include_mps) - for gpu in gpus: - if gpu not in all_available_gpus: - raise MisconfigurationException( - f"You requested gpu: {gpus}\n But your machine only has: {all_available_gpus}" - ) - return gpus - - -def _normalize_parse_gpu_input_to_list( - gpus: Union[int, List[int], Tuple[int, ...]], include_cuda: bool, include_mps: bool -) -> Optional[List[int]]: - assert gpus is not None - if isinstance(gpus, (MutableSequence, tuple)): - return list(gpus) - - # must be an int - if not gpus: # gpus==0 - return None - if gpus == -1: - return _get_all_available_gpus(include_cuda=include_cuda, include_mps=include_mps) - - return list(range(gpus)) - - -def _get_all_available_gpus(include_cuda: bool = False, include_mps: bool = False) -> List[int]: - """ - Returns: - a list of all available gpus - """ - cuda_gpus = _get_all_available_cuda_gpus() if include_cuda else [] - mps_gpus = _get_all_available_mps_gpus() if include_mps else [] - return cuda_gpus + mps_gpus - - -def _get_all_available_mps_gpus() -> List[int]: - """ - Returns: - a list of all available MPS gpus - """ - # lazy import to avoid circular dependencies - from pytorch_lightning.accelerators.mps import _MPS_AVAILABLE - - return [0] if _MPS_AVAILABLE else [] - - -def _get_all_available_cuda_gpus() -> List[int]: - """ - Returns: - a list of all available CUDA gpus - """ - return list(range(num_cuda_devices())) - - -def _check_unique(device_ids: List[int]) -> None: - """Checks that the device_ids are unique. - - Args: - device_ids: list of ints corresponding to gpus indices - - Raises: - MisconfigurationException: - If ``device_ids`` of GPUs aren't unique - """ - if len(device_ids) != len(set(device_ids)): - raise MisconfigurationException("Device ID's (GPU) must be unique.") - - -def _check_data_type(device_ids: Any) -> None: - """Checks that the device_ids argument is one of None, int, string, or sequence of integers. - - Args: - device_ids: gpus/tpu_cores parameter as passed to the Trainer - - Raises: - MisconfigurationException: - If ``device_ids`` of GPU/TPUs aren't ``int``, ``str``, sequence of ``int`` or ``None`` - """ - msg = "Device IDs (GPU/TPU) must be an int, a string, a sequence of ints or None, but you passed" - - if device_ids is None: - return - elif isinstance(device_ids, (MutableSequence, tuple)): - for id_ in device_ids: - if type(id_) is not int: - raise MisconfigurationException(f"{msg} a sequence of {type(id_).__name__}.") - elif type(device_ids) not in (int, str): - raise MisconfigurationException(f"{msg} {type(device_ids).__name__}.") - - -def _tpu_cores_valid(tpu_cores: Any) -> bool: - # allow 1 or 8 cores - if tpu_cores in (1, 8, None): - return True - - # allow picking 1 of 8 indexes - if isinstance(tpu_cores, (list, tuple, set)): - has_1_tpu_idx = len(tpu_cores) == 1 - is_valid_tpu_idx = 1 <= list(tpu_cores)[0] <= 8 - - is_valid_tpu_core_choice = has_1_tpu_idx and is_valid_tpu_idx - return is_valid_tpu_core_choice - - return False - - -def _parse_tpu_cores_str(tpu_cores: str) -> Union[int, List[int]]: - if tpu_cores in ("1", "8"): - return int(tpu_cores) - return [int(x.strip()) for x in tpu_cores.split(",") if len(x) > 0] +from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation def parse_hpus(devices: Optional[Union[int, str, List[int]]]) -> Optional[int]: @@ -319,25 +44,49 @@ def parse_hpus(devices: Optional[Union[int, str, List[int]]]) -> Optional[int]: return int(devices) if isinstance(devices, str) else devices -def num_cuda_devices() -> int: - """Returns the number of GPUs available. - - Unlike :func:`torch.cuda.device_count`, this function will do its best not to create a CUDA context for fork - support, if the platform allows it. - """ - if "fork" not in torch.multiprocessing.get_all_start_methods() or _is_forking_disabled(): - return torch.cuda.device_count() - with multiprocessing.get_context("fork").Pool(1) as pool: - return pool.apply(torch.cuda.device_count) +def determine_root_gpu_device(*args: Any, **kwargs: Any) -> Any: + rank_zero_deprecation( + "`pytorch_lightning.utilities.device_parser.determine_root_gpu_device` has been deprecated in v1.8.0 and will" + " be removed in v1.10.0. Please use `lightning_lite.utilities.device_parser.determine_root_gpu_device` instead." + ) + return new_determine_root_gpu_device(*args, **kwargs) def is_cuda_available() -> bool: - """Returns a bool indicating if CUDA is currently available. + rank_zero_deprecation( + "`pytorch_lightning.utilities.device_parser.is_cuda_available` has been deprecated in v1.8.0 and will" + " be removed in v1.10.0. Please use `lightning_lite.utilities.device_parser.is_cuda_available` instead." + ) + return new_is_cuda_available() - Unlike :func:`torch.cuda.is_available`, this function will do its best not to create a CUDA context for fork - support, if the platform allows it. - """ - if "fork" not in torch.multiprocessing.get_all_start_methods() or _is_forking_disabled(): - return torch.cuda.is_available() - with multiprocessing.get_context("fork").Pool(1) as pool: - return pool.apply(torch.cuda.is_available) + +def num_cuda_devices() -> int: + rank_zero_deprecation( + "`pytorch_lightning.utilities.device_parser.num_cuda_devices` has been deprecated in v1.8.0 and will" + " be removed in v1.10.0. Please use `lightning_lite.utilities.device_parser.num_cuda_devices` instead." + ) + return new_num_cuda_devices() + + +def parse_cpu_cores(*args: Any, **kwargs: Any) -> Any: + rank_zero_deprecation( + "`pytorch_lightning.utilities.device_parser.parse_cpu_cores` has been deprecated in v1.8.0 and will" + " be removed in v1.10.0. Please use `lightning_lite.utilities.device_parser.parse_cpu_cores` instead." + ) + return new_parse_cpu_cores(*args, **kwargs) + + +def parse_gpu_ids(*args: Any, **kwargs: Any) -> Any: + rank_zero_deprecation( + "`pytorch_lightning.utilities.device_parser.parse_gpu_ids` has been deprecated in v1.8.0 and will" + " be removed in v1.10.0. Please use `lightning_lite.utilities.device_parser.parse_gpu_ids` instead." + ) + return new_parse_gpu_ids(*args, **kwargs) + + +def parse_tpu_cores(*args: Any, **kwargs: Any) -> Any: + rank_zero_deprecation( + "`pytorch_lightning.utilities.device_parser.parse_tpu_cores` has been deprecated in v1.8.0 and will" + " be removed in v1.10.0. Please use `lightning_lite.utilities.device_parser.parse_tpu_cores` instead." + ) + return new_parse_tpu_cores(*args, **kwargs) diff --git a/src/pytorch_lightning/utilities/distributed.py b/src/pytorch_lightning/utilities/distributed.py index 7b33cb38b6709..6f01a1a5b447e 100644 --- a/src/pytorch_lightning/utilities/distributed.py +++ b/src/pytorch_lightning/utilities/distributed.py @@ -12,211 +12,23 @@ # limitations under the License. """Utilities that can be used with distributed training.""" -import logging -import os -from typing import Any, Callable, Dict, List, Optional, Tuple, Union +from typing import Any, Callable, Dict, Optional import torch -import torch.nn.functional as F -from torch import Tensor from torch.nn.parallel.distributed import DistributedDataParallel -import pytorch_lightning as pl -from pytorch_lightning.utilities.imports import _HPU_AVAILABLE, _TPU_AVAILABLE -from pytorch_lightning.utilities.rank_zero import rank_zero_only # noqa: F401 +from lightning_lite.utilities.distributed import all_gather_ddp_if_available as new_all_gather_ddp_if_available +from lightning_lite.utilities.distributed import distributed_available as new_distributed_available +from lightning_lite.utilities.distributed import gather_all_tensors as new_gather_all_tensors +from lightning_lite.utilities.distributed import ( + get_default_process_group_backend_for_device as new_get_default_process_group_backend_for_device, +) +from lightning_lite.utilities.distributed import init_dist_connection as new_init_dist_connection +from lightning_lite.utilities.distributed import sync_ddp as new_sync_ddp +from lightning_lite.utilities.distributed import sync_ddp_if_available as new_sync_ddp_if_available +from lightning_lite.utilities.distributed import tpu_distributed as new_tpu_distributed from pytorch_lightning.utilities.rank_zero import rank_zero_debug, rank_zero_deprecation, rank_zero_info -if _TPU_AVAILABLE: - import torch_xla.core.xla_model as xm - -if torch.distributed.is_available(): - from torch.distributed import group, ReduceOp - -else: - - class ReduceOp: # type: ignore # (see https://github.com/python/mypy/issues/1153) - SUM = None - - class group: # type: ignore - WORLD = None - - -log = logging.getLogger(__name__) - - -def gather_all_tensors(result: Tensor, group: Optional[Any] = None) -> List[Tensor]: - """Function to gather all tensors from several ddp processes onto a list that is broadcasted to all processes. - - Works on tensors that have the same number of dimensions, but where each dimension may differ. In this case - tensors are padded, gathered and then trimmed to secure equal workload for all processes. - - Args: - result: the value to sync - group: the process group to gather results from. Defaults to all processes (world) - - Return: - gathered_result: list with size equal to the process group where - gathered_result[i] corresponds to result tensor from process i - """ - if group is None: - group = torch.distributed.group.WORLD - - # convert tensors to contiguous format - result = result.contiguous() - - world_size = torch.distributed.get_world_size(group) - torch.distributed.barrier(group=group) - - # if the tensor is scalar, things are easy - if result.ndim == 0: - return _simple_gather_all_tensors(result, group, world_size) - - # 1. Gather sizes of all tensors - local_size = torch.tensor(result.shape, device=result.device) - local_sizes = [torch.zeros_like(local_size) for _ in range(world_size)] - torch.distributed.all_gather(local_sizes, local_size, group=group) - max_size = torch.stack(local_sizes).max(dim=0).values - all_sizes_equal = all(all(ls == max_size) for ls in local_sizes) - - # 2. If shapes are all the same, then do a simple gather: - if all_sizes_equal: - return _simple_gather_all_tensors(result, group, world_size) - - # 3. If not, we need to pad each local tensor to maximum size, gather and then truncate - pad_dims = [] - pad_by = (max_size - local_size).detach().cpu() - for val in reversed(pad_by): - pad_dims.append(0) - pad_dims.append(val.item()) - result_padded = F.pad(result, pad_dims) - gathered_result = [torch.zeros_like(result_padded) for _ in range(world_size)] - torch.distributed.all_gather(gathered_result, result_padded, group) - for idx, item_size in enumerate(local_sizes): - slice_param = [slice(dim_size) for dim_size in item_size] - gathered_result[idx] = gathered_result[idx][slice_param] - return gathered_result - - -def _simple_gather_all_tensors(result: Tensor, group: Any, world_size: int) -> List[Tensor]: - gathered_result = [torch.zeros_like(result) for _ in range(world_size)] - torch.distributed.all_gather(gathered_result, result, group) - return gathered_result - - -def distributed_available() -> bool: - return torch.distributed.is_available() and torch.distributed.is_initialized() or tpu_distributed() - - -def sync_ddp_if_available( - result: Tensor, group: Optional[Any] = None, reduce_op: Optional[Union[ReduceOp, str]] = None -) -> Tensor: - """Function to reduce a tensor across worker processes during distributed training. - - Args: - result: the value to sync and reduce (typically tensor or number) - group: the process group to gather results from. Defaults to all processes (world) - reduce_op: the reduction operation. Defaults to sum. - Can also be a string of 'avg', 'mean' to calculate the mean during reduction. - - Return: - reduced value - """ - if distributed_available(): - return sync_ddp(result, group=group, reduce_op=reduce_op) - return result - - -def sync_ddp(result: Tensor, group: Optional[Any] = None, reduce_op: Optional[Union[ReduceOp, str]] = None) -> Tensor: - """Function to reduce the tensors from several ddp processes to one main process. - - Args: - result: the value to sync and reduce (typically tensor or number) - group: the process group to gather results from. Defaults to all processes (world) - reduce_op: the reduction operation. Defaults to sum. - Can also be a string of 'avg', 'mean' to calculate the mean during reduction. - - Return: - reduced value - """ - divide_by_world_size = False - - if group is None: - group = torch.distributed.group.WORLD - - op: Optional[ReduceOp] - if isinstance(reduce_op, str): - if reduce_op.lower() in ("avg", "mean"): - op = ReduceOp.SUM - divide_by_world_size = True - else: - op = getattr(ReduceOp, reduce_op.upper()) - else: - op = reduce_op - - # WA for HPU. HPU doesn't support Long types, forcefully set it to float - if _HPU_AVAILABLE: - is_hpu_backend = os.environ.get("HCCL_DISTRIBUTED_BACKEND") == "1" - if is_hpu_backend: - if (result.type() == "torch.LongTensor") or (result.type() == "torch.hpu.LongTensor"): - rank_zero_info("Long tensor unsupported on HPU, casting to float") - result = result.float() - - # sync all processes before reduction - torch.distributed.barrier(group=group) - torch.distributed.all_reduce(result, op=op, group=group, async_op=False) - - if divide_by_world_size: - result = result / torch.distributed.get_world_size(group) - - return result - - -class AllGatherGrad(torch.autograd.Function): - @staticmethod - def forward( # type: ignore[override] - ctx: Any, - tensor: Tensor, - group: Optional["torch.distributed.ProcessGroup"] = group.WORLD, - ) -> Tensor: - ctx.group = group - - gathered_tensor = [torch.zeros_like(tensor) for _ in range(torch.distributed.get_world_size())] - - torch.distributed.all_gather(gathered_tensor, tensor, group=group) - gathered_tensor = torch.stack(gathered_tensor, dim=0) - - return gathered_tensor - - @staticmethod - def backward(ctx: Any, *grad_output: Tensor) -> Tuple[Tensor, None]: - grad_output = torch.cat(grad_output) - - torch.distributed.all_reduce(grad_output, op=torch.distributed.ReduceOp.SUM, async_op=False, group=ctx.group) - - return grad_output[torch.distributed.get_rank()], None - - -def all_gather_ddp_if_available( - tensor: Tensor, group: Optional["torch.distributed.ProcessGroup"] = None, sync_grads: bool = False -) -> Tensor: - """Function to gather a tensor from several distributed processes. - - Args: - tensor: tensor of shape (batch, ...) - group: the process group to gather results from. Defaults to all processes (world) - sync_grads: flag that allows users to synchronize gradients for all_gather op - - Return: - A tensor of shape (world_size, batch, ...) - """ - group = group if group is not None else torch.distributed.group.WORLD - if distributed_available(): - if sync_grads: - return AllGatherGrad.apply(tensor, group) - with torch.no_grad(): - return AllGatherGrad.apply(tensor, group) - return tensor - def register_ddp_comm_hook( model: DistributedDataParallel, @@ -319,67 +131,6 @@ def register_ddp_comm_hook( model.register_comm_hook(state=ddp_comm_state, hook=ddp_comm_hook) # type: ignore[operator] -def tpu_distributed() -> bool: - return _TPU_AVAILABLE and xm.xrt_world_size() > 1 - - -def get_default_process_group_backend_for_device(device: torch.device) -> str: - return "nccl" if device.type == "cuda" else "gloo" - - -def _get_process_group_backend_from_env() -> Optional[str]: - torch_backend = os.getenv("PL_TORCH_DISTRIBUTED_BACKEND") - if torch_backend is not None: - rank_zero_deprecation( - "Environment variable `PL_TORCH_DISTRIBUTED_BACKEND`" - " was deprecated in v1.6 and will be removed in v1.8." - " Specify `process_group_backend` directly on the strategy constructor." - ) - return torch_backend - - -def init_dist_connection( - cluster_environment: "pl.plugins.environments.ClusterEnvironment", - torch_distributed_backend: str, - global_rank: Optional[int] = None, - world_size: Optional[int] = None, - **kwargs: Any, -) -> None: - """Utility function to initialize distributed connection by setting env variables and initializing the - distributed process group. - - Args: - cluster_environment: ``ClusterEnvironment`` instance - torch_distributed_backend: backend to use (includes `nccl` and `gloo`) - global_rank: rank of the current process - world_size: number of processes in the group - kwargs: kwargs for ``init_process_group`` - - Raises: - RuntimeError: - If ``torch.distributed`` is not available - """ - if not torch.distributed.is_available(): - raise RuntimeError("torch.distributed is not available. Cannot initialize distributed process group") - if torch.distributed.is_initialized(): - log.debug("torch.distributed is already initialized. Exiting early") - return - global_rank = global_rank if global_rank is not None else cluster_environment.global_rank() - world_size = world_size if world_size is not None else cluster_environment.world_size() - os.environ["MASTER_ADDR"] = cluster_environment.main_address - os.environ["MASTER_PORT"] = str(cluster_environment.main_port) - log.info(f"Initializing distributed: GLOBAL_RANK: {global_rank}, MEMBER: {global_rank + 1}/{world_size}") - torch.distributed.init_process_group(torch_distributed_backend, rank=global_rank, world_size=world_size, **kwargs) - - # on rank=0 let everyone know training is starting - rank_zero_info( - f"{'-' * 100}\n" - f"distributed_backend={torch_distributed_backend}\n" - f"All distributed processes registered. Starting with {world_size} processes\n" - f"{'-' * 100}\n" - ) - - def _broadcast_object_list(obj: Any, rank: int) -> Any: objects = [obj if torch.distributed.get_rank() == rank else None] torch.distributed.broadcast_object_list(objects, src=rank) @@ -397,6 +148,71 @@ def _collect_states_on_rank_zero(state: Dict[str, Any]) -> Dict[int, Any]: states: On global rank 0, a dictionary where the primary keys are the process rank and the values their associated states. Otherwise, returns None. """ - if not distributed_available(): + if not new_distributed_available(): return {0: state} return {rank: _broadcast_object_list(state, rank) for rank in range(torch.distributed.get_world_size())} + + +def all_gather_ddp_if_available(*args: Any, **kwargs: Any) -> Any: + rank_zero_deprecation( + "`pytorch_lightning.utilities.distributed.all_gather_ddp_if_available` has been deprecated in v1.8.0 and will" + " be removed in v1.10.0. Please use `lightning_lite.utilities.distributed.all_gather_ddp_if_available` instead." + ) + return new_all_gather_ddp_if_available(*args, **kwargs) + + +def distributed_available() -> Any: + rank_zero_deprecation( + "`pytorch_lightning.utilities.distributed.distributed_available` has been deprecated in v1.8.0 and will" + " be removed in v1.10.0. Please use `lightning_lite.utilities.distributed.distributed_available` instead." + ) + return new_distributed_available() + + +def gather_all_tensors(*args: Any, **kwargs: Any) -> Any: + rank_zero_deprecation( + "`pytorch_lightning.utilities.distributed.gather_all_tensors` has been deprecated in v1.8.0 and will" + " be removed in v1.10.0. Please use `lightning_lite.utilities.distributed.gather_all_tensors` instead." + ) + return new_gather_all_tensors(*args, **kwargs) + + +def get_default_process_group_backend_for_device(*args: Any, **kwargs: Any) -> Any: + rank_zero_deprecation( + "`pytorch_lightning.utilities.distributed.get_default_process_group_backend_for_device` has been deprecated" + " in v1.8.0 and will be removed in v1.10.0. Please use" + " `lightning_lite.utilities.distributed.get_default_process_group_backend_for_device` instead." + ) + return new_get_default_process_group_backend_for_device(*args, **kwargs) + + +def init_dist_connection(*args: Any, **kwargs: Any) -> Any: + rank_zero_deprecation( + "`pytorch_lightning.utilities.distributed.init_dist_connection` has been deprecated in v1.8.0 and will" + " be removed in v1.10.0. Please use `lightning_lite.utilities.distributed.init_dist_connection` instead." + ) + return new_init_dist_connection(*args, **kwargs) + + +def sync_ddp(*args: Any, **kwargs: Any) -> Any: + rank_zero_deprecation( + "`pytorch_lightning.utilities.distributed.sync_ddp` has been deprecated in v1.8.0 and will" + " be removed in v1.10.0. Please use `lightning_lite.utilities.distributed.sync_ddp` instead." + ) + return new_sync_ddp(*args, **kwargs) + + +def sync_ddp_if_available(*args: Any, **kwargs: Any) -> Any: + rank_zero_deprecation( + "`pytorch_lightning.utilities.distributed.sync_ddp_if_available` has been deprecated in v1.8.0 and will" + " be removed in v1.10.0. Please use `lightning_lite.utilities.distributed.sync_ddp_if_available` instead." + ) + return new_sync_ddp_if_available(*args, **kwargs) + + +def tpu_distributed() -> bool: + rank_zero_deprecation( + "`pytorch_lightning.utilities.distributed.tpu_distributed` has been deprecated in v1.8.0 and will" + " be removed in v1.10.0. Please use `lightning_lite.utilities.distributed.tpu_distributed` instead." + ) + return new_tpu_distributed() diff --git a/src/pytorch_lightning/utilities/enums.py b/src/pytorch_lightning/utilities/enums.py index 03d9b8782e5ab..8a5fe0e35d6b2 100644 --- a/src/pytorch_lightning/utilities/enums.py +++ b/src/pytorch_lightning/utilities/enums.py @@ -15,47 +15,10 @@ from __future__ import annotations import os -from typing import TYPE_CHECKING - -from lightning_utilities.core.enums import StrEnum +from lightning_lite.utilities.enums import AMPType, LightningEnum, PrecisionType # noqa: F401 from pytorch_lightning.utilities.exceptions import MisconfigurationException -if TYPE_CHECKING: - from enum import Enum - - # re-defined because `mypy` infers `StrEnum` as `Any` - class LightningEnum(StrEnum, Enum): - ... - -else: - LightningEnum = StrEnum - - -class AMPType(LightningEnum): - """Type of Automatic Mixed Precission used for training.""" - - APEX = "apex" - NATIVE = "native" - - -class PrecisionType(LightningEnum): - """Type of precision used.""" - - HALF = "16" - FLOAT = "32" - FULL = "64" - BFLOAT = "bf16" - MIXED = "mixed" - - @staticmethod - def supported_type(precision: str | int) -> bool: - return any(x == precision for x in PrecisionType) - - @staticmethod - def supported_types() -> list[str]: - return [x.value for x in PrecisionType] - class GradClipAlgorithmType(LightningEnum): """Define gradient_clip_algorithm types - training-tricks. @@ -85,47 +48,6 @@ class AutoRestartBatchKeys(LightningEnum): PL_RESTART_META = "__pl_restart_meta" -class _StrategyType(LightningEnum): - """Define type of training strategy.""" - - DP = "dp" - DDP = "ddp" - DDP_SPAWN = "ddp_spawn" - DDP_FORK = "ddp_fork" - TPU_SPAWN = "tpu_spawn" - DEEPSPEED = "deepspeed" - HOROVOD = "horovod" - DDP_SHARDED = "ddp_sharded" - DDP_SHARDED_SPAWN = "ddp_sharded_spawn" - DDP_FULLY_SHARDED = "ddp_fully_sharded" - BAGUA = "bagua" - HPU_PARALLEL = "hpu_parallel" - - @staticmethod - def interactive_compatible_types() -> list[_StrategyType]: - """Returns a list containing interactive compatible _StrategyTypes.""" - return [ - _StrategyType.DP, - _StrategyType.TPU_SPAWN, - _StrategyType.DDP_FORK, - ] - - def is_interactive_compatible(self) -> bool: - """Returns whether self is interactive compatible.""" - return self in _StrategyType.interactive_compatible_types() - - -class _AcceleratorType(LightningEnum): - """Define Accelerator type by its nature.""" - - CPU = "CPU" - CUDA = "CUDA" - IPU = "IPU" - TPU = "TPU" - HPU = "HPU" - MPS = "MPS" - - class _FaultTolerantMode(LightningEnum): DISABLED = "disabled" diff --git a/src/pytorch_lightning/utilities/exceptions.py b/src/pytorch_lightning/utilities/exceptions.py index ece4629819b33..7a3e20034e6a3 100644 --- a/src/pytorch_lightning/utilities/exceptions.py +++ b/src/pytorch_lightning/utilities/exceptions.py @@ -12,9 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. - -class MisconfigurationException(Exception): - """Exception used to inform users of misuse with PyTorch Lightning.""" +from lightning_lite.utilities.exceptions import MisconfigurationException # noqa: F401 class DeadlockDetectedException(Exception): diff --git a/src/pytorch_lightning/utilities/fetching.py b/src/pytorch_lightning/utilities/fetching.py index ba44e2132a0e0..5dd068af5340d 100644 --- a/src/pytorch_lightning/utilities/fetching.py +++ b/src/pytorch_lightning/utilities/fetching.py @@ -20,6 +20,7 @@ from lightning_utilities.core.apply_func import apply_to_collection, apply_to_collections from torch.utils.data.dataloader import DataLoader +from lightning_lite.utilities.data import has_len from pytorch_lightning.trainer.supporters import CombinedLoader, CycleIterator from pytorch_lightning.utilities.auto_restart import ( _add_capture_metadata_collate, @@ -29,7 +30,6 @@ MergedIteratorState, patch_dataloader_iterator, ) -from pytorch_lightning.utilities.data import has_len from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.imports import _fault_tolerant_training diff --git a/src/pytorch_lightning/utilities/meta.py b/src/pytorch_lightning/utilities/meta.py index b1359df852606..6670dc7a63f6f 100644 --- a/src/pytorch_lightning/utilities/meta.py +++ b/src/pytorch_lightning/utilities/meta.py @@ -18,7 +18,7 @@ from torch import Tensor from torch.nn import Module, Parameter -from pytorch_lightning.utilities import rank_zero_deprecation +from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation def is_meta_init() -> bool: diff --git a/src/pytorch_lightning/utilities/optimizer.py b/src/pytorch_lightning/utilities/optimizer.py index b13baf25522c1..9b5fe9273f12e 100644 --- a/src/pytorch_lightning/utilities/optimizer.py +++ b/src/pytorch_lightning/utilities/optimizer.py @@ -12,23 +12,24 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import Iterable +from typing import Any -from lightning_utilities.core.apply_func import apply_to_collection -from torch import Tensor -from torch.optim import Optimizer +from lightning_lite.utilities.optimizer import optimizer_to_device as new_optimizer_to_device +from lightning_lite.utilities.optimizer import optimizers_to_device as new_optimizers_to_device +from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation -from lightning_lite.utilities.apply_func import move_data_to_device -from pytorch_lightning.utilities.types import _DEVICE +def optimizers_to_device(*args: Any, **kwargs: Any) -> None: + rank_zero_deprecation( + "`pytorch_lightning.utilities.optimizer.optimizers_to_device` has been deprecated in v1.8.0 and will be" + " removed in v1.10.0. Please use `lightning_lite.utilities.optimizer.optimizers_to_device` instead." + ) + return new_optimizers_to_device(*args, **kwargs) -def optimizers_to_device(optimizers: Iterable[Optimizer], device: _DEVICE) -> None: - """Moves optimizer states for a sequence of optimizers to the device.""" - for opt in optimizers: - optimizer_to_device(opt, device) - -def optimizer_to_device(optimizer: Optimizer, device: _DEVICE) -> None: - """Moves the state of a single optimizer to the device.""" - for p, v in optimizer.state.items(): - optimizer.state[p] = apply_to_collection(v, Tensor, move_data_to_device, device) +def optimizer_to_device(*args: Any, **kwargs: Any) -> None: + rank_zero_deprecation( + "`pytorch_lightning.utilities.optimizer.optimizer_to_device` has been deprecated in v1.8.0 and will be" + " removed in v1.10.0. Please use `lightning_lite.utilities.optimizer.optimizer_to_device` instead." + ) + return new_optimizer_to_device(*args, **kwargs) diff --git a/src/pytorch_lightning/utilities/rank_zero.py b/src/pytorch_lightning/utilities/rank_zero.py index 156c7c98c5996..70550e43a43bf 100644 --- a/src/pytorch_lightning/utilities/rank_zero.py +++ b/src/pytorch_lightning/utilities/rank_zero.py @@ -11,48 +11,19 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - """Utilities that can be used for calling functions on a particular rank.""" import logging -import os -from typing import Optional - -import lightning_utilities.core.rank_zero as rank_zero_module -# note: we want to keep these indirections so the `rank_zero_only.rank` is set (on import) for PL users -from lightning_utilities.core.rank_zero import ( # noqa: F401 +# note: we want to keep these indirections so the `rank_zero_module.log` is set (on import) for PL users +# backwards compatibility +from lightning_lite.utilities.rank_zero import LightningDeprecationWarning # noqa: F401 +from lightning_lite.utilities.rank_zero import ( # noqa: F401 rank_zero_debug, rank_zero_deprecation, rank_zero_info, + rank_zero_module, rank_zero_only, rank_zero_warn, ) -import pytorch_lightning as pl - rank_zero_module.log = logging.getLogger(__name__) - - -def _get_rank(trainer: Optional["pl.Trainer"] = None) -> Optional[int]: - if trainer is not None: - return trainer.global_rank - # SLURM_PROCID can be set even if SLURM is not managing the multiprocessing, - # therefore LOCAL_RANK needs to be checked first - rank_keys = ("RANK", "LOCAL_RANK", "SLURM_PROCID", "JSM_NAMESPACE_RANK") - for key in rank_keys: - rank = os.environ.get(key) - if rank is not None: - return int(rank) - # None to differentiate whether an environment variable was set at all - return None - - -# add the attribute to the function but don't overwrite in case Trainer has already set it -rank_zero_only.rank = getattr(rank_zero_only, "rank", _get_rank() or 0) - - -class LightningDeprecationWarning(DeprecationWarning): - """Deprecation warnings raised by PyTorch Lightning.""" - - -rank_zero_module.rank_zero_deprecation_category = LightningDeprecationWarning diff --git a/src/pytorch_lightning/utilities/seed.py b/src/pytorch_lightning/utilities/seed.py index 5c33214cf405a..221ed9d114260 100644 --- a/src/pytorch_lightning/utilities/seed.py +++ b/src/pytorch_lightning/utilities/seed.py @@ -12,135 +12,16 @@ # See the License for the specific language governing permissions and # limitations under the License. """Utilities to help with reproducibility of models.""" - -import logging -import os -import random from contextlib import contextmanager -from random import getstate as python_get_rng_state -from random import setstate as python_set_rng_state -from typing import Any, Dict, Generator, Optional +from typing import Any, Generator -import numpy as np import torch -from lightning_utilities.core.rank_zero import rank_prefixed_message - -from pytorch_lightning.utilities.rank_zero import _get_rank, rank_zero_only, rank_zero_warn - -log = logging.getLogger(__name__) - -max_seed_value = np.iinfo(np.uint32).max -min_seed_value = np.iinfo(np.uint32).min - - -def seed_everything(seed: Optional[int] = None, workers: bool = False) -> int: - """Function that sets seed for pseudo-random number generators in: pytorch, numpy, python.random In addition, - sets the following environment variables: - - - `PL_GLOBAL_SEED`: will be passed to spawned subprocesses (e.g. ddp_spawn backend). - - `PL_SEED_WORKERS`: (optional) is set to 1 if ``workers=True``. - - Args: - seed: the integer value seed for global random state in Lightning. - If `None`, will read seed from `PL_GLOBAL_SEED` env variable - or select it randomly. - workers: if set to ``True``, will properly configure all dataloaders passed to the - Trainer with a ``worker_init_fn``. If the user already provides such a function - for their dataloaders, setting this argument will have no influence. See also: - :func:`~pytorch_lightning.utilities.seed.pl_worker_init_function`. - """ - if seed is None: - env_seed = os.environ.get("PL_GLOBAL_SEED") - if env_seed is None: - seed = _select_seed_randomly(min_seed_value, max_seed_value) - rank_zero_warn(f"No seed found, seed set to {seed}") - else: - try: - seed = int(env_seed) - except ValueError: - seed = _select_seed_randomly(min_seed_value, max_seed_value) - rank_zero_warn(f"Invalid seed found: {repr(env_seed)}, seed set to {seed}") - elif not isinstance(seed, int): - seed = int(seed) - - if not (min_seed_value <= seed <= max_seed_value): - rank_zero_warn(f"{seed} is not in bounds, numpy accepts from {min_seed_value} to {max_seed_value}") - seed = _select_seed_randomly(min_seed_value, max_seed_value) - - log.info(rank_prefixed_message(f"Global seed set to {seed}", _get_rank())) - os.environ["PL_GLOBAL_SEED"] = str(seed) - random.seed(seed) - np.random.seed(seed) - torch.manual_seed(seed) - torch.cuda.manual_seed_all(seed) - - os.environ["PL_SEED_WORKERS"] = f"{int(workers)}" - - return seed - - -def _select_seed_randomly(min_seed_value: int = min_seed_value, max_seed_value: int = max_seed_value) -> int: - return random.randint(min_seed_value, max_seed_value) - - -def reset_seed() -> None: - """Reset the seed to the value that :func:`pytorch_lightning.utilities.seed.seed_everything` previously set. - - If :func:`pytorch_lightning.utilities.seed.seed_everything` is unused, this function will do nothing. - """ - seed = os.environ.get("PL_GLOBAL_SEED", None) - if seed is None: - return - workers = os.environ.get("PL_SEED_WORKERS", "0") - seed_everything(int(seed), workers=bool(int(workers))) - - -def pl_worker_init_function(worker_id: int, rank: Optional[int] = None) -> None: # pragma: no cover - """The worker_init_fn that Lightning automatically adds to your dataloader if you previously set the seed with - ``seed_everything(seed, workers=True)``. - - See also the PyTorch documentation on - `randomness in DataLoaders `_. - """ - # implementation notes: https://github.com/pytorch/pytorch/issues/5059#issuecomment-817392562 - global_rank = rank if rank is not None else rank_zero_only.rank - process_seed = torch.initial_seed() - # back out the base seed so we can use all the bits - base_seed = process_seed - worker_id - log.debug( - f"Initializing random number generators of process {global_rank} worker {worker_id} with base seed {base_seed}" - ) - ss = np.random.SeedSequence([base_seed, worker_id, global_rank]) - # use 128 bits (4 x 32-bit words) - np.random.seed(ss.generate_state(4)) - # Spawn distinct SeedSequences for the PyTorch PRNG and the stdlib random module - torch_ss, stdlib_ss = ss.spawn(2) - torch.manual_seed(torch_ss.generate_state(1, dtype=np.uint64)[0]) - # use 128 bits expressed as an integer - stdlib_seed = (stdlib_ss.generate_state(2, dtype=np.uint64).astype(object) * [1 << 64, 1]).sum() - random.seed(stdlib_seed) - - -def _collect_rng_states() -> Dict[str, Any]: - """Collect the global random state of :mod:`torch`, :mod:`torch.cuda`, :mod:`numpy` and Python.""" - return { - "torch": torch.get_rng_state(), - "torch.cuda": torch.cuda.get_rng_state_all(), - "numpy": np.random.get_state(), - "python": python_get_rng_state(), - } - -def _set_rng_states(rng_state_dict: Dict[str, Any]) -> None: - """Set the global random state of :mod:`torch`, :mod:`torch.cuda`, :mod:`numpy` and Python in the current - process.""" - torch.set_rng_state(rng_state_dict["torch"]) - # torch.cuda rng_state is only included since v1.8. - if "torch.cuda" in rng_state_dict: - torch.cuda.set_rng_state_all(rng_state_dict["torch.cuda"]) - np.random.set_state(rng_state_dict["numpy"]) - version, state, gauss = rng_state_dict["python"] - python_set_rng_state((version, tuple(state), gauss)) +from lightning_lite.utilities.seed import _collect_rng_states, _set_rng_states +from lightning_lite.utilities.seed import pl_worker_init_function as new_pl_worker_init_function +from lightning_lite.utilities.seed import reset_seed as new_reset_seed +from lightning_lite.utilities.seed import seed_everything as new_seed_everything +from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation @contextmanager @@ -161,3 +42,27 @@ def isolate_rng() -> Generator[None, None, None]: states = _collect_rng_states() yield _set_rng_states(states) + + +def seed_everything(*args: Any, **kwargs: Any) -> Any: + rank_zero_deprecation( + "`pytorch_lightning.utilities.seed.seed_everything` has been deprecated in v1.8.0 and will be" + " removed in v1.10.0. Please use `lightning_lite.utilities.seed.seed_everything` instead." + ) + return new_seed_everything(*args, **kwargs) + + +def reset_seed() -> None: + rank_zero_deprecation( + "`pytorch_lightning.utilities.seed.reset_seed` has been deprecated in v1.8.0 and will be" + " removed in v1.10.0. Please use `lightning_lite.utilities.seed.reset_seed` instead." + ) + return new_reset_seed() + + +def pl_worker_init_function(*args: Any, **kwargs: Any) -> None: + rank_zero_deprecation( + "`pytorch_lightning.utilities.seed.pl_worker_init_function` has been deprecated in v1.8.0 and will be" + " removed in v1.10.0. Please use `lightning_lite.utilities.seed.pl_worker_init_function` instead." + ) + return new_pl_worker_init_function(*args, **kwargs) diff --git a/src/pytorch_lightning/utilities/types.py b/src/pytorch_lightning/utilities/types.py index 39b50748099dd..06dea2eebb32b 100644 --- a/src/pytorch_lightning/utilities/types.py +++ b/src/pytorch_lightning/utilities/types.py @@ -19,16 +19,16 @@ from argparse import _ArgumentGroup, ArgumentParser from contextlib import contextmanager from dataclasses import dataclass -from pathlib import Path -from typing import Any, Callable, Dict, Generator, Iterator, List, Mapping, Optional, Sequence, Type, TypeVar, Union +from typing import Any, Dict, Generator, List, Mapping, Optional, Sequence, Type, Union import torch from torch import Tensor -from torch.optim import Optimizer from torch.utils.data import DataLoader from torchmetrics import Metric from typing_extensions import Protocol, runtime_checkable +from lightning_lite.utilities.types import _LRScheduler, ReduceLROnPlateau + if torch.distributed.is_available(): from torch._C._distributed_c10d import ProcessGroup else: @@ -41,8 +41,6 @@ EPOCH_OUTPUT = List[STEP_OUTPUT] _EVALUATE_OUTPUT = List[Dict[str, float]] # 1 dict per DataLoader _PREDICT_OUTPUT = Union[List[Any], List[List[Any]]] -_PARAMETERS = Iterator[torch.nn.Parameter] -_PATH = Union[str, Path] TRAIN_DATALOADERS = Union[ DataLoader, Sequence[DataLoader], @@ -53,8 +51,6 @@ Dict[str, Sequence[DataLoader]], ] EVAL_DATALOADERS = Union[DataLoader, Sequence[DataLoader]] -_DEVICE = Union[torch.device, str, int] -_MAP_LOCATION_TYPE = Optional[Union[_DEVICE, Callable[[_DEVICE], _DEVICE], Dict[_DEVICE, _DEVICE]]] _ADD_ARGPARSE_RETURN = Union[_ArgumentGroup, ArgumentParser] @@ -94,60 +90,6 @@ def predict_step(self, *args: Any, **kwargs: Any) -> STEP_OUTPUT: ... -_DictKey = TypeVar("_DictKey") - - -@runtime_checkable -class _Stateful(Protocol[_DictKey]): - """This class is used to detect if an object is stateful using `isinstance(obj, _Stateful)`.""" - - def state_dict(self) -> Dict[_DictKey, Any]: - ... - - def load_state_dict(self, state_dict: Dict[_DictKey, Any]) -> None: - ... - - -# Inferred from `torch.optim.lr_scheduler.pyi` -# Missing attributes were added to improve typing -@runtime_checkable -class _LRScheduler(_Stateful[str], Protocol): - optimizer: Optimizer - base_lrs: List[float] - - def __init__(self, optimizer: Optimizer, *args: Any, **kwargs: Any) -> None: - ... - - def step(self, epoch: Optional[int] = None) -> None: - ... - - -# Inferred from `torch.optim.lr_scheduler.pyi` -# Missing attributes were added to improve typing -@runtime_checkable -class ReduceLROnPlateau(_Stateful[str], Protocol): - in_cooldown: bool - optimizer: Optimizer - - def __init__( - self, - optimizer: Optimizer, - mode: str = ..., - factor: float = ..., - patience: int = ..., - verbose: bool = ..., - threshold: float = ..., - threshold_mode: str = ..., - cooldown: int = ..., - min_lr: float = ..., - eps: float = ..., - ) -> None: - ... - - def step(self, metrics: Union[float, int, Tensor], epoch: Optional[int] = None) -> None: - ... - - # Inferred from `torch.nn.parallel.distributed.pyi` # Missing attributes were added to improve typing @runtime_checkable diff --git a/src/pytorch_lightning/utilities/upgrade_checkpoint.py b/src/pytorch_lightning/utilities/upgrade_checkpoint.py index 804714d82ca2d..6f4dd5ca938dd 100644 --- a/src/pytorch_lightning/utilities/upgrade_checkpoint.py +++ b/src/pytorch_lightning/utilities/upgrade_checkpoint.py @@ -17,9 +17,9 @@ import torch +from lightning_lite.utilities.types import _PATH from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint from pytorch_lightning.utilities.migration import pl_legacy_patch -from pytorch_lightning.utilities.types import _PATH KEYS_MAPPING = { "checkpoint_callback_best_model_score": (ModelCheckpoint, "best_model_score"), diff --git a/src/pytorch_lightning/utilities/warnings.py b/src/pytorch_lightning/utilities/warnings.py index ae608bdbccce7..57b56ba0685c8 100644 --- a/src/pytorch_lightning/utilities/warnings.py +++ b/src/pytorch_lightning/utilities/warnings.py @@ -12,13 +12,5 @@ # See the License for the specific language governing permissions and # limitations under the License. """Warning-related utilities.""" -import warnings - -from pytorch_lightning.utilities.rank_zero import LightningDeprecationWarning - -# enable our warnings -warnings.simplefilter("default", category=LightningDeprecationWarning) - - -class PossibleUserWarning(UserWarning): - """Warnings that could be false positives.""" +# backwards compatibility +from lightning_lite.utilities.warnings import PossibleUserWarning # noqa: F401 diff --git a/src/pytorch_lightning/utilities/xla_device.py b/src/pytorch_lightning/utilities/xla_device.py index 1d6347c6e6a25..a515058a63c1f 100644 --- a/src/pytorch_lightning/utilities/xla_device.py +++ b/src/pytorch_lightning/utilities/xla_device.py @@ -18,7 +18,7 @@ from lightning_lite.utilities.xla_device import inner_f as new_inner_f from lightning_lite.utilities.xla_device import pl_multi_process as new_pl_multi_process from lightning_lite.utilities.xla_device import XLADeviceUtils as NewXLADeviceUtils -from pytorch_lightning.utilities import rank_zero_deprecation # TODO(lite): update to lightning_lite.utilities +from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation def inner_f(queue: Queue, func: Callable, *args: Any, **kwargs: Any) -> None: # pragma: no cover diff --git a/tests/tests_lite/conftest.py b/tests/tests_lite/conftest.py index fab4ff7e17901..209d6869a18db 100644 --- a/tests/tests_lite/conftest.py +++ b/tests/tests_lite/conftest.py @@ -1,7 +1,94 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import os from typing import List import pytest +import torch.distributed + + +@pytest.fixture(scope="function", autouse=True) +def preserve_global_rank_variable(): + """Ensures that the rank_zero_only.rank global variable gets reset in each test.""" + from lightning_lite.utilities.rank_zero import rank_zero_only + + rank = getattr(rank_zero_only, "rank", None) + yield + if rank is not None: + setattr(rank_zero_only, "rank", rank) + + +@pytest.fixture(scope="function", autouse=True) +def restore_env_variables(): + """Ensures that environment variables set during the test do not leak out.""" + env_backup = os.environ.copy() + yield + leaked_vars = os.environ.keys() - env_backup.keys() + # restore environment as it was before running the test + os.environ.clear() + os.environ.update(env_backup) + # these are currently known leakers - ideally these would not be allowed + # TODO(lite): this list can be trimmed, maybe PL's too after moving tests + allowlist = { + "CUDA_DEVICE_ORDER", + "LOCAL_RANK", + "NODE_RANK", + "WORLD_SIZE", + "MASTER_ADDR", + "MASTER_PORT", + "PL_GLOBAL_SEED", + "PL_SEED_WORKERS", + "HOROVOD_FUSION_THRESHOLD", + "RANK", # set by DeepSpeed + "POPLAR_ENGINE_OPTIONS", # set by IPUStrategy + # set by XLA + "TF2_BEHAVIOR", + "XRT_MESH_SERVICE_ADDRESS", + "XRT_TORCH_DIST_ROOT", + "XRT_MULTI_PROCESSING_DEVICE", + "XRT_SHARD_WORLD_SIZE", + "XRT_LOCAL_WORKER", + "XRT_HOST_WORLD_SIZE", + "XRT_SHARD_ORDINAL", + "XRT_SHARD_LOCAL_ORDINAL", + "TF_CPP_MIN_LOG_LEVEL", + } + leaked_vars.difference_update(allowlist) + assert not leaked_vars, f"test is leaking environment variable(s): {set(leaked_vars)}" + + +@pytest.fixture(scope="function", autouse=True) +def teardown_process_group(): + """Ensures that the distributed process group gets closed before the next test runs.""" + yield + if torch.distributed.is_available() and torch.distributed.is_initialized(): + torch.distributed.destroy_process_group() + + +@pytest.fixture +def caplog(caplog): + """Workaround for https://github.com/pytest-dev/pytest/issues/3697. + + Setting ``filterwarnings`` with pytest breaks ``caplog`` when ``not logger.propagate``. + """ + import logging + + lightning_logger = logging.getLogger("lightning_lite") + propagate = lightning_logger.propagate + lightning_logger.propagate = True + yield caplog + lightning_logger.propagate = propagate def pytest_collection_modifyitems(items: List[pytest.Function], config: pytest.Config) -> None: diff --git a/tests/tests_lite/helpers/runif.py b/tests/tests_lite/helpers/runif.py index fcdca0f9a6d22..7dd9aaf72962c 100644 --- a/tests/tests_lite/helpers/runif.py +++ b/tests/tests_lite/helpers/runif.py @@ -20,8 +20,9 @@ from packaging.version import Version from pkg_resources import get_distribution +from lightning_lite.utilities.imports import _FAIRSCALE_AVAILABLE, _PSUTIL_AVAILABLE, _TPU_AVAILABLE + -# TODO(lite): Add all RunIf conditions once the relevant utilities have moved to lite source dir class RunIf: """RunIf wrapper for simple marking specific cases, fully compatible with pytest.mark:: @@ -38,8 +39,11 @@ def __new__( min_torch: Optional[str] = None, max_torch: Optional[str] = None, min_python: Optional[str] = None, + tpu: bool = False, skip_windows: bool = False, standalone: bool = False, + fairscale: bool = False, + psutil: bool = False, **kwargs, ): """ @@ -49,9 +53,12 @@ def __new__( min_torch: Require that PyTorch is greater or equal than this version. max_torch: Require that PyTorch is less than this version. min_python: Require that Python is greater or equal than this version. + tpu: Require that TPU is available. skip_windows: Skip for Windows platform. standalone: Mark the test as standalone, our CI will run it in a separate process. This requires that the ``PL_RUN_STANDALONE_TESTS=1`` environment variable is set. + fairscale: Require that facebookresearch/fairscale is installed. + psutil: Require that psutil is installed. **kwargs: Any :class:`pytest.mark.skipif` keyword arguments. """ conditions = [] @@ -82,6 +89,12 @@ def __new__( conditions.append(sys.platform == "win32") reasons.append("unimplemented on Windows") + if tpu: + conditions.append(not _TPU_AVAILABLE) + reasons.append("TPU") + # used in conftest.py::pytest_collection_modifyitems + kwargs["tpu"] = True + if standalone: env_flag = os.getenv("PL_RUN_STANDALONE_TESTS", "0") conditions.append(env_flag != "1") @@ -89,6 +102,18 @@ def __new__( # used in conftest.py::pytest_collection_modifyitems kwargs["standalone"] = True + if fairscale: + if skip_windows: + raise ValueError( + "`skip_windows` is not necessary when `fairscale` is set as it does not support Windows." + ) + conditions.append(not _FAIRSCALE_AVAILABLE) + reasons.append("Fairscale") + + if psutil: + conditions.append(not _PSUTIL_AVAILABLE) + reasons.append("psutil") + reasons = [rs for cond, rs in zip(conditions, reasons) if cond] return pytest.mark.skipif( *args, condition=any(conditions), reason=f"Requires: [{' + '.join(reasons)}]", **kwargs diff --git a/tests/tests_lite/helpers/utils.py b/tests/tests_lite/helpers/utils.py new file mode 100644 index 0000000000000..2a8294d4d8dfe --- /dev/null +++ b/tests/tests_lite/helpers/utils.py @@ -0,0 +1,31 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os + +import numpy as np + +from lightning_lite.utilities.seed import seed_everything + +# generate a list of random seeds for each test +RANDOM_PORTS = list(np.random.randint(12000, 19000, 1000)) + + +def reset_seed(seed=0): + seed_everything(seed) + + +def set_random_main_port(): + reset_seed() + port = RANDOM_PORTS.pop() + os.environ["MASTER_PORT"] = str(port) diff --git a/tests/tests_lite/utilities/test_data.py b/tests/tests_lite/utilities/test_data.py new file mode 100644 index 0000000000000..8946ab5001754 --- /dev/null +++ b/tests/tests_lite/utilities/test_data.py @@ -0,0 +1,509 @@ +import random + +import pytest +import torch +from torch.utils.data import BatchSampler, DataLoader, RandomSampler, SequentialSampler + +from lightning_lite.utilities.data import ( + _dataloader_init_kwargs_resolve_sampler, + _get_dataloader_init_args_and_kwargs, + _replace_dunder_methods, + _replace_value_in_saved_args, + _update_dataloader, + _WrapAttrTag, + has_iterable_dataset, + has_len, +) +from lightning_lite.utilities.exceptions import MisconfigurationException + +# TODO(lite): provide boring classes in Lite +from pytorch_lightning.demos.boring_classes import RandomDataset, RandomIterableDataset + + +def test_has_iterable_dataset(): + assert has_iterable_dataset(DataLoader(RandomIterableDataset(1, 1))) + + assert not has_iterable_dataset(DataLoader(RandomDataset(1, 1))) + + class MockDatasetWithoutIterableDataset(RandomDataset): + def __iter__(self): + yield 1 + return self + + assert not has_iterable_dataset(DataLoader(MockDatasetWithoutIterableDataset(1, 1))) + + +def test_has_len(): + assert has_len(DataLoader(RandomDataset(1, 1))) + + with pytest.warns(UserWarning, match="`DataLoader` returned 0 length."): + assert has_len(DataLoader(RandomDataset(0, 0))) + + assert not has_len(DataLoader(RandomIterableDataset(1, 1))) + + +def test_replace_dunder_methods_multiple_loaders_without_init(): + """In case of a class, that inherits from a class that we are patching, but doesn't define its own `__init__` + method (the one we are wrapping), it can happen, that `hasattr(cls, "__old__init__")` is True because of parent + class, but it is impossible to delete, because that method is owned by parent class. Furthermore, the error + occured only sometimes because it depends on the order in which we are iterating over a set of classes we are + patching. + + This test simulates the behavior by generating sufficient number of dummy classes, which do not define `__init__` + and are children of `DataLoader`. We are testing that a) context manager `_replace_dunder_method` exits cleanly, and + b) the mechanism checking for presence of `__old__init__` works as expected. + """ + classes = [DataLoader] + for i in range(100): + classes.append(type(f"DataLoader_{i}", (random.choice(classes),), {})) + + before = {cls: cls.__init__ for cls in classes} + + with _replace_dunder_methods(DataLoader, "dataset"): + for cls in classes[1:]: # First one is `DataLoader` + assert "__old__init__" not in cls.__dict__ + assert hasattr(cls, "__old__init__") + + assert "__old__init__" in DataLoader.__dict__ + assert hasattr(DataLoader, "__old__init__") + + for cls in classes: + assert before[cls] == cls.__init__ + + +class MyBaseDataLoader(DataLoader): + pass + + +class DataLoaderSubclass1(DataLoader): + def __init__(self, attribute1, *args, **kwargs): + self.at1 = attribute1 + super().__init__(*args, **kwargs) + + +class DataLoaderSubclass2(DataLoaderSubclass1): + def __init__(self, attribute2, *args, **kwargs): + self.at2 = attribute2 + super().__init__(attribute2 + "-2", *args, **kwargs) + + +class MyDataLoader(MyBaseDataLoader): + def __init__(self, data: torch.Tensor, *args, **kwargs): + self.data = data + super().__init__(range(data.size(0)), *args, **kwargs) + + +test3_data = torch.randn((10, 20)) + + +class PoptorchDataLoader(DataLoader): + def __init__(self, options, *args, **kwargs): + super().__init__(*args, **kwargs) + self._options = options + + @property + def options(self): + return self._options + + +class IncompleteDataLoader(DataLoader): + def __init__(self, dataset, batch_size, **kwargs): + batch_size = max(batch_size - 5, 0) + super().__init__(dataset, batch_size=batch_size, **kwargs) + + +class WeirdDataLoader1(DataLoader): + def __init__(self, arg1, arg2, **kwargs): + self.arg1 = arg1 + super().__init__(arg2, **kwargs) + + +class WeirdDataLoader2(DataLoader): + def __init__(self, data_part1, data_part2, **kwargs): + data = list(data_part1) + list(data_part2) + super().__init__(data, **kwargs) + + +class NoneDataLoader(DataLoader): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + +class ChangingDataLoader(DataLoader): + def __init__(self, dataset, **kwargs): + super().__init__(list(dataset) + list(range(5, 10)), **kwargs) + + +@pytest.mark.parametrize( + ["cls", "args", "kwargs", "arg_names", "dataset", "checked_values"], + [ + pytest.param( + DataLoaderSubclass1, + ("attribute1",), + dict(dataset=range(4), batch_size=2), + ("attribute1",), + range(4), + dict(batch_size=2, at1="attribute1"), + id="test1", + ), + pytest.param( + DataLoaderSubclass2, + ("attribute2",), + dict(dataset=range(4), batch_size=2), + ("attribute2",), + range(4), + dict(batch_size=2, at1="attribute2-2", at2="attribute2"), + id="test2", + ), + pytest.param( + MyDataLoader, + (test3_data,), + dict(batch_size=2), + ("data",), + range(10), + dict(batch_size=2, data=test3_data), + id="test3", + ), + pytest.param(PoptorchDataLoader, (123, [1]), dict(), ("options",), [1], dict(options=123), id="test4"), + pytest.param( + IncompleteDataLoader, + (range(10),), + dict(batch_size=10), + ("dataset",), + range(10), + dict(batch_size=5), + id="test5", + ), + pytest.param( + WeirdDataLoader1, + (10, range(10)), + dict(batch_size=10), + ("arg1", "arg2"), + range(10), + dict(arg1=10, batch_size=10), + id="test6", + ), + pytest.param( + WeirdDataLoader2, + (range(10), range(10, 20)), + dict(batch_size=10), + ("data_part1", "data_part2"), + list(range(20)), + dict(batch_size=10), + id="test7", + ), + pytest.param(NoneDataLoader, (None,), dict(), (), None, dict(), id="test8"), + pytest.param(ChangingDataLoader, (range(5),), dict(), ("dataset",), list(range(10)), dict(), id="test9"), + ], +) +def test_replace_dunder_methods_dataloader(cls, args, kwargs, arg_names, dataset, checked_values): + with _replace_dunder_methods(DataLoader, "dataset"): + dataloader = cls(*args, **kwargs) + + assert dataloader.__pl_saved_args == args + assert dataloader.__pl_saved_kwargs == kwargs + assert dataloader.__pl_saved_arg_names == arg_names + assert dataloader.__pl_saved_default_kwargs == {} + assert dataloader.__dataset == dataset + + assert dataloader.dataset == dataset + + for key, value in checked_values.items(): + dataloader_value = getattr(dataloader, key) + if isinstance(dataloader_value, torch.Tensor): + assert dataloader_value is value + else: + assert dataloader_value == value + + dataloader = _update_dataloader(dataloader, dataloader.sampler) + + assert isinstance(dataloader, cls) + assert not hasattr(dataloader, "__pl_saved_kwargs") + assert not hasattr(dataloader, "__pl_saved_arg_names") + assert not hasattr(dataloader, "__pl_saved_args") + assert not hasattr(dataloader, "__pl_saved_default_kwargs") + assert not hasattr(dataloader, "__dataset") + + assert dataloader.dataset == dataset + + for key, value in checked_values.items(): + dataloader_value = getattr(dataloader, key) + if isinstance(dataloader_value, torch.Tensor): + assert dataloader_value is value + else: + assert dataloader_value == value + + +def test_replace_dunder_methods_extra_kwargs(): + class LoaderSubclass(DataLoader): + def __init__(self, dataset, *args, batch_size=10, **kwargs): + super().__init__(dataset, *args, batch_size=batch_size, **kwargs) + + with _replace_dunder_methods(DataLoader, "dataset"): + dataloader = LoaderSubclass(range(10)) + + assert dataloader.__pl_saved_args == (range(10),) + assert dataloader.__pl_saved_kwargs == {} + assert dataloader.__pl_saved_arg_names == ("dataset",) + assert dataloader.__pl_saved_default_kwargs == {"batch_size": 10} + assert dataloader.__dataset == range(10) + + +def test_replace_dunder_methods_attrs(): + """This test checks, that all the calls from setting and deleting attributes within `_replace_dunder_methods` + are correctly preserved even after reinstantiation. + + It also includes a custom `__setattr__` + """ + + class Loader(DataLoader): + def __setattr__(self, attr, val): + if attr == "custom_arg": + val = val + 2 + super().__setattr__(attr, val) + + with _replace_dunder_methods(DataLoader, "dataset"): + dataloader = Loader(range(10)) + dataloader.custom_arg = 5 + dataloader.my_arg = 10 + dataloader.another_arg = 100 + del dataloader.dataset + try: + del dataloader.abc_arg + except AttributeError: + pass + + assert dataloader.__pl_saved_args == (range(10),) + assert dataloader.__pl_saved_kwargs == {} + assert dataloader.__pl_saved_arg_names == ("dataset",) + assert dataloader.__dataset == range(10) + assert dataloader.custom_arg == 7 + assert dataloader.my_arg == 10 + assert dataloader.another_arg == 100 + assert not hasattr(dataloader, "dataset") + assert dataloader.__pl_attrs_record == [ + (("custom_arg", 5), _WrapAttrTag.SET), + (("my_arg", 10), _WrapAttrTag.SET), + (("another_arg", 100), _WrapAttrTag.SET), + (("dataset",), _WrapAttrTag.DEL), + ] + + dataloader = _update_dataloader(dataloader, dataloader.sampler) + assert dataloader.custom_arg == 7 + assert dataloader.my_arg == 10 + assert dataloader.another_arg == 100 + assert not hasattr(dataloader, "dataset") + + +def test_replace_dunder_methods_restore_methods(): + """This tests checks whether are all dunder methods restored to their original versions.""" + + class Init(DataLoader): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + class SetAttr(DataLoader): + def __setattr__(self, *args): + return super().__setattr__(*args) + + class DelAttr(DataLoader): + def __delattr__(self, *args): + return super().__delattr__(*args) + + class InitAndSetAttr(Init, SetAttr): + pass + + class InitAndDelAttr(Init, DelAttr): + pass + + class SetAttrAndDelAttr(SetAttr, DelAttr): + pass + + class AllDunder(Init, SetAttr, DelAttr): + pass + + before = dict() + for cls in (Init, SetAttr, DelAttr, InitAndSetAttr, InitAndDelAttr, SetAttrAndDelAttr, AllDunder): + before[cls] = {"init": cls.__init__, "setattr": cls.__setattr__, "delattr": cls.__delattr__} + + with _replace_dunder_methods(DataLoader, "dataset"): + pass + + for cls in (Init, SetAttr, DelAttr, InitAndSetAttr, InitAndDelAttr, SetAttrAndDelAttr, AllDunder): + assert before[cls] == {"init": cls.__init__, "setattr": cls.__setattr__, "delattr": cls.__delattr__} + + +@pytest.mark.parametrize( + [ + "args", + "kwargs", + "default_kwargs", + "arg_names", + "replace_key", + "replace_value", + "expected_status", + "expected_args", + "expected_kwargs", + ], + [ + pytest.param((), {}, {}, [], "a", 1, False, (), {}, id="empty"), + pytest.param((1,), {}, {}, ["a"], "a", 2, True, (2,), {}, id="simple1"), + pytest.param((1, 2, 3), {}, {}, ["a", "b", "c"], "b", False, True, (1, False, 3), {}, id="simple2"), + pytest.param((1, 2, 3), {"a": 1}, {}, ["b", "c", "d"], "a", 2, True, (1, 2, 3), {"a": 2}, id="simple_kwargs"), + pytest.param( + (1, 2, 3), + {"a": 1}, + {"e": 5}, + ["b", "c", "d"], + "e", + 2, + True, + (1, 2, 3), + {"a": 1, "e": 2}, + id="default_kwargs", + ), + ], +) +def test_replace_value_in_args( + args, kwargs, default_kwargs, arg_names, replace_key, replace_value, expected_status, expected_args, expected_kwargs +): + assert _replace_value_in_saved_args(replace_key, replace_value, args, kwargs, default_kwargs, arg_names) == ( + expected_status, + expected_args, + expected_kwargs, + ) + + +def test_update_dataloader_typerror_custom_exception(): + class BadStandaloneGoodHookImpl(DataLoader): + def __init__(self, foo, *args, **kwargs): + self.foo = foo + # positional conflict with `dataset` + super().__init__(foo, *args, **kwargs) + + dataloader = BadStandaloneGoodHookImpl([1, 2, 3]) + with pytest.raises(MisconfigurationException, match="implementation has an error.*`dataset`"): + _update_dataloader(dataloader, dataloader.sampler) + + with _replace_dunder_methods(DataLoader, "dataset"): + dataloader = BadStandaloneGoodHookImpl([1, 2, 3]) + new_dataloader = _update_dataloader(dataloader, dataloader.sampler) + assert isinstance(new_dataloader, BadStandaloneGoodHookImpl) + + class BadImpl(DataLoader): + def __init__(self, randomize, *args, **kwargs): + self.randomize = randomize + # keyword conflict with `shuffle` + super().__init__(*args, shuffle=randomize, **kwargs) + + dataloader = BadImpl(False, []) + with pytest.raises(MisconfigurationException, match="implementation has an error.*`shuffle`"): + _update_dataloader(dataloader, dataloader.sampler) + + class GoodImpl(DataLoader): + def __init__(self, randomize, *args, **kwargs): + # fixed implementation, kwargs are filtered + self.randomize = randomize or kwargs.pop("shuffle", False) + super().__init__(*args, shuffle=randomize, **kwargs) + + dataloader = GoodImpl(False, []) + new_dataloader = _update_dataloader(dataloader, dataloader.sampler) + assert isinstance(new_dataloader, GoodImpl) + + +def test_custom_batch_sampler(): + """This test asserts, that custom `BatchSampler`, with all the arguments, that are required in order to + properly reinstantiate the class, is invoked properly. + + It also asserts, that during the reinstantiation, the wrapper of `__init__` method is not present anymore, therefore + not setting `__pl_saved_{args,arg_names,kwargs}` attributes. + """ + + class MyBatchSampler(BatchSampler): + # Custom Batch sampler with extra argument and default value + def __init__(self, sampler, extra_arg, drop_last=True): + self.extra_arg = extra_arg + super().__init__(sampler, 10, drop_last) + + sampler = RandomSampler(range(10)) + with _replace_dunder_methods(BatchSampler): + # instantiate within `_replace_dunder_method` context manager, simulating `*_dataloader` hooks + batch_sampler = MyBatchSampler(sampler, "random_str") + + dataloader = DataLoader(range(10), batch_sampler=batch_sampler) + + # assert that passed information got saved + assert dataloader.batch_sampler.__pl_saved_args == (sampler, "random_str") + assert dataloader.batch_sampler.__pl_saved_kwargs == {} + assert dataloader.batch_sampler.__pl_saved_arg_names == ("sampler", "extra_arg") + assert dataloader.batch_sampler.__pl_saved_default_kwargs == {"drop_last": True} + + # updating dataloader, what happens on access of the dataloaders. + # This should not fail, and would fail before support for custom args. + dataloader = _update_dataloader(dataloader, dataloader.sampler) + + # Assert the `__init__` method is not replaced anymore and everything is instantiated to correct types + batch_sampler = dataloader.batch_sampler + + assert isinstance(batch_sampler, MyBatchSampler) + + assert batch_sampler.extra_arg == "random_str" + assert not hasattr(batch_sampler, "__pl_saved_kwargs") + assert not hasattr(batch_sampler, "__pl_saved_arg_names") + assert not hasattr(batch_sampler, "__pl_saved_args") + assert not hasattr(batch_sampler, "__pl_saved_default_kwargs") + + +def test_custom_batch_sampler_no_sampler(): + """Tests whether appropriate error is raised when the custom `BatchSampler` does not support sampler + argument.""" + + class MyBatchSampler(BatchSampler): + # Custom batch sampler, without sampler argument. + def __init__(self, extra_arg): + self.extra_arg = extra_arg + super().__init__(RandomSampler(range(10)), 10, False) + + with _replace_dunder_methods(BatchSampler): + # instantiate within `_replace_dunder_method` context manager, simulating `*_dataloader` hooks + batch_sampler = MyBatchSampler("random_str") + dataloader = DataLoader(range(10), batch_sampler=batch_sampler) + + # assert that passed information got saved + assert dataloader.batch_sampler.__pl_saved_args == ("random_str",) + assert dataloader.batch_sampler.__pl_saved_kwargs == {} + assert dataloader.batch_sampler.__pl_saved_arg_names == ("extra_arg",) + assert dataloader.batch_sampler.__pl_saved_default_kwargs == {} + + # Assert that error is raised + with pytest.raises(TypeError, match="sampler into the batch sampler"): + dataloader = _update_dataloader(dataloader, dataloader.sampler) + + +def test_dataloader_disallow_batch_sampler(): + dataset = RandomDataset(5, 100) + dataloader = DataLoader(dataset, batch_size=10) + + # This should not raise + _dataloader_init_kwargs_resolve_sampler(dataloader, dataloader.sampler, disallow_batch_sampler=True) + + dataset = RandomDataset(5, 100) + sampler = SequentialSampler(dataset) + batch_sampler = BatchSampler(sampler, batch_size=10, drop_last=False) + dataloader = DataLoader(dataset, batch_sampler=batch_sampler) + + # this should raise - using batch sampler, that was not automatically instantiated by DataLoader + with pytest.raises(MisconfigurationException, match="when running on multiple IPU devices"): + _dataloader_init_kwargs_resolve_sampler(dataloader, dataloader.sampler, disallow_batch_sampler=True) + + +def test_dataloader_kwargs_replacement_with_iterable_dataset(): + """Test that DataLoader kwargs are not replaced when using Iterable Dataset.""" + dataset = RandomIterableDataset(7, 100) + dataloader = DataLoader(dataset, batch_size=32) + _, dl_kwargs = _get_dataloader_init_args_and_kwargs(dataloader, dataloader.sampler) + assert dl_kwargs["sampler"] is None + assert dl_kwargs["batch_sampler"] is None + assert dl_kwargs["batch_size"] is dataloader.batch_size + assert dl_kwargs["dataset"] is dataloader.dataset + assert dl_kwargs["collate_fn"] is dataloader.collate_fn diff --git a/tests/tests_lite/utilities/test_device_parser.py b/tests/tests_lite/utilities/test_device_parser.py new file mode 100644 index 0000000000000..bb6e1665efde7 --- /dev/null +++ b/tests/tests_lite/utilities/test_device_parser.py @@ -0,0 +1,31 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from unittest import mock + +import pytest +import torch + +import lightning_lite.utilities.device_parser + + +@pytest.mark.skipif( + "fork" in torch.multiprocessing.get_all_start_methods(), reason="Requires platform without forking support" +) +@mock.patch("torch.cuda.is_available", return_value=True) +@mock.patch("torch.cuda.device_count", return_value=2) +def test_num_cuda_devices_without_forking(*_): + """This merely tests that on platforms without fork support our helper functions fall back to the default + implementation for determining cuda availability.""" + assert lightning_lite.utilities.device_parser.is_cuda_available() + assert lightning_lite.utilities.device_parser.num_cuda_devices() == 2 diff --git a/tests/tests_lite/utilities/test_distributed.py b/tests/tests_lite/utilities/test_distributed.py new file mode 100644 index 0000000000000..b09c0487bd62e --- /dev/null +++ b/tests/tests_lite/utilities/test_distributed.py @@ -0,0 +1,63 @@ +import os + +import pytest +import tests_lite.helpers.utils as tutils +import torch +from tests_lite.helpers.runif import RunIf +from torch import multiprocessing as mp + +from lightning_lite.utilities.distributed import gather_all_tensors + + +def _test_all_gather_uneven_tensors(rank, world_size, backend): + os.environ["MASTER_ADDR"] = "localhost" + + if backend == "nccl": + device = torch.device("cuda", rank) + torch.cuda.set_device(device) + else: + device = torch.device("cpu") + + # initialize the process group + torch.distributed.init_process_group(backend, rank=rank, world_size=world_size) + + tensor = torch.ones(rank, device=device) + result = gather_all_tensors(tensor) + assert len(result) == world_size + for idx in range(world_size): + assert len(result[idx]) == idx + assert (result[idx] == torch.ones_like(result[idx])).all() + + +def _test_all_gather_uneven_tensors_multidim(rank, world_size, backend): + os.environ["MASTER_ADDR"] = "localhost" + + if backend == "nccl": + device = torch.device("cuda", rank) + torch.cuda.set_device(device) + else: + device = torch.device("cpu") + + # initialize the process group + torch.distributed.init_process_group(backend, rank=rank, world_size=world_size) + tensor = torch.ones(rank + 1, 2 - rank, device=device) + result = gather_all_tensors(tensor) + assert len(result) == world_size + for idx in range(world_size): + val = result[idx] + assert val.shape == (idx + 1, 2 - idx) + assert (val == torch.ones_like(val)).all() + + +@RunIf(min_torch="1.10", skip_windows=True) +@pytest.mark.parametrize( + "process", + [ + _test_all_gather_uneven_tensors_multidim, + _test_all_gather_uneven_tensors, + ], +) +@pytest.mark.parametrize("backend", [pytest.param("nccl", marks=RunIf(min_cuda_gpus=2)), "gloo"]) +def test_gather_all_tensors(backend, process): + tutils.set_random_main_port() + mp.spawn(process, args=(2, backend), nprocs=2) diff --git a/tests/tests_lite/utilities/test_enums.py b/tests/tests_lite/utilities/test_enums.py new file mode 100644 index 0000000000000..38a556e5dc2f2 --- /dev/null +++ b/tests/tests_lite/utilities/test_enums.py @@ -0,0 +1,9 @@ +from lightning_lite.utilities.enums import PrecisionType + + +def test_precision_supported_types(): + assert PrecisionType.supported_types() == ["16", "32", "64", "bf16", "mixed"] + assert PrecisionType.supported_type(16) + assert PrecisionType.supported_type("16") + assert not PrecisionType.supported_type(1) + assert not PrecisionType.supported_type("invalid") diff --git a/tests/tests_lite/utilities/test_imports.py b/tests/tests_lite/utilities/test_imports.py new file mode 100644 index 0000000000000..3a8444ef728ff --- /dev/null +++ b/tests/tests_lite/utilities/test_imports.py @@ -0,0 +1,81 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from lightning_lite.utilities.imports import ( + _APEX_AVAILABLE, + _FAIRSCALE_AVAILABLE, + _HOROVOD_AVAILABLE, + _OMEGACONF_AVAILABLE, + _POPTORCH_AVAILABLE, +) + + +def test_imports(): + try: + import apex # noqa + except ModuleNotFoundError: + assert not _APEX_AVAILABLE + else: + assert _APEX_AVAILABLE + + # TODO(lite): re-enable these once deepspeed strategy has moved + # try: + # import deepspeed + # except ModuleNotFoundError: + # assert not _DEEPSPEED_AVAILABLE + # else: + # assert _DEEPSPEED_AVAILABLE + + try: + import fairscale.nn # noqa + except ModuleNotFoundError: + assert not _FAIRSCALE_AVAILABLE + else: + assert _FAIRSCALE_AVAILABLE + + try: + import horovod.torch # noqa + except ModuleNotFoundError: + assert not _HOROVOD_AVAILABLE + else: + assert _HOROVOD_AVAILABLE + + try: + import omegaconf # noqa + except ModuleNotFoundError: + assert not _OMEGACONF_AVAILABLE + else: + assert _OMEGACONF_AVAILABLE + + try: + import poptorch # noqa + except ModuleNotFoundError: + assert not _POPTORCH_AVAILABLE + else: + assert _POPTORCH_AVAILABLE diff --git a/tests/tests_pytorch/utilities/test_optimizer.py b/tests/tests_lite/utilities/test_optimizer.py similarity index 93% rename from tests/tests_pytorch/utilities/test_optimizer.py rename to tests/tests_lite/utilities/test_optimizer.py index 6d4c0ec54e1c1..09a37a6403afb 100644 --- a/tests/tests_pytorch/utilities/test_optimizer.py +++ b/tests/tests_lite/utilities/test_optimizer.py @@ -2,7 +2,7 @@ import torch -from pytorch_lightning.utilities.optimizer import optimizer_to_device +from lightning_lite.utilities.optimizer import optimizer_to_device def test_optimizer_to_device(): diff --git a/tests/tests_pytorch/utilities/test_rank_zero.py b/tests/tests_lite/utilities/test_rank_zero.py similarity index 65% rename from tests/tests_pytorch/utilities/test_rank_zero.py rename to tests/tests_lite/utilities/test_rank_zero.py index c4c15b28e5b73..edf85d7b342ca 100644 --- a/tests/tests_pytorch/utilities/test_rank_zero.py +++ b/tests/tests_lite/utilities/test_rank_zero.py @@ -1,23 +1,10 @@ -# Copyright The PyTorch Lightning team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. import os import sys from unittest import mock import pytest -from pytorch_lightning.utilities.rank_zero import _get_rank +from lightning_lite.utilities.rank_zero import _get_rank @pytest.mark.parametrize( @@ -39,8 +26,8 @@ def test_rank_zero_known_environment_variables(env_vars, expected): with mock.patch.dict(os.environ, env_vars): # force module reload to re-trigger the rank_zero_only.rank global computation sys.modules.pop("lightning_utilities.core.rank_zero", None) - sys.modules.pop("pytorch_lightning.utilities.rank_zero", None) - from pytorch_lightning.utilities.rank_zero import rank_zero_only + sys.modules.pop("lightning_lite.utilities.rank_zero", None) + from lightning_lite.utilities.rank_zero import rank_zero_only @rank_zero_only def foo(): diff --git a/tests/tests_lite/utilities/test_seed.py b/tests/tests_lite/utilities/test_seed.py new file mode 100644 index 0000000000000..b03aa6d049890 --- /dev/null +++ b/tests/tests_lite/utilities/test_seed.py @@ -0,0 +1,84 @@ +import os +from unittest import mock + +import pytest +import torch + +import lightning_lite.utilities +from lightning_lite.utilities import seed as seed_utils +from lightning_lite.utilities.seed import _collect_rng_states, _set_rng_states + + +@mock.patch.dict(os.environ, {}, clear=True) +def test_seed_stays_same_with_multiple_seed_everything_calls(): + """Ensure that after the initial seed everything, the seed stays the same for the same run.""" + with pytest.warns(UserWarning, match="No seed found"): + lightning_lite.utilities.seed.seed_everything() + initial_seed = os.environ.get("PL_GLOBAL_SEED") + + with pytest.warns(None) as record: + lightning_lite.utilities.seed.seed_everything() + assert not record # does not warn + seed = os.environ.get("PL_GLOBAL_SEED") + + assert initial_seed == seed + + +@mock.patch.dict(os.environ, {"PL_GLOBAL_SEED": "2020"}, clear=True) +def test_correct_seed_with_environment_variable(): + """Ensure that the PL_GLOBAL_SEED environment is read.""" + assert lightning_lite.utilities.seed.seed_everything() == 2020 + + +@mock.patch.dict(os.environ, {"PL_GLOBAL_SEED": "invalid"}, clear=True) +@mock.patch.object(seed_utils, attribute="_select_seed_randomly", new=lambda *_: 123) +def test_invalid_seed(): + """Ensure that we still fix the seed even if an invalid seed is given.""" + with pytest.warns(UserWarning, match="Invalid seed found"): + seed = lightning_lite.utilities.seed.seed_everything() + assert seed == 123 + + +@mock.patch.dict(os.environ, {}, clear=True) +@mock.patch.object(seed_utils, attribute="_select_seed_randomly", new=lambda *_: 123) +@pytest.mark.parametrize("seed", (10e9, -10e9)) +def test_out_of_bounds_seed(seed): + """Ensure that we still fix the seed even if an out-of-bounds seed is given.""" + with pytest.warns(UserWarning, match="is not in bounds"): + actual = lightning_lite.utilities.seed.seed_everything(seed) + assert actual == 123 + + +def test_reset_seed_no_op(): + """Test that the reset_seed function is a no-op when seed_everything() was not used.""" + assert "PL_GLOBAL_SEED" not in os.environ + seed_before = torch.initial_seed() + lightning_lite.utilities.seed.reset_seed() + assert torch.initial_seed() == seed_before + assert "PL_GLOBAL_SEED" not in os.environ + + +@pytest.mark.parametrize("workers", (True, False)) +def test_reset_seed_everything(workers): + """Test that we can reset the seed to the initial value set by seed_everything()""" + assert "PL_GLOBAL_SEED" not in os.environ + assert "PL_SEED_WORKERS" not in os.environ + + lightning_lite.utilities.seed.seed_everything(123, workers) + before = torch.rand(1) + assert os.environ["PL_GLOBAL_SEED"] == "123" + assert os.environ["PL_SEED_WORKERS"] == str(int(workers)) + + lightning_lite.utilities.seed.reset_seed() + after = torch.rand(1) + assert os.environ["PL_GLOBAL_SEED"] == "123" + assert os.environ["PL_SEED_WORKERS"] == str(int(workers)) + assert torch.allclose(before, after) + + +def test_backward_compatibility_rng_states_dict(): + """Test that an older rng_states_dict without the "torch.cuda" key does not crash.""" + states = _collect_rng_states() + assert "torch.cuda" in states + states.pop("torch.cuda") + _set_rng_states(states) diff --git a/tests/tests_lite/utilities/test_warnings.py b/tests/tests_lite/utilities/test_warnings.py new file mode 100644 index 0000000000000..e951ff53ea632 --- /dev/null +++ b/tests/tests_lite/utilities/test_warnings.py @@ -0,0 +1,78 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Test that the warnings actually appear and they have the correct `stacklevel` + +Needs to be run outside of `pytest` as it captures all the warnings. +""" +from contextlib import redirect_stderr +from io import StringIO + +from lightning_utilities.core.rank_zero import _warn, WarningCache + +from lightning_lite.utilities.rank_zero import rank_zero_deprecation, rank_zero_warn + +if __name__ == "__main__": + stderr = StringIO() + # recording + with redirect_stderr(stderr): + _warn("test1") + _warn("test2", category=DeprecationWarning) + + rank_zero_warn("test3") + rank_zero_warn("test4", category=DeprecationWarning) + + rank_zero_deprecation("test5") + + cache = WarningCache() + cache.warn("test6") + cache.deprecation("test7") + + output = stderr.getvalue() + assert "test_warnings.py:29: UserWarning: test1" in output + assert "test_warnings.py:30: DeprecationWarning: test2" in output + + assert "test_warnings.py:32: UserWarning: test3" in output + assert "test_warnings.py:33: DeprecationWarning: test4" in output + + assert "test_warnings.py:35: LightningDeprecationWarning: test5" in output + + assert "test_warnings.py:38: UserWarning: test6" in output + assert "test_warnings.py:39: LightningDeprecationWarning: test7" in output + + # check that logging is properly configured + import logging + + root_logger = logging.getLogger() + lightning_logger = logging.getLogger("lightning_lite") + # should have a `StreamHandler` + assert lightning_logger.hasHandlers() and len(lightning_logger.handlers) == 1 + # set our own stream for testing + handler = lightning_logger.handlers[0] + assert isinstance(handler, logging.StreamHandler) + stderr = StringIO() + # necessary with `propagate = False` + lightning_logger.handlers[0].stream = stderr + + # necessary with `propagate = True` + with redirect_stderr(stderr): + # Lightning should not configure the root `logging` logger by default + logging.info("test1") + root_logger.info("test1") + # but our logger instance + lightning_logger.info("test2") + # level is set to INFO + lightning_logger.debug("test3") + + output = stderr.getvalue() + assert output == "test2\n", repr(output) diff --git a/tests/tests_lite/utilities/test_xla_device_utils.py b/tests/tests_lite/utilities/test_xla_device_utils.py index d8f6003c6a55a..87c92b772c520 100644 --- a/tests/tests_lite/utilities/test_xla_device_utils.py +++ b/tests/tests_lite/utilities/test_xla_device_utils.py @@ -15,10 +15,10 @@ from unittest.mock import patch import pytest +from tests_lite.helpers.runif import RunIf import lightning_lite.utilities.xla_device as xla_utils -from pytorch_lightning.utilities import _XLA_AVAILABLE -from tests_pytorch.helpers.runif import RunIf +from lightning_lite.utilities.imports import _XLA_AVAILABLE @pytest.mark.skipif(_XLA_AVAILABLE, reason="test requires torch_xla to be absent") @@ -38,7 +38,7 @@ def sleep_fn(sleep_time: float) -> bool: return True -@patch("pytorch_lightning.utilities.xla_device.TPU_CHECK_TIMEOUT", 3) +@patch("lightning_lite.utilities.xla_device.TPU_CHECK_TIMEOUT", 3) @pytest.mark.skipif(not _XLA_AVAILABLE, reason="test requires torch_xla to be present") def test_result_returns_within_timeout_seconds(): """Check that pl_multi_process returns within 3 seconds.""" diff --git a/tests/tests_pytorch/accelerators/test_common.py b/tests/tests_pytorch/accelerators/test_common.py index 8c4ac8f3fd4ae..05fb76f1cc572 100644 --- a/tests/tests_pytorch/accelerators/test_common.py +++ b/tests/tests_pytorch/accelerators/test_common.py @@ -18,7 +18,7 @@ from pytorch_lightning.strategies import DDPStrategy -@mock.patch("pytorch_lightning.utilities.device_parser.num_cuda_devices", return_value=2) +@mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=2) def test_auto_device_count(_): assert CPUAccelerator.auto_device_count() == 1 assert CUDAAccelerator.auto_device_count() == 2 diff --git a/tests/tests_pytorch/core/test_metric_result_integration.py b/tests/tests_pytorch/core/test_metric_result_integration.py index 9672bb75b51f1..8eb4abca00e12 100644 --- a/tests/tests_pytorch/core/test_metric_result_integration.py +++ b/tests/tests_pytorch/core/test_metric_result_integration.py @@ -27,6 +27,7 @@ import pytorch_lightning as pl import tests_pytorch.helpers.utils as tutils +from lightning_lite.utilities.warnings import PossibleUserWarning from pytorch_lightning import Trainer from pytorch_lightning.callbacks import ModelCheckpoint from pytorch_lightning.demos.boring_classes import BoringModel @@ -36,7 +37,6 @@ _ResultMetric, _Sync, ) -from pytorch_lightning.utilities.warnings import PossibleUserWarning from tests_pytorch.helpers.runif import RunIf from tests_pytorch.helpers.utils import no_warning_call diff --git a/tests/tests_pytorch/core/test_results.py b/tests/tests_pytorch/core/test_results.py index dc4c2ac065f2d..543437c28f169 100644 --- a/tests/tests_pytorch/core/test_results.py +++ b/tests/tests_pytorch/core/test_results.py @@ -16,8 +16,8 @@ import torch.multiprocessing as mp import tests_pytorch.helpers.utils as tutils +from lightning_lite.utilities.distributed import sync_ddp_if_available from pytorch_lightning.trainer.connectors.logger_connector.result import _Sync -from pytorch_lightning.utilities.distributed import sync_ddp_if_available from tests_pytorch.helpers.runif import RunIf diff --git a/tests/tests_pytorch/deprecated_api/test_remove_1-10.py b/tests/tests_pytorch/deprecated_api/test_remove_1-10.py index 2193085255fb9..a48c6a7884083 100644 --- a/tests/tests_pytorch/deprecated_api/test_remove_1-10.py +++ b/tests/tests_pytorch/deprecated_api/test_remove_1-10.py @@ -17,13 +17,15 @@ import numpy import pytest import torch +from torch.utils.data import DataLoader from pytorch_lightning import Trainer from pytorch_lightning.core.mixins.device_dtype_mixin import DeviceDtypeModuleMixin -from pytorch_lightning.demos.boring_classes import BoringModel +from pytorch_lightning.demos.boring_classes import BoringModel, RandomDataset from pytorch_lightning.overrides import LightningDistributedModule, LightningParallelModule from pytorch_lightning.overrides.base import unwrap_lightning_module from pytorch_lightning.overrides.fairscale import LightningShardedDataParallel, unwrap_lightning_module_sharded +from pytorch_lightning.plugins.environments import LightningEnvironment from pytorch_lightning.strategies.bagua import LightningBaguaModule from pytorch_lightning.strategies.deepspeed import LightningDeepSpeedModule from pytorch_lightning.strategies.ipu import LightningIPUModule @@ -38,6 +40,27 @@ TransferableDataType, ) from pytorch_lightning.utilities.cloud_io import atomic_save, get_filesystem, load +from pytorch_lightning.utilities.data import has_iterable_dataset, has_len +from pytorch_lightning.utilities.device_parser import ( + determine_root_gpu_device, + is_cuda_available, + num_cuda_devices, + parse_cpu_cores, + parse_gpu_ids, + parse_tpu_cores, +) +from pytorch_lightning.utilities.distributed import ( + all_gather_ddp_if_available, + distributed_available, + gather_all_tensors, + get_default_process_group_backend_for_device, + init_dist_connection, + sync_ddp, + sync_ddp_if_available, + tpu_distributed, +) +from pytorch_lightning.utilities.optimizer import optimizer_to_device, optimizers_to_device +from pytorch_lightning.utilities.seed import pl_worker_init_function, reset_seed, seed_everything from pytorch_lightning.utilities.xla_device import inner_f, pl_multi_process, XLADeviceUtils from tests_pytorch.helpers.runif import RunIf from tests_pytorch.helpers.utils import no_warning_call @@ -112,17 +135,6 @@ def test_v1_10_deprecated_xla_device_utilities(): XLADeviceUtils.tpu_device_exists() -def test_v1_10_deprecated_cloud_io_utilities(tmpdir): - with pytest.deprecated_call(match="cloud_io.atomic_save` has been deprecated in v1.8.0"): - atomic_save({}, tmpdir / "atomic_save.ckpt") - - with pytest.deprecated_call(match="cloud_io.get_filesystem` has been deprecated in v1.8.0"): - get_filesystem(tmpdir) - - with pytest.deprecated_call(match="cloud_io.load` has been deprecated in v1.8.0"): - load(str(tmpdir / "atomic_save.ckpt")) - - def test_v1_10_deprecated_apply_func_utilities(): with pytest.deprecated_call(match="apply_func.apply_to_collection` has been deprecated in v1.8.0"): apply_to_collection([], dtype=object, function=(lambda x: x)) @@ -147,3 +159,94 @@ class MyModule(TransferableDataType): with pytest.deprecated_call(match="apply_func.TransferableDataType` has been deprecated in v1.8.0"): MyModule() + + +def test_v1_10_deprecated_cloud_io_utilities(tmpdir): + with pytest.deprecated_call(match="cloud_io.atomic_save` has been deprecated in v1.8.0"): + atomic_save({}, tmpdir / "atomic_save.ckpt") + + with pytest.deprecated_call(match="cloud_io.get_filesystem` has been deprecated in v1.8.0"): + get_filesystem(tmpdir) + + with pytest.deprecated_call(match="cloud_io.load` has been deprecated in v1.8.0"): + load(str(tmpdir / "atomic_save.ckpt")) + + +def test_v1_10_deprecated_data_utilities(): + with pytest.deprecated_call(match="data.has_iterable_dataset` has been deprecated in v1.8.0"): + has_iterable_dataset(DataLoader(RandomDataset(2, 4))) + + with pytest.deprecated_call(match="data.has_len` has been deprecated in v1.8.0"): + has_len(DataLoader(RandomDataset(2, 4))) + + +def test_v1_10_deprecated_device_parser_utilities(): + with pytest.deprecated_call(match="device_parser.determine_root_gpu_device` has been deprecated in v1.8.0"): + determine_root_gpu_device(None) + + with pytest.deprecated_call(match="device_parser.is_cuda_available` has been deprecated in v1.8.0"): + is_cuda_available() + + with pytest.deprecated_call(match="device_parser.num_cuda_devices` has been deprecated in v1.8.0"): + num_cuda_devices() + + with pytest.deprecated_call(match="device_parser.parse_cpu_cores` has been deprecated in v1.8.0"): + parse_cpu_cores(1) + + with pytest.deprecated_call(match="device_parser.parse_gpu_ids` has been deprecated in v1.8.0"): + parse_gpu_ids(None) + + with pytest.deprecated_call(match="device_parser.parse_tpu_cores` has been deprecated in v1.8.0"): + parse_tpu_cores(None) + + +def test_v1_10_deprecated_distributed_utilities(): + with pytest.deprecated_call(match="distributed.all_gather_ddp_if_available` has been deprecated in v1.8.0"): + all_gather_ddp_if_available(torch.tensor(1)) + + with pytest.deprecated_call(match="distributed.distributed_available` has been deprecated in v1.8.0"): + distributed_available() + + with mock.patch("torch.distributed.get_world_size", return_value=2), mock.patch( + "torch.distributed.barrier" + ), mock.patch("torch.distributed.all_gather"): + with pytest.deprecated_call(match="distributed.gather_all_tensors` has been deprecated in v1.8.0"): + gather_all_tensors(torch.tensor(1)) + + with pytest.deprecated_call( + match="distributed.get_default_process_group_backend_for_device` has been deprecated in v1.8.0" + ): + get_default_process_group_backend_for_device(torch.device("cpu")) + + with mock.patch("torch.distributed.is_initialized", return_value=True): + with pytest.deprecated_call(match="distributed.init_dist_connection` has been deprecated in v1.8.0"): + init_dist_connection(LightningEnvironment(), "gloo") + + with pytest.deprecated_call(match="distributed.sync_ddp_if_available` has been deprecated in v1.8.0"): + sync_ddp_if_available(torch.tensor(1)) + + with mock.patch("torch.distributed.barrier"), mock.patch("torch.distributed.all_reduce"): + with pytest.deprecated_call(match="distributed.sync_ddp` has been deprecated in v1.8.0"): + sync_ddp(torch.tensor(1)) + + with pytest.deprecated_call(match="distributed.tpu_distributed` has been deprecated in v1.8.0"): + tpu_distributed() + + +def test_v1_10_deprecated_optimizer_utilities(): + with pytest.deprecated_call(match="optimizer.optimizers_to_device` has been deprecated in v1.8.0"): + optimizers_to_device([torch.optim.Adam(torch.nn.Linear(1, 1).parameters())], "cpu") + + with pytest.deprecated_call(match="optimizer.optimizer_to_device` has been deprecated in v1.8.0"): + optimizer_to_device(torch.optim.Adam(torch.nn.Linear(1, 1).parameters()), "cpu") + + +def test_v1_10_deprecated_seed_utilities(): + with pytest.deprecated_call(match="seed.seed_everything` has been deprecated in v1.8.0"): + seed_everything(1) + + with pytest.deprecated_call(match="seed.reset_seed` has been deprecated in v1.8.0"): + reset_seed() + + with pytest.deprecated_call(match="seed.pl_worker_init_function` has been deprecated in v1.8.0"): + pl_worker_init_function(0) diff --git a/tests/tests_pytorch/deprecated_api/test_remove_1-8.py b/tests/tests_pytorch/deprecated_api/test_remove_1-8.py index 489ef38f0c00f..b9e36df94d669 100644 --- a/tests/tests_pytorch/deprecated_api/test_remove_1-8.py +++ b/tests/tests_pytorch/deprecated_api/test_remove_1-8.py @@ -22,6 +22,7 @@ import torch import pytorch_lightning +from lightning_lite.utilities import device_parser from pytorch_lightning import Callback, Trainer from pytorch_lightning.callbacks import ModelCheckpoint from pytorch_lightning.demos.boring_classes import BoringDataModule, BoringModel @@ -33,7 +34,6 @@ from pytorch_lightning.strategies.ipu import LightningIPUModule from pytorch_lightning.trainer.configuration_validator import _check_datamodule_checkpoint_hooks from pytorch_lightning.trainer.states import RunningStage -from pytorch_lightning.utilities import device_parser from pytorch_lightning.utilities.rank_zero import rank_zero_only from tests_pytorch.helpers.runif import RunIf @@ -547,7 +547,7 @@ def test_v1_8_0_lightning_module_use_amp(): @mock.patch.dict(os.environ, {"PL_TORCH_DISTRIBUTED_BACKEND": "foo"}) def test_v1_8_0_torch_distributed_backend_env(): - from pytorch_lightning.utilities.distributed import _get_process_group_backend_from_env + from lightning_lite.utilities.distributed import _get_process_group_backend_from_env with pytest.deprecated_call( match="Environment variable `PL_TORCH_DISTRIBUTED_BACKEND`" diff --git a/tests/tests_pytorch/deprecated_api/test_remove_2-0.py b/tests/tests_pytorch/deprecated_api/test_remove_2-0.py index b39c6dafc1696..bd359cc3234f2 100644 --- a/tests/tests_pytorch/deprecated_api/test_remove_2-0.py +++ b/tests/tests_pytorch/deprecated_api/test_remove_2-0.py @@ -28,8 +28,8 @@ def test_v2_0_0_deprecated_num_processes(): _ = Trainer(num_processes=2) -@mock.patch("pytorch_lightning.utilities.device_parser.is_cuda_available", return_value=True) -@mock.patch("pytorch_lightning.utilities.device_parser.num_cuda_devices", return_value=2) +@mock.patch("lightning_lite.utilities.device_parser.is_cuda_available", return_value=True) +@mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=2) def test_v2_0_0_deprecated_gpus(*_): with pytest.deprecated_call(match=r"is deprecated in v1.7 and will be removed in v2.0."): _ = Trainer(gpus=0) diff --git a/tests/tests_pytorch/lite/test_lite.py b/tests/tests_pytorch/lite/test_lite.py index d45046f249d54..e7b5c61a67727 100644 --- a/tests/tests_pytorch/lite/test_lite.py +++ b/tests/tests_pytorch/lite/test_lite.py @@ -23,13 +23,13 @@ from torch import nn from torch.utils.data import DataLoader, DistributedSampler, Sampler +from lightning_lite.utilities import _StrategyType +from lightning_lite.utilities.seed import pl_worker_init_function from pytorch_lightning.lite import LightningLite from pytorch_lightning.lite.wrappers import _LiteDataLoader, _LiteModule, _LiteOptimizer from pytorch_lightning.plugins import PrecisionPlugin from pytorch_lightning.strategies import DeepSpeedStrategy, Strategy -from pytorch_lightning.utilities import _StrategyType from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning.utilities.seed import pl_worker_init_function from tests_pytorch.helpers.runif import RunIf diff --git a/tests/tests_pytorch/models/test_gpu.py b/tests/tests_pytorch/models/test_gpu.py index 5eded60d2084f..1f15f2a5969aa 100644 --- a/tests/tests_pytorch/models/test_gpu.py +++ b/tests/tests_pytorch/models/test_gpu.py @@ -21,11 +21,11 @@ import tests_pytorch.helpers.pipelines as tpipes import tests_pytorch.helpers.utils as tutils +from lightning_lite.utilities import device_parser from pytorch_lightning import Trainer from pytorch_lightning.accelerators import CPUAccelerator, CUDAAccelerator from pytorch_lightning.demos.boring_classes import BoringModel from pytorch_lightning.plugins.environments import TorchElasticEnvironment -from pytorch_lightning.utilities import device_parser from pytorch_lightning.utilities.exceptions import MisconfigurationException from tests_pytorch.helpers.datamodules import ClassifDataModule from tests_pytorch.helpers.runif import RunIf @@ -181,8 +181,8 @@ def test_parse_gpu_returns_none_when_no_devices_are_available(mocked_device_coun "TORCHELASTIC_RUN_ID": "1", }, ) -@mock.patch("pytorch_lightning.utilities.device_parser.num_cuda_devices", return_value=1) -@mock.patch("pytorch_lightning.utilities.device_parser.is_cuda_available", return_value=True) +@mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=1) +@mock.patch("lightning_lite.utilities.device_parser.is_cuda_available", return_value=True) @pytest.mark.parametrize("gpus", [[0, 1, 2], 2, "0", [0, 2]]) def test_torchelastic_gpu_parsing(mocked_device_count, mocked_is_available, gpus): """Ensure when using torchelastic and nproc_per_node is set to the default of 1 per GPU device That we omit diff --git a/tests/tests_pytorch/models/test_tpu.py b/tests/tests_pytorch/models/test_tpu.py index a41ba7429c0e9..1265f6c90f1f8 100644 --- a/tests/tests_pytorch/models/test_tpu.py +++ b/tests/tests_pytorch/models/test_tpu.py @@ -21,6 +21,7 @@ import tests_pytorch.helpers.pipelines as tpipes import tests_pytorch.helpers.utils as tutils +from lightning_lite.utilities.distributed import ReduceOp from pytorch_lightning import Trainer from pytorch_lightning.accelerators import TPUAccelerator from pytorch_lightning.callbacks import EarlyStopping @@ -29,7 +30,6 @@ from pytorch_lightning.strategies.launchers.xla import _save_spawn from pytorch_lightning.trainer.connectors.logger_connector.result import _Sync from pytorch_lightning.utilities import _TPU_AVAILABLE -from pytorch_lightning.utilities.distributed import ReduceOp from pytorch_lightning.utilities.exceptions import MisconfigurationException from tests_pytorch.helpers.runif import RunIf diff --git a/tests/tests_pytorch/overrides/test_distributed.py b/tests/tests_pytorch/overrides/test_distributed.py index 77c3eb40bfb10..bae31e8fc08f9 100644 --- a/tests/tests_pytorch/overrides/test_distributed.py +++ b/tests/tests_pytorch/overrides/test_distributed.py @@ -16,9 +16,9 @@ import pytest from torch.utils.data import BatchSampler, SequentialSampler +from lightning_lite.utilities.data import has_len from pytorch_lightning import seed_everything from pytorch_lightning.overrides.distributed import IndexBatchSamplerWrapper, UnrepeatedDistributedSampler -from pytorch_lightning.utilities.data import has_len @pytest.mark.parametrize("shuffle", [False, True]) diff --git a/tests/tests_pytorch/plugins/test_amp_plugins.py b/tests/tests_pytorch/plugins/test_amp_plugins.py index 974964e5b9101..a7efe0ec75fdd 100644 --- a/tests/tests_pytorch/plugins/test_amp_plugins.py +++ b/tests/tests_pytorch/plugins/test_amp_plugins.py @@ -51,8 +51,8 @@ class MyApexPlugin(ApexMixedPrecisionPlugin): "SLURM_LOCALID": "0", }, ) -@mock.patch("pytorch_lightning.utilities.device_parser.is_cuda_available", return_value=True) -@mock.patch("pytorch_lightning.utilities.device_parser.num_cuda_devices", return_value=2) +@mock.patch("lightning_lite.utilities.device_parser.is_cuda_available", return_value=True) +@mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=2) @pytest.mark.parametrize("strategy,devices", [("ddp", 2), ("ddp_spawn", 2)]) @pytest.mark.parametrize( "amp,custom_plugin,plugin_cls", @@ -278,16 +278,16 @@ def test_precision_selection_raises(monkeypatch): with pytest.raises(MisconfigurationException, match=r"amp_type='apex', precision='bf16'\)` but it's not supported"): Trainer(amp_backend="apex", precision="bf16") - with mock.patch("pytorch_lightning.utilities.device_parser.num_cuda_devices", return_value=1), pytest.raises( + with mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=1), pytest.raises( MisconfigurationException, match="Sharded plugins are not supported with apex" ): - with mock.patch("pytorch_lightning.utilities.device_parser.is_cuda_available", return_value=True): + with mock.patch("lightning_lite.utilities.device_parser.is_cuda_available", return_value=True): Trainer(amp_backend="apex", precision=16, accelerator="gpu", devices=1, strategy="ddp_fully_sharded") import pytorch_lightning.plugins.precision.apex_amp as apex monkeypatch.setattr(apex, "_APEX_AVAILABLE", False) - with mock.patch("pytorch_lightning.utilities.device_parser.num_cuda_devices", return_value=1), mock.patch( - "pytorch_lightning.utilities.device_parser.is_cuda_available", return_value=True + with mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=1), mock.patch( + "lightning_lite.utilities.device_parser.is_cuda_available", return_value=True ), pytest.raises(MisconfigurationException, match="asked for Apex AMP but `apex` is not installed"): Trainer(amp_backend="apex", precision=16, accelerator="gpu", devices=1) diff --git a/tests/tests_pytorch/plugins/test_checkpoint_io_plugin.py b/tests/tests_pytorch/plugins/test_checkpoint_io_plugin.py index ae618ffa333dc..21a94d33bbb9d 100644 --- a/tests/tests_pytorch/plugins/test_checkpoint_io_plugin.py +++ b/tests/tests_pytorch/plugins/test_checkpoint_io_plugin.py @@ -18,6 +18,7 @@ import torch +from lightning_lite.utilities.types import _PATH from pytorch_lightning import Trainer from pytorch_lightning.callbacks import ModelCheckpoint from pytorch_lightning.demos.boring_classes import BoringModel @@ -25,7 +26,6 @@ from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO from pytorch_lightning.plugins.io.torch_plugin import TorchCheckpointIO from pytorch_lightning.strategies import SingleDeviceStrategy -from pytorch_lightning.utilities.types import _PATH class CustomCheckpointIO(CheckpointIO): diff --git a/tests/tests_pytorch/plugins/test_cluster_integration.py b/tests/tests_pytorch/plugins/test_cluster_integration.py index b9f39336d11f7..be8f87d643f9c 100644 --- a/tests/tests_pytorch/plugins/test_cluster_integration.py +++ b/tests/tests_pytorch/plugins/test_cluster_integration.py @@ -85,8 +85,8 @@ def test_ranks_available_manual_strategy_selection(mock_gpu_acc_available, strat dict(strategy="ddp_spawn", accelerator="gpu", devices=[1, 2]), ], ) -@mock.patch("pytorch_lightning.utilities.device_parser.is_cuda_available", return_value=True) -@mock.patch("pytorch_lightning.utilities.device_parser.num_cuda_devices", return_value=4) +@mock.patch("lightning_lite.utilities.device_parser.is_cuda_available", return_value=True) +@mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=4) def test_ranks_available_automatic_strategy_selection(mock0, mock1, trainer_kwargs): """Test that the rank information is readily available after Trainer initialization.""" num_nodes = 2 diff --git a/tests/tests_pytorch/strategies/test_bagua_strategy.py b/tests/tests_pytorch/strategies/test_bagua_strategy.py index 79ec701964f8f..3e9aba79dd5ea 100644 --- a/tests/tests_pytorch/strategies/test_bagua_strategy.py +++ b/tests/tests_pytorch/strategies/test_bagua_strategy.py @@ -118,6 +118,6 @@ def test_bagua_not_available(monkeypatch): import pytorch_lightning.strategies.bagua as imports monkeypatch.setattr(imports, "_BAGUA_AVAILABLE", False) - with mock.patch("pytorch_lightning.utilities.device_parser.num_cuda_devices", return_value=1): + with mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=1): with pytest.raises(MisconfigurationException, match="you must have `Bagua` installed"): Trainer(strategy="bagua", accelerator="gpu", devices=1) diff --git a/tests/tests_pytorch/strategies/test_common.py b/tests/tests_pytorch/strategies/test_common.py index 479b222e25a9d..d696ce81184b1 100644 --- a/tests/tests_pytorch/strategies/test_common.py +++ b/tests/tests_pytorch/strategies/test_common.py @@ -15,11 +15,11 @@ import torch import tests_pytorch.helpers.utils as tutils +from lightning_lite.utilities.seed import seed_everything from pytorch_lightning import Trainer from pytorch_lightning.demos.boring_classes import BoringModel from pytorch_lightning.strategies import DDPStrategy from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_12 -from pytorch_lightning.utilities.seed import seed_everything from tests_pytorch.helpers.datamodules import ClassifDataModule from tests_pytorch.helpers.runif import RunIf from tests_pytorch.strategies.test_dp import CustomClassificationModelDP diff --git a/tests/tests_pytorch/strategies/test_ddp.py b/tests/tests_pytorch/strategies/test_ddp.py index dbde198b6eb6e..19317bfe300a6 100644 --- a/tests/tests_pytorch/strategies/test_ddp.py +++ b/tests/tests_pytorch/strategies/test_ddp.py @@ -60,12 +60,12 @@ def test_multi_gpu_model_ddp_fit_test(tmpdir): @RunIf(skip_windows=True) @pytest.mark.skipif(torch.cuda.is_available(), reason="test doesn't requires GPU machine") -@mock.patch("pytorch_lightning.utilities.device_parser.is_cuda_available", return_value=True) +@mock.patch("lightning_lite.utilities.device_parser.is_cuda_available", return_value=True) def test_torch_distributed_backend_env_variables(tmpdir): """This test set `undefined` as torch backend and should raise an `Backend.UNDEFINED` ValueError.""" _environ = {"PL_TORCH_DISTRIBUTED_BACKEND": "undefined", "CUDA_VISIBLE_DEVICES": "0,1", "WORLD_SIZE": "2"} with patch.dict(os.environ, _environ), patch( - "pytorch_lightning.utilities.device_parser.num_cuda_devices", return_value=2 + "lightning_lite.utilities.device_parser.num_cuda_devices", return_value=2 ): with pytest.deprecated_call(match="Environment variable `PL_TORCH_DISTRIBUTED_BACKEND` was deprecated in v1.6"): with pytest.raises(ValueError, match="Invalid backend: 'undefined'"): @@ -83,8 +83,8 @@ def test_torch_distributed_backend_env_variables(tmpdir): @RunIf(skip_windows=True) @mock.patch("torch.cuda.set_device") -@mock.patch("pytorch_lightning.utilities.device_parser.is_cuda_available", return_value=True) -@mock.patch("pytorch_lightning.utilities.device_parser.num_cuda_devices", return_value=1) +@mock.patch("lightning_lite.utilities.device_parser.is_cuda_available", return_value=True) +@mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=1) @mock.patch("pytorch_lightning.accelerators.gpu.CUDAAccelerator.is_available", return_value=True) @mock.patch.dict(os.environ, {"PL_TORCH_DISTRIBUTED_BACKEND": "gloo"}, clear=True) def test_ddp_torch_dist_is_available_in_setup( diff --git a/tests/tests_pytorch/strategies/test_ddp_fully_sharded_with_full_state_dict.py b/tests/tests_pytorch/strategies/test_ddp_fully_sharded_with_full_state_dict.py index 88a07a78efecf..bb3b63ea578c6 100644 --- a/tests/tests_pytorch/strategies/test_ddp_fully_sharded_with_full_state_dict.py +++ b/tests/tests_pytorch/strategies/test_ddp_fully_sharded_with_full_state_dict.py @@ -29,8 +29,8 @@ def test_invalid_on_cpu(tmpdir): @mock.patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0"}) -@mock.patch("pytorch_lightning.utilities.device_parser.num_cuda_devices", return_value=1) -@mock.patch("pytorch_lightning.utilities.device_parser.is_cuda_available", return_value=True) +@mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=1) +@mock.patch("lightning_lite.utilities.device_parser.is_cuda_available", return_value=True) @RunIf(fairscale=True) def test_fsdp_with_sharded_amp(device_count_mock, mock_cuda_available, tmpdir): """Test to ensure that plugin native amp plugin is correctly chosen when using sharded.""" diff --git a/tests/tests_pytorch/strategies/test_deepspeed_strategy.py b/tests/tests_pytorch/strategies/test_deepspeed_strategy.py index 857abaa8dfbb4..70af274e2f788 100644 --- a/tests/tests_pytorch/strategies/test_deepspeed_strategy.py +++ b/tests/tests_pytorch/strategies/test_deepspeed_strategy.py @@ -169,7 +169,7 @@ def test_deepspeed_strategy_env(tmpdir, monkeypatch, deepspeed_config): @RunIf(deepspeed=True) -@mock.patch("pytorch_lightning.utilities.device_parser.num_cuda_devices", return_value=1) +@mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=1) @pytest.mark.parametrize( "amp_backend", ["native", pytest.param("apex", marks=RunIf(amp_apex=True))], diff --git a/tests/tests_pytorch/test_cli.py b/tests/tests_pytorch/test_cli.py index e37f799888f7e..46fc7e9b6217f 100644 --- a/tests/tests_pytorch/test_cli.py +++ b/tests/tests_pytorch/test_cli.py @@ -197,8 +197,8 @@ def test_parse_args_parsing_complex_types(cli_args, expected, instantiate): ) def test_parse_args_parsing_gpus(monkeypatch, cli_args, expected_gpu): """Test parsing of gpus and instantiation of Trainer.""" - monkeypatch.setattr("pytorch_lightning.utilities.device_parser.num_cuda_devices", lambda: 2) - monkeypatch.setattr("pytorch_lightning.utilities.device_parser.is_cuda_available", lambda: True) + monkeypatch.setattr("lightning_lite.utilities.device_parser.num_cuda_devices", lambda: 2) + monkeypatch.setattr("lightning_lite.utilities.device_parser.is_cuda_available", lambda: True) cli_args = cli_args.split(" ") if cli_args else [] with mock.patch("sys.argv", ["any.py"] + cli_args): parser = LightningArgumentParser(add_help=False, parse_as_dict=False) diff --git a/tests/tests_pytorch/trainer/connectors/test_accelerator_connector.py b/tests/tests_pytorch/trainer/connectors/test_accelerator_connector.py index 434607cab78a2..6625f191c3190 100644 --- a/tests/tests_pytorch/trainer/connectors/test_accelerator_connector.py +++ b/tests/tests_pytorch/trainer/connectors/test_accelerator_connector.py @@ -98,7 +98,7 @@ def _test_strategy_choice_ddp_and_cpu(tmpdir, ddp_strategy_class): "SLURM_LOCALID": "0", }, ) -@mock.patch("pytorch_lightning.utilities.device_parser.num_cuda_devices", return_value=0) +@mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=0) def test_custom_cluster_environment_in_slurm_environment(_, tmpdir): """Test that we choose the custom cluster even when SLURM or TE flags are around.""" @@ -135,7 +135,7 @@ def creates_processes_externally(self) -> bool: "SLURM_LOCALID": "0", }, ) -@mock.patch("pytorch_lightning.utilities.device_parser.num_cuda_devices", return_value=0) +@mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=0) @mock.patch("pytorch_lightning.strategies.DDPStrategy.setup_distributed", autospec=True) def test_custom_accelerator(device_count_mock, setup_distributed_mock): class Accel(Accelerator): @@ -194,7 +194,7 @@ class Strat(DDPStrategy): "SLURM_LOCALID": "0", }, ) -@mock.patch("pytorch_lightning.utilities.device_parser.num_cuda_devices", return_value=0) +@mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=0) @mock.patch("pytorch_lightning.strategies.DDPStrategy.setup_distributed", autospec=True) def test_dist_backend_accelerator_mapping(*_): trainer = Trainer(fast_dev_run=True, strategy="ddp_spawn", accelerator="cpu", devices=2) @@ -203,7 +203,7 @@ def test_dist_backend_accelerator_mapping(*_): assert trainer.strategy.local_rank == 0 -@mock.patch("pytorch_lightning.utilities.device_parser.num_cuda_devices", return_value=2) +@mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=2) def test_ipython_incompatible_backend_error(_, monkeypatch): monkeypatch.setattr(pytorch_lightning.utilities, "_IS_INTERACTIVE", True) with pytest.raises(MisconfigurationException, match=r"strategy='ddp'\)`.*is not compatible"): @@ -220,7 +220,7 @@ def test_ipython_incompatible_backend_error(_, monkeypatch): Trainer(strategy="dp") -@mock.patch("pytorch_lightning.utilities.device_parser.num_cuda_devices", return_value=2) +@mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=2) def test_ipython_compatible_dp_strategy_gpu(_, monkeypatch): monkeypatch.setattr(pytorch_lightning.utilities, "_IS_INTERACTIVE", True) trainer = Trainer(strategy="dp", accelerator="gpu") @@ -253,8 +253,8 @@ def test_ipython_compatible_strategy_ddp_fork(monkeypatch): ], ) @pytest.mark.parametrize("devices", [1, 2]) -@mock.patch("pytorch_lightning.utilities.device_parser.is_cuda_available", return_value=True) -@mock.patch("pytorch_lightning.utilities.device_parser.num_cuda_devices", return_value=2) +@mock.patch("lightning_lite.utilities.device_parser.is_cuda_available", return_value=True) +@mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=2) def test_accelerator_choice_multi_node_gpu( mock_is_available, mock_device_count, tmpdir, strategy, strategy_class, devices ): @@ -284,8 +284,8 @@ def test_accelerator_cpu(_): Trainer(accelerator="cpu", gpus=1) -@mock.patch("pytorch_lightning.utilities.device_parser.num_cuda_devices", return_value=2) -@mock.patch("pytorch_lightning.utilities.device_parser.is_cuda_available", return_value=True) +@mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=2) +@mock.patch("lightning_lite.utilities.device_parser.is_cuda_available", return_value=True) @pytest.mark.parametrize("device_count", (["0"], [0, "1"], ["GPU"], [["0", "1"], [0, 1]], [False])) def test_accelererator_invalid_type_devices(mock_is_available, mock_device_count, device_count): with pytest.raises( @@ -449,8 +449,8 @@ def test_strategy_choice_ddp_fork_cpu(): @mock.patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0,1"}) -@mock.patch("pytorch_lightning.utilities.device_parser.num_cuda_devices", return_value=2) -@mock.patch("pytorch_lightning.utilities.device_parser.is_cuda_available", return_value=True) +@mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=2) +@mock.patch("lightning_lite.utilities.device_parser.is_cuda_available", return_value=True) def test_strategy_choice_ddp(*_): trainer = Trainer(fast_dev_run=True, strategy="ddp", accelerator="gpu", devices=1) assert isinstance(trainer.accelerator, CUDAAccelerator) @@ -459,8 +459,8 @@ def test_strategy_choice_ddp(*_): @mock.patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0,1"}) -@mock.patch("pytorch_lightning.utilities.device_parser.num_cuda_devices", return_value=2) -@mock.patch("pytorch_lightning.utilities.device_parser.is_cuda_available", return_value=True) +@mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=2) +@mock.patch("lightning_lite.utilities.device_parser.is_cuda_available", return_value=True) def test_strategy_choice_ddp_spawn(cuda_available_mock, device_count_mock): trainer = Trainer(fast_dev_run=True, strategy="ddp_spawn", accelerator="gpu", devices=1) assert isinstance(trainer.accelerator, CUDAAccelerator) @@ -505,10 +505,10 @@ def test_strategy_choice_ddp_slurm(setup_distributed_mock, strategy): }, ) @mock.patch("torch.cuda.set_device") -@mock.patch("pytorch_lightning.utilities.device_parser.num_cuda_devices", return_value=2) -@mock.patch("pytorch_lightning.utilities.device_parser.is_cuda_available", return_value=True) +@mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=2) +@mock.patch("lightning_lite.utilities.device_parser.is_cuda_available", return_value=True) @mock.patch("pytorch_lightning.strategies.DDPStrategy.setup_distributed", autospec=True) -@mock.patch("pytorch_lightning.utilities.device_parser.is_cuda_available", return_value=True) +@mock.patch("lightning_lite.utilities.device_parser.is_cuda_available", return_value=True) def test_strategy_choice_ddp_te(*_): trainer = Trainer(fast_dev_run=True, strategy="ddp", accelerator="gpu", devices=2) assert isinstance(trainer.accelerator, CUDAAccelerator) @@ -529,7 +529,7 @@ def test_strategy_choice_ddp_te(*_): "TORCHELASTIC_RUN_ID": "1", }, ) -@mock.patch("pytorch_lightning.utilities.device_parser.num_cuda_devices", return_value=0) +@mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=0) @mock.patch("pytorch_lightning.strategies.DDPStrategy.setup_distributed", autospec=True) def test_strategy_choice_ddp_cpu_te(*_): trainer = Trainer(fast_dev_run=True, strategy="ddp_spawn", accelerator="cpu", devices=2) @@ -552,8 +552,8 @@ def test_strategy_choice_ddp_cpu_te(*_): }, ) @mock.patch("torch.cuda.set_device") -@mock.patch("pytorch_lightning.utilities.device_parser.num_cuda_devices", return_value=1) -@mock.patch("pytorch_lightning.utilities.device_parser.is_cuda_available", return_value=True) +@mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=1) +@mock.patch("lightning_lite.utilities.device_parser.is_cuda_available", return_value=True) @mock.patch("pytorch_lightning.strategies.DDPStrategy.setup_distributed", autospec=True) def test_strategy_choice_ddp_kubeflow(*_): trainer = Trainer(fast_dev_run=True, strategy="ddp", accelerator="gpu", devices=1) @@ -574,7 +574,7 @@ def test_strategy_choice_ddp_kubeflow(*_): "RANK": "1", }, ) -@mock.patch("pytorch_lightning.utilities.device_parser.num_cuda_devices", return_value=0) +@mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=0) @mock.patch("pytorch_lightning.strategies.DDPStrategy.setup_distributed", autospec=True) def test_strategy_choice_ddp_cpu_kubeflow(*_): trainer = Trainer(fast_dev_run=True, strategy="ddp_spawn", accelerator="cpu", devices=2) @@ -596,7 +596,7 @@ def test_strategy_choice_ddp_cpu_kubeflow(*_): "SLURM_LOCALID": "0", }, ) -@mock.patch("pytorch_lightning.utilities.device_parser.num_cuda_devices", return_value=0) +@mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=0) @mock.patch("pytorch_lightning.strategies.DDPStrategy.setup_distributed", autospec=True) @pytest.mark.parametrize("strategy", ["ddp", DDPStrategy()]) def test_strategy_choice_ddp_cpu_slurm(device_count_mock, setup_distributed_mock, strategy): @@ -646,7 +646,7 @@ def test_unsupported_ipu_choice(mock_ipu_acc_avail, monkeypatch): Trainer(accelerator="ipu", precision=64) -@mock.patch("pytorch_lightning.utilities.device_parser.num_cuda_devices", return_value=0) +@mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=0) @mock.patch("pytorch_lightning.utilities.imports._TPU_AVAILABLE", return_value=False) @mock.patch("pytorch_lightning.utilities.imports._IPU_AVAILABLE", return_value=False) @mock.patch("pytorch_lightning.utilities.imports._HPU_AVAILABLE", return_value=False) @@ -655,8 +655,8 @@ def test_devices_auto_choice_cpu(*_): assert trainer.num_devices == 1 -@mock.patch("pytorch_lightning.utilities.device_parser.is_cuda_available", return_value=True) -@mock.patch("pytorch_lightning.utilities.device_parser.num_cuda_devices", return_value=2) +@mock.patch("lightning_lite.utilities.device_parser.is_cuda_available", return_value=True) +@mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=2) @RunIf(mps=False) def test_devices_auto_choice_gpu(is_gpu_available_mock, device_count_mock): trainer = Trainer(accelerator="auto", devices="auto") @@ -769,7 +769,7 @@ def test_gpu_accelerator_backend_choice(expected_accelerator_flag, expected_acce assert isinstance(trainer.accelerator, expected_accelerator_class) -@mock.patch("pytorch_lightning.utilities.device_parser.num_cuda_devices", return_value=1) +@mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=1) def test_gpu_accelerator_backend_choice_cuda(_): trainer = Trainer(accelerator="gpu") @@ -777,6 +777,8 @@ def test_gpu_accelerator_backend_choice_cuda(_): assert isinstance(trainer.accelerator, CUDAAccelerator) +# TODO(lite): remove skip once MPS utils have moved +@pytest.mark.skip(reason="Utils in Lite rely on MPS accelerator file, but refactor is not yet finished") @mock.patch("pytorch_lightning.accelerators.mps._MPS_AVAILABLE", return_value=True) @mock.patch("torch.device", return_value="mps") # necessary because torch doesn't allow creation of mps devices def test_gpu_accelerator_backend_choice_mps(*_): diff --git a/tests/tests_pytorch/trainer/connectors/test_data_connector.py b/tests/tests_pytorch/trainer/connectors/test_data_connector.py index 379a3248a1535..703ce8f053590 100644 --- a/tests/tests_pytorch/trainer/connectors/test_data_connector.py +++ b/tests/tests_pytorch/trainer/connectors/test_data_connector.py @@ -21,6 +21,7 @@ from torch import Tensor from torch.utils.data import BatchSampler, DataLoader, DistributedSampler, Sampler, SequentialSampler +from lightning_lite.utilities.warnings import PossibleUserWarning from pytorch_lightning import Trainer from pytorch_lightning.demos.boring_classes import BoringDataModule, BoringModel, RandomDataset from pytorch_lightning.overrides.distributed import DistributedSamplerWrapper @@ -30,7 +31,6 @@ from pytorch_lightning.trainer.supporters import CombinedLoader from pytorch_lightning.utilities.data import _update_dataloader from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning.utilities.warnings import PossibleUserWarning from tests_pytorch.helpers.runif import RunIf from tests_pytorch.helpers.utils import no_warning_call diff --git a/tests/tests_pytorch/trainer/flags/test_env_vars.py b/tests/tests_pytorch/trainer/flags/test_env_vars.py index cfac06c8d7711..a6415d5e907d2 100644 --- a/tests/tests_pytorch/trainer/flags/test_env_vars.py +++ b/tests/tests_pytorch/trainer/flags/test_env_vars.py @@ -49,8 +49,8 @@ def test_passing_env_variables_defaults(): @mock.patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0,1", "PL_TRAINER_DEVICES": "2"}) -@mock.patch("pytorch_lightning.utilities.device_parser.num_cuda_devices", return_value=2) -@mock.patch("pytorch_lightning.utilities.device_parser.is_cuda_available", return_value=True) +@mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=2) +@mock.patch("lightning_lite.utilities.device_parser.is_cuda_available", return_value=True) def test_passing_env_variables_devices(cuda_available_mock, device_count_mock): """Testing overwriting trainer arguments.""" trainer = Trainer() diff --git a/tests/tests_pytorch/trainer/flags/test_min_max_epochs.py b/tests/tests_pytorch/trainer/flags/test_min_max_epochs.py index 25f2dfdab279f..ed3c9952b8a98 100644 --- a/tests/tests_pytorch/trainer/flags/test_min_max_epochs.py +++ b/tests/tests_pytorch/trainer/flags/test_min_max_epochs.py @@ -1,8 +1,8 @@ import pytest +from lightning_lite.utilities.warnings import PossibleUserWarning from pytorch_lightning import Trainer from pytorch_lightning.demos.boring_classes import BoringModel -from pytorch_lightning.utilities.warnings import PossibleUserWarning from tests_pytorch.helpers.utils import no_warning_call diff --git a/tests/tests_pytorch/trainer/properties/test_auto_gpu_select.py b/tests/tests_pytorch/trainer/properties/test_auto_gpu_select.py index aa9f15bc43c18..05ee9d2ab3170 100644 --- a/tests/tests_pytorch/trainer/properties/test_auto_gpu_select.py +++ b/tests/tests_pytorch/trainer/properties/test_auto_gpu_select.py @@ -42,13 +42,13 @@ def test_pick_multiple_gpus(nb, expected_gpu_idxs, expected_error): assert expected_gpu_idxs == pick_multiple_gpus(nb) -@mock.patch("pytorch_lightning.utilities.device_parser.num_cuda_devices", return_value=1) +@mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=1) def test_pick_multiple_gpus_more_than_available(*_): with pytest.raises(MisconfigurationException, match="You requested 3 GPUs but your machine only has 1 GPUs"): pick_multiple_gpus(3) -@mock.patch("pytorch_lightning.utilities.device_parser.num_cuda_devices", return_value=2) +@mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=2) @mock.patch("pytorch_lightning.trainer.connectors.accelerator_connector.pick_multiple_gpus", return_value=[1]) def test_auto_select_gpus(*_): diff --git a/tests/tests_pytorch/trainer/properties/test_estimated_stepping_batches.py b/tests/tests_pytorch/trainer/properties/test_estimated_stepping_batches.py index 72c07ec0790c2..0cd31008ea8ee 100644 --- a/tests/tests_pytorch/trainer/properties/test_estimated_stepping_batches.py +++ b/tests/tests_pytorch/trainer/properties/test_estimated_stepping_batches.py @@ -20,11 +20,11 @@ import torch from torch.utils.data import DataLoader +from lightning_lite.utilities import device_parser from pytorch_lightning import Trainer from pytorch_lightning.callbacks.gradient_accumulation_scheduler import GradientAccumulationScheduler from pytorch_lightning.demos.boring_classes import BoringModel, RandomIterableDataset from pytorch_lightning.strategies.ipu import IPUStrategy -from pytorch_lightning.utilities import device_parser from pytorch_lightning.utilities.exceptions import MisconfigurationException from tests_pytorch.helpers.runif import RunIf diff --git a/tests/tests_pytorch/trainer/test_config_validator.py b/tests/tests_pytorch/trainer/test_config_validator.py index f6508c181ebbb..7cc742eea845f 100644 --- a/tests/tests_pytorch/trainer/test_config_validator.py +++ b/tests/tests_pytorch/trainer/test_config_validator.py @@ -15,11 +15,11 @@ import torch import pytorch_lightning as pl +from lightning_lite.utilities import device_parser +from lightning_lite.utilities.warnings import PossibleUserWarning from pytorch_lightning import LightningDataModule, LightningModule, Trainer from pytorch_lightning.demos.boring_classes import BoringModel, RandomDataset -from pytorch_lightning.utilities import device_parser from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning.utilities.warnings import PossibleUserWarning def test_wrong_train_setting(tmpdir): diff --git a/tests/tests_pytorch/trainer/test_dataloaders.py b/tests/tests_pytorch/trainer/test_dataloaders.py index 317a35af3d1ab..08e81e5915351 100644 --- a/tests/tests_pytorch/trainer/test_dataloaders.py +++ b/tests/tests_pytorch/trainer/test_dataloaders.py @@ -23,6 +23,7 @@ from torch.utils.data.distributed import DistributedSampler from torch.utils.data.sampler import SequentialSampler +from lightning_lite.utilities.data import _auto_add_worker_init_fn, has_iterable_dataset from pytorch_lightning import Callback, seed_everything, Trainer from pytorch_lightning.callbacks import ModelCheckpoint from pytorch_lightning.demos.boring_classes import ( @@ -32,7 +33,7 @@ RandomIterableDatasetWithLen, ) from pytorch_lightning.trainer.states import RunningStage -from pytorch_lightning.utilities.data import _auto_add_worker_init_fn, has_iterable_dataset, has_len_all_ranks +from pytorch_lightning.utilities.data import has_len_all_ranks from pytorch_lightning.utilities.exceptions import MisconfigurationException from tests_pytorch.helpers.dataloaders import CustomInfDataloader, CustomNotImplementedErrorDataloader from tests_pytorch.helpers.runif import RunIf diff --git a/tests/tests_pytorch/trainer/test_supporters.py b/tests/tests_pytorch/trainer/test_supporters.py index fec8466748ab1..d9beabda43dd9 100644 --- a/tests/tests_pytorch/trainer/test_supporters.py +++ b/tests/tests_pytorch/trainer/test_supporters.py @@ -314,8 +314,8 @@ def test_nested_calc_num_data(input_data, compute_func, expected_length): @mock.patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0,1"}) -@mock.patch("pytorch_lightning.utilities.device_parser.num_cuda_devices", return_value=2) -@mock.patch("pytorch_lightning.utilities.device_parser.is_cuda_available", return_value=True) +@mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=2) +@mock.patch("lightning_lite.utilities.device_parser.is_cuda_available", return_value=True) @pytest.mark.parametrize("use_fault_tolerant", [False, True]) @pytest.mark.parametrize("replace_sampler_ddp", [False, True]) def test_combined_data_loader_validation_test( diff --git a/tests/tests_pytorch/trainer/test_trainer.py b/tests/tests_pytorch/trainer/test_trainer.py index d1b1ef6cf9e68..da6aedebbedf6 100644 --- a/tests/tests_pytorch/trainer/test_trainer.py +++ b/tests/tests_pytorch/trainer/test_trainer.py @@ -35,7 +35,9 @@ import pytorch_lightning import tests_pytorch.helpers.utils as tutils +from lightning_lite.utilities import device_parser from lightning_lite.utilities.cloud_io import load as pl_load +from lightning_lite.utilities.seed import seed_everything from pytorch_lightning import Callback, LightningDataModule, LightningModule, Trainer from pytorch_lightning.accelerators import CPUAccelerator, CUDAAccelerator from pytorch_lightning.callbacks import EarlyStopping, GradientAccumulationScheduler, ModelCheckpoint, Timer @@ -60,10 +62,8 @@ SingleDeviceStrategy, ) from pytorch_lightning.trainer.states import RunningStage, TrainerFn -from pytorch_lightning.utilities import device_parser from pytorch_lightning.utilities.exceptions import DeadlockDetectedException, MisconfigurationException from pytorch_lightning.utilities.imports import _OMEGACONF_AVAILABLE, _TORCH_GREATER_EQUAL_1_12 -from pytorch_lightning.utilities.seed import seed_everything from tests_pytorch.helpers.datamodules import ClassifDataModule from tests_pytorch.helpers.runif import RunIf from tests_pytorch.helpers.simple_models import ClassificationModel @@ -1258,8 +1258,8 @@ def __init__(self, **kwargs): "trainer_params", [{"max_epochs": 1, "accelerator": "gpu", "devices": 1}, {"max_epochs": 1, "accelerator": "gpu", "devices": [0]}], ) -@mock.patch("pytorch_lightning.utilities.device_parser.is_cuda_available", return_value=True) -@mock.patch("pytorch_lightning.utilities.device_parser.num_cuda_devices", return_value=1) +@mock.patch("lightning_lite.utilities.device_parser.is_cuda_available", return_value=True) +@mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=1) def test_trainer_omegaconf(_, __, trainer_params): config = OmegaConf.create(trainer_params) Trainer(**config) diff --git a/tests/tests_pytorch/trainer/test_trainer_cli.py b/tests/tests_pytorch/trainer/test_trainer_cli.py index 468650e234f81..6613f0b1bcf38 100644 --- a/tests/tests_pytorch/trainer/test_trainer_cli.py +++ b/tests/tests_pytorch/trainer/test_trainer_cli.py @@ -19,8 +19,9 @@ import pytest import tests_pytorch.helpers.utils as tutils +from lightning_lite.utilities import device_parser from pytorch_lightning import Trainer -from pytorch_lightning.utilities import argparse, device_parser +from pytorch_lightning.utilities import argparse @mock.patch("argparse.ArgumentParser.parse_args") diff --git a/tests/tests_pytorch/utilities/test_all_gather_grad.py b/tests/tests_pytorch/utilities/test_all_gather_grad.py index 49d86aca9cc92..7e00bc74a5155 100644 --- a/tests/tests_pytorch/utilities/test_all_gather_grad.py +++ b/tests/tests_pytorch/utilities/test_all_gather_grad.py @@ -17,9 +17,10 @@ import numpy as np import torch -from pytorch_lightning import seed_everything, Trainer +from lightning_lite.utilities import AllGatherGrad +from lightning_lite.utilities.seed import seed_everything +from pytorch_lightning import Trainer from pytorch_lightning.demos.boring_classes import BoringModel -from pytorch_lightning.utilities import AllGatherGrad from tests_pytorch.helpers.runif import RunIf diff --git a/tests/tests_pytorch/utilities/test_auto_restart.py b/tests/tests_pytorch/utilities/test_auto_restart.py index a3bf115313bd0..b399ba8b35539 100644 --- a/tests/tests_pytorch/utilities/test_auto_restart.py +++ b/tests/tests_pytorch/utilities/test_auto_restart.py @@ -35,7 +35,8 @@ from torch.utils.data.dataset import Dataset, IterableDataset import tests_pytorch.helpers.utils as tutils -from pytorch_lightning import Callback, LightningModule, seed_everything, Trainer +from lightning_lite.utilities.seed import seed_everything +from pytorch_lightning import Callback, LightningModule, Trainer from pytorch_lightning.demos.boring_classes import BoringModel, RandomDataset from pytorch_lightning.trainer.states import RunningStage, TrainerState from pytorch_lightning.trainer.supporters import CombinedLoader diff --git a/tests/tests_pytorch/utilities/test_data.py b/tests/tests_pytorch/utilities/test_data.py index 9e3d04ae65560..28743324c2101 100644 --- a/tests/tests_pytorch/utilities/test_data.py +++ b/tests/tests_pytorch/utilities/test_data.py @@ -1,4 +1,3 @@ -import random from dataclasses import dataclass import pytest @@ -6,6 +5,7 @@ from torch import Tensor from torch.utils.data import BatchSampler, DataLoader, RandomSampler, SequentialSampler +from lightning_lite.utilities.data import _replace_dunder_methods from pytorch_lightning import Trainer from pytorch_lightning.demos.boring_classes import BoringModel, RandomDataset, RandomIterableDataset from pytorch_lightning.overrides.distributed import IndexBatchSamplerWrapper @@ -13,14 +13,9 @@ from pytorch_lightning.utilities.data import ( _dataloader_init_kwargs_resolve_sampler, _get_dataloader_init_args_and_kwargs, - _replace_dunder_methods, - _replace_value_in_saved_args, _update_dataloader, - _WrapAttrTag, extract_batch_size, get_len, - has_iterable_dataset, - has_len, has_len_all_ranks, warning_cache, ) @@ -96,28 +91,6 @@ def __init__(self): _check_error_raised(data) -def test_has_iterable_dataset(): - assert has_iterable_dataset(DataLoader(RandomIterableDataset(1, 1))) - - assert not has_iterable_dataset(DataLoader(RandomDataset(1, 1))) - - class MockDatasetWithoutIterableDataset(RandomDataset): - def __iter__(self): - yield 1 - return self - - assert not has_iterable_dataset(DataLoader(MockDatasetWithoutIterableDataset(1, 1))) - - -def test_has_len(): - assert has_len(DataLoader(RandomDataset(1, 1))) - - with pytest.warns(UserWarning, match="`DataLoader` returned 0 length."): - assert has_len(DataLoader(RandomDataset(0, 0))) - - assert not has_len(DataLoader(RandomIterableDataset(1, 1))) - - def test_get_len(): assert get_len(DataLoader(RandomDataset(1, 1))) == 1 @@ -174,297 +147,6 @@ def __init__(self, randomize, *args, **kwargs): assert isinstance(new_dataloader, GoodImpl) -def test_replace_dunder_methods_multiple_loaders_without_init(): - """In case of a class, that inherits from a class that we are patching, but doesn't define its own `__init__` - method (the one we are wrapping), it can happen, that `hasattr(cls, "__old__init__")` is True because of parent - class, but it is impossible to delete, because that method is owned by parent class. Furthermore, the error - occured only sometimes because it depends on the order in which we are iterating over a set of classes we are - patching. - - This test simulates the behavior by generating sufficient number of dummy classes, which do not define `__init__` - and are children of `DataLoader`. We are testing that a) context manager `_replace_dunder_method` exits cleanly, and - b) the mechanism checking for presence of `__old__init__` works as expected. - """ - classes = [DataLoader] - for i in range(100): - classes.append(type(f"DataLoader_{i}", (random.choice(classes),), {})) - - before = {cls: cls.__init__ for cls in classes} - - with _replace_dunder_methods(DataLoader, "dataset"): - for cls in classes[1:]: # First one is `DataLoader` - assert "__old__init__" not in cls.__dict__ - assert hasattr(cls, "__old__init__") - - assert "__old__init__" in DataLoader.__dict__ - assert hasattr(DataLoader, "__old__init__") - - for cls in classes: - assert before[cls] == cls.__init__ - - -class DataLoaderSubclass1(DataLoader): - def __init__(self, attribute1, *args, **kwargs): - self.at1 = attribute1 - super().__init__(*args, **kwargs) - - -class DataLoaderSubclass2(DataLoaderSubclass1): - def __init__(self, attribute2, *args, **kwargs): - self.at2 = attribute2 - super().__init__(attribute2 + "-2", *args, **kwargs) - - -class MyBaseDataLoader(DataLoader): - pass - - -class MyDataLoader(MyBaseDataLoader): - def __init__(self, data: torch.Tensor, *args, **kwargs): - self.data = data - super().__init__(range(data.size(0)), *args, **kwargs) - - -test3_data = torch.randn((10, 20)) - - -class PoptorchDataLoader(DataLoader): - def __init__(self, options, *args, **kwargs): - super().__init__(*args, **kwargs) - self._options = options - - @property - def options(self): - return self._options - - -class IncompleteDataLoader(DataLoader): - def __init__(self, dataset, batch_size, **kwargs): - batch_size = max(batch_size - 5, 0) - super().__init__(dataset, batch_size=batch_size, **kwargs) - - -class WeirdDataLoader1(DataLoader): - def __init__(self, arg1, arg2, **kwargs): - self.arg1 = arg1 - super().__init__(arg2, **kwargs) - - -class WeirdDataLoader2(DataLoader): - def __init__(self, data_part1, data_part2, **kwargs): - data = list(data_part1) + list(data_part2) - super().__init__(data, **kwargs) - - -class NoneDataLoader(DataLoader): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - -class ChangingDataLoader(DataLoader): - def __init__(self, dataset, **kwargs): - super().__init__(list(dataset) + list(range(5, 10)), **kwargs) - - -@pytest.mark.parametrize( - ["cls", "args", "kwargs", "arg_names", "dataset", "checked_values"], - [ - pytest.param( - DataLoaderSubclass1, - ("attribute1",), - dict(dataset=range(4), batch_size=2), - ("attribute1",), - range(4), - dict(batch_size=2, at1="attribute1"), - id="test1", - ), - pytest.param( - DataLoaderSubclass2, - ("attribute2",), - dict(dataset=range(4), batch_size=2), - ("attribute2",), - range(4), - dict(batch_size=2, at1="attribute2-2", at2="attribute2"), - id="test2", - ), - pytest.param( - MyDataLoader, - (test3_data,), - dict(batch_size=2), - ("data",), - range(10), - dict(batch_size=2, data=test3_data), - id="test3", - ), - pytest.param(PoptorchDataLoader, (123, [1]), dict(), ("options",), [1], dict(options=123), id="test4"), - pytest.param( - IncompleteDataLoader, - (range(10),), - dict(batch_size=10), - ("dataset",), - range(10), - dict(batch_size=5), - id="test5", - ), - pytest.param( - WeirdDataLoader1, - (10, range(10)), - dict(batch_size=10), - ("arg1", "arg2"), - range(10), - dict(arg1=10, batch_size=10), - id="test6", - ), - pytest.param( - WeirdDataLoader2, - (range(10), range(10, 20)), - dict(batch_size=10), - ("data_part1", "data_part2"), - list(range(20)), - dict(batch_size=10), - id="test7", - ), - pytest.param(NoneDataLoader, (None,), dict(), (), None, dict(), id="test8"), - pytest.param(ChangingDataLoader, (range(5),), dict(), ("dataset",), list(range(10)), dict(), id="test9"), - ], -) -def test_replace_dunder_methods_dataloader(cls, args, kwargs, arg_names, dataset, checked_values): - with _replace_dunder_methods(DataLoader, "dataset"): - dataloader = cls(*args, **kwargs) - - assert dataloader.__pl_saved_args == args - assert dataloader.__pl_saved_kwargs == kwargs - assert dataloader.__pl_saved_arg_names == arg_names - assert dataloader.__pl_saved_default_kwargs == {} - assert dataloader.__dataset == dataset - - assert dataloader.dataset == dataset - - for key, value in checked_values.items(): - dataloader_value = getattr(dataloader, key) - if isinstance(dataloader_value, torch.Tensor): - assert dataloader_value is value - else: - assert dataloader_value == value - - dataloader = _update_dataloader(dataloader, dataloader.sampler) - - assert isinstance(dataloader, cls) - assert not hasattr(dataloader, "__pl_saved_kwargs") - assert not hasattr(dataloader, "__pl_saved_arg_names") - assert not hasattr(dataloader, "__pl_saved_args") - assert not hasattr(dataloader, "__pl_saved_default_kwargs") - assert not hasattr(dataloader, "__dataset") - - assert dataloader.dataset == dataset - - for key, value in checked_values.items(): - dataloader_value = getattr(dataloader, key) - if isinstance(dataloader_value, torch.Tensor): - assert dataloader_value is value - else: - assert dataloader_value == value - - -def test_replace_dunder_methods_extra_kwargs(): - class LoaderSubclass(DataLoader): - def __init__(self, dataset, *args, batch_size=10, **kwargs): - super().__init__(dataset, *args, batch_size=batch_size, **kwargs) - - with _replace_dunder_methods(DataLoader, "dataset"): - dataloader = LoaderSubclass(range(10)) - - assert dataloader.__pl_saved_args == (range(10),) - assert dataloader.__pl_saved_kwargs == {} - assert dataloader.__pl_saved_arg_names == ("dataset",) - assert dataloader.__pl_saved_default_kwargs == {"batch_size": 10} - assert dataloader.__dataset == range(10) - - -def test_replace_dunder_methods_attrs(): - """This test checks, that all the calls from setting and deleting attributes within `_replace_dunder_methods` - are correctly preserved even after reinstantiation. - - It also includes a custom `__setattr__` - """ - - class Loader(DataLoader): - def __setattr__(self, attr, val): - if attr == "custom_arg": - val = val + 2 - super().__setattr__(attr, val) - - with _replace_dunder_methods(DataLoader, "dataset"): - dataloader = Loader(range(10)) - dataloader.custom_arg = 5 - dataloader.my_arg = 10 - dataloader.another_arg = 100 - del dataloader.dataset - try: - del dataloader.abc_arg - except AttributeError: - pass - - assert dataloader.__pl_saved_args == (range(10),) - assert dataloader.__pl_saved_kwargs == {} - assert dataloader.__pl_saved_arg_names == ("dataset",) - assert dataloader.__dataset == range(10) - assert dataloader.custom_arg == 7 - assert dataloader.my_arg == 10 - assert dataloader.another_arg == 100 - assert not hasattr(dataloader, "dataset") - assert dataloader.__pl_attrs_record == [ - (("custom_arg", 5), _WrapAttrTag.SET), - (("my_arg", 10), _WrapAttrTag.SET), - (("another_arg", 100), _WrapAttrTag.SET), - (("dataset",), _WrapAttrTag.DEL), - ] - - dataloader = _update_dataloader(dataloader, dataloader.sampler) - assert dataloader.custom_arg == 7 - assert dataloader.my_arg == 10 - assert dataloader.another_arg == 100 - assert not hasattr(dataloader, "dataset") - - -def test_replace_dunder_methods_restore_methods(): - """This tests checks whether are all dunder methods restored to their original versions.""" - - class Init(DataLoader): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - class SetAttr(DataLoader): - def __setattr__(self, *args): - return super().__setattr__(*args) - - class DelAttr(DataLoader): - def __delattr__(self, *args): - return super().__delattr__(*args) - - class InitAndSetAttr(Init, SetAttr): - pass - - class InitAndDelAttr(Init, DelAttr): - pass - - class SetAttrAndDelAttr(SetAttr, DelAttr): - pass - - class AllDunder(Init, SetAttr, DelAttr): - pass - - before = dict() - for cls in (Init, SetAttr, DelAttr, InitAndSetAttr, InitAndDelAttr, SetAttrAndDelAttr, AllDunder): - before[cls] = {"init": cls.__init__, "setattr": cls.__setattr__, "delattr": cls.__delattr__} - - with _replace_dunder_methods(DataLoader, "dataset"): - pass - - for cls in (Init, SetAttr, DelAttr, InitAndSetAttr, InitAndDelAttr, SetAttrAndDelAttr, AllDunder): - assert before[cls] == {"init": cls.__init__, "setattr": cls.__setattr__, "delattr": cls.__delattr__} - - @pytest.mark.parametrize("predicting", [True, False]) def test_custom_batch_sampler(predicting): """This test asserts, that custom `BatchSampler`, with all the arguments, that are required in order to @@ -570,47 +252,6 @@ def __init__(self, extra_arg): dataloader = _update_dataloader(dataloader, dataloader.sampler, mode=RunningStage.PREDICTING) -@pytest.mark.parametrize( - [ - "args", - "kwargs", - "default_kwargs", - "arg_names", - "replace_key", - "replace_value", - "expected_status", - "expected_args", - "expected_kwargs", - ], - [ - pytest.param((), {}, {}, [], "a", 1, False, (), {}, id="empty"), - pytest.param((1,), {}, {}, ["a"], "a", 2, True, (2,), {}, id="simple1"), - pytest.param((1, 2, 3), {}, {}, ["a", "b", "c"], "b", False, True, (1, False, 3), {}, id="simple2"), - pytest.param((1, 2, 3), {"a": 1}, {}, ["b", "c", "d"], "a", 2, True, (1, 2, 3), {"a": 2}, id="simple_kwargs"), - pytest.param( - (1, 2, 3), - {"a": 1}, - {"e": 5}, - ["b", "c", "d"], - "e", - 2, - True, - (1, 2, 3), - {"a": 1, "e": 2}, - id="default_kwargs", - ), - ], -) -def test_replace_value_in_args( - args, kwargs, default_kwargs, arg_names, replace_key, replace_value, expected_status, expected_args, expected_kwargs -): - assert _replace_value_in_saved_args(replace_key, replace_value, args, kwargs, default_kwargs, arg_names) == ( - expected_status, - expected_args, - expected_kwargs, - ) - - def test_dataloader_disallow_batch_sampler(): dataset = RandomDataset(5, 100) dataloader = DataLoader(dataset, batch_size=10) diff --git a/tests/tests_pytorch/utilities/test_device_parser.py b/tests/tests_pytorch/utilities/test_device_parser.py index d496db487f55c..a4a84892a6e8d 100644 --- a/tests/tests_pytorch/utilities/test_device_parser.py +++ b/tests/tests_pytorch/utilities/test_device_parser.py @@ -16,7 +16,7 @@ import pytest import torch -from pytorch_lightning.utilities import device_parser +from lightning_lite.utilities import device_parser @pytest.mark.skipif( diff --git a/tests/tests_pytorch/utilities/test_distributed.py b/tests/tests_pytorch/utilities/test_distributed.py index c3c90b5da6a21..2e2c88dd7a4a5 100644 --- a/tests/tests_pytorch/utilities/test_distributed.py +++ b/tests/tests_pytorch/utilities/test_distributed.py @@ -13,13 +13,12 @@ # limitations under the License. import os -import pytest import torch import torch.distributed import torch.multiprocessing as mp import tests_pytorch.helpers.utils as tutils -from pytorch_lightning.utilities.distributed import _collect_states_on_rank_zero, gather_all_tensors +from pytorch_lightning.utilities.distributed import _collect_states_on_rank_zero from tests_pytorch.helpers.runif import RunIf @@ -44,57 +43,3 @@ def test_collect_states(): """ tutils.set_random_main_port() mp.spawn(_test_collect_states, args=(2,), nprocs=2) - - -def _test_all_gather_uneven_tensors(rank, world_size, backend): - os.environ["MASTER_ADDR"] = "localhost" - - if backend == "nccl": - device = torch.device("cuda", rank) - torch.cuda.set_device(device) - else: - device = torch.device("cpu") - - # initialize the process group - torch.distributed.init_process_group(backend, rank=rank, world_size=world_size) - - tensor = torch.ones(rank, device=device) - result = gather_all_tensors(tensor) - assert len(result) == world_size - for idx in range(world_size): - assert len(result[idx]) == idx - assert (result[idx] == torch.ones_like(result[idx])).all() - - -def _test_all_gather_uneven_tensors_multidim(rank, world_size, backend): - os.environ["MASTER_ADDR"] = "localhost" - - if backend == "nccl": - device = torch.device("cuda", rank) - torch.cuda.set_device(device) - else: - device = torch.device("cpu") - - # initialize the process group - torch.distributed.init_process_group(backend, rank=rank, world_size=world_size) - tensor = torch.ones(rank + 1, 2 - rank, device=device) - result = gather_all_tensors(tensor) - assert len(result) == world_size - for idx in range(world_size): - val = result[idx] - assert val.shape == (idx + 1, 2 - idx) - assert (val == torch.ones_like(val)).all() - - -@RunIf(min_torch="1.10", skip_windows=True) -@pytest.mark.parametrize( - "process", - [ - _test_all_gather_uneven_tensors_multidim, - _test_all_gather_uneven_tensors, - ], -) -@pytest.mark.parametrize("backend", [pytest.param("nccl", marks=RunIf(min_cuda_gpus=2)), "gloo"]) -def test_gather_all_tensors(backend, process): - tutils.set_random_main_port() - mp.spawn(process, args=(2, backend), nprocs=2) diff --git a/tests/tests_pytorch/utilities/test_enums.py b/tests/tests_pytorch/utilities/test_enums.py index 1519e177217bb..83b6c7b116bc9 100644 --- a/tests/tests_pytorch/utilities/test_enums.py +++ b/tests/tests_pytorch/utilities/test_enums.py @@ -11,15 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from pytorch_lightning.utilities.enums import GradClipAlgorithmType, PrecisionType - - -def test_precision_supported_types(): - assert PrecisionType.supported_types() == ["16", "32", "64", "bf16", "mixed"] - assert PrecisionType.supported_type(16) - assert PrecisionType.supported_type("16") - assert not PrecisionType.supported_type(1) - assert not PrecisionType.supported_type("invalid") +from pytorch_lightning.utilities.enums import GradClipAlgorithmType def test_gradient_clip_algorithms(): diff --git a/tests/tests_pytorch/utilities/test_seed.py b/tests/tests_pytorch/utilities/test_seed.py index 502febcaa9223..ac76725616946 100644 --- a/tests/tests_pytorch/utilities/test_seed.py +++ b/tests/tests_pytorch/utilities/test_seed.py @@ -1,83 +1,13 @@ -import os import random -from unittest import mock import numpy as np import pytest import torch -import pytorch_lightning.utilities.seed as seed_utils -from pytorch_lightning.utilities.seed import _collect_rng_states, _set_rng_states, isolate_rng +from pytorch_lightning.utilities.seed import isolate_rng from tests_pytorch.helpers.runif import RunIf -@mock.patch.dict(os.environ, {}, clear=True) -def test_seed_stays_same_with_multiple_seed_everything_calls(): - """Ensure that after the initial seed everything, the seed stays the same for the same run.""" - with pytest.warns(UserWarning, match="No seed found"): - seed_utils.seed_everything() - initial_seed = os.environ.get("PL_GLOBAL_SEED") - - with pytest.warns(None) as record: - seed_utils.seed_everything() - assert not record # does not warn - seed = os.environ.get("PL_GLOBAL_SEED") - - assert initial_seed == seed - - -@mock.patch.dict(os.environ, {"PL_GLOBAL_SEED": "2020"}, clear=True) -def test_correct_seed_with_environment_variable(): - """Ensure that the PL_GLOBAL_SEED environment is read.""" - assert seed_utils.seed_everything() == 2020 - - -@mock.patch.dict(os.environ, {"PL_GLOBAL_SEED": "invalid"}, clear=True) -@mock.patch.object(seed_utils, attribute="_select_seed_randomly", new=lambda *_: 123) -def test_invalid_seed(): - """Ensure that we still fix the seed even if an invalid seed is given.""" - with pytest.warns(UserWarning, match="Invalid seed found"): - seed = seed_utils.seed_everything() - assert seed == 123 - - -@mock.patch.dict(os.environ, {}, clear=True) -@mock.patch.object(seed_utils, attribute="_select_seed_randomly", new=lambda *_: 123) -@pytest.mark.parametrize("seed", (10e9, -10e9)) -def test_out_of_bounds_seed(seed): - """Ensure that we still fix the seed even if an out-of-bounds seed is given.""" - with pytest.warns(UserWarning, match="is not in bounds"): - actual = seed_utils.seed_everything(seed) - assert actual == 123 - - -def test_reset_seed_no_op(): - """Test that the reset_seed function is a no-op when seed_everything() was not used.""" - assert "PL_GLOBAL_SEED" not in os.environ - seed_before = torch.initial_seed() - seed_utils.reset_seed() - assert torch.initial_seed() == seed_before - assert "PL_GLOBAL_SEED" not in os.environ - - -@pytest.mark.parametrize("workers", (True, False)) -def test_reset_seed_everything(workers): - """Test that we can reset the seed to the initial value set by seed_everything()""" - assert "PL_GLOBAL_SEED" not in os.environ - assert "PL_SEED_WORKERS" not in os.environ - - seed_utils.seed_everything(123, workers) - before = torch.rand(1) - assert os.environ["PL_GLOBAL_SEED"] == "123" - assert os.environ["PL_SEED_WORKERS"] == str(int(workers)) - - seed_utils.reset_seed() - after = torch.rand(1) - assert os.environ["PL_GLOBAL_SEED"] == "123" - assert os.environ["PL_SEED_WORKERS"] == str(int(workers)) - assert torch.allclose(before, after) - - @pytest.mark.parametrize("with_torch_cuda", [False, pytest.param(True, marks=RunIf(min_cuda_gpus=1))]) def test_isolate_rng(with_torch_cuda): """Test that the isolate_rng context manager isolates the random state from the outer scope.""" @@ -105,11 +35,3 @@ def test_isolate_rng(with_torch_cuda): with isolate_rng(): generated = [random.random() for _ in range(3)] assert random.random() == generated[0] - - -def test_backward_compatibility_rng_states_dict(): - """Test that an older rng_states_dict without the "torch.cuda" key does not crash.""" - states = _collect_rng_states() - assert "torch.cuda" in states - states.pop("torch.cuda") - _set_rng_states(states) diff --git a/tests/tests_pytorch/utilities/test_types.py b/tests/tests_pytorch/utilities/test_types.py index 5b523a43dc4f2..0782d3bc2e9f3 100644 --- a/tests/tests_pytorch/utilities/test_types.py +++ b/tests/tests_pytorch/utilities/test_types.py @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from pytorch_lightning.utilities.types import _Stateful +from lightning_lite.utilities.types import _Stateful def test_stateful_protocol(): diff --git a/tests/tests_pytorch/utilities/test_warnings.py b/tests/tests_pytorch/utilities/test_warnings.py index 223cd4e59f75f..e95a3423273c8 100644 --- a/tests/tests_pytorch/utilities/test_warnings.py +++ b/tests/tests_pytorch/utilities/test_warnings.py @@ -18,38 +18,7 @@ from contextlib import redirect_stderr from io import StringIO -from lightning_utilities.core.rank_zero import _warn, WarningCache - -from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation, rank_zero_warn - if __name__ == "__main__": - stderr = StringIO() - # recording - with redirect_stderr(stderr): - _warn("test1") - _warn("test2", category=DeprecationWarning) - - rank_zero_warn("test3") - rank_zero_warn("test4", category=DeprecationWarning) - - rank_zero_deprecation("test5") - - cache = WarningCache() - cache.warn("test6") - cache.deprecation("test7") - - output = stderr.getvalue() - assert "test_warnings.py:29: UserWarning: test1" in output - assert "test_warnings.py:30: DeprecationWarning: test2" in output - - assert "test_warnings.py:32: UserWarning: test3" in output - assert "test_warnings.py:33: DeprecationWarning: test4" in output - - assert "test_warnings.py:35: LightningDeprecationWarning: test5" in output - - assert "test_warnings.py:38: UserWarning: test6" in output - assert "test_warnings.py:39: LightningDeprecationWarning: test7" in output - # check that logging is properly configured import logging From 12d4b2a5f52e36ad709c9d6e4f30b7bf2a806ac7 Mon Sep 17 00:00:00 2001 From: Rick Izzo Date: Wed, 7 Sep 2022 11:55:03 -0400 Subject: [PATCH 080/193] Fix(ci) ONC-114: reduce the ci load by only installing lmdb in tests (#14581) reduce the ci load by only installing lmdb in tests --- tests/tests_app_examples/custom_work_dependencies/app.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/tests/tests_app_examples/custom_work_dependencies/app.py b/tests/tests_app_examples/custom_work_dependencies/app.py index a7dbb7ff70062..a821adf3fcffc 100644 --- a/tests/tests_app_examples/custom_work_dependencies/app.py +++ b/tests/tests_app_examples/custom_work_dependencies/app.py @@ -10,17 +10,13 @@ def build_commands(self): class WorkWithCustomDeps(LightningWork): def __init__(self, cloud_compute: CloudCompute = CloudCompute(), **kwargs): - build_config = CustomBuildConfig(requirements=["numpy", "pandas", "py"]) + build_config = CustomBuildConfig(requirements=["py"]) super().__init__(parallel=True, **kwargs, cloud_compute=cloud_compute, cloud_build_config=build_config) def run(self): # installed by the build commands and by requirements in the build config import lmdb - import numpy as np - import pandas as pd - print("installed numpy version:", np.__version__) - print("installed pandas version:", pd.__version__) print("installed lmdb version:", lmdb.__version__) From 6cf666c6f16f5ffe550efbca79e3e7f96a086a2d Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 7 Sep 2022 16:22:34 +0000 Subject: [PATCH 081/193] Update neptune-client requirement from <0.16.4,>=0.10.0 to >=0.10.0,<0.16.8 in /requirements (#14582) Update neptune-client requirement in /requirements Updates the requirements on [neptune-client](https://github.com/neptune-ai/neptune-client) to permit the latest version. - [Release notes](https://github.com/neptune-ai/neptune-client/releases) - [Changelog](https://github.com/neptune-ai/neptune-client/blob/master/CHANGELOG.md) - [Commits](https://github.com/neptune-ai/neptune-client/compare/0.10.0...0.16.7) --- updated-dependencies: - dependency-name: neptune-client dependency-type: direct:production ... Signed-off-by: dependabot[bot] Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- requirements/pytorch/loggers.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/pytorch/loggers.txt b/requirements/pytorch/loggers.txt index 573daaa541ced..f3a5dcfd5a39d 100644 --- a/requirements/pytorch/loggers.txt +++ b/requirements/pytorch/loggers.txt @@ -3,7 +3,7 @@ # all supported loggers -neptune-client>=0.10.0, <0.16.4 +neptune-client>=0.10.0, <0.16.8 comet-ml>=3.1.12, <3.31.8 mlflow>=1.0.0, <1.29.0 wandb>=0.10.22, <0.13.2 From 4d082e4dac7fec388bdc49578f1d50ae72e498fb Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 7 Sep 2022 16:30:46 +0000 Subject: [PATCH 082/193] Update fastapi requirement from <=0.79.0 to <0.83.0 in /requirements (#14576) Updates the requirements on [fastapi](https://github.com/tiangolo/fastapi) to permit the latest version. - [Release notes](https://github.com/tiangolo/fastapi/releases) - [Commits](https://github.com/tiangolo/fastapi/compare/0.1.11...0.82.0) --- updated-dependencies: - dependency-name: fastapi dependency-type: direct:production ... Signed-off-by: dependabot[bot] Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- requirements/pytorch/test.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/pytorch/test.txt b/requirements/pytorch/test.txt index 87f619e64e8c5..d6488574e357d 100644 --- a/requirements/pytorch/test.txt +++ b/requirements/pytorch/test.txt @@ -13,5 +13,5 @@ scikit-learn>0.22.1, <1.1.3 onnxruntime<1.13.0 psutil<=5.9.1 # for `DeviceStatsMonitor` pandas>1.0, <=1.4.3 # needed in benchmarks -fastapi<=0.79.0 +fastapi<0.83.0 uvicorn<=0.18.2 From 06fa9ea3e0549c502a100a60938099163fe6d38b Mon Sep 17 00:00:00 2001 From: Kushashwa Ravi Shrimali Date: Thu, 8 Sep 2022 02:05:05 +0530 Subject: [PATCH 083/193] PL: update changelog post 1.7.5 release (#14570) Co-authored-by: Jirka Borovec --- src/pytorch_lightning/CHANGELOG.md | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 7fec6260b174d..ecf1ce319aa13 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -4,7 +4,7 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). -## [1.8.0] - 2022-MM-DD +## [unReleased] - 2022-MM-DD ### Added @@ -158,11 +158,13 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed an issue to keep downscaling the batch size in case there hasn't been even a single successful optimal batch size with `mode="power"` ([#14372](https://github.com/Lightning-AI/lightning/pull/14372)) -- Squeezed tensor values when logging with `LightningModule.log` ([#14489](https://github.com/Lightning-AI/lightning/pull/14489)) +## [1.7.5] - 2022-09-06 +### Fixed +- Squeezed tensor values when logging with `LightningModule.log` ([#14489](https://github.com/Lightning-AI/lightning/pull/14489)) - Fixed `WandbLogger` `save_dir` is not set after creation ([#14326](https://github.com/Lightning-AI/lightning/pull/14326)) - +- Fixed `Trainer.estimated_stepping_batches` when maximum number of epochs is not set ([#14317](https://github.com/Lightning-AI/lightning/pull/14317)) ## [1.7.4] - 2022-08-31 From 59fcabf53a72b23e6efa0a65f58c79cd9bff0b1a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 7 Sep 2022 22:56:39 +0200 Subject: [PATCH 084/193] Refactor `_get_rank` utility to take strategy instead of trainer (#14546) --- src/pytorch_lightning/callbacks/early_stopping.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pytorch_lightning/callbacks/early_stopping.py b/src/pytorch_lightning/callbacks/early_stopping.py index 6c1a43e1d140c..4acff87dfaa60 100644 --- a/src/pytorch_lightning/callbacks/early_stopping.py +++ b/src/pytorch_lightning/callbacks/early_stopping.py @@ -261,7 +261,7 @@ def _improvement_message(self, current: Tensor) -> str: @staticmethod def _log_info(trainer: Optional["pl.Trainer"], message: str, log_rank_zero_only: bool) -> None: - rank = _get_rank(trainer) + rank = _get_rank(strategy=(trainer.strategy if trainer is not None else None)) if trainer is not None and trainer.world_size <= 1: rank = None message = rank_prefixed_message(message, rank) From 95374440ce9b4f499f83b0e6b8cee0bc715aa18b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 7 Sep 2022 23:22:46 +0200 Subject: [PATCH 085/193] Move device parser tests inside Lite (#14586) --- .../utilities/test_device_parser.py | 89 ++++++++++++++++++- tests/tests_pytorch/models/test_gpu.py | 60 ------------- 2 files changed, 86 insertions(+), 63 deletions(-) diff --git a/tests/tests_lite/utilities/test_device_parser.py b/tests/tests_lite/utilities/test_device_parser.py index bb6e1665efde7..0f005d5ce3a37 100644 --- a/tests/tests_lite/utilities/test_device_parser.py +++ b/tests/tests_lite/utilities/test_device_parser.py @@ -16,7 +16,90 @@ import pytest import torch -import lightning_lite.utilities.device_parser +from lightning_lite.utilities import device_parser +from lightning_lite.utilities.exceptions import MisconfigurationException + +_PRETEND_N_OF_GPUS = 16 + + +@pytest.fixture +def mocked_device_count(monkeypatch): + def device_count(): + return _PRETEND_N_OF_GPUS + + def is_available(): + return True + + monkeypatch.setattr(device_parser, "is_cuda_available", is_available) + monkeypatch.setattr(device_parser, "num_cuda_devices", device_count) + + +@pytest.fixture +def mocked_device_count_0(monkeypatch): + def device_count(): + return 0 + + monkeypatch.setattr(device_parser, "num_cuda_devices", device_count) + + +@pytest.mark.parametrize( + ["devices", "expected_root_gpu"], + [ + pytest.param(None, None, id="No gpus, expect gpu root device to be None"), + pytest.param([0], 0, id="Oth gpu, expect gpu root device to be 0."), + pytest.param([1], 1, id="1st gpu, expect gpu root device to be 1."), + pytest.param([3], 3, id="3rd gpu, expect gpu root device to be 3."), + pytest.param([1, 2], 1, id="[1, 2] gpus, expect gpu root device to be 1."), + ], +) +def test_determine_root_gpu_device(devices, expected_root_gpu): + assert device_parser.determine_root_gpu_device(devices) == expected_root_gpu + + +@pytest.mark.parametrize( + ["devices", "expected_gpu_ids"], + [ + (None, None), + (0, None), + ([], None), + (1, [0]), + (3, [0, 1, 2]), + pytest.param(-1, list(range(_PRETEND_N_OF_GPUS)), id="-1 - use all gpus"), + ([0], [0]), + ([1, 3], [1, 3]), + ((1, 3), [1, 3]), + ("0", None), + ("3", [0, 1, 2]), + ("1, 3", [1, 3]), + ("2,", [2]), + pytest.param("-1", list(range(_PRETEND_N_OF_GPUS)), id="'-1' - use all gpus"), + ], +) +def test_parse_gpu_ids(mocked_device_count, devices, expected_gpu_ids): + assert device_parser.parse_gpu_ids(devices, include_cuda=True) == expected_gpu_ids + + +@pytest.mark.parametrize("devices", [0.1, -2, False, [-1], [None], ["0"], [0, 0]]) +def test_parse_gpu_fail_on_unsupported_inputs(mocked_device_count, devices): + with pytest.raises(MisconfigurationException): + device_parser.parse_gpu_ids(devices, include_cuda=True) + + +@pytest.mark.parametrize("devices", [[1, 2, 19], -1, "-1"]) +def test_parse_gpu_fail_on_non_existent_id(mocked_device_count_0, devices): + with pytest.raises(MisconfigurationException): + device_parser.parse_gpu_ids(devices, include_cuda=True) + + +def test_parse_gpu_fail_on_non_existent_id_2(mocked_device_count): + with pytest.raises(MisconfigurationException): + device_parser.parse_gpu_ids([1, 2, 19], include_cuda=True) + + +@pytest.mark.parametrize("devices", [-1, "-1"]) +def test_parse_gpu_returns_none_when_no_devices_are_available(mocked_device_count_0, devices): + with pytest.raises(MisconfigurationException): + device_parser.parse_gpu_ids(devices, include_cuda=True) @pytest.mark.skipif( @@ -27,5 +110,5 @@ def test_num_cuda_devices_without_forking(*_): """This merely tests that on platforms without fork support our helper functions fall back to the default implementation for determining cuda availability.""" - assert lightning_lite.utilities.device_parser.is_cuda_available() - assert lightning_lite.utilities.device_parser.num_cuda_devices() == 2 + assert device_parser.is_cuda_available() + assert device_parser.num_cuda_devices() == 2 diff --git a/tests/tests_pytorch/models/test_gpu.py b/tests/tests_pytorch/models/test_gpu.py index 1f15f2a5969aa..3951a500b8481 100644 --- a/tests/tests_pytorch/models/test_gpu.py +++ b/tests/tests_pytorch/models/test_gpu.py @@ -109,66 +109,6 @@ def test_root_gpu_property_0_raising(mocked_device_count_0, devices, expected_ro Trainer(accelerator="gpu", devices=devices, strategy=strategy) -@pytest.mark.parametrize( - ["devices", "expected_root_gpu"], - [ - pytest.param(None, None, id="No gpus, expect gpu root device to be None"), - pytest.param([0], 0, id="Oth gpu, expect gpu root device to be 0."), - pytest.param([1], 1, id="1st gpu, expect gpu root device to be 1."), - pytest.param([3], 3, id="3rd gpu, expect gpu root device to be 3."), - pytest.param([1, 2], 1, id="[1, 2] gpus, expect gpu root device to be 1."), - ], -) -def test_determine_root_gpu_device(devices, expected_root_gpu): - assert device_parser.determine_root_gpu_device(devices) == expected_root_gpu - - -@pytest.mark.parametrize( - ["devices", "expected_gpu_ids"], - [ - (None, None), - (0, None), - ([], None), - (1, [0]), - (3, [0, 1, 2]), - pytest.param(-1, list(range(PRETEND_N_OF_GPUS)), id="-1 - use all gpus"), - ([0], [0]), - ([1, 3], [1, 3]), - ((1, 3), [1, 3]), - ("0", None), - ("3", [0, 1, 2]), - ("1, 3", [1, 3]), - ("2,", [2]), - pytest.param("-1", list(range(PRETEND_N_OF_GPUS)), id="'-1' - use all gpus"), - ], -) -def test_parse_gpu_ids(mocked_device_count, devices, expected_gpu_ids): - assert device_parser.parse_gpu_ids(devices, include_cuda=True) == expected_gpu_ids - - -@pytest.mark.parametrize("devices", [0.1, -2, False, [-1], [None], ["0"], [0, 0]]) -def test_parse_gpu_fail_on_unsupported_inputs(mocked_device_count, devices): - with pytest.raises(MisconfigurationException): - device_parser.parse_gpu_ids(devices, include_cuda=True) - - -@pytest.mark.parametrize("devices", [[1, 2, 19], -1, "-1"]) -def test_parse_gpu_fail_on_non_existent_id(mocked_device_count_0, devices): - with pytest.raises(MisconfigurationException): - device_parser.parse_gpu_ids(devices, include_cuda=True) - - -def test_parse_gpu_fail_on_non_existent_id_2(mocked_device_count): - with pytest.raises(MisconfigurationException): - device_parser.parse_gpu_ids([1, 2, 19], include_cuda=True) - - -@pytest.mark.parametrize("devices", [-1, "-1"]) -def test_parse_gpu_returns_none_when_no_devices_are_available(mocked_device_count_0, devices): - with pytest.raises(MisconfigurationException): - device_parser.parse_gpu_ids(devices, include_cuda=True) - - @mock.patch.dict( os.environ, { From b0683b4eaeb5c630ad6cd57a4ea05bcbb59688f6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Thu, 8 Sep 2022 15:29:28 +0200 Subject: [PATCH 086/193] Use the `pull_request_target` workflow event (#14603) * Use the `pull_request_target` workflow event * Minor cleanup * ready_for_review --- .github/workflows/ci-app-examples.yml | 3 ++- .github/workflows/ci-app-tests.yml | 3 ++- .github/workflows/ci-circleci.yml | 2 +- .github/workflows/ci-lite-test-full.yml | 4 ++-- .github/workflows/ci-pkg-install.yml | 2 +- .github/workflows/ci-pytorch-test-conda.yml | 2 +- .github/workflows/ci-pytorch-test-full.yml | 4 ++-- .github/workflows/ci-pytorch-test-slow.yml | 4 ++-- .github/workflows/code-checks.yml | 2 +- .github/workflows/docs-checks.yml | 2 +- .github/workflows/probot-auto-cc.yml | 2 +- .github/workflows/release-pypi.yml | 2 +- 12 files changed, 17 insertions(+), 15 deletions(-) diff --git a/.github/workflows/ci-app-examples.yml b/.github/workflows/ci-app-examples.yml index 0480322d2b39d..511c2264e0848 100644 --- a/.github/workflows/ci-app-examples.yml +++ b/.github/workflows/ci-app-examples.yml @@ -1,11 +1,12 @@ name: Test App - examples # see: https://help.github.com/en/actions/reference/events-that-trigger-workflows -on: # Trigger the workflow on push or pull request, but only for the master branch +on: push: branches: [master, "release/*"] pull_request: branches: [master, "release/*"] + types: [opened, reopened, ready_for_review, synchronize] # add `ready_for_review` since draft is skipped paths: - ".github/workflows/ci-app-examples.yml" - "src/lightning_app/**" diff --git a/.github/workflows/ci-app-tests.yml b/.github/workflows/ci-app-tests.yml index 4565cee2d36d2..422dc746a6fcc 100644 --- a/.github/workflows/ci-app-tests.yml +++ b/.github/workflows/ci-app-tests.yml @@ -1,10 +1,11 @@ name: Test App # see: https://help.github.com/en/actions/reference/events-that-trigger-workflows -on: # Trigger the workflow on push or pull request, but only for the master branch +on: push: branches: [master, "release/*"] pull_request: + branches: [master, "release/*"] paths: - ".github/workflows/ci-app-tests.yml" - "src/lightning_app/**" diff --git a/.github/workflows/ci-circleci.yml b/.github/workflows/ci-circleci.yml index 697fa444f3dc9..751c3f9edc5f2 100644 --- a/.github/workflows/ci-circleci.yml +++ b/.github/workflows/ci-circleci.yml @@ -8,7 +8,7 @@ on: - "src/pytorch_lightning/**" - "tests/tests_pytorch/**" - "setup.cfg" # includes pytest config - pull_request: + pull_request_target: branches: [master, "release/*"] paths: - ".github/workflows/ci-circleci.yml" diff --git a/.github/workflows/ci-lite-test-full.yml b/.github/workflows/ci-lite-test-full.yml index 2830952e2407b..4449fca313f84 100644 --- a/.github/workflows/ci-lite-test-full.yml +++ b/.github/workflows/ci-lite-test-full.yml @@ -1,12 +1,12 @@ name: Test Lite full # see: https://help.github.com/en/actions/reference/events-that-trigger-workflows -on: # Trigger the workflow on push or pull request, but only for the master branch +on: push: branches: [master, "release/*"] pull_request: branches: [master, "release/*"] - types: [opened, reopened, ready_for_review, synchronize] + types: [opened, reopened, ready_for_review, synchronize] # add `ready_for_review` since draft is skipped paths: - "requirements/lite/**" - "src/lightning_lite/**" diff --git a/.github/workflows/ci-pkg-install.yml b/.github/workflows/ci-pkg-install.yml index 7751dce429b3d..7993dd04d5852 100644 --- a/.github/workflows/ci-pkg-install.yml +++ b/.github/workflows/ci-pkg-install.yml @@ -1,7 +1,7 @@ name: Package # see: https://help.github.com/en/actions/reference/events-that-trigger-workflows -on: # Trigger the workflow on push or pull request, but only for the master branch +on: push: branches: [master, "release/*"] pull_request: diff --git a/.github/workflows/ci-pytorch-test-conda.yml b/.github/workflows/ci-pytorch-test-conda.yml index 64d06a22949d8..c49361153eb4d 100644 --- a/.github/workflows/ci-pytorch-test-conda.yml +++ b/.github/workflows/ci-pytorch-test-conda.yml @@ -1,7 +1,7 @@ name: Test PyTorch with Conda # see: https://help.github.com/en/actions/reference/events-that-trigger-workflows -on: # Trigger the workflow on push or pull request, but only for the master branch +on: push: branches: [master, "release/*"] pull_request: diff --git a/.github/workflows/ci-pytorch-test-full.yml b/.github/workflows/ci-pytorch-test-full.yml index b50dae0857587..18a1a6c08828a 100644 --- a/.github/workflows/ci-pytorch-test-full.yml +++ b/.github/workflows/ci-pytorch-test-full.yml @@ -1,12 +1,12 @@ name: Test PyTorch full # see: https://help.github.com/en/actions/reference/events-that-trigger-workflows -on: # Trigger the workflow on push or pull request, but only for the master branch +on: push: branches: [master, "release/*"] pull_request: branches: [master, "release/*"] - types: [opened, reopened, ready_for_review, synchronize] + types: [opened, reopened, ready_for_review, synchronize] # add `ready_for_review` since draft is skipped concurrency: group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }} diff --git a/.github/workflows/ci-pytorch-test-slow.yml b/.github/workflows/ci-pytorch-test-slow.yml index 091c3f606c3ca..c1b2ab2292009 100644 --- a/.github/workflows/ci-pytorch-test-slow.yml +++ b/.github/workflows/ci-pytorch-test-slow.yml @@ -1,12 +1,12 @@ name: Test PyTorch slow # see: https://help.github.com/en/actions/reference/events-that-trigger-workflows -on: # Trigger the workflow on push or pull request, but only for the master branch +on: push: branches: [master, "release/*"] pull_request: branches: [master, "release/*"] - types: [opened, reopened, ready_for_review, synchronize] + types: [opened, reopened, ready_for_review, synchronize] # add `ready_for_review` since draft is skipped paths: - "requirements/pytorch/**" - "src/pytorch_lightning/**" diff --git a/.github/workflows/code-checks.yml b/.github/workflows/code-checks.yml index 15bd5e9911740..cf74675118fe7 100644 --- a/.github/workflows/code-checks.yml +++ b/.github/workflows/code-checks.yml @@ -1,6 +1,6 @@ name: Code check -on: # Trigger the workflow on push or pull request, but only for the master branch +on: push: branches: [master, "release/*"] pull_request: diff --git a/.github/workflows/docs-checks.yml b/.github/workflows/docs-checks.yml index bea17f946a8ab..c63fe01709fad 100644 --- a/.github/workflows/docs-checks.yml +++ b/.github/workflows/docs-checks.yml @@ -1,7 +1,7 @@ name: Check Docs # https://github.com/marketplace/actions/sphinx-build -on: # Trigger the workflow on push or pull request, but only for the master branch +on: push: branches: [master, "release/*"] pull_request: diff --git a/.github/workflows/probot-auto-cc.yml b/.github/workflows/probot-auto-cc.yml index 5bebcf9667b75..9cc302b299375 100644 --- a/.github/workflows/probot-auto-cc.yml +++ b/.github/workflows/probot-auto-cc.yml @@ -3,7 +3,7 @@ name: Probot on: issues: types: [labeled] - pull_request: + pull_request_target: types: [labeled, ready_for_review] jobs: diff --git a/.github/workflows/release-pypi.yml b/.github/workflows/release-pypi.yml index 2c6f5da240f63..763665f142cfb 100644 --- a/.github/workflows/release-pypi.yml +++ b/.github/workflows/release-pypi.yml @@ -1,7 +1,7 @@ name: PyPI # https://help.github.com/en/actions/reference/events-that-trigger-workflows -on: # Trigger the workflow on push or pull request, but only for the master branch +on: push: branches: [master, "release/*"] release: From b84c03f3a6c5327814c6aaab5634e5b5d46919d8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Thu, 8 Sep 2022 16:12:58 +0200 Subject: [PATCH 087/193] Update checkgroup config (#14587) --- .github/checkgroup.yml | 91 ++++++++++++++++++++++++++++++++---------- 1 file changed, 71 insertions(+), 20 deletions(-) diff --git a/.github/checkgroup.yml b/.github/checkgroup.yml index e8892926f6e55..6acef1517738c 100644 --- a/.github/checkgroup.yml +++ b/.github/checkgroup.yml @@ -3,9 +3,13 @@ custom_service_name: "Lightning CI required checker" # changes made in pull requests from different branches or forks are ignored. This means that changes to this file # will only be used after they are merged. subprojects: + + # SECTION: pytorch_lightning + - id: "CI: CircleCI" paths: - ".circleci/**" + - ".github/workflows/ci-circleci.yml" checks: - "test-on-tpus" @@ -25,7 +29,6 @@ subprojects: - "src/pytorch_lightning/**" - "tests/tests_pytorch/**" - "setup.cfg" # includes pytest config - - ".github/workflows/ci-pytorch*.yml" - ".github/workflows/docs-*.yml" checks: - "pl-conda (3.8, 1.10)" @@ -41,15 +44,6 @@ subprojects: - "pl-cpu (windows-2022, 3.10, latest, stable)" - "pl-cpu (windows-2022, 3.7, latest, stable)" - "pl-cpu (windows-2022, 3.7, oldest, stable)" - - "lite-cpu (macOS-11, 3.10, latest, stable)" - - "lite-cpu (macOS-11, 3.7, latest, stable)" - - "lite-cpu (macOS-11, 3.7, oldest, stable)" - - "lite-cpu (ubuntu-20.04, 3.10, latest, stable)" - - "lite-cpu (ubuntu-20.04, 3.7, latest, stable)" - - "lite-cpu (ubuntu-20.04, 3.7, oldest, stable)" - - "lite-cpu (windows-2022, 3.10, latest, stable)" - - "lite-cpu (windows-2022, 3.7, latest, stable)" - - "lite-cpu (windows-2022, 3.7, oldest, stable)" - "make-doctest (pytorch)" - "make-html (pytorch)" - "mypy" @@ -62,6 +56,37 @@ subprojects: - "pl-slow (windows-2022, 3.7, 1.11)" - "test-on-tpus" + - id: "pytorch_lightning: Conda" + paths: + - ".github/workflows/ci-pytorch-test-conda.yml" + checks: + - "pl-conda (3.8, 1.10)" + - "pl-conda (3.8, 1.9)" + - "pl-conda (3.9, 1.11)" + - "pl-conda (3.9, 1.12)" + + - id: "pytorch_lightning: CPU" + paths: + - ".github/workflows/ci-pytorch-test-full.yml" + checks: + - "pl-cpu (macOS-11, 3.10, latest, stable)" + - "pl-cpu (macOS-11, 3.7, latest, stable)" + - "pl-cpu (macOS-11, 3.7, oldest, stable)" + - "pl-cpu (ubuntu-20.04, 3.10, latest, stable)" + - "pl-cpu (ubuntu-20.04, 3.7, latest, stable)" + - "pl-cpu (ubuntu-20.04, 3.7, oldest, stable)" + - "pl-cpu (windows-2022, 3.10, latest, stable)" + - "pl-cpu (windows-2022, 3.7, latest, stable)" + - "pl-cpu (windows-2022, 3.7, oldest, stable)" + + - id: "pytorch_lightning: Slow" + paths: + - ".github/workflows/ci-pytorch-test-slow.yml" + checks: + - "pl-slow (macOS-11, 3.7, 1.11)" + - "pl-slow (ubuntu-20.04, 3.7, 1.11)" + - "pl-slow (windows-2022, 3.7, 1.11)" + - id: "pytorch_lightning: Azure GPU" paths: - ".azure/gpu-tests.yml" @@ -69,13 +94,6 @@ subprojects: checks: - "pytorch-lightning (GPUs)" - - id: "lightning_lite: Azure GPU" - paths: - - ".azure/gpu-tests-lite.yml" - - "tests/tests_lite/run_standalone_*.sh" - checks: - - "lightning-lite (GPUs)" - - id: "pytorch_lightning: Azure HPU" paths: - ".azure/hpu-tests.yml" @@ -101,6 +119,7 @@ subprojects: - id: "pytorch_lightning: Docker" paths: - "dockers/**" + - ".github/workflows/ci-pytorch-dockers.yml" - "requirements.txt" - "requirements/*.txt" - "requirements/pytorch/*" @@ -126,12 +145,35 @@ subprojects: - "build-pl (3.9, 1.9, 11.1.1)" - "build-xla (3.7, 1.12)" - - id: "pytorch_lightning: mypy" + # SECTION: lightning_lite + + - id: "lightning_lite" paths: - - ".github/workflows/code-checks.yml" - - "pyproject.toml" # includes mypy config + - "requirements/lite/**" + - "src/lightning_lite/**" + - "tests/tests_lite/**" + - "setup.cfg" # includes pytest config checks: + - "lite-cpu (macOS-11, 3.10, latest, stable)" + - "lite-cpu (macOS-11, 3.7, latest, stable)" + - "lite-cpu (macOS-11, 3.7, oldest, stable)" + - "lite-cpu (ubuntu-20.04, 3.10, latest, stable)" + - "lite-cpu (ubuntu-20.04, 3.7, latest, stable)" + - "lite-cpu (ubuntu-20.04, 3.7, oldest, stable)" + - "lite-cpu (windows-2022, 3.10, latest, stable)" + - "lite-cpu (windows-2022, 3.7, latest, stable)" + - "lite-cpu (windows-2022, 3.7, oldest, stable)" - "mypy" + # TODO: lite should also require (some?) pl checks. this also requires that the path filters are modified + + - id: "lightning_lite: Azure GPU" + paths: + - ".azure/gpu-tests-lite.yml" + - "tests/tests_lite/run_standalone_*.sh" + checks: + - "lightning-lite (GPUs)" + + # SECTION: lightning_app - id: "lightning_app" paths: @@ -174,6 +216,15 @@ subprojects: - "make-doctest (app)" - "make-html (app)" + # SECTION: common + + - id: "mypy" + paths: + - ".github/workflows/code-checks.yml" + - "pyproject.toml" # includes mypy config + checks: + - "mypy" + - id: "install" paths: - ".actions/setup_tools.py" From 7cbf153332707b1677ac37d300c68659aa2a90c0 Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Fri, 9 Sep 2022 20:38:03 +0200 Subject: [PATCH 088/193] CI: hotfix last version (#14627) LooseVersion was not correctly evaluation RC and set it as last even though the full release is out... --- .actions/assistant.py | 7 ++++++- .actions/requirements.txt | 3 +++ .azure/hpu-tests.yml | 2 +- .github/workflows/ci-lite-test-full.yml | 2 +- .github/workflows/ci-pkg-install.yml | 4 ++-- .github/workflows/ci-pytorch-test-full.yml | 2 +- .github/workflows/events-nightly.yml | 2 +- .github/workflows/release-pypi.yml | 6 +++--- 8 files changed, 18 insertions(+), 10 deletions(-) create mode 100644 .actions/requirements.txt diff --git a/.actions/assistant.py b/.actions/assistant.py index c67c9ba78b951..85a1b5879e109 100644 --- a/.actions/assistant.py +++ b/.actions/assistant.py @@ -1,6 +1,7 @@ import datetime import glob import json +import logging import os import re import shutil @@ -16,6 +17,7 @@ import fire import pkg_resources +from packaging.version import parse as version_parse REQUIREMENT_FILES = { "pytorch": ( @@ -123,7 +125,9 @@ def download_package(package: str, folder: str = ".", version: Optional[str] = N data = json.load(urlopen(Request(url))) if not version: versions = list(data["releases"].keys()) - version = sorted(versions, key=LooseVersion)[-1] + versions = sorted(versions, key=lambda x: version_parse(x)) + logging.debug(f"Available versions: {versions}") + version = versions[-1] releases = list(filter(lambda r: r["packagetype"] == "sdist", data["releases"][version])) assert releases, f"Missing 'sdist' for this package/version aka {package}/{version}" release = releases[0] @@ -131,6 +135,7 @@ def download_package(package: str, folder: str = ".", version: Optional[str] = N pkg_file = os.path.basename(pkg_url) pkg_path = os.path.join(folder, pkg_file) os.makedirs(folder, exist_ok=True) + print(f"downloading: {pkg_url}") request.urlretrieve(pkg_url, pkg_path) @staticmethod diff --git a/.actions/requirements.txt b/.actions/requirements.txt new file mode 100644 index 0000000000000..3b41e6584e7b4 --- /dev/null +++ b/.actions/requirements.txt @@ -0,0 +1,3 @@ +fire +packaging +requests diff --git a/.azure/hpu-tests.yml b/.azure/hpu-tests.yml index 33206ef5c3e37..c445cc92dff46 100644 --- a/.azure/hpu-tests.yml +++ b/.azure/hpu-tests.yml @@ -60,7 +60,7 @@ jobs: set -e pip --version sudo pip uninstall -y lightning pytorch-lightning - pip install fire + pip install -q -r .actions/requirements.txt python .actions/assistant.py requirements-prune-pkgs torch,torchvision pip install ".[extra,test]" pip list diff --git a/.github/workflows/ci-lite-test-full.yml b/.github/workflows/ci-lite-test-full.yml index 4449fca313f84..e8091eb76d64a 100644 --- a/.github/workflows/ci-lite-test-full.yml +++ b/.github/workflows/ci-lite-test-full.yml @@ -51,7 +51,7 @@ jobs: - name: basic setup run: | pip --version - pip install -q fire + pip install -q -r .actions/requirements.txt - name: Setup Windows if: runner.os == 'windows' diff --git a/.github/workflows/ci-pkg-install.yml b/.github/workflows/ci-pkg-install.yml index 7993dd04d5852..69c602d2e2913 100644 --- a/.github/workflows/ci-pkg-install.yml +++ b/.github/workflows/ci-pkg-install.yml @@ -114,7 +114,7 @@ jobs: - name: Dowload package # todo: download also lite after it is fist published run: | - pip install -q fire requests + pip install -q -r .actions/requirements.txt for pkg in 'app' 'pytorch' ; do python .actions/assistant.py download-package "$pkg" --folder pypi done @@ -131,7 +131,7 @@ jobs: - name: Miror source run: | - pip install -q fire requests + pip install -q -r .actions/requirements.txt python .actions/assistant.py mirror-pkg2source pypi src ls -R src/ diff --git a/.github/workflows/ci-pytorch-test-full.yml b/.github/workflows/ci-pytorch-test-full.yml index 18a1a6c08828a..e4c5ecd9cc0c1 100644 --- a/.github/workflows/ci-pytorch-test-full.yml +++ b/.github/workflows/ci-pytorch-test-full.yml @@ -71,7 +71,7 @@ jobs: if: ${{ (steps.skip.outputs.continue == '1') }} run: | pip --version - pip install -q fire + pip install -q -r .actions/requirements.txt # Github Actions: Run step on specific OS: https://stackoverflow.com/a/57948488/4521646 - name: Setup macOS diff --git a/.github/workflows/events-nightly.yml b/.github/workflows/events-nightly.yml index 2576b05e33566..18b6b6fdbfdf8 100644 --- a/.github/workflows/events-nightly.yml +++ b/.github/workflows/events-nightly.yml @@ -26,7 +26,7 @@ jobs: - name: Build packages run: | - pip install -q fire + pip install -q -r .actions/requirements.txt python .actions/assistant.py prepare-nightly-version python setup.py sdist bdist_wheel ls -lh dist/ diff --git a/.github/workflows/release-pypi.yml b/.github/workflows/release-pypi.yml index 763665f142cfb..88b95e5b264a0 100644 --- a/.github/workflows/release-pypi.yml +++ b/.github/workflows/release-pypi.yml @@ -26,7 +26,7 @@ jobs: with: python-version: 3.9 - run: | - pip install -q fire requests + pip install -q -r .actions/requirements.txt mkdir dist && touch dist/.placeholder mkdir pypi && touch pypi/.placeholder - uses: actions/upload-artifact@v3 @@ -105,7 +105,7 @@ jobs: - name: Dowload package run: | - pip install -q fire requests + pip install -q -r .actions/requirements.txt python .actions/assistant.py download-package ${{ matrix.pkg }} --folder pypi - uses: actions/upload-artifact@v3 @@ -147,7 +147,7 @@ jobs: - name: Miror source run: | - pip install -q fire requests + pip install -q -r .actions/requirements.txt python .actions/assistant.py mirror-pkg2source pypi src ls -R src/ From 3a66798a6c612e06bfa06df6fb4f270d8de03e3e Mon Sep 17 00:00:00 2001 From: Laverne Henderson Date: Fri, 9 Sep 2022 11:59:03 -0700 Subject: [PATCH 089/193] Update content for S3 persistent storage (#14060) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Update content for S3 persistent storage * Updates based on feedback * Fix unstructured validation issue * Updates based on feedback Co-authored-by: Jirka Borovec Co-authored-by: edenlightning <66261195+edenlightning@users.noreply.github.com> Co-authored-by: Luca Antiga Co-authored-by: Adrian Wälchli --- .../glossary/storage/drive_content.rst | 64 +++++++++++++------ 1 file changed, 44 insertions(+), 20 deletions(-) diff --git a/docs/source-app/glossary/storage/drive_content.rst b/docs/source-app/glossary/storage/drive_content.rst index c2c1357d0f6df..813bbeef47812 100644 --- a/docs/source-app/glossary/storage/drive_content.rst +++ b/docs/source-app/glossary/storage/drive_content.rst @@ -1,27 +1,48 @@ +************************** +What are Lightning Drives? +************************** -************ -About Drives -************ - -Lightning Drive storage makes it easy to share files between LightningWorks so you can run your Lightning App both locally and in the cloud without changing the code. +Lightning Drives are shared app storage that allow you to share files between `LightningWork (Work) <../../core_api/lightning_work/index.html>`_ components, so that you distributed components can share files when running on the cloud. Using drives, you can run your Lightning App both locally and in the cloud without changing the code. The Drive object provides a central place for your components to share data. -The Drive acts as an isolate folder and any component can access it by knowing its name. +The Drive acts as an isolated folder and any component can access it by knowing its name. + +We currently support two types of Drives: Lightning-managed (``lit://``) and S3 (``s3://``). -Your components can put, list, get, and delete files from and to the Drive (except LightningFlows). ++-----------------------------------+-------------------------------------------------------------------------------------------------------------------------------+ +| Lightning-managed (``lit://``) | Allows read-write operations and are accessible through the Drive API from a Work. | +| | | +| | They allow your components to put, list, get, and delete files from and to the Drive (except LightningFlows). | ++-----------------------------------+-------------------------------------------------------------------------------------------------------------------------------+ +| S3 (``s3://``) | S3 is AWS S3 storage mounted at a filesystem mount point. S3 is read-only (for now) and its primary purpose is | +| | to give you a permanent location to access your training data. | +| | | +| | They allow your components to list and get files located on the Drive. | ++-----------------------------------+-------------------------------------------------------------------------------------------------------------------------------+ ---- -*********************** -What Drive does for you -*********************** +********************** +What Drives do for you +********************** Think of every instance of the Drive object acting like a Google Drive or like Dropbox. By sharing the Drive between components through the LightningFlow, -several components can have a shared place to read and write files from. +several components can have a shared place to read (S3 Drives) or read and write (Lightning-managed Drives) files from. + +S3 Drive Limitations +^^^^^^^^^^^^^^^^^^^^ + +These limitations only apply to S3 Drives: + +* There is no top level “shareable” S3 drive object. Each S3 Drive is owned by a particular Work. However, it’s possible to create a Drive with the same location across multiple Works. + +* S3 buckets cannot be mounted as Drives once a Work has been instantiated. The `Drive` object must be initialized passed to a Work at creation time. + +* Whenever a Drive is mounted to a Work, an indexing process will be done again for the provided S3 bucket. This may lead to performance issues with particularly large S3 buckets. For context, 1M files with 2-3 levels of nesting takes less than 1 second to index. ---- @@ -29,7 +50,9 @@ several components can have a shared place to read and write files from. Create a Drive ************** -In order to create a Drive, you simply need to pass its name with the prefix ``lit://`` as follows: +In order to create a Drive, you simply need to pass its name with the prefix ``lit://`` or ``s3://``. + +.. note:: We do not support mounting single objects for S3 buckets, so there must be a trailing `/` in the s3:// URL. For example: ``s3://foo/bar/``. .. code-block:: python @@ -41,9 +64,9 @@ In order to create a Drive, you simply need to pass its name with the prefix ``l drive_1 = Drive("lit://drive_1") # The identifier of this Drive is ``drive_2`` - drive_2 = Drive("lit://drive_2") + drive_2 = Drive("s3://drive_2/") -Any components can create a drive object. +Any component can create a drive object for ``lit://`` Drives. .. code-block:: python @@ -74,7 +97,9 @@ Any components can create a drive object. Supported actions with Drives ***************************** -A Drive supports put, list, get, and delete actions. +A Lightning-managed Drive supports put, list, get, and delete actions. + +An S3 Drive supports list and get actions (for now). .. code-block:: python @@ -104,7 +129,7 @@ A Drive supports put, list, get, and delete actions. Component interactions with Drives ********************************** -Here is an illustrated code example on how to create drives within works. +Here is an illustrated code example on how to create drives within Works. .. figure:: https://pl-flash-data.s3.amazonaws.com/assets_lightning/drive_2.png @@ -167,17 +192,16 @@ Here is an illustrated code example on how to create drives within works. ---- -***************************** +************************* Transfer files with Drive -***************************** +************************* -In the example below, the Drive is created by the flow and passed to its LightningWork's. +In the example below, the Drive is created by the Flow and passed to its Works. The ``Work_1`` put a file **a.txt** in the **Drive("lit://this_drive_id")** and the ``Work_2`` can list and get the **a.txt** file from it. .. literalinclude:: ../../../examples/app_drive/app.py - ---- .. raw:: html From 2b50cbb4d3761c5fe03f137deb86fb2b2a6f2aa8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Fri, 9 Sep 2022 21:08:34 +0200 Subject: [PATCH 090/193] Avoid instantiating every accelerator in the registry (#14591) * Avoid instantiating every accelerator in the registry when listing available ones --- .../trainer/connectors/accelerator_connector.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/pytorch_lightning/trainer/connectors/accelerator_connector.py b/src/pytorch_lightning/trainer/connectors/accelerator_connector.py index f3be6caa5be2e..d50fc86140da2 100644 --- a/src/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/src/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -524,7 +524,9 @@ def _set_parallel_devices_and_init_accelerator(self) -> None: if not self.accelerator.is_available(): available_accelerator = [ - acc_str for acc_str in self._accelerator_types if AcceleratorRegistry.get(acc_str).is_available() + acc_str + for acc_str in self._accelerator_types + if AcceleratorRegistry[acc_str]["accelerator"].is_available() ] raise MisconfigurationException( f"{self.accelerator.__class__.__qualname__} can not run on your system" From d5b32c308723e09ec1c2d7bb980f6bfcc0de94a0 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 9 Sep 2022 19:58:49 +0000 Subject: [PATCH 091/193] Update s3fs requirement from <=2022.7.1,>=2022.5.0 to >=2022.5.0,<2022.8.3 in /requirements (#14585) Update s3fs requirement in /requirements Updates the requirements on [s3fs](https://github.com/fsspec/s3fs) to permit the latest version. - [Release notes](https://github.com/fsspec/s3fs/releases) - [Changelog](https://github.com/fsspec/s3fs/blob/main/release-procedure.md) - [Commits](https://github.com/fsspec/s3fs/compare/2022.5.0...2022.8.2) --- updated-dependencies: - dependency-name: s3fs dependency-type: direct:production ... Signed-off-by: dependabot[bot] Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- requirements/app/base.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/app/base.txt b/requirements/app/base.txt index 47e70a4682b69..003a174ef937e 100644 --- a/requirements/app/base.txt +++ b/requirements/app/base.txt @@ -3,7 +3,7 @@ packaging deepdiff>=5.7.0, <=5.8.1 starsessions>=1.2.1, <2.0 # strict fsspec>=2022.5.0, <=2022.7.1 -s3fs>=2022.5.0, <=2022.7.1 +s3fs>=2022.5.0, <2022.8.3 croniter>=1.3.0, <1.4.0 # strict; TODO: for now until we find something more robust. traitlets<5.2.0 # Traitlets 5.2.X fails: https://github.com/ipython/traitlets/issues/741 arrow>=1.2.0, <=1.2.2 From 7e9e441843d345d0adf0dd172e760b62bf4631cd Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Fri, 9 Sep 2022 21:04:57 +0100 Subject: [PATCH 092/193] Use TorchVision's Multi-weight Support and Model Registration API on Lightning (#14567) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Carlos Mocholí --- .../advanced/transfer_learning.rst | 2 +- examples/pl_basics/profiler_example.py | 4 ++-- .../computer_vision_fine_tuning.py | 6 +++--- examples/pl_domain_templates/imagenet.py | 8 ++++---- examples/pl_servable_module/production.py | 4 ++-- src/pytorch_lightning/utilities/imports.py | 4 ++-- .../utilities/model_helpers.py | 20 ++++++++++++++++++- .../tests_pytorch/helpers/advanced_models.py | 7 ++++--- tests/tests_pytorch/test_cli.py | 2 +- 9 files changed, 38 insertions(+), 19 deletions(-) diff --git a/docs/source-pytorch/advanced/transfer_learning.rst b/docs/source-pytorch/advanced/transfer_learning.rst index caa739bdfc1f6..2d221cf6f7f3e 100644 --- a/docs/source-pytorch/advanced/transfer_learning.rst +++ b/docs/source-pytorch/advanced/transfer_learning.rst @@ -62,7 +62,7 @@ Example: Imagenet (Computer Vision) super().__init__() # init a pretrained resnet - backbone = models.resnet50(pretrained=True) + backbone = models.resnet50(weights="DEFAULT") num_filters = backbone.fc.in_features layers = list(backbone.children())[:-1] self.feature_extractor = nn.Sequential(*layers) diff --git a/examples/pl_basics/profiler_example.py b/examples/pl_basics/profiler_example.py index 6df8f769973c6..39c147c938d06 100644 --- a/examples/pl_basics/profiler_example.py +++ b/examples/pl_basics/profiler_example.py @@ -27,12 +27,12 @@ import torch import torchvision -import torchvision.models as models import torchvision.transforms as T from pytorch_lightning import cli_lightning_logo, LightningDataModule, LightningModule from pytorch_lightning.cli import LightningCLI from pytorch_lightning.profilers.pytorch import PyTorchProfiler +from pytorch_lightning.utilities.model_helpers import get_torchvision_model DEFAULT_CMD_LINE = ( "fit", @@ -49,7 +49,7 @@ class ModelToProfile(LightningModule): def __init__(self, name: str = "resnet18", automatic_optimization: bool = True): super().__init__() - self.model = getattr(models, name)(pretrained=True) + self.model = get_torchvision_model(name, weights="DEFAULT") self.criterion = torch.nn.CrossEntropyLoss() self.automatic_optimization = automatic_optimization self.training_step = ( diff --git a/examples/pl_domain_templates/computer_vision_fine_tuning.py b/examples/pl_domain_templates/computer_vision_fine_tuning.py index 7a81df983996c..afcfa8f90066b 100644 --- a/examples/pl_domain_templates/computer_vision_fine_tuning.py +++ b/examples/pl_domain_templates/computer_vision_fine_tuning.py @@ -50,7 +50,7 @@ from torch.optim.optimizer import Optimizer from torch.utils.data import DataLoader from torchmetrics import Accuracy -from torchvision import models, transforms +from torchvision import transforms from torchvision.datasets import ImageFolder from torchvision.datasets.utils import download_and_extract_archive @@ -58,6 +58,7 @@ from pytorch_lightning.callbacks.finetuning import BaseFinetuning from pytorch_lightning.cli import LightningCLI from pytorch_lightning.utilities import rank_zero_info +from pytorch_lightning.utilities.model_helpers import get_torchvision_model log = logging.getLogger(__name__) DATA_URL = "https://storage.googleapis.com/mledu-datasets/cats_and_dogs_filtered.zip" @@ -193,8 +194,7 @@ def __build_model(self): """Define model layers & loss.""" # 1. Load pre-trained network: - model_func = getattr(models, self.backbone) - backbone = model_func(pretrained=True) + backbone = get_torchvision_model(self.backbone, weights="DEFAULT") _layers = list(backbone.children())[:-1] self.feature_extractor = nn.Sequential(*_layers) diff --git a/examples/pl_domain_templates/imagenet.py b/examples/pl_domain_templates/imagenet.py index efb9c40eea061..0a3b55d2a6a04 100644 --- a/examples/pl_domain_templates/imagenet.py +++ b/examples/pl_domain_templates/imagenet.py @@ -40,7 +40,6 @@ import torch.utils.data import torch.utils.data.distributed import torchvision.datasets as datasets -import torchvision.models as models import torchvision.transforms as transforms from torch.utils.data import Dataset from torchmetrics import Accuracy @@ -49,6 +48,7 @@ from pytorch_lightning.callbacks import ModelCheckpoint, TQDMProgressBar from pytorch_lightning.cli import LightningCLI from pytorch_lightning.strategies import ParallelStrategy +from pytorch_lightning.utilities.model_helpers import get_torchvision_model class ImageNetLightningModel(LightningModule): @@ -63,7 +63,7 @@ def __init__( self, data_path: str, arch: str = "resnet18", - pretrained: bool = False, + weights: Optional[str] = None, lr: float = 0.1, momentum: float = 0.9, weight_decay: float = 1e-4, @@ -72,14 +72,14 @@ def __init__( ): super().__init__() self.arch = arch - self.pretrained = pretrained + self.weights = weights self.lr = lr self.momentum = momentum self.weight_decay = weight_decay self.data_path = data_path self.batch_size = batch_size self.workers = workers - self.model = models.__dict__[self.arch](pretrained=self.pretrained) + self.model = get_torchvision_model(self.arch, weights=self.weights) self.train_dataset: Optional[Dataset] = None self.eval_dataset: Optional[Dataset] = None self.train_acc1 = Accuracy(top_k=1) diff --git a/examples/pl_servable_module/production.py b/examples/pl_servable_module/production.py index 3ecd72376417a..4005fecb7307d 100644 --- a/examples/pl_servable_module/production.py +++ b/examples/pl_servable_module/production.py @@ -7,13 +7,13 @@ import numpy as np import torch import torchvision -import torchvision.models as models import torchvision.transforms as T from PIL import Image as PILImage from pytorch_lightning import cli_lightning_logo, LightningDataModule, LightningModule from pytorch_lightning.cli import LightningCLI from pytorch_lightning.serve import ServableModule, ServableModuleValidator +from pytorch_lightning.utilities.model_helpers import get_torchvision_model DATASETS_PATH = path.join(path.dirname(__file__), "..", "..", "Datasets") @@ -21,7 +21,7 @@ class LitModule(LightningModule): def __init__(self, name: str = "resnet18"): super().__init__() - self.model = getattr(models, name)(pretrained=True) + self.model = get_torchvision_model(name, weights="DEFAULT") self.model.fc = torch.nn.Linear(self.model.fc.in_features, 10) self.criterion = torch.nn.CrossEntropyLoss() diff --git a/src/pytorch_lightning/utilities/imports.py b/src/pytorch_lightning/utilities/imports.py index cbbfcc21ddaf4..d870d0faab823 100644 --- a/src/pytorch_lightning/utilities/imports.py +++ b/src/pytorch_lightning/utilities/imports.py @@ -17,7 +17,7 @@ import sys import torch -from lightning_utilities.core.imports import compare_version, module_available, package_available +from lightning_utilities.core.imports import compare_version, module_available, package_available, RequirementCache _IS_WINDOWS = platform.system() == "Windows" _IS_INTERACTIVE = hasattr(sys, "ps1") # https://stackoverflow.com/a/64523765 @@ -41,7 +41,7 @@ _PSUTIL_AVAILABLE = package_available("psutil") _RICH_AVAILABLE = package_available("rich") and compare_version("rich", operator.ge, "10.2.2") _TORCH_QUANTIZE_AVAILABLE = bool([eg for eg in torch.backends.quantized.supported_engines if eg != "none"]) -_TORCHVISION_AVAILABLE = package_available("torchvision") +_TORCHVISION_AVAILABLE = RequirementCache("torchvision") _XLA_AVAILABLE: bool = package_available("torch_xla") diff --git a/src/pytorch_lightning/utilities/model_helpers.py b/src/pytorch_lightning/utilities/model_helpers.py index 66ad264355669..b72e9320b364e 100644 --- a/src/pytorch_lightning/utilities/model_helpers.py +++ b/src/pytorch_lightning/utilities/model_helpers.py @@ -12,9 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. from functools import partial -from typing import Optional, Type +from typing import Any, Optional, Type from unittest.mock import Mock +from lightning_utilities.core.imports import RequirementCache +from torch import nn + import pytorch_lightning as pl @@ -54,3 +57,18 @@ def is_overridden(method_name: str, instance: Optional[object] = None, parent: O raise ValueError("The parent should define the method") return instance_attr.__code__ != parent_attr.__code__ + + +def get_torchvision_model(model_name: str, **kwargs: Any) -> nn.Module: + from pytorch_lightning.utilities.imports import _TORCHVISION_AVAILABLE + + if not _TORCHVISION_AVAILABLE: + raise ModuleNotFoundError(str(_TORCHVISION_AVAILABLE)) + + from torchvision import models + + torchvision_greater_equal_0_14 = RequirementCache("torchvision>=0.14.0") + # TODO: deprecate this function when 0.14 is the minimum supported torchvision + if torchvision_greater_equal_0_14: + return models.get_model(model_name, **kwargs) + return getattr(models, model_name)(**kwargs) diff --git a/tests/tests_pytorch/helpers/advanced_models.py b/tests/tests_pytorch/helpers/advanced_models.py index a305fe04e62d9..4b8ce9c60e315 100644 --- a/tests/tests_pytorch/helpers/advanced_models.py +++ b/tests/tests_pytorch/helpers/advanced_models.py @@ -20,11 +20,12 @@ from pytorch_lightning.core.module import LightningModule from pytorch_lightning.utilities.imports import _TORCHVISION_AVAILABLE +from pytorch_lightning.utilities.model_helpers import get_torchvision_model from tests_pytorch import _PATH_DATASETS from tests_pytorch.helpers.datasets import AverageDataset, MNIST, TrialMNIST if _TORCHVISION_AVAILABLE: - from torchvision import models, transforms + from torchvision import transforms from torchvision.datasets import CIFAR10 @@ -217,13 +218,13 @@ def train_dataloader(self): class ParityModuleCIFAR(LightningModule): - def __init__(self, backbone="resnet101", hidden_dim=1024, learning_rate=1e-3, pretrained=True): + def __init__(self, backbone="resnet101", hidden_dim=1024, learning_rate=1e-3, weights="DEFAULT"): super().__init__() self.save_hyperparameters() self.learning_rate = learning_rate self.num_classes = 10 - self.backbone = getattr(models, backbone)(pretrained=pretrained) + self.backbone = get_torchvision_model(backbone, weights=weights) self.classifier = torch.nn.Sequential( torch.nn.Linear(1000, hidden_dim), torch.nn.Linear(hidden_dim, self.num_classes) diff --git a/tests/tests_pytorch/test_cli.py b/tests/tests_pytorch/test_cli.py index 46fc7e9b6217f..4d6a609a00a3c 100644 --- a/tests/tests_pytorch/test_cli.py +++ b/tests/tests_pytorch/test_cli.py @@ -520,7 +520,7 @@ def __init__(self, submodule1: LightningModule, submodule2: LightningModule, mai assert isinstance(cli.model.submodule2, BoringModel) -@pytest.mark.skipif(not _TORCHVISION_AVAILABLE, reason="Tests a bug with torchvision, but it's not available") +@pytest.mark.skipif(not _TORCHVISION_AVAILABLE, reason=str(_TORCHVISION_AVAILABLE)) def test_lightning_cli_torch_modules(tmpdir): class TestModule(BoringModel): def __init__(self, activation: torch.nn.Module = None, transform: Optional[List[torch.nn.Module]] = None): From 40868f7f43f308ca3b35b1e46a78bfa46fb940d7 Mon Sep 17 00:00:00 2001 From: Rui Wang <45031995+wangraying@users.noreply.github.com> Date: Sat, 10 Sep 2022 04:07:25 +0800 Subject: [PATCH 093/193] Add bagua support for CUDA 11.6 images (#14529) * Add support for bagua-cuda116 * Remove bagua-cuda115 from installation Co-authored-by: Akihiro Nitta --- .azure/gpu-tests.yml | 4 ++-- dockers/base-conda/Dockerfile | 4 ++-- dockers/base-cuda/Dockerfile | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/.azure/gpu-tests.yml b/.azure/gpu-tests.yml index e53d8f07567ff..67e4f3d0bac19 100644 --- a/.azure/gpu-tests.yml +++ b/.azure/gpu-tests.yml @@ -74,11 +74,11 @@ jobs: python -c "fname = 'requirements/pytorch/strategies.txt' ; lines = [line for line in open(fname).readlines() if 'bagua' not in line] ; open(fname, 'w').writelines(lines)" TORCH_VERSION=$(python -c "import torch; print(torch.__version__.split('+')[0])") CUDA_VERSION_MM=$(python -c "import torch ; print(''.join(map(str, torch.version.cuda.split('.')[:2])))") - CUDA_VERSION_BAGUA=$(python -c "print([ver for ver in [115,113,111,102] if $CUDA_VERSION_MM >= ver][0])") + CUDA_VERSION_BAGUA=$(python -c "print([ver for ver in [116,113,111,102] if $CUDA_VERSION_MM >= ver][0])") python ./requirements/pytorch/adjust-versions.py requirements/pytorch/base.txt ${PYTORCH_VERSION} python ./requirements/pytorch/adjust-versions.py requirements/pytorch/extra.txt ${PYTORCH_VERSION} python ./requirements/pytorch/adjust-versions.py requirements/pytorch/examples.txt ${PYTORCH_VERSION} - pip install "bagua-cuda$CUDA_VERSION_BAGUA>=0.9.0" + pip install "bagua-cuda$CUDA_VERSION_BAGUA" pip install -e .[strategies] --find-links https://download.pytorch.org/whl/cu${CUDA_VERSION_MM}/torch_stable.html pip install --requirement requirements/pytorch/devel.txt --find-links https://download.pytorch.org/whl/cu${CUDA_VERSION_MM}/torch_stable.html pip list diff --git a/dockers/base-conda/Dockerfile b/dockers/base-conda/Dockerfile index c82c5a4dfa15f..12953af627604 100644 --- a/dockers/base-conda/Dockerfile +++ b/dockers/base-conda/Dockerfile @@ -143,8 +143,8 @@ RUN \ RUN \ # install Bagua CUDA_VERSION_MM=$(python -c "print(''.join('$CUDA_VERSION'.split('.')[:2]))") && \ - CUDA_VERSION_BAGUA=$(python -c "print([ver for ver in [115,113,111,102] if $CUDA_VERSION_MM >= ver][0])") && \ - pip install "bagua-cuda$CUDA_VERSION_BAGUA==0.9.0" && \ + CUDA_VERSION_BAGUA=$(python -c "print([ver for ver in [116,113,111,102] if $CUDA_VERSION_MM >= ver][0])") && \ + pip install "bagua-cuda$CUDA_VERSION_BAGUA" && \ if [[ "$CUDA_VERSION_MM" = "$CUDA_VERSION_BAGUA" ]]; then python -c "import bagua_core; bagua_core.install_deps()"; fi && \ python -c "import bagua; print(bagua.__version__)" diff --git a/dockers/base-cuda/Dockerfile b/dockers/base-cuda/Dockerfile index 08692ff00ab78..8eaddb4dca081 100644 --- a/dockers/base-cuda/Dockerfile +++ b/dockers/base-cuda/Dockerfile @@ -131,8 +131,8 @@ RUN \ RUN \ # install Bagua CUDA_VERSION_MM=$(python -c "print(''.join('$CUDA_VERSION'.split('.')[:2]))") && \ - CUDA_VERSION_BAGUA=$(python -c "print([ver for ver in [115,113,111,102] if $CUDA_VERSION_MM >= ver][0])") && \ - pip install "bagua-cuda$CUDA_VERSION_BAGUA==0.9.0" && \ + CUDA_VERSION_BAGUA=$(python -c "print([ver for ver in [116,113,111,102] if $CUDA_VERSION_MM >= ver][0])") && \ + pip install "bagua-cuda$CUDA_VERSION_BAGUA" && \ if [[ "$CUDA_VERSION_MM" = "$CUDA_VERSION_BAGUA" ]]; then python -c "import bagua_core; bagua_core.install_deps()"; fi && \ python -c "import bagua; print(bagua.__version__)" From 3c23125aaba7db8d2d499c12f11ccfc3c58f5583 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 9 Sep 2022 20:13:18 +0000 Subject: [PATCH 094/193] Bump carmocca/probot from 1 to 2 (#14336) --- .github/workflows/probot-auto-cc.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/probot-auto-cc.yml b/.github/workflows/probot-auto-cc.yml index 9cc302b299375..388cd6ee51624 100644 --- a/.github/workflows/probot-auto-cc.yml +++ b/.github/workflows/probot-auto-cc.yml @@ -12,6 +12,6 @@ jobs: if: github.event_name == 'issue' || github.event.pull_request.draft == false timeout-minutes: 5 steps: - - uses: carmocca/probot@v1 + - uses: carmocca/probot@v2 env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} From a409b2305511b655352593c74cf064322f3a1090 Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Sun, 11 Sep 2022 04:33:54 +0900 Subject: [PATCH 095/193] Remove deprecated test_tube dependency from `environment.yml` (#14617) Co-authored-by: Jirka Borovec --- environment.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/environment.yml b/environment.yml index 7576bfe0e5ff3..df3fac2b0f0e8 100644 --- a/environment.yml +++ b/environment.yml @@ -46,7 +46,6 @@ dependencies: - torchvision>=0.10.* - pip: - - test-tube>=0.7.5 - mlflow>=1.0.0 - comet_ml>=3.1.12 - wandb>=0.10.22 From b33e3ef18aa8f50dd905e3762aaaa3117aa65705 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sat, 10 Sep 2022 20:03:11 -0400 Subject: [PATCH 096/193] simplify storage import (#14638) * docs * t1 * simple import * simple import * simple import * simple import * Update __version__.py * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- requirements/docs/base.txt | 13 +++++++++++++ src/lightning/__init__.py | 3 ++- 2 files changed, 15 insertions(+), 1 deletion(-) create mode 100644 requirements/docs/base.txt diff --git a/requirements/docs/base.txt b/requirements/docs/base.txt new file mode 100644 index 0000000000000..1b00471602c60 --- /dev/null +++ b/requirements/docs/base.txt @@ -0,0 +1,13 @@ +sphinx>=4.0, <5.0 +myst-parser>=0.15, <0.17 +nbsphinx>=0.8.5, <=0.8.9 +pandoc>=1.0, <=2.2 +docutils>=0.16, <0.19 +sphinxcontrib-fulltoc>=1.0, <=1.2.0 +sphinxcontrib-mockautodoc +sphinx-autodoc-typehints>=1.11, <1.15 # strict; v1.15 failing on master (#11405) +sphinx-paramlinks>=0.5.1, <=0.5.4 +sphinx-togglebutton>=0.2, <=0.3.2 +sphinx-copybutton>=0.3, <=0.5.0 +sphinx-multiproject +jinja2>=3.0.0,<3.1.0 diff --git a/src/lightning/__init__.py b/src/lightning/__init__.py index 3de19170313de..8557df91ee9b1 100644 --- a/src/lightning/__init__.py +++ b/src/lightning/__init__.py @@ -30,9 +30,9 @@ def _detail(self: Any, message: str, *args: Any, **kwargs: Any) -> None: _logger.addHandler(_console) _logger.propagate = False - from lightning.__about__ import * # noqa: E402, F401, F403 from lightning.__version__ import version as __version__ # noqa: E402, F401 +from lightning.app import storage # noqa: E402 from lightning.app.core.app import LightningApp # noqa: E402 from lightning.app.core.flow import LightningFlow # noqa: E402 from lightning.app.core.work import LightningWork # noqa: E402 @@ -54,4 +54,5 @@ def _detail(self: Any, message: str, *args: Any, **kwargs: Any) -> None: "LightningModule", "Callback", "seed_everything", + "storage", ] From 024e7b8204846ead790cf5a098e4a1ca61fcb171 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 12 Sep 2022 12:20:08 +0200 Subject: [PATCH 097/193] Standalone Lite: Cluster Environments (#14509) --- src/lightning_lite/plugins/__init__.py | 1 + .../plugins/environments/__init__.py | 20 +++++++++++++++++++ .../environments/cluster_environment.py | 0 .../environments/kubeflow_environment.py | 2 +- .../environments/lightning_environment.py | 4 ++-- .../plugins/environments/lsf_environment.py | 7 ++++--- .../plugins/environments/slurm_environment.py | 2 +- .../environments/torchelastic_environment.py | 6 +++--- .../plugins/environments/xla_environment.py | 4 ++-- src/lightning_lite/utilities/device_parser.py | 5 +++-- src/lightning_lite/utilities/distributed.py | 4 ++-- src/pytorch_lightning/loops/utilities.py | 3 ++- src/pytorch_lightning/plugins/__init__.py | 4 ++-- .../plugins/environments/__init__.py | 14 ++++++------- .../plugins/environments/bagua_environment.py | 2 +- src/pytorch_lightning/strategies/bagua.py | 2 +- src/pytorch_lightning/strategies/ddp.py | 2 +- src/pytorch_lightning/strategies/ddp_spawn.py | 2 +- src/pytorch_lightning/strategies/deepspeed.py | 2 +- .../strategies/fully_sharded.py | 2 +- .../strategies/fully_sharded_native.py | 2 +- .../strategies/hpu_parallel.py | 2 +- src/pytorch_lightning/strategies/ipu.py | 2 +- .../strategies/launchers/multiprocessing.py | 5 +++-- .../strategies/launchers/subprocess_script.py | 2 +- src/pytorch_lightning/strategies/parallel.py | 2 +- src/pytorch_lightning/strategies/tpu_spawn.py | 2 +- .../connectors/accelerator_connector.py | 18 ++++++++--------- .../logger_connector/logger_connector.py | 2 +- .../trainer/connectors/signal_connector.py | 2 +- src/pytorch_lightning/trainer/trainer.py | 1 + tests/tests_lite/plugins/__init__.py | 0 .../plugins/environments/__init__.py | 0 .../environments/test_kubeflow_environment.py | 6 +++--- .../test_lightning_environment.py | 2 +- .../environments/test_lsf_environment.py | 2 +- .../environments/test_slurm_environment.py | 6 +++--- .../test_torchelastic_environment.py | 8 ++++---- .../environments/test_xla_environment.py | 10 +++++----- tests/tests_pytorch/conftest.py | 2 +- .../deprecated_api/test_remove_1-10.py | 2 +- tests/tests_pytorch/lite/test_parity.py | 2 +- tests/tests_pytorch/models/test_amp.py | 2 +- tests/tests_pytorch/models/test_gpu.py | 2 +- .../plugins/test_cluster_integration.py | 2 +- .../strategies/test_ddp_strategy.py | 2 +- tests/tests_pytorch/test_cli.py | 4 ++-- .../connectors/test_accelerator_connector.py | 12 +++++------ .../connectors/test_signal_connector.py | 2 +- 49 files changed, 110 insertions(+), 84 deletions(-) create mode 100644 src/lightning_lite/plugins/__init__.py create mode 100644 src/lightning_lite/plugins/environments/__init__.py rename src/{pytorch_lightning => lightning_lite}/plugins/environments/cluster_environment.py (100%) rename src/{pytorch_lightning => lightning_lite}/plugins/environments/kubeflow_environment.py (96%) rename src/{pytorch_lightning => lightning_lite}/plugins/environments/lightning_environment.py (95%) rename src/{pytorch_lightning => lightning_lite}/plugins/environments/lsf_environment.py (98%) rename src/{pytorch_lightning => lightning_lite}/plugins/environments/slurm_environment.py (98%) rename src/{pytorch_lightning => lightning_lite}/plugins/environments/torchelastic_environment.py (92%) rename src/{pytorch_lightning => lightning_lite}/plugins/environments/xla_environment.py (93%) create mode 100644 tests/tests_lite/plugins/__init__.py create mode 100644 tests/tests_lite/plugins/environments/__init__.py rename tests/{tests_pytorch => tests_lite}/plugins/environments/test_kubeflow_environment.py (91%) rename tests/{tests_pytorch => tests_lite}/plugins/environments/test_lightning_environment.py (97%) rename tests/{tests_pytorch => tests_lite}/plugins/environments/test_lsf_environment.py (98%) rename tests/{tests_pytorch => tests_lite}/plugins/environments/test_slurm_environment.py (92%) rename tests/{tests_pytorch => tests_lite}/plugins/environments/test_torchelastic_environment.py (91%) rename tests/{tests_pytorch => tests_lite}/plugins/environments/test_xla_environment.py (87%) diff --git a/src/lightning_lite/plugins/__init__.py b/src/lightning_lite/plugins/__init__.py new file mode 100644 index 0000000000000..e1c69ab16df2c --- /dev/null +++ b/src/lightning_lite/plugins/__init__.py @@ -0,0 +1 @@ +from lightning_lite.plugins.environments import ClusterEnvironment # noqa: F401 diff --git a/src/lightning_lite/plugins/environments/__init__.py b/src/lightning_lite/plugins/environments/__init__.py new file mode 100644 index 0000000000000..9a38d5daa144c --- /dev/null +++ b/src/lightning_lite/plugins/environments/__init__.py @@ -0,0 +1,20 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from lightning_lite.plugins.environments.cluster_environment import ClusterEnvironment # noqa: F401 +from lightning_lite.plugins.environments.kubeflow_environment import KubeflowEnvironment # noqa: F401 +from lightning_lite.plugins.environments.lightning_environment import LightningEnvironment # noqa: F401 +from lightning_lite.plugins.environments.lsf_environment import LSFEnvironment # noqa: F401 +from lightning_lite.plugins.environments.slurm_environment import SLURMEnvironment # noqa: F401 +from lightning_lite.plugins.environments.torchelastic_environment import TorchElasticEnvironment # noqa: F401 +from lightning_lite.plugins.environments.xla_environment import XLAEnvironment # noqa: F401 diff --git a/src/pytorch_lightning/plugins/environments/cluster_environment.py b/src/lightning_lite/plugins/environments/cluster_environment.py similarity index 100% rename from src/pytorch_lightning/plugins/environments/cluster_environment.py rename to src/lightning_lite/plugins/environments/cluster_environment.py diff --git a/src/pytorch_lightning/plugins/environments/kubeflow_environment.py b/src/lightning_lite/plugins/environments/kubeflow_environment.py similarity index 96% rename from src/pytorch_lightning/plugins/environments/kubeflow_environment.py rename to src/lightning_lite/plugins/environments/kubeflow_environment.py index 4e38f6d082c29..bd44ae9b3f0c2 100644 --- a/src/pytorch_lightning/plugins/environments/kubeflow_environment.py +++ b/src/lightning_lite/plugins/environments/kubeflow_environment.py @@ -15,7 +15,7 @@ import logging import os -from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment +from lightning_lite.plugins.environments.cluster_environment import ClusterEnvironment log = logging.getLogger(__name__) diff --git a/src/pytorch_lightning/plugins/environments/lightning_environment.py b/src/lightning_lite/plugins/environments/lightning_environment.py similarity index 95% rename from src/pytorch_lightning/plugins/environments/lightning_environment.py rename to src/lightning_lite/plugins/environments/lightning_environment.py index 5792d7cc16a67..9293c5506b025 100644 --- a/src/pytorch_lightning/plugins/environments/lightning_environment.py +++ b/src/lightning_lite/plugins/environments/lightning_environment.py @@ -15,8 +15,8 @@ import os import socket -from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment -from pytorch_lightning.utilities.rank_zero import rank_zero_only +from lightning_lite.plugins.environments.cluster_environment import ClusterEnvironment +from lightning_lite.utilities.rank_zero import rank_zero_only class LightningEnvironment(ClusterEnvironment): diff --git a/src/pytorch_lightning/plugins/environments/lsf_environment.py b/src/lightning_lite/plugins/environments/lsf_environment.py similarity index 98% rename from src/pytorch_lightning/plugins/environments/lsf_environment.py rename to src/lightning_lite/plugins/environments/lsf_environment.py index b1f592bd9b04d..c054f37f9ff71 100644 --- a/src/pytorch_lightning/plugins/environments/lsf_environment.py +++ b/src/lightning_lite/plugins/environments/lsf_environment.py @@ -11,14 +11,15 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - +import logging import os import socket from typing import Dict, List +from lightning_lite.plugins.environments.cluster_environment import ClusterEnvironment from lightning_lite.utilities.cloud_io import get_filesystem -from pytorch_lightning import _logger as log -from pytorch_lightning.plugins.environments import ClusterEnvironment + +log = logging.getLogger(__name__) class LSFEnvironment(ClusterEnvironment): diff --git a/src/pytorch_lightning/plugins/environments/slurm_environment.py b/src/lightning_lite/plugins/environments/slurm_environment.py similarity index 98% rename from src/pytorch_lightning/plugins/environments/slurm_environment.py rename to src/lightning_lite/plugins/environments/slurm_environment.py index c17d2d765464e..5973453194a28 100644 --- a/src/pytorch_lightning/plugins/environments/slurm_environment.py +++ b/src/lightning_lite/plugins/environments/slurm_environment.py @@ -17,7 +17,7 @@ import re from typing import Optional -from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment +from lightning_lite.plugins.environments.cluster_environment import ClusterEnvironment log = logging.getLogger(__name__) diff --git a/src/pytorch_lightning/plugins/environments/torchelastic_environment.py b/src/lightning_lite/plugins/environments/torchelastic_environment.py similarity index 92% rename from src/pytorch_lightning/plugins/environments/torchelastic_environment.py rename to src/lightning_lite/plugins/environments/torchelastic_environment.py index 2cd3f408f4964..f33c7dab0fea7 100644 --- a/src/pytorch_lightning/plugins/environments/torchelastic_environment.py +++ b/src/lightning_lite/plugins/environments/torchelastic_environment.py @@ -17,9 +17,9 @@ import torch.distributed -from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment -from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_9_1 -from pytorch_lightning.utilities.rank_zero import rank_zero_warn +from lightning_lite.plugins.environments.cluster_environment import ClusterEnvironment +from lightning_lite.utilities.imports import _TORCH_GREATER_EQUAL_1_9_1 +from lightning_lite.utilities.rank_zero import rank_zero_warn log = logging.getLogger(__name__) diff --git a/src/pytorch_lightning/plugins/environments/xla_environment.py b/src/lightning_lite/plugins/environments/xla_environment.py similarity index 93% rename from src/pytorch_lightning/plugins/environments/xla_environment.py rename to src/lightning_lite/plugins/environments/xla_environment.py index 4072f6f8715f5..da5a99c000d56 100644 --- a/src/pytorch_lightning/plugins/environments/xla_environment.py +++ b/src/lightning_lite/plugins/environments/xla_environment.py @@ -14,8 +14,8 @@ import logging import os -from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment -from pytorch_lightning.utilities.imports import _TPU_AVAILABLE +from lightning_lite.plugins.environments.cluster_environment import ClusterEnvironment +from lightning_lite.utilities.imports import _TPU_AVAILABLE if _TPU_AVAILABLE: import torch_xla.core.xla_env_vars as xenv diff --git a/src/lightning_lite/utilities/device_parser.py b/src/lightning_lite/utilities/device_parser.py index 78bf8a9a8c93f..f0e5802d07c0e 100644 --- a/src/lightning_lite/utilities/device_parser.py +++ b/src/lightning_lite/utilities/device_parser.py @@ -4,8 +4,9 @@ import torch +from lightning_lite.plugins.environments.torchelastic_environment import TorchElasticEnvironment + # TODO(lite): Fix the imports -# from lightning_lite.plugins.environments import TorchElasticEnvironment # from lightning_lite.strategies.launchers.multiprocessing import _is_forking_disabled from lightning_lite.utilities.exceptions import MisconfigurationException from lightning_lite.utilities.types import _DEVICE @@ -83,7 +84,7 @@ def parse_gpu_ids( raise MisconfigurationException("GPUs requested but none are available.") if ( - True # TorchElasticEnvironment.detect() # TODO(lite): Revert this once environments have moved + TorchElasticEnvironment.detect() and len(gpus) != 1 and len(_get_all_available_gpus(include_cuda=include_cuda, include_mps=include_mps)) == 1 ): diff --git a/src/lightning_lite/utilities/distributed.py b/src/lightning_lite/utilities/distributed.py index 77123c53ff14a..166b28a5c948f 100644 --- a/src/lightning_lite/utilities/distributed.py +++ b/src/lightning_lite/utilities/distributed.py @@ -6,6 +6,7 @@ from torch import Tensor from torch.nn import functional as F +from lightning_lite.plugins.environments.cluster_environment import ClusterEnvironment from lightning_lite.utilities.imports import _HPU_AVAILABLE, _TPU_AVAILABLE from lightning_lite.utilities.rank_zero import rank_zero_deprecation from lightning_lite.utilities.rank_zero import rank_zero_info as new_rank_zero_info @@ -203,8 +204,7 @@ def all_gather_ddp_if_available( def init_dist_connection( - # TODO(lite): Fix this type error - cluster_environment: "ClusterEnvironment", # type: ignore[name-defined] # noqa: F821 + cluster_environment: "ClusterEnvironment", torch_distributed_backend: str, global_rank: Optional[int] = None, world_size: Optional[int] = None, diff --git a/src/pytorch_lightning/loops/utilities.py b/src/pytorch_lightning/loops/utilities.py index d5824c431cea6..3dcc2f6531e8d 100644 --- a/src/pytorch_lightning/loops/utilities.py +++ b/src/pytorch_lightning/loops/utilities.py @@ -26,7 +26,8 @@ from lightning_lite.utilities.warnings import PossibleUserWarning from pytorch_lightning.callbacks.timer import Timer from pytorch_lightning.loops import Loop -from pytorch_lightning.strategies import ParallelStrategy, Strategy +from pytorch_lightning.strategies.parallel import ParallelStrategy +from pytorch_lightning.strategies.strategy import Strategy from pytorch_lightning.trainer.progress import BaseProgress from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.memory import recursive_detach diff --git a/src/pytorch_lightning/plugins/__init__.py b/src/pytorch_lightning/plugins/__init__.py index 5967b8debf3ad..261ff363897be 100644 --- a/src/pytorch_lightning/plugins/__init__.py +++ b/src/pytorch_lightning/plugins/__init__.py @@ -1,6 +1,6 @@ from typing import Union -from pytorch_lightning.plugins.environments import ClusterEnvironment +from lightning_lite.plugins.environments import ClusterEnvironment from pytorch_lightning.plugins.io.async_plugin import AsyncCheckpointIO from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO from pytorch_lightning.plugins.io.hpu_plugin import HPUCheckpointIO @@ -19,7 +19,7 @@ from pytorch_lightning.plugins.precision.sharded_native_amp import ShardedNativeMixedPrecisionPlugin from pytorch_lightning.plugins.precision.tpu import TPUPrecisionPlugin from pytorch_lightning.plugins.precision.tpu_bf16 import TPUBf16PrecisionPlugin -from pytorch_lightning.strategies import Strategy +from pytorch_lightning.strategies.strategy import Strategy PLUGIN = Union[Strategy, PrecisionPlugin, ClusterEnvironment, CheckpointIO, LayerSync] PLUGIN_INPUT = Union[PLUGIN, str] diff --git a/src/pytorch_lightning/plugins/environments/__init__.py b/src/pytorch_lightning/plugins/environments/__init__.py index 3417f6007041b..2d422d9fc4b28 100644 --- a/src/pytorch_lightning/plugins/environments/__init__.py +++ b/src/pytorch_lightning/plugins/environments/__init__.py @@ -11,11 +11,11 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from lightning_lite.plugins.environments.cluster_environment import ClusterEnvironment # noqa: F401 +from lightning_lite.plugins.environments.kubeflow_environment import KubeflowEnvironment # noqa: F401 +from lightning_lite.plugins.environments.lightning_environment import LightningEnvironment # noqa: F401 +from lightning_lite.plugins.environments.lsf_environment import LSFEnvironment # noqa: F401 +from lightning_lite.plugins.environments.slurm_environment import SLURMEnvironment # noqa: F401 +from lightning_lite.plugins.environments.torchelastic_environment import TorchElasticEnvironment # noqa: F401 +from lightning_lite.plugins.environments.xla_environment import XLAEnvironment # noqa: F401 from pytorch_lightning.plugins.environments.bagua_environment import BaguaEnvironment # noqa: F401 -from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment # noqa: F401 -from pytorch_lightning.plugins.environments.kubeflow_environment import KubeflowEnvironment # noqa: F401 -from pytorch_lightning.plugins.environments.lightning_environment import LightningEnvironment # noqa: F401 -from pytorch_lightning.plugins.environments.lsf_environment import LSFEnvironment # noqa: F401 -from pytorch_lightning.plugins.environments.slurm_environment import SLURMEnvironment # noqa: F401 -from pytorch_lightning.plugins.environments.torchelastic_environment import TorchElasticEnvironment # noqa: F401 -from pytorch_lightning.plugins.environments.xla_environment import XLAEnvironment # noqa: F401 diff --git a/src/pytorch_lightning/plugins/environments/bagua_environment.py b/src/pytorch_lightning/plugins/environments/bagua_environment.py index 0516b264c2ac3..2bbd654ce2d82 100644 --- a/src/pytorch_lightning/plugins/environments/bagua_environment.py +++ b/src/pytorch_lightning/plugins/environments/bagua_environment.py @@ -15,7 +15,7 @@ import logging import os -from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment +from lightning_lite.plugins.environments.cluster_environment import ClusterEnvironment log = logging.getLogger(__name__) diff --git a/src/pytorch_lightning/strategies/bagua.py b/src/pytorch_lightning/strategies/bagua.py index a54267a32b7b7..77e832233d968 100644 --- a/src/pytorch_lightning/strategies/bagua.py +++ b/src/pytorch_lightning/strategies/bagua.py @@ -8,11 +8,11 @@ from torch.nn import Module import pytorch_lightning as pl +from lightning_lite.plugins.environments.cluster_environment import ClusterEnvironment from lightning_lite.utilities.distributed import ReduceOp from lightning_lite.utilities.optimizer import optimizers_to_device from lightning_lite.utilities.seed import reset_seed from pytorch_lightning.overrides.base import _LightningModuleWrapperBase, _LightningPrecisionModuleWrapperBase -from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO from pytorch_lightning.plugins.precision import PrecisionPlugin from pytorch_lightning.strategies.ddp import DDPStrategy diff --git a/src/pytorch_lightning/strategies/ddp.py b/src/pytorch_lightning/strategies/ddp.py index c0eaf47ff8485..768f12ae23332 100644 --- a/src/pytorch_lightning/strategies/ddp.py +++ b/src/pytorch_lightning/strategies/ddp.py @@ -29,6 +29,7 @@ from torch.optim.optimizer import Optimizer import pytorch_lightning as pl +from lightning_lite.plugins.environments.cluster_environment import ClusterEnvironment from lightning_lite.utilities.distributed import ( _get_process_group_backend_from_env, distributed_available, @@ -43,7 +44,6 @@ from pytorch_lightning.overrides.base import _LightningPrecisionModuleWrapperBase from pytorch_lightning.overrides.distributed import prepare_for_backward from pytorch_lightning.overrides.fairscale import _FAIRSCALE_AVAILABLE -from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO from pytorch_lightning.plugins.precision import PrecisionPlugin from pytorch_lightning.strategies.launchers.subprocess_script import _SubprocessScriptLauncher diff --git a/src/pytorch_lightning/strategies/ddp_spawn.py b/src/pytorch_lightning/strategies/ddp_spawn.py index 35d90498131df..74d8f7c3f4c19 100644 --- a/src/pytorch_lightning/strategies/ddp_spawn.py +++ b/src/pytorch_lightning/strategies/ddp_spawn.py @@ -24,6 +24,7 @@ from typing_extensions import Literal import pytorch_lightning as pl +from lightning_lite.plugins.environments.cluster_environment import ClusterEnvironment from lightning_lite.utilities.distributed import ( _get_process_group_backend_from_env, distributed_available, @@ -35,7 +36,6 @@ from pytorch_lightning.overrides import LightningDistributedModule from pytorch_lightning.overrides.base import _LightningPrecisionModuleWrapperBase from pytorch_lightning.overrides.distributed import prepare_for_backward -from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO from pytorch_lightning.plugins.precision import PrecisionPlugin from pytorch_lightning.strategies.launchers.multiprocessing import _MultiProcessingLauncher diff --git a/src/pytorch_lightning/strategies/deepspeed.py b/src/pytorch_lightning/strategies/deepspeed.py index 46634f00123db..ad8d4da106ec8 100644 --- a/src/pytorch_lightning/strategies/deepspeed.py +++ b/src/pytorch_lightning/strategies/deepspeed.py @@ -30,6 +30,7 @@ from torch.optim import Optimizer import pytorch_lightning as pl +from lightning_lite.plugins.environments.cluster_environment import ClusterEnvironment from lightning_lite.utilities.distributed import ( _get_process_group_backend_from_env, get_default_process_group_backend_for_device, @@ -42,7 +43,6 @@ from pytorch_lightning.accelerators.cuda import CUDAAccelerator from pytorch_lightning.core.optimizer import _init_optimizers_and_lr_schedulers from pytorch_lightning.overrides.base import _LightningModuleWrapperBase, _LightningPrecisionModuleWrapperBase -from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment from pytorch_lightning.plugins.precision import PrecisionPlugin from pytorch_lightning.strategies.ddp import DDPStrategy from pytorch_lightning.strategies.utils import _fp_to_half diff --git a/src/pytorch_lightning/strategies/fully_sharded.py b/src/pytorch_lightning/strategies/fully_sharded.py index add78dc35e41f..5cfad516fb444 100644 --- a/src/pytorch_lightning/strategies/fully_sharded.py +++ b/src/pytorch_lightning/strategies/fully_sharded.py @@ -18,11 +18,11 @@ import torch import pytorch_lightning as pl +from lightning_lite.plugins.environments.cluster_environment import ClusterEnvironment from lightning_lite.utilities.enums import PrecisionType from lightning_lite.utilities.optimizer import optimizers_to_device from pytorch_lightning.overrides.base import _LightningModuleWrapperBase from pytorch_lightning.overrides.fairscale import _FAIRSCALE_AVAILABLE -from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO from pytorch_lightning.plugins.precision import PrecisionPlugin from pytorch_lightning.strategies.ddp import DDPStrategy diff --git a/src/pytorch_lightning/strategies/fully_sharded_native.py b/src/pytorch_lightning/strategies/fully_sharded_native.py index ed7c237c9bae5..243cfcac81529 100644 --- a/src/pytorch_lightning/strategies/fully_sharded_native.py +++ b/src/pytorch_lightning/strategies/fully_sharded_native.py @@ -19,6 +19,7 @@ from torch import Tensor import pytorch_lightning as pl +from lightning_lite.plugins.environments.cluster_environment import ClusterEnvironment from lightning_lite.utilities.distributed import ( _get_process_group_backend_from_env, get_default_process_group_backend_for_device, @@ -28,7 +29,6 @@ from lightning_lite.utilities.optimizer import optimizers_to_device from lightning_lite.utilities.seed import reset_seed from pytorch_lightning.overrides.base import _LightningModuleWrapperBase -from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO from pytorch_lightning.plugins.precision import PrecisionPlugin from pytorch_lightning.plugins.precision.fsdp_native_native_amp import FullyShardedNativeNativeMixedPrecisionPlugin diff --git a/src/pytorch_lightning/strategies/hpu_parallel.py b/src/pytorch_lightning/strategies/hpu_parallel.py index e7c18d34713d9..9bb3a2b4716fd 100644 --- a/src/pytorch_lightning/strategies/hpu_parallel.py +++ b/src/pytorch_lightning/strategies/hpu_parallel.py @@ -18,10 +18,10 @@ import torch.distributed import pytorch_lightning as pl +from lightning_lite.plugins.environments.cluster_environment import ClusterEnvironment from lightning_lite.utilities.distributed import group as _group from pytorch_lightning.overrides import LightningDistributedModule from pytorch_lightning.overrides.torch_distributed import broadcast_object_list -from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO from pytorch_lightning.plugins.io.hpu_plugin import HPUCheckpointIO from pytorch_lightning.plugins.io.wrapper import _WrappingCheckpointIO diff --git a/src/pytorch_lightning/strategies/ipu.py b/src/pytorch_lightning/strategies/ipu.py index 69de6049711b7..7560a303a7949 100644 --- a/src/pytorch_lightning/strategies/ipu.py +++ b/src/pytorch_lightning/strategies/ipu.py @@ -21,10 +21,10 @@ from torch.utils.data import DataLoader, Sampler import pytorch_lightning as pl +from lightning_lite.plugins.environments.cluster_environment import ClusterEnvironment from lightning_lite.utilities.cloud_io import get_filesystem from lightning_lite.utilities.enums import PrecisionType from pytorch_lightning.overrides.base import _LightningModuleWrapperBase, _LightningPrecisionModuleWrapperBase -from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO from pytorch_lightning.plugins.precision import PrecisionPlugin from pytorch_lightning.strategies.parallel import ParallelStrategy diff --git a/src/pytorch_lightning/strategies/launchers/multiprocessing.py b/src/pytorch_lightning/strategies/launchers/multiprocessing.py index 31508067abf36..fdc17f8b8d90f 100644 --- a/src/pytorch_lightning/strategies/launchers/multiprocessing.py +++ b/src/pytorch_lightning/strategies/launchers/multiprocessing.py @@ -30,7 +30,6 @@ from lightning_lite.utilities.seed import _collect_rng_states, _set_rng_states from lightning_lite.utilities.types import _PATH from pytorch_lightning.strategies.launchers.base import _Launcher -from pytorch_lightning.strategies.strategy import Strategy from pytorch_lightning.trainer.states import TrainerFn, TrainerState from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_11 from pytorch_lightning.utilities.rank_zero import rank_zero_debug @@ -59,7 +58,9 @@ class _MultiProcessingLauncher(_Launcher): - 'forkserver': Alternative implementation to 'fork'. """ - def __init__(self, strategy: Strategy, start_method: Literal["spawn", "fork", "forkserver"] = "spawn") -> None: + def __init__( + self, strategy: "pl.strategies.Strategy", start_method: Literal["spawn", "fork", "forkserver"] = "spawn" + ) -> None: self._strategy = strategy self._start_method = start_method if start_method not in mp.get_all_start_methods(): diff --git a/src/pytorch_lightning/strategies/launchers/subprocess_script.py b/src/pytorch_lightning/strategies/launchers/subprocess_script.py index fd28fd3dcb20c..f9e565260f703 100644 --- a/src/pytorch_lightning/strategies/launchers/subprocess_script.py +++ b/src/pytorch_lightning/strategies/launchers/subprocess_script.py @@ -22,7 +22,7 @@ from lightning_utilities.core.imports import RequirementCache import pytorch_lightning as pl -from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment +from lightning_lite.plugins.environments.cluster_environment import ClusterEnvironment from pytorch_lightning.strategies.launchers.base import _Launcher _HYDRA_AVAILABLE = RequirementCache("hydra") diff --git a/src/pytorch_lightning/strategies/parallel.py b/src/pytorch_lightning/strategies/parallel.py index 124d01f362fef..e2b15fefe1a50 100644 --- a/src/pytorch_lightning/strategies/parallel.py +++ b/src/pytorch_lightning/strategies/parallel.py @@ -19,6 +19,7 @@ from torch import Tensor import pytorch_lightning as pl +from lightning_lite.plugins.environments.cluster_environment import ClusterEnvironment from lightning_lite.utilities.distributed import ( _get_process_group_backend_from_env, all_gather_ddp_if_available, @@ -26,7 +27,6 @@ ReduceOp, ) from pytorch_lightning.plugins import LayerSync -from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO from pytorch_lightning.plugins.precision import PrecisionPlugin from pytorch_lightning.strategies.strategy import Strategy diff --git a/src/pytorch_lightning/strategies/tpu_spawn.py b/src/pytorch_lightning/strategies/tpu_spawn.py index 52dec94ac3702..a84e703bbf1f9 100644 --- a/src/pytorch_lightning/strategies/tpu_spawn.py +++ b/src/pytorch_lightning/strategies/tpu_spawn.py @@ -22,12 +22,12 @@ from torch.utils.data import DataLoader import pytorch_lightning as pl +from lightning_lite.plugins.environments import XLAEnvironment from lightning_lite.utilities.data import has_len from lightning_lite.utilities.distributed import ReduceOp from lightning_lite.utilities.optimizer import optimizers_to_device from lightning_lite.utilities.types import _PATH from pytorch_lightning.overrides import LightningDistributedModule -from pytorch_lightning.plugins.environments import XLAEnvironment from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO from pytorch_lightning.plugins.io.wrapper import _WrappingCheckpointIO from pytorch_lightning.plugins.io.xla_plugin import XLACheckpointIO diff --git a/src/pytorch_lightning/trainer/connectors/accelerator_connector.py b/src/pytorch_lightning/trainer/connectors/accelerator_connector.py index d50fc86140da2..77f29d3c159e5 100644 --- a/src/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/src/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -20,6 +20,14 @@ import torch from typing_extensions import Literal +from lightning_lite.plugins.environments import ( + ClusterEnvironment, + KubeflowEnvironment, + LightningEnvironment, + LSFEnvironment, + SLURMEnvironment, + TorchElasticEnvironment, +) from lightning_lite.utilities import _StrategyType, AMPType, device_parser, LightningEnum from pytorch_lightning.accelerators.accelerator import Accelerator from pytorch_lightning.accelerators.cpu import CPUAccelerator @@ -44,15 +52,7 @@ TPUBf16PrecisionPlugin, TPUPrecisionPlugin, ) -from pytorch_lightning.plugins.environments import ( - BaguaEnvironment, - ClusterEnvironment, - KubeflowEnvironment, - LightningEnvironment, - LSFEnvironment, - SLURMEnvironment, - TorchElasticEnvironment, -) +from pytorch_lightning.plugins.environments import BaguaEnvironment from pytorch_lightning.plugins.layer_sync import LayerSync, NativeSyncBatchNorm from pytorch_lightning.plugins.precision.fsdp_native_native_amp import FullyShardedNativeNativeMixedPrecisionPlugin from pytorch_lightning.strategies import ( diff --git a/src/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py b/src/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py index 6c251c1c13277..a9205b67bd3e4 100644 --- a/src/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py +++ b/src/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py @@ -17,9 +17,9 @@ from torch import Tensor import pytorch_lightning as pl +from lightning_lite.plugins.environments.slurm_environment import SLURMEnvironment from lightning_lite.utilities.apply_func import move_data_to_device from pytorch_lightning.loggers import Logger, TensorBoardLogger -from pytorch_lightning.plugins.environments.slurm_environment import SLURMEnvironment from pytorch_lightning.trainer.connectors.logger_connector.result import _METRICS, _OUT_DICT, _PBAR_DICT from pytorch_lightning.utilities.metrics import metrics_to_scalars from pytorch_lightning.utilities.model_helpers import is_overridden diff --git a/src/pytorch_lightning/trainer/connectors/signal_connector.py b/src/pytorch_lightning/trainer/connectors/signal_connector.py index 8d7c0bb51a0ce..17e11bfdf649d 100644 --- a/src/pytorch_lightning/trainer/connectors/signal_connector.py +++ b/src/pytorch_lightning/trainer/connectors/signal_connector.py @@ -8,7 +8,7 @@ from typing import Any, Callable, Dict, List, Set, Union import pytorch_lightning as pl -from pytorch_lightning.plugins.environments import SLURMEnvironment +from lightning_lite.plugins.environments import SLURMEnvironment from pytorch_lightning.utilities.imports import _fault_tolerant_training, _IS_WINDOWS from pytorch_lightning.utilities.rank_zero import rank_zero_info diff --git a/src/pytorch_lightning/trainer/trainer.py b/src/pytorch_lightning/trainer/trainer.py index 1859eab265db2..fc0fc36238ed9 100644 --- a/src/pytorch_lightning/trainer/trainer.py +++ b/src/pytorch_lightning/trainer/trainer.py @@ -427,6 +427,7 @@ def __init__( Trainer._log_api_event("init") log.detail(f"{self.__class__.__name__}: Initializing trainer with parameters: {locals()}") self.state = TrainerState() + self.num_sanity_val_steps: int # init connectors self._data_connector = DataConnector(self, multiple_trainloader_mode) diff --git a/tests/tests_lite/plugins/__init__.py b/tests/tests_lite/plugins/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/tests/tests_lite/plugins/environments/__init__.py b/tests/tests_lite/plugins/environments/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/tests/tests_pytorch/plugins/environments/test_kubeflow_environment.py b/tests/tests_lite/plugins/environments/test_kubeflow_environment.py similarity index 91% rename from tests/tests_pytorch/plugins/environments/test_kubeflow_environment.py rename to tests/tests_lite/plugins/environments/test_kubeflow_environment.py index d1abe1c112eeb..19ad84f124352 100644 --- a/tests/tests_pytorch/plugins/environments/test_kubeflow_environment.py +++ b/tests/tests_lite/plugins/environments/test_kubeflow_environment.py @@ -17,7 +17,7 @@ import pytest -from pytorch_lightning.plugins.environments import KubeflowEnvironment +from lightning_lite.plugins.environments import KubeflowEnvironment @mock.patch.dict(os.environ, {}, clear=True) @@ -61,14 +61,14 @@ def test_attributes_from_environment_variables(caplog): assert env.local_rank() == 0 assert env.node_rank() == 1 # setter should be no-op - with caplog.at_level(logging.DEBUG, logger="pytorch_lightning.plugins.environments"): + with caplog.at_level(logging.DEBUG, logger="lightning_lite.plugins.environments"): env.set_global_rank(100) assert env.global_rank() == 1 assert "setting global rank is not allowed" in caplog.text caplog.clear() - with caplog.at_level(logging.DEBUG, logger="pytorch_lightning.plugins.environments"): + with caplog.at_level(logging.DEBUG, logger="lightning_lite.plugins.environments"): env.set_world_size(100) assert env.world_size() == 20 assert "setting world size is not allowed" in caplog.text diff --git a/tests/tests_pytorch/plugins/environments/test_lightning_environment.py b/tests/tests_lite/plugins/environments/test_lightning_environment.py similarity index 97% rename from tests/tests_pytorch/plugins/environments/test_lightning_environment.py rename to tests/tests_lite/plugins/environments/test_lightning_environment.py index 0ad05e0bfa8eb..eb9d7040e6279 100644 --- a/tests/tests_pytorch/plugins/environments/test_lightning_environment.py +++ b/tests/tests_lite/plugins/environments/test_lightning_environment.py @@ -16,7 +16,7 @@ import pytest -from pytorch_lightning.plugins.environments import LightningEnvironment +from lightning_lite.plugins.environments import LightningEnvironment @mock.patch.dict(os.environ, {}, clear=True) diff --git a/tests/tests_pytorch/plugins/environments/test_lsf_environment.py b/tests/tests_lite/plugins/environments/test_lsf_environment.py similarity index 98% rename from tests/tests_pytorch/plugins/environments/test_lsf_environment.py rename to tests/tests_lite/plugins/environments/test_lsf_environment.py index e4b213da47c8d..5987f9e10f392 100644 --- a/tests/tests_pytorch/plugins/environments/test_lsf_environment.py +++ b/tests/tests_lite/plugins/environments/test_lsf_environment.py @@ -16,7 +16,7 @@ import pytest -from pytorch_lightning.plugins.environments import LSFEnvironment +from lightning_lite.plugins.environments import LSFEnvironment def _make_rankfile(tmp_path): diff --git a/tests/tests_pytorch/plugins/environments/test_slurm_environment.py b/tests/tests_lite/plugins/environments/test_slurm_environment.py similarity index 92% rename from tests/tests_pytorch/plugins/environments/test_slurm_environment.py rename to tests/tests_lite/plugins/environments/test_slurm_environment.py index 44b0dd97e354d..805aa8acf80ef 100644 --- a/tests/tests_pytorch/plugins/environments/test_slurm_environment.py +++ b/tests/tests_lite/plugins/environments/test_slurm_environment.py @@ -17,7 +17,7 @@ import pytest -from pytorch_lightning.plugins.environments import SLURMEnvironment +from lightning_lite.plugins.environments import SLURMEnvironment @mock.patch.dict(os.environ, {}, clear=True) @@ -66,14 +66,14 @@ def test_attributes_from_environment_variables(caplog): assert env.node_rank() == 3 assert env.job_name() == "JOB" # setter should be no-op - with caplog.at_level(logging.DEBUG, logger="pytorch_lightning.plugins.environments"): + with caplog.at_level(logging.DEBUG, logger="lightning_lite.plugins.environments"): env.set_global_rank(100) assert env.global_rank() == 1 assert "setting global rank is not allowed" in caplog.text caplog.clear() - with caplog.at_level(logging.DEBUG, logger="pytorch_lightning.plugins.environments"): + with caplog.at_level(logging.DEBUG, logger="lightning_lite.plugins.environments"): env.set_world_size(100) assert env.world_size() == 20 assert "setting world size is not allowed" in caplog.text diff --git a/tests/tests_pytorch/plugins/environments/test_torchelastic_environment.py b/tests/tests_lite/plugins/environments/test_torchelastic_environment.py similarity index 91% rename from tests/tests_pytorch/plugins/environments/test_torchelastic_environment.py rename to tests/tests_lite/plugins/environments/test_torchelastic_environment.py index 10468d7a02a1f..9a28784d2cbde 100644 --- a/tests/tests_pytorch/plugins/environments/test_torchelastic_environment.py +++ b/tests/tests_lite/plugins/environments/test_torchelastic_environment.py @@ -16,9 +16,9 @@ from unittest import mock import pytest +from tests_lite.helpers.runif import RunIf -from pytorch_lightning.plugins.environments import TorchElasticEnvironment -from tests_pytorch.helpers.runif import RunIf +from lightning_lite.plugins.environments import TorchElasticEnvironment @mock.patch.dict(os.environ, {}, clear=True) @@ -58,14 +58,14 @@ def test_attributes_from_environment_variables(caplog): assert env.local_rank() == 2 assert env.node_rank() == 3 # setter should be no-op - with caplog.at_level(logging.DEBUG, logger="pytorch_lightning.plugins.environments"): + with caplog.at_level(logging.DEBUG, logger="lightning_lite.plugins.environments"): env.set_global_rank(100) assert env.global_rank() == 1 assert "setting global rank is not allowed" in caplog.text caplog.clear() - with caplog.at_level(logging.DEBUG, logger="pytorch_lightning.plugins.environments"): + with caplog.at_level(logging.DEBUG, logger="lightning_lite.plugins.environments"): env.set_world_size(100) assert env.world_size() == 20 assert "setting world size is not allowed" in caplog.text diff --git a/tests/tests_pytorch/plugins/environments/test_xla_environment.py b/tests/tests_lite/plugins/environments/test_xla_environment.py similarity index 87% rename from tests/tests_pytorch/plugins/environments/test_xla_environment.py rename to tests/tests_lite/plugins/environments/test_xla_environment.py index ac1f17bc2dde0..313aab368b2ff 100644 --- a/tests/tests_pytorch/plugins/environments/test_xla_environment.py +++ b/tests/tests_lite/plugins/environments/test_xla_environment.py @@ -16,10 +16,10 @@ import pytest import torch +from tests_lite.helpers.runif import RunIf -import pytorch_lightning as pl -from pytorch_lightning.plugins.environments import XLAEnvironment -from tests_pytorch.helpers.runif import RunIf +import lightning_lite +from lightning_lite.plugins.environments import XLAEnvironment @RunIf(tpu=True) @@ -72,8 +72,8 @@ def test_attributes_from_environment_variables(): def test_detect(monkeypatch): """Test the detection of a xla environment configuration.""" - monkeypatch.setattr(pl.plugins.environments.xla_environment, "_TPU_AVAILABLE", False) + monkeypatch.setattr(lightning_lite.plugins.environments.xla_environment, "_TPU_AVAILABLE", False) assert not XLAEnvironment.detect() - monkeypatch.setattr(pl.plugins.environments.xla_environment, "_TPU_AVAILABLE", True) + monkeypatch.setattr(lightning_lite.plugins.environments.xla_environment, "_TPU_AVAILABLE", True) assert XLAEnvironment.detect() diff --git a/tests/tests_pytorch/conftest.py b/tests/tests_pytorch/conftest.py index 95586af186da3..d3c54a9b15b9e 100644 --- a/tests/tests_pytorch/conftest.py +++ b/tests/tests_pytorch/conftest.py @@ -22,7 +22,7 @@ import pytest import torch.distributed -from pytorch_lightning.plugins.environments.lightning_environment import find_free_network_port +from lightning_lite.plugins.environments.lightning_environment import find_free_network_port from pytorch_lightning.trainer.connectors.signal_connector import SignalConnector from pytorch_lightning.utilities.imports import _IS_WINDOWS from tests_pytorch import _PATH_DATASETS diff --git a/tests/tests_pytorch/deprecated_api/test_remove_1-10.py b/tests/tests_pytorch/deprecated_api/test_remove_1-10.py index a48c6a7884083..140009ffcdd4d 100644 --- a/tests/tests_pytorch/deprecated_api/test_remove_1-10.py +++ b/tests/tests_pytorch/deprecated_api/test_remove_1-10.py @@ -19,13 +19,13 @@ import torch from torch.utils.data import DataLoader +from lightning_lite.plugins.environments import LightningEnvironment from pytorch_lightning import Trainer from pytorch_lightning.core.mixins.device_dtype_mixin import DeviceDtypeModuleMixin from pytorch_lightning.demos.boring_classes import BoringModel, RandomDataset from pytorch_lightning.overrides import LightningDistributedModule, LightningParallelModule from pytorch_lightning.overrides.base import unwrap_lightning_module from pytorch_lightning.overrides.fairscale import LightningShardedDataParallel, unwrap_lightning_module_sharded -from pytorch_lightning.plugins.environments import LightningEnvironment from pytorch_lightning.strategies.bagua import LightningBaguaModule from pytorch_lightning.strategies.deepspeed import LightningDeepSpeedModule from pytorch_lightning.strategies.ipu import LightningIPUModule diff --git a/tests/tests_pytorch/lite/test_parity.py b/tests/tests_pytorch/lite/test_parity.py index 0ea65bb49dd56..eaada992da497 100644 --- a/tests/tests_pytorch/lite/test_parity.py +++ b/tests/tests_pytorch/lite/test_parity.py @@ -28,11 +28,11 @@ from torch.utils.data import DataLoader from torch.utils.data.distributed import DistributedSampler +from lightning_lite.plugins.environments.lightning_environment import find_free_network_port from lightning_lite.utilities.apply_func import move_data_to_device from lightning_lite.utilities.cloud_io import atomic_save from pytorch_lightning.demos.boring_classes import RandomDataset from pytorch_lightning.lite import LightningLite -from pytorch_lightning.plugins.environments.lightning_environment import find_free_network_port from pytorch_lightning.strategies.ddp_spawn import DDPSpawnStrategy from tests_pytorch.helpers.runif import RunIf diff --git a/tests/tests_pytorch/models/test_amp.py b/tests/tests_pytorch/models/test_amp.py index 822d2e0401619..ce1ed69d504dd 100644 --- a/tests/tests_pytorch/models/test_amp.py +++ b/tests/tests_pytorch/models/test_amp.py @@ -20,9 +20,9 @@ from torch.utils.data import DataLoader import tests_pytorch.helpers.utils as tutils +from lightning_lite.plugins.environments import SLURMEnvironment from pytorch_lightning import Trainer from pytorch_lightning.demos.boring_classes import BoringModel, RandomDataset -from pytorch_lightning.plugins.environments import SLURMEnvironment from tests_pytorch.helpers.runif import RunIf diff --git a/tests/tests_pytorch/models/test_gpu.py b/tests/tests_pytorch/models/test_gpu.py index 3951a500b8481..b01377f1be3f5 100644 --- a/tests/tests_pytorch/models/test_gpu.py +++ b/tests/tests_pytorch/models/test_gpu.py @@ -21,11 +21,11 @@ import tests_pytorch.helpers.pipelines as tpipes import tests_pytorch.helpers.utils as tutils +from lightning_lite.plugins.environments import TorchElasticEnvironment from lightning_lite.utilities import device_parser from pytorch_lightning import Trainer from pytorch_lightning.accelerators import CPUAccelerator, CUDAAccelerator from pytorch_lightning.demos.boring_classes import BoringModel -from pytorch_lightning.plugins.environments import TorchElasticEnvironment from pytorch_lightning.utilities.exceptions import MisconfigurationException from tests_pytorch.helpers.datamodules import ClassifDataModule from tests_pytorch.helpers.runif import RunIf diff --git a/tests/tests_pytorch/plugins/test_cluster_integration.py b/tests/tests_pytorch/plugins/test_cluster_integration.py index be8f87d643f9c..4427551e4a206 100644 --- a/tests/tests_pytorch/plugins/test_cluster_integration.py +++ b/tests/tests_pytorch/plugins/test_cluster_integration.py @@ -17,8 +17,8 @@ import pytest import torch +from lightning_lite.plugins.environments import LightningEnvironment, SLURMEnvironment, TorchElasticEnvironment from pytorch_lightning import Trainer -from pytorch_lightning.plugins.environments import LightningEnvironment, SLURMEnvironment, TorchElasticEnvironment from pytorch_lightning.strategies import DDPShardedStrategy, DDPStrategy, DeepSpeedStrategy from pytorch_lightning.utilities.rank_zero import rank_zero_only from tests_pytorch.helpers.runif import RunIf diff --git a/tests/tests_pytorch/strategies/test_ddp_strategy.py b/tests/tests_pytorch/strategies/test_ddp_strategy.py index d867339ef63ea..2665eb7c3e370 100644 --- a/tests/tests_pytorch/strategies/test_ddp_strategy.py +++ b/tests/tests_pytorch/strategies/test_ddp_strategy.py @@ -19,10 +19,10 @@ import torch from torch.nn.parallel import DistributedDataParallel +from lightning_lite.plugins.environments import ClusterEnvironment, LightningEnvironment from pytorch_lightning import LightningModule, Trainer from pytorch_lightning.demos.boring_classes import BoringModel from pytorch_lightning.overrides.fairscale import _FAIRSCALE_AVAILABLE -from pytorch_lightning.plugins.environments import ClusterEnvironment, LightningEnvironment from pytorch_lightning.strategies import DDPStrategy from pytorch_lightning.trainer.states import TrainerFn from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_10 diff --git a/tests/tests_pytorch/test_cli.py b/tests/tests_pytorch/test_cli.py index 4d6a609a00a3c..f9ca39cb883bc 100644 --- a/tests/tests_pytorch/test_cli.py +++ b/tests/tests_pytorch/test_cli.py @@ -29,6 +29,7 @@ from torch.optim import SGD from torch.optim.lr_scheduler import ReduceLROnPlateau, StepLR +from lightning_lite.plugins.environments import SLURMEnvironment from pytorch_lightning import __version__, Callback, LightningDataModule, LightningModule, seed_everything, Trainer from pytorch_lightning.callbacks import LearningRateMonitor, ModelCheckpoint from pytorch_lightning.cli import ( @@ -43,7 +44,6 @@ from pytorch_lightning.loggers import _COMET_AVAILABLE, TensorBoardLogger from pytorch_lightning.loggers.neptune import _NEPTUNE_AVAILABLE from pytorch_lightning.loggers.wandb import _WANDB_AVAILABLE -from pytorch_lightning.plugins.environments import SLURMEnvironment from pytorch_lightning.strategies import DDPStrategy from pytorch_lightning.trainer.states import TrainerFn from pytorch_lightning.utilities import _TPU_AVAILABLE @@ -341,7 +341,7 @@ def fit(self, **_): def test_lightning_cli_args_cluster_environments(tmpdir): - plugins = [dict(class_path="pytorch_lightning.plugins.environments.SLURMEnvironment")] + plugins = [dict(class_path="lightning_lite.plugins.environments.SLURMEnvironment")] class TestModel(BoringModel): def on_fit_start(self): diff --git a/tests/tests_pytorch/trainer/connectors/test_accelerator_connector.py b/tests/tests_pytorch/trainer/connectors/test_accelerator_connector.py index 6625f191c3190..1a420e9269fae 100644 --- a/tests/tests_pytorch/trainer/connectors/test_accelerator_connector.py +++ b/tests/tests_pytorch/trainer/connectors/test_accelerator_connector.py @@ -21,18 +21,18 @@ import torch.distributed import pytorch_lightning +from lightning_lite.plugins.environments import ( + KubeflowEnvironment, + LightningEnvironment, + SLURMEnvironment, + TorchElasticEnvironment, +) from pytorch_lightning import Trainer from pytorch_lightning.accelerators.accelerator import Accelerator from pytorch_lightning.accelerators.cpu import CPUAccelerator from pytorch_lightning.accelerators.cuda import CUDAAccelerator from pytorch_lightning.accelerators.mps import MPSAccelerator from pytorch_lightning.plugins import DoublePrecisionPlugin, LayerSync, NativeSyncBatchNorm, PrecisionPlugin -from pytorch_lightning.plugins.environments import ( - KubeflowEnvironment, - LightningEnvironment, - SLURMEnvironment, - TorchElasticEnvironment, -) from pytorch_lightning.plugins.io import TorchCheckpointIO from pytorch_lightning.strategies import ( DataParallelStrategy, diff --git a/tests/tests_pytorch/trainer/connectors/test_signal_connector.py b/tests/tests_pytorch/trainer/connectors/test_signal_connector.py index d83faa3cec6aa..4f8bee8398edc 100644 --- a/tests/tests_pytorch/trainer/connectors/test_signal_connector.py +++ b/tests/tests_pytorch/trainer/connectors/test_signal_connector.py @@ -19,9 +19,9 @@ import pytest +from lightning_lite.plugins.environments import SLURMEnvironment from pytorch_lightning import Trainer from pytorch_lightning.demos.boring_classes import BoringModel -from pytorch_lightning.plugins.environments import SLURMEnvironment from pytorch_lightning.trainer.connectors.signal_connector import SignalConnector from pytorch_lightning.utilities.exceptions import ExitGracefullyException from tests_pytorch.helpers.runif import RunIf From 463439e6247568000f2049eda9d77b19945b409f Mon Sep 17 00:00:00 2001 From: awaelchli Date: Wed, 7 Sep 2022 16:15:47 +0200 Subject: [PATCH 098/193] Move checkpoint io plugins from pl/plugins/io to lite/plugins/io (#14519) --- src/lightning_lite/plugins/__init__.py | 26 ++++++++++++++++++- src/lightning_lite/plugins/io/__init__.py | 18 +++++++++++++ .../plugins/io/checkpoint_plugin.py | 0 .../plugins/io/torch_plugin.py | 4 +-- .../plugins/io/xla_plugin.py | 2 +- src/pytorch_lightning/plugins/__init__.py | 6 ++--- src/pytorch_lightning/plugins/io/__init__.py | 6 ++--- .../plugins/io/async_plugin.py | 2 +- .../plugins/io/hpu_plugin.py | 2 +- src/pytorch_lightning/plugins/io/wrapper.py | 2 +- src/pytorch_lightning/strategies/bagua.py | 2 +- src/pytorch_lightning/strategies/ddp.py | 2 +- src/pytorch_lightning/strategies/ddp_spawn.py | 2 +- src/pytorch_lightning/strategies/dp.py | 2 +- .../strategies/fully_sharded.py | 2 +- .../strategies/fully_sharded_native.py | 2 +- src/pytorch_lightning/strategies/horovod.py | 2 +- .../strategies/hpu_parallel.py | 2 +- src/pytorch_lightning/strategies/ipu.py | 2 +- src/pytorch_lightning/strategies/parallel.py | 2 +- .../strategies/single_device.py | 2 +- .../strategies/single_hpu.py | 2 +- .../strategies/single_tpu.py | 4 +-- src/pytorch_lightning/strategies/strategy.py | 2 +- src/pytorch_lightning/strategies/tpu_spawn.py | 4 +-- tests/tests_pytorch/accelerators/test_cpu.py | 2 +- .../checkpointing/test_trainer_checkpoint.py | 4 +-- .../plugins/test_checkpoint_io_plugin.py | 4 +-- 28 files changed, 77 insertions(+), 35 deletions(-) create mode 100644 src/lightning_lite/plugins/io/__init__.py rename src/{pytorch_lightning => lightning_lite}/plugins/io/checkpoint_plugin.py (100%) rename src/{pytorch_lightning => lightning_lite}/plugins/io/torch_plugin.py (96%) rename src/{pytorch_lightning => lightning_lite}/plugins/io/xla_plugin.py (97%) diff --git a/src/lightning_lite/plugins/__init__.py b/src/lightning_lite/plugins/__init__.py index e1c69ab16df2c..ff7d31fbe7a91 100644 --- a/src/lightning_lite/plugins/__init__.py +++ b/src/lightning_lite/plugins/__init__.py @@ -1 +1,25 @@ -from lightning_lite.plugins.environments import ClusterEnvironment # noqa: F401 +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from lightning_lite.plugins.environments import ClusterEnvironment +from lightning_lite.plugins.io.checkpoint_plugin import CheckpointIO +from lightning_lite.plugins.io.torch_plugin import TorchCheckpointIO +from lightning_lite.plugins.io.xla_plugin import XLACheckpointIO + +__all__ = [ + "ClusterEnvironment", + "CheckpointIO", + "TorchCheckpointIO", + "XLACheckpointIO", +] diff --git a/src/lightning_lite/plugins/io/__init__.py b/src/lightning_lite/plugins/io/__init__.py new file mode 100644 index 0000000000000..835150856f128 --- /dev/null +++ b/src/lightning_lite/plugins/io/__init__.py @@ -0,0 +1,18 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from lightning_lite.plugins.io.checkpoint_plugin import CheckpointIO +from lightning_lite.plugins.io.torch_plugin import TorchCheckpointIO +from lightning_lite.plugins.io.xla_plugin import XLACheckpointIO + +__all__ = ["CheckpointIO", "TorchCheckpointIO", "XLACheckpointIO"] diff --git a/src/pytorch_lightning/plugins/io/checkpoint_plugin.py b/src/lightning_lite/plugins/io/checkpoint_plugin.py similarity index 100% rename from src/pytorch_lightning/plugins/io/checkpoint_plugin.py rename to src/lightning_lite/plugins/io/checkpoint_plugin.py diff --git a/src/pytorch_lightning/plugins/io/torch_plugin.py b/src/lightning_lite/plugins/io/torch_plugin.py similarity index 96% rename from src/pytorch_lightning/plugins/io/torch_plugin.py rename to src/lightning_lite/plugins/io/torch_plugin.py index 723900864c517..f0e9ca3d11e54 100644 --- a/src/pytorch_lightning/plugins/io/torch_plugin.py +++ b/src/lightning_lite/plugins/io/torch_plugin.py @@ -16,11 +16,11 @@ from typing import Any, Callable, Dict, Optional import pytorch_lightning as pl +from lightning_lite.plugins.io.checkpoint_plugin import CheckpointIO from lightning_lite.utilities.cloud_io import atomic_save, get_filesystem from lightning_lite.utilities.cloud_io import load as pl_load +from lightning_lite.utilities.rank_zero import rank_zero_warn from lightning_lite.utilities.types import _PATH -from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO -from pytorch_lightning.utilities.rank_zero import rank_zero_warn log = logging.getLogger(__name__) diff --git a/src/pytorch_lightning/plugins/io/xla_plugin.py b/src/lightning_lite/plugins/io/xla_plugin.py similarity index 97% rename from src/pytorch_lightning/plugins/io/xla_plugin.py rename to src/lightning_lite/plugins/io/xla_plugin.py index 88d8c2bcb7481..222b50aca5618 100644 --- a/src/pytorch_lightning/plugins/io/xla_plugin.py +++ b/src/lightning_lite/plugins/io/xla_plugin.py @@ -16,9 +16,9 @@ from lightning_utilities.core.apply_func import apply_to_collection +from lightning_lite.plugins.io.torch_plugin import TorchCheckpointIO from lightning_lite.utilities.cloud_io import get_filesystem from lightning_lite.utilities.types import _PATH -from pytorch_lightning.plugins.io.torch_plugin import TorchCheckpointIO from pytorch_lightning.utilities import _OMEGACONF_AVAILABLE, _TPU_AVAILABLE if _TPU_AVAILABLE: diff --git a/src/pytorch_lightning/plugins/__init__.py b/src/pytorch_lightning/plugins/__init__.py index 261ff363897be..42d5f85299777 100644 --- a/src/pytorch_lightning/plugins/__init__.py +++ b/src/pytorch_lightning/plugins/__init__.py @@ -1,11 +1,11 @@ from typing import Union from lightning_lite.plugins.environments import ClusterEnvironment +from lightning_lite.plugins.io.checkpoint_plugin import CheckpointIO +from lightning_lite.plugins.io.torch_plugin import TorchCheckpointIO +from lightning_lite.plugins.io.xla_plugin import XLACheckpointIO from pytorch_lightning.plugins.io.async_plugin import AsyncCheckpointIO -from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO from pytorch_lightning.plugins.io.hpu_plugin import HPUCheckpointIO -from pytorch_lightning.plugins.io.torch_plugin import TorchCheckpointIO -from pytorch_lightning.plugins.io.xla_plugin import XLACheckpointIO from pytorch_lightning.plugins.layer_sync import LayerSync, NativeSyncBatchNorm from pytorch_lightning.plugins.precision.apex_amp import ApexMixedPrecisionPlugin from pytorch_lightning.plugins.precision.deepspeed import DeepSpeedPrecisionPlugin diff --git a/src/pytorch_lightning/plugins/io/__init__.py b/src/pytorch_lightning/plugins/io/__init__.py index 19a556bddf29c..8897391bc0ce3 100644 --- a/src/pytorch_lightning/plugins/io/__init__.py +++ b/src/pytorch_lightning/plugins/io/__init__.py @@ -11,10 +11,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from lightning_lite.plugins.io.checkpoint_plugin import CheckpointIO +from lightning_lite.plugins.io.torch_plugin import TorchCheckpointIO +from lightning_lite.plugins.io.xla_plugin import XLACheckpointIO from pytorch_lightning.plugins.io.async_plugin import AsyncCheckpointIO -from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO from pytorch_lightning.plugins.io.hpu_plugin import HPUCheckpointIO -from pytorch_lightning.plugins.io.torch_plugin import TorchCheckpointIO -from pytorch_lightning.plugins.io.xla_plugin import XLACheckpointIO __all__ = ["AsyncCheckpointIO", "CheckpointIO", "HPUCheckpointIO", "TorchCheckpointIO", "XLACheckpointIO"] diff --git a/src/pytorch_lightning/plugins/io/async_plugin.py b/src/pytorch_lightning/plugins/io/async_plugin.py index 1146bc373a4ac..88455988f1201 100644 --- a/src/pytorch_lightning/plugins/io/async_plugin.py +++ b/src/pytorch_lightning/plugins/io/async_plugin.py @@ -15,7 +15,7 @@ from concurrent.futures import ThreadPoolExecutor from typing import Any, Optional -from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO +from lightning_lite.plugins.io.checkpoint_plugin import CheckpointIO from pytorch_lightning.plugins.io.wrapper import _WrappingCheckpointIO diff --git a/src/pytorch_lightning/plugins/io/hpu_plugin.py b/src/pytorch_lightning/plugins/io/hpu_plugin.py index 9fb564cda7237..47662112c2733 100644 --- a/src/pytorch_lightning/plugins/io/hpu_plugin.py +++ b/src/pytorch_lightning/plugins/io/hpu_plugin.py @@ -17,10 +17,10 @@ import torch +from lightning_lite.plugins.io.torch_plugin import TorchCheckpointIO from lightning_lite.utilities.apply_func import move_data_to_device from lightning_lite.utilities.cloud_io import atomic_save, get_filesystem from lightning_lite.utilities.types import _PATH -from pytorch_lightning.plugins.io.torch_plugin import TorchCheckpointIO class HPUCheckpointIO(TorchCheckpointIO): diff --git a/src/pytorch_lightning/plugins/io/wrapper.py b/src/pytorch_lightning/plugins/io/wrapper.py index eb46990deffdf..bf54db6570d26 100644 --- a/src/pytorch_lightning/plugins/io/wrapper.py +++ b/src/pytorch_lightning/plugins/io/wrapper.py @@ -13,7 +13,7 @@ # limitations under the License. from typing import Any, Dict, Optional -from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO +from lightning_lite.plugins.io.checkpoint_plugin import CheckpointIO class _WrappingCheckpointIO(CheckpointIO): diff --git a/src/pytorch_lightning/strategies/bagua.py b/src/pytorch_lightning/strategies/bagua.py index 77e832233d968..6e861bac0094d 100644 --- a/src/pytorch_lightning/strategies/bagua.py +++ b/src/pytorch_lightning/strategies/bagua.py @@ -9,11 +9,11 @@ import pytorch_lightning as pl from lightning_lite.plugins.environments.cluster_environment import ClusterEnvironment +from lightning_lite.plugins.io.checkpoint_plugin import CheckpointIO from lightning_lite.utilities.distributed import ReduceOp from lightning_lite.utilities.optimizer import optimizers_to_device from lightning_lite.utilities.seed import reset_seed from pytorch_lightning.overrides.base import _LightningModuleWrapperBase, _LightningPrecisionModuleWrapperBase -from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO from pytorch_lightning.plugins.precision import PrecisionPlugin from pytorch_lightning.strategies.ddp import DDPStrategy from pytorch_lightning.strategies.strategy import TBroadcast diff --git a/src/pytorch_lightning/strategies/ddp.py b/src/pytorch_lightning/strategies/ddp.py index 768f12ae23332..d197aa7979a0a 100644 --- a/src/pytorch_lightning/strategies/ddp.py +++ b/src/pytorch_lightning/strategies/ddp.py @@ -30,6 +30,7 @@ import pytorch_lightning as pl from lightning_lite.plugins.environments.cluster_environment import ClusterEnvironment +from lightning_lite.plugins.io.checkpoint_plugin import CheckpointIO from lightning_lite.utilities.distributed import ( _get_process_group_backend_from_env, distributed_available, @@ -44,7 +45,6 @@ from pytorch_lightning.overrides.base import _LightningPrecisionModuleWrapperBase from pytorch_lightning.overrides.distributed import prepare_for_backward from pytorch_lightning.overrides.fairscale import _FAIRSCALE_AVAILABLE -from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO from pytorch_lightning.plugins.precision import PrecisionPlugin from pytorch_lightning.strategies.launchers.subprocess_script import _SubprocessScriptLauncher from pytorch_lightning.strategies.parallel import ParallelStrategy diff --git a/src/pytorch_lightning/strategies/ddp_spawn.py b/src/pytorch_lightning/strategies/ddp_spawn.py index 74d8f7c3f4c19..092f90009bbfd 100644 --- a/src/pytorch_lightning/strategies/ddp_spawn.py +++ b/src/pytorch_lightning/strategies/ddp_spawn.py @@ -25,6 +25,7 @@ import pytorch_lightning as pl from lightning_lite.plugins.environments.cluster_environment import ClusterEnvironment +from lightning_lite.plugins.io.checkpoint_plugin import CheckpointIO from lightning_lite.utilities.distributed import ( _get_process_group_backend_from_env, distributed_available, @@ -36,7 +37,6 @@ from pytorch_lightning.overrides import LightningDistributedModule from pytorch_lightning.overrides.base import _LightningPrecisionModuleWrapperBase from pytorch_lightning.overrides.distributed import prepare_for_backward -from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO from pytorch_lightning.plugins.precision import PrecisionPlugin from pytorch_lightning.strategies.launchers.multiprocessing import _MultiProcessingLauncher from pytorch_lightning.strategies.parallel import ParallelStrategy diff --git a/src/pytorch_lightning/strategies/dp.py b/src/pytorch_lightning/strategies/dp.py index 1724f0021db63..9a1261eeca0ad 100644 --- a/src/pytorch_lightning/strategies/dp.py +++ b/src/pytorch_lightning/strategies/dp.py @@ -19,10 +19,10 @@ from torch.nn import DataParallel, Module import pytorch_lightning as pl +from lightning_lite.plugins.io.checkpoint_plugin import CheckpointIO from lightning_lite.utilities.distributed import ReduceOp from pytorch_lightning.overrides.base import _LightningPrecisionModuleWrapperBase from pytorch_lightning.overrides.data_parallel import LightningParallelModule -from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO from pytorch_lightning.plugins.precision import PrecisionPlugin from pytorch_lightning.strategies.parallel import ParallelStrategy from pytorch_lightning.strategies.strategy import TBroadcast, TReduce diff --git a/src/pytorch_lightning/strategies/fully_sharded.py b/src/pytorch_lightning/strategies/fully_sharded.py index 5cfad516fb444..6979741d92be7 100644 --- a/src/pytorch_lightning/strategies/fully_sharded.py +++ b/src/pytorch_lightning/strategies/fully_sharded.py @@ -19,11 +19,11 @@ import pytorch_lightning as pl from lightning_lite.plugins.environments.cluster_environment import ClusterEnvironment +from lightning_lite.plugins.io.checkpoint_plugin import CheckpointIO from lightning_lite.utilities.enums import PrecisionType from lightning_lite.utilities.optimizer import optimizers_to_device from pytorch_lightning.overrides.base import _LightningModuleWrapperBase from pytorch_lightning.overrides.fairscale import _FAIRSCALE_AVAILABLE -from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO from pytorch_lightning.plugins.precision import PrecisionPlugin from pytorch_lightning.strategies.ddp import DDPStrategy from pytorch_lightning.trainer.states import TrainerFn diff --git a/src/pytorch_lightning/strategies/fully_sharded_native.py b/src/pytorch_lightning/strategies/fully_sharded_native.py index 243cfcac81529..09b4113adc419 100644 --- a/src/pytorch_lightning/strategies/fully_sharded_native.py +++ b/src/pytorch_lightning/strategies/fully_sharded_native.py @@ -20,6 +20,7 @@ import pytorch_lightning as pl from lightning_lite.plugins.environments.cluster_environment import ClusterEnvironment +from lightning_lite.plugins.io.checkpoint_plugin import CheckpointIO from lightning_lite.utilities.distributed import ( _get_process_group_backend_from_env, get_default_process_group_backend_for_device, @@ -29,7 +30,6 @@ from lightning_lite.utilities.optimizer import optimizers_to_device from lightning_lite.utilities.seed import reset_seed from pytorch_lightning.overrides.base import _LightningModuleWrapperBase -from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO from pytorch_lightning.plugins.precision import PrecisionPlugin from pytorch_lightning.plugins.precision.fsdp_native_native_amp import FullyShardedNativeNativeMixedPrecisionPlugin from pytorch_lightning.strategies.launchers.subprocess_script import _SubprocessScriptLauncher diff --git a/src/pytorch_lightning/strategies/horovod.py b/src/pytorch_lightning/strategies/horovod.py index 27793306fb28e..8b35754a78822 100644 --- a/src/pytorch_lightning/strategies/horovod.py +++ b/src/pytorch_lightning/strategies/horovod.py @@ -20,11 +20,11 @@ from torch.optim import Optimizer import pytorch_lightning as pl +from lightning_lite.plugins.io.checkpoint_plugin import CheckpointIO from lightning_lite.utilities.distributed import distributed_available from lightning_lite.utilities.distributed import group as dist_group from lightning_lite.utilities.distributed import ReduceOp from pytorch_lightning.core.optimizer import LightningOptimizer -from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO from pytorch_lightning.plugins.precision import PrecisionPlugin from pytorch_lightning.strategies.parallel import ParallelStrategy from pytorch_lightning.strategies.strategy import TBroadcast diff --git a/src/pytorch_lightning/strategies/hpu_parallel.py b/src/pytorch_lightning/strategies/hpu_parallel.py index 9bb3a2b4716fd..96c66224ed72b 100644 --- a/src/pytorch_lightning/strategies/hpu_parallel.py +++ b/src/pytorch_lightning/strategies/hpu_parallel.py @@ -19,10 +19,10 @@ import pytorch_lightning as pl from lightning_lite.plugins.environments.cluster_environment import ClusterEnvironment +from lightning_lite.plugins.io.checkpoint_plugin import CheckpointIO from lightning_lite.utilities.distributed import group as _group from pytorch_lightning.overrides import LightningDistributedModule from pytorch_lightning.overrides.torch_distributed import broadcast_object_list -from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO from pytorch_lightning.plugins.io.hpu_plugin import HPUCheckpointIO from pytorch_lightning.plugins.io.wrapper import _WrappingCheckpointIO from pytorch_lightning.plugins.precision import PrecisionPlugin diff --git a/src/pytorch_lightning/strategies/ipu.py b/src/pytorch_lightning/strategies/ipu.py index 7560a303a7949..64898e6c76251 100644 --- a/src/pytorch_lightning/strategies/ipu.py +++ b/src/pytorch_lightning/strategies/ipu.py @@ -22,10 +22,10 @@ import pytorch_lightning as pl from lightning_lite.plugins.environments.cluster_environment import ClusterEnvironment +from lightning_lite.plugins.io.checkpoint_plugin import CheckpointIO from lightning_lite.utilities.cloud_io import get_filesystem from lightning_lite.utilities.enums import PrecisionType from pytorch_lightning.overrides.base import _LightningModuleWrapperBase, _LightningPrecisionModuleWrapperBase -from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO from pytorch_lightning.plugins.precision import PrecisionPlugin from pytorch_lightning.strategies.parallel import ParallelStrategy from pytorch_lightning.strategies.strategy import TBroadcast diff --git a/src/pytorch_lightning/strategies/parallel.py b/src/pytorch_lightning/strategies/parallel.py index e2b15fefe1a50..3d9f6a5dd3bdd 100644 --- a/src/pytorch_lightning/strategies/parallel.py +++ b/src/pytorch_lightning/strategies/parallel.py @@ -20,6 +20,7 @@ import pytorch_lightning as pl from lightning_lite.plugins.environments.cluster_environment import ClusterEnvironment +from lightning_lite.plugins.io.checkpoint_plugin import CheckpointIO from lightning_lite.utilities.distributed import ( _get_process_group_backend_from_env, all_gather_ddp_if_available, @@ -27,7 +28,6 @@ ReduceOp, ) from pytorch_lightning.plugins import LayerSync -from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO from pytorch_lightning.plugins.precision import PrecisionPlugin from pytorch_lightning.strategies.strategy import Strategy from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation diff --git a/src/pytorch_lightning/strategies/single_device.py b/src/pytorch_lightning/strategies/single_device.py index a9d5d7ca87fd0..cf1d47c5c95fc 100644 --- a/src/pytorch_lightning/strategies/single_device.py +++ b/src/pytorch_lightning/strategies/single_device.py @@ -19,8 +19,8 @@ from torch import Tensor import pytorch_lightning as pl +from lightning_lite.plugins.io.checkpoint_plugin import CheckpointIO from lightning_lite.utilities.types import _DEVICE -from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO from pytorch_lightning.plugins.precision import PrecisionPlugin from pytorch_lightning.strategies.strategy import Strategy, TBroadcast diff --git a/src/pytorch_lightning/strategies/single_hpu.py b/src/pytorch_lightning/strategies/single_hpu.py index 5c29829fa6ce9..1e91150cded22 100644 --- a/src/pytorch_lightning/strategies/single_hpu.py +++ b/src/pytorch_lightning/strategies/single_hpu.py @@ -15,8 +15,8 @@ from typing import Dict, Optional import pytorch_lightning as pl +from lightning_lite.plugins.io.checkpoint_plugin import CheckpointIO from lightning_lite.utilities.types import _DEVICE -from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO from pytorch_lightning.plugins.io.hpu_plugin import HPUCheckpointIO from pytorch_lightning.plugins.io.wrapper import _WrappingCheckpointIO from pytorch_lightning.plugins.precision import PrecisionPlugin diff --git a/src/pytorch_lightning/strategies/single_tpu.py b/src/pytorch_lightning/strategies/single_tpu.py index 3084f17430338..62c2c4e7222cc 100644 --- a/src/pytorch_lightning/strategies/single_tpu.py +++ b/src/pytorch_lightning/strategies/single_tpu.py @@ -15,9 +15,9 @@ from typing import Dict, Optional import pytorch_lightning as pl -from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO +from lightning_lite.plugins.io.checkpoint_plugin import CheckpointIO +from lightning_lite.plugins.io.xla_plugin import XLACheckpointIO from pytorch_lightning.plugins.io.wrapper import _WrappingCheckpointIO -from pytorch_lightning.plugins.io.xla_plugin import XLACheckpointIO from pytorch_lightning.plugins.precision import PrecisionPlugin from pytorch_lightning.strategies.single_device import SingleDeviceStrategy from pytorch_lightning.utilities import _TPU_AVAILABLE, find_shared_parameters, set_shared_parameters diff --git a/src/pytorch_lightning/strategies/strategy.py b/src/pytorch_lightning/strategies/strategy.py index bb63c602690d4..0f73b7b24e675 100644 --- a/src/pytorch_lightning/strategies/strategy.py +++ b/src/pytorch_lightning/strategies/strategy.py @@ -23,13 +23,13 @@ from torch.utils.data import DataLoader import pytorch_lightning as pl +from lightning_lite.plugins.io.checkpoint_plugin import CheckpointIO from lightning_lite.utilities.apply_func import move_data_to_device from lightning_lite.utilities.distributed import ReduceOp from lightning_lite.utilities.optimizer import optimizer_to_device, optimizers_to_device from lightning_lite.utilities.types import _PATH from pytorch_lightning.core.optimizer import _init_optimizers_and_lr_schedulers, LightningOptimizer from pytorch_lightning.plugins import TorchCheckpointIO -from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO from pytorch_lightning.plugins.io.wrapper import _WrappingCheckpointIO from pytorch_lightning.plugins.precision import PrecisionPlugin from pytorch_lightning.strategies.launchers.base import _Launcher diff --git a/src/pytorch_lightning/strategies/tpu_spawn.py b/src/pytorch_lightning/strategies/tpu_spawn.py index a84e703bbf1f9..d982221b89f85 100644 --- a/src/pytorch_lightning/strategies/tpu_spawn.py +++ b/src/pytorch_lightning/strategies/tpu_spawn.py @@ -23,14 +23,14 @@ import pytorch_lightning as pl from lightning_lite.plugins.environments import XLAEnvironment +from lightning_lite.plugins.io.checkpoint_plugin import CheckpointIO +from lightning_lite.plugins.io.xla_plugin import XLACheckpointIO from lightning_lite.utilities.data import has_len from lightning_lite.utilities.distributed import ReduceOp from lightning_lite.utilities.optimizer import optimizers_to_device from lightning_lite.utilities.types import _PATH from pytorch_lightning.overrides import LightningDistributedModule -from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO from pytorch_lightning.plugins.io.wrapper import _WrappingCheckpointIO -from pytorch_lightning.plugins.io.xla_plugin import XLACheckpointIO from pytorch_lightning.plugins.precision import PrecisionPlugin from pytorch_lightning.strategies.ddp_spawn import DDPSpawnStrategy from pytorch_lightning.strategies.launchers.xla import _XLALauncher diff --git a/tests/tests_pytorch/accelerators/test_cpu.py b/tests/tests_pytorch/accelerators/test_cpu.py index 5d051a6b15322..4453b7add086f 100644 --- a/tests/tests_pytorch/accelerators/test_cpu.py +++ b/tests/tests_pytorch/accelerators/test_cpu.py @@ -6,10 +6,10 @@ import torch import pytorch_lightning as pl +from lightning_lite.plugins.io.torch_plugin import TorchCheckpointIO from pytorch_lightning import Trainer from pytorch_lightning.accelerators import CPUAccelerator from pytorch_lightning.demos.boring_classes import BoringModel -from pytorch_lightning.plugins.io.torch_plugin import TorchCheckpointIO from pytorch_lightning.plugins.precision.precision_plugin import PrecisionPlugin from pytorch_lightning.strategies import SingleDeviceStrategy from pytorch_lightning.utilities.exceptions import MisconfigurationException diff --git a/tests/tests_pytorch/checkpointing/test_trainer_checkpoint.py b/tests/tests_pytorch/checkpointing/test_trainer_checkpoint.py index b3eb1627be511..e781e09163abb 100644 --- a/tests/tests_pytorch/checkpointing/test_trainer_checkpoint.py +++ b/tests/tests_pytorch/checkpointing/test_trainer_checkpoint.py @@ -19,11 +19,11 @@ import torch import pytorch_lightning as pl +from lightning_lite.plugins.io.torch_plugin import TorchCheckpointIO +from lightning_lite.plugins.io.xla_plugin import XLACheckpointIO from pytorch_lightning import Trainer from pytorch_lightning.callbacks import ModelCheckpoint from pytorch_lightning.demos.boring_classes import BoringModel -from pytorch_lightning.plugins.io.torch_plugin import TorchCheckpointIO -from pytorch_lightning.plugins.io.xla_plugin import XLACheckpointIO def test_finetuning_with_ckpt_path(tmpdir): diff --git a/tests/tests_pytorch/plugins/test_checkpoint_io_plugin.py b/tests/tests_pytorch/plugins/test_checkpoint_io_plugin.py index 21a94d33bbb9d..39ec5ebb16b2a 100644 --- a/tests/tests_pytorch/plugins/test_checkpoint_io_plugin.py +++ b/tests/tests_pytorch/plugins/test_checkpoint_io_plugin.py @@ -18,13 +18,13 @@ import torch +from lightning_lite.plugins.io.checkpoint_plugin import CheckpointIO +from lightning_lite.plugins.io.torch_plugin import TorchCheckpointIO from lightning_lite.utilities.types import _PATH from pytorch_lightning import Trainer from pytorch_lightning.callbacks import ModelCheckpoint from pytorch_lightning.demos.boring_classes import BoringModel from pytorch_lightning.plugins.io.async_plugin import AsyncCheckpointIO -from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO -from pytorch_lightning.plugins.io.torch_plugin import TorchCheckpointIO from pytorch_lightning.strategies import SingleDeviceStrategy From cbbd1480890cdd08faf12a52c7ad5652ee00a8c8 Mon Sep 17 00:00:00 2001 From: awaelchli Date: Mon, 12 Sep 2022 13:19:32 +0200 Subject: [PATCH 099/193] Add back-compatibility for checkpoint io plugins in pl/plugins/io (#14519) --- src/lightning_lite/plugins/io/torch_plugin.py | 6 +++--- src/lightning_lite/plugins/io/xla_plugin.py | 2 +- .../plugins/io/checkpoint_plugin.py | 16 ++++++++++++++++ src/pytorch_lightning/plugins/io/torch_plugin.py | 16 ++++++++++++++++ src/pytorch_lightning/plugins/io/xla_plugin.py | 16 ++++++++++++++++ .../checkpointing/test_trainer_checkpoint.py | 2 +- .../deprecated_api/test_remove_1-10.py | 2 +- 7 files changed, 54 insertions(+), 6 deletions(-) create mode 100644 src/pytorch_lightning/plugins/io/checkpoint_plugin.py create mode 100644 src/pytorch_lightning/plugins/io/torch_plugin.py create mode 100644 src/pytorch_lightning/plugins/io/xla_plugin.py diff --git a/src/lightning_lite/plugins/io/torch_plugin.py b/src/lightning_lite/plugins/io/torch_plugin.py index f0e9ca3d11e54..7c246b00a80e1 100644 --- a/src/lightning_lite/plugins/io/torch_plugin.py +++ b/src/lightning_lite/plugins/io/torch_plugin.py @@ -15,7 +15,6 @@ import os from typing import Any, Callable, Dict, Optional -import pytorch_lightning as pl from lightning_lite.plugins.io.checkpoint_plugin import CheckpointIO from lightning_lite.utilities.cloud_io import atomic_save, get_filesystem from lightning_lite.utilities.cloud_io import load as pl_load @@ -53,9 +52,10 @@ def save_checkpoint(self, checkpoint: Dict[str, Any], path: _PATH, storage_optio # write the checkpoint dictionary on the file atomic_save(checkpoint, path) except AttributeError as err: - # todo (sean): is this try catch necessary still? + # todo: is this try catch necessary still? # https://github.com/Lightning-AI/lightning/pull/431 - key = pl.LightningModule.CHECKPOINT_HYPER_PARAMS_KEY + # TODO(lite): Lite doesn't support hyperparameters in the checkpoint, so this should be refactored + key = "hyper_parameters" checkpoint.pop(key, None) rank_zero_warn(f"Warning, `{key}` dropped from checkpoint. An attribute is not picklable: {err}") atomic_save(checkpoint, path) diff --git a/src/lightning_lite/plugins/io/xla_plugin.py b/src/lightning_lite/plugins/io/xla_plugin.py index 222b50aca5618..1b97736d8f71d 100644 --- a/src/lightning_lite/plugins/io/xla_plugin.py +++ b/src/lightning_lite/plugins/io/xla_plugin.py @@ -18,8 +18,8 @@ from lightning_lite.plugins.io.torch_plugin import TorchCheckpointIO from lightning_lite.utilities.cloud_io import get_filesystem +from lightning_lite.utilities.imports import _OMEGACONF_AVAILABLE, _TPU_AVAILABLE from lightning_lite.utilities.types import _PATH -from pytorch_lightning.utilities import _OMEGACONF_AVAILABLE, _TPU_AVAILABLE if _TPU_AVAILABLE: import torch_xla.core.xla_model as xm diff --git a/src/pytorch_lightning/plugins/io/checkpoint_plugin.py b/src/pytorch_lightning/plugins/io/checkpoint_plugin.py new file mode 100644 index 0000000000000..92253ddb4bebc --- /dev/null +++ b/src/pytorch_lightning/plugins/io/checkpoint_plugin.py @@ -0,0 +1,16 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# For backward-compatibility +from lightning_lite.plugins.io.checkpoint_plugin import CheckpointIO # noqa: F401 diff --git a/src/pytorch_lightning/plugins/io/torch_plugin.py b/src/pytorch_lightning/plugins/io/torch_plugin.py new file mode 100644 index 0000000000000..c01975cd3913a --- /dev/null +++ b/src/pytorch_lightning/plugins/io/torch_plugin.py @@ -0,0 +1,16 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# For backward-compatibility +from lightning_lite.plugins.io.torch_plugin import TorchCheckpointIO # noqa: F401 diff --git a/src/pytorch_lightning/plugins/io/xla_plugin.py b/src/pytorch_lightning/plugins/io/xla_plugin.py new file mode 100644 index 0000000000000..4d65ef3d473e8 --- /dev/null +++ b/src/pytorch_lightning/plugins/io/xla_plugin.py @@ -0,0 +1,16 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# For backward-compatibility +from lightning_lite.plugins.io.xla_plugin import XLACheckpointIO # noqa: F401 diff --git a/tests/tests_pytorch/checkpointing/test_trainer_checkpoint.py b/tests/tests_pytorch/checkpointing/test_trainer_checkpoint.py index e781e09163abb..54f2c11983c85 100644 --- a/tests/tests_pytorch/checkpointing/test_trainer_checkpoint.py +++ b/tests/tests_pytorch/checkpointing/test_trainer_checkpoint.py @@ -90,7 +90,7 @@ def test_trainer_save_checkpoint_storage_options(tmpdir): instance_path = tmpdir + "/path.ckpt" instance_storage_options = "my instance storage options" - with mock.patch("pytorch_lightning.plugins.io.torch_plugin.TorchCheckpointIO.save_checkpoint") as io_mock: + with mock.patch("lightning_lite.plugins.io.torch_plugin.TorchCheckpointIO.save_checkpoint") as io_mock: trainer.save_checkpoint(instance_path, storage_options=instance_storage_options) io_mock.assert_called_with(ANY, instance_path, storage_options=instance_storage_options) trainer.save_checkpoint(instance_path) diff --git a/tests/tests_pytorch/deprecated_api/test_remove_1-10.py b/tests/tests_pytorch/deprecated_api/test_remove_1-10.py index 140009ffcdd4d..a48c6a7884083 100644 --- a/tests/tests_pytorch/deprecated_api/test_remove_1-10.py +++ b/tests/tests_pytorch/deprecated_api/test_remove_1-10.py @@ -19,13 +19,13 @@ import torch from torch.utils.data import DataLoader -from lightning_lite.plugins.environments import LightningEnvironment from pytorch_lightning import Trainer from pytorch_lightning.core.mixins.device_dtype_mixin import DeviceDtypeModuleMixin from pytorch_lightning.demos.boring_classes import BoringModel, RandomDataset from pytorch_lightning.overrides import LightningDistributedModule, LightningParallelModule from pytorch_lightning.overrides.base import unwrap_lightning_module from pytorch_lightning.overrides.fairscale import LightningShardedDataParallel, unwrap_lightning_module_sharded +from pytorch_lightning.plugins.environments import LightningEnvironment from pytorch_lightning.strategies.bagua import LightningBaguaModule from pytorch_lightning.strategies.deepspeed import LightningDeepSpeedModule from pytorch_lightning.strategies.ipu import LightningIPUModule From 199c2ae89f24577b4a14f87ba06c32644cd78175 Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Mon, 12 Sep 2022 15:12:30 +0200 Subject: [PATCH 100/193] Drop duplicate docs requirements (#14644) Delete base.txt Co-authored-by: thomas chaton --- requirements/docs/base.txt | 13 ------------- 1 file changed, 13 deletions(-) delete mode 100644 requirements/docs/base.txt diff --git a/requirements/docs/base.txt b/requirements/docs/base.txt deleted file mode 100644 index 1b00471602c60..0000000000000 --- a/requirements/docs/base.txt +++ /dev/null @@ -1,13 +0,0 @@ -sphinx>=4.0, <5.0 -myst-parser>=0.15, <0.17 -nbsphinx>=0.8.5, <=0.8.9 -pandoc>=1.0, <=2.2 -docutils>=0.16, <0.19 -sphinxcontrib-fulltoc>=1.0, <=1.2.0 -sphinxcontrib-mockautodoc -sphinx-autodoc-typehints>=1.11, <1.15 # strict; v1.15 failing on master (#11405) -sphinx-paramlinks>=0.5.1, <=0.5.4 -sphinx-togglebutton>=0.2, <=0.3.2 -sphinx-copybutton>=0.3, <=0.5.0 -sphinx-multiproject -jinja2>=3.0.0,<3.1.0 From e859546b96b25f13dab70533c32e24f8dd6be21a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Mon, 12 Sep 2022 15:16:57 +0200 Subject: [PATCH 101/193] Integrate lightning_utilities `is_overridden` (#14620) --- src/lightning_app/utilities/app_helpers.py | 21 +------- src/pytorch_lightning/CHANGELOG.md | 3 +- .../utilities/model_helpers.py | 26 +--------- tests/tests_app/utilities/test_app_helpers.py | 26 +--------- .../utilities/test_model_helpers.py | 51 +------------------ 5 files changed, 10 insertions(+), 117 deletions(-) diff --git a/src/lightning_app/utilities/app_helpers.py b/src/lightning_app/utilities/app_helpers.py index faa612bba1998..0cc2d84ea58b7 100644 --- a/src/lightning_app/utilities/app_helpers.py +++ b/src/lightning_app/utilities/app_helpers.py @@ -12,7 +12,6 @@ from copy import deepcopy from dataclasses import dataclass, field from typing import Any, Callable, Dict, Generator, List, Mapping, Optional, Tuple, Type, TYPE_CHECKING -from unittest.mock import Mock import websockets from deepdiff import Delta @@ -231,12 +230,9 @@ def render_non_authorized(self): pass -# Adapted from -# https://github.com/Lightning-AI/pytorch-lightning/blob/master/pytorch_lightning/utilities/model_helpers.py#L21 def is_overridden(method_name: str, instance: Optional[object] = None, parent: Optional[Type[object]] = None) -> bool: if instance is None: return False - if parent is None: if isinstance(instance, lightning_app.LightningFlow): parent = lightning_app.LightningFlow @@ -244,22 +240,9 @@ def is_overridden(method_name: str, instance: Optional[object] = None, parent: O parent = lightning_app.LightningWork if parent is None: raise ValueError("Expected a parent") + from lightning_utilities.core.overrides import is_overridden - instance_attr = getattr(instance, method_name, None) - if instance_attr is None: - return False - # `Mock(wraps=...)` support - if isinstance(instance_attr, Mock): - # access the wrapped function - instance_attr = instance_attr._mock_wraps - if instance_attr is None: - return False - - parent_attr = getattr(parent, method_name, None) - if parent_attr is None: - raise ValueError("The parent should define the method") - - return instance_attr.__code__ != parent_attr.__code__ + return is_overridden(method_name, instance, parent) def _is_json_serializable(x: Any) -> bool: diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index ecf1ce319aa13..c28f9bd1498e4 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -29,7 +29,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). [#14537](https://github.com/Lightning-AI/lightning/issues/14537), [#14556](https://github.com/Lightning-AI/lightning/issues/14556), [#14558](https://github.com/Lightning-AI/lightning/issues/14558), - [#14575](https://github.com/Lightning-AI/lightning/issues/14575)) + [#14575](https://github.com/Lightning-AI/lightning/issues/14575), + [#14620](https://github.com/Lightning-AI/lightning/issues/14620)) ### Changed diff --git a/src/pytorch_lightning/utilities/model_helpers.py b/src/pytorch_lightning/utilities/model_helpers.py index b72e9320b364e..679ba029c1b4d 100644 --- a/src/pytorch_lightning/utilities/model_helpers.py +++ b/src/pytorch_lightning/utilities/model_helpers.py @@ -11,9 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from functools import partial from typing import Any, Optional, Type -from unittest.mock import Mock from lightning_utilities.core.imports import RequirementCache from torch import nn @@ -25,7 +23,6 @@ def is_overridden(method_name: str, instance: Optional[object] = None, parent: O if instance is None: # if `self.lightning_module` was passed as instance, it can be `None` return False - if parent is None: if isinstance(instance, pl.LightningModule): parent = pl.LightningModule @@ -35,28 +32,9 @@ def is_overridden(method_name: str, instance: Optional[object] = None, parent: O parent = pl.Callback if parent is None: raise ValueError("Expected a parent") + from lightning_utilities.core.overrides import is_overridden - instance_attr = getattr(instance, method_name, None) - if instance_attr is None: - return False - # `functools.wraps()` support - if hasattr(instance_attr, "__wrapped__"): - instance_attr = instance_attr.__wrapped__ - # `Mock(wraps=...)` support - if isinstance(instance_attr, Mock): - # access the wrapped function - instance_attr = instance_attr._mock_wraps - # `partial` support - elif isinstance(instance_attr, partial): - instance_attr = instance_attr.func - if instance_attr is None: - return False - - parent_attr = getattr(parent, method_name, None) - if parent_attr is None: - raise ValueError("The parent should define the method") - - return instance_attr.__code__ != parent_attr.__code__ + return is_overridden(method_name, instance, parent) def get_torchvision_model(model_name: str, **kwargs: Any) -> nn.Module: diff --git a/tests/tests_app/utilities/test_app_helpers.py b/tests/tests_app/utilities/test_app_helpers.py index 44074859f68f5..a1c2be5c36f5a 100644 --- a/tests/tests_app/utilities/test_app_helpers.py +++ b/tests/tests_app/utilities/test_app_helpers.py @@ -1,5 +1,4 @@ from unittest import mock -from unittest.mock import Mock import pytest @@ -25,39 +24,18 @@ def run(self): def test_is_overridden(): - flow = Flow() - work = Work() - # edge cases assert not is_overridden("whatever", None) with pytest.raises(ValueError, match="Expected a parent"): is_overridden("whatever", object()) + flow = Flow() assert not is_overridden("whatever", flow) assert not is_overridden("whatever", flow, parent=Flow) - - class TestFlow(LightningFlow): - def run(self): - pass - - def foo(self): - pass - - def bar(self): - return 1 - - with pytest.raises(ValueError, match="The parent should define the method"): - is_overridden("foo", TestFlow()) - # normal usage assert is_overridden("run", flow) + work = Work() assert is_overridden("run", work) - # `Mock` support - mock = Mock(spec=Flow, wraps=flow) - assert is_overridden("run", mock) - mock = Mock(spec=LightningWork, wraps=work) - assert is_overridden("run", mock) - def test_simple_app_store(): diff --git a/tests/tests_pytorch/utilities/test_model_helpers.py b/tests/tests_pytorch/utilities/test_model_helpers.py index 7f9f5eb344575..be31ead725065 100644 --- a/tests/tests_pytorch/utilities/test_model_helpers.py +++ b/tests/tests_pytorch/utilities/test_model_helpers.py @@ -11,9 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from functools import partial, wraps -from unittest.mock import Mock - import pytest from pytorch_lightning import LightningDataModule @@ -22,58 +19,14 @@ def test_is_overridden(): - model = BoringModel() - datamodule = BoringDataModule() - # edge cases assert not is_overridden("whatever", None) with pytest.raises(ValueError, match="Expected a parent"): is_overridden("whatever", object()) + model = BoringModel() assert not is_overridden("whatever", model) assert not is_overridden("whatever", model, parent=LightningDataModule) - - class TestModel(BoringModel): - def foo(self): - pass - - def bar(self): - return 1 - - with pytest.raises(ValueError, match="The parent should define the method"): - is_overridden("foo", TestModel()) - # normal usage assert is_overridden("training_step", model) + datamodule = BoringDataModule() assert is_overridden("train_dataloader", datamodule) - - class WrappedModel(TestModel): - def __new__(cls, *args, **kwargs): - obj = super().__new__(cls) - obj.foo = cls.wrap(obj.foo) - obj.bar = cls.wrap(obj.bar) - return obj - - @staticmethod - def wrap(fn): - @wraps(fn) - def wrapper(): - fn() - - return wrapper - - def bar(self): - return 2 - - # `functools.wraps()` support - assert not is_overridden("foo", WrappedModel(), parent=TestModel) - assert is_overridden("bar", WrappedModel(), parent=TestModel) - - # `Mock` support - mock = Mock(spec=BoringModel, wraps=model) - assert is_overridden("training_step", mock) - mock = Mock(spec=BoringDataModule, wraps=datamodule) - assert is_overridden("train_dataloader", mock) - - # `partial` support - model.training_step = partial(model.training_step) - assert is_overridden("training_step", model) From d8fe0cf9b51cbef0157f2388079425453231a075 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 12 Sep 2022 13:37:49 +0000 Subject: [PATCH 102/193] Bump docker/build-push-action from 1.1.0 to 3 (#14651) * Bump docker/build-push-action from 1.1.0 to 3.1.1 Bumps [docker/build-push-action](https://github.com/docker/build-push-action) from 1.1.0 to 3.1.1. - [Release notes](https://github.com/docker/build-push-action/releases) - [Commits](https://github.com/docker/build-push-action/compare/v1.1.0...v3.1.1) --- updated-dependencies: - dependency-name: docker/build-push-action dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] * Revert "Bump docker/build-push-action from 1.1.0 to 3.1.1" This reverts commit 05f9bfb084fd00657d4396214938f448a3f9b143. * use v3 Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Akihiro Nitta --- .github/workflows/release-docker.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/release-docker.yml b/.github/workflows/release-docker.yml index 67503ba2b2c0d..6999ca6dd4d61 100644 --- a/.github/workflows/release-docker.yml +++ b/.github/workflows/release-docker.yml @@ -29,7 +29,7 @@ jobs: run: echo "::set-output name=RELEASE_VERSION::$(echo ${GITHUB_REF##*/})" - name: Publish Releases to Docker - uses: docker/build-push-action@v1.1.0 + uses: docker/build-push-action@v3 with: repository: pytorchlightning/pytorch_lightning username: ${{ secrets.DOCKER_USERNAME }} @@ -46,7 +46,7 @@ jobs: timeout-minutes: 55 - name: Publish Latest to Docker - uses: docker/build-push-action@v1.1.0 + uses: docker/build-push-action@v3 # Only latest Python and PyTorch if: matrix.python_version == '3.9' && matrix.pytorch_version == '1.12' with: From 8f0a64dab6413ab495e0edce47d81afc6f14060c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 12 Sep 2022 16:15:42 +0200 Subject: [PATCH 103/193] Standalone Lite: Launchers (#14555) Co-authored-by: Jirka Borovec --- src/lightning_lite/strategies/__init__.py | 0 .../strategies/launchers/__init__.py | 0 .../strategies/launchers/base.py | 0 .../strategies/launchers/multiprocessing.py | 178 ++++++++++++++++++ .../strategies/launchers/subprocess_script.py | 167 ++++++++++++++++ .../strategies/launchers/xla.py | 121 ++++++++++++ src/lightning_lite/utilities/device_parser.py | 11 +- .../strategies/launchers/__init__.py | 2 - .../strategies/launchers/multiprocessing.py | 2 +- .../strategies/launchers/subprocess_script.py | 2 +- src/pytorch_lightning/strategies/strategy.py | 2 +- tests/tests_lite/conftest.py | 7 + tests/tests_lite/strategies/__init__.py | 0 .../strategies/launchers/__init__.py | 0 .../launchers/test_multiprocessing.py | 95 ++++++++++ .../launchers/test_subprocess_script.py | 78 ++++++++ .../strategies/launchers/test_xla.py | 39 ++++ 17 files changed, 689 insertions(+), 15 deletions(-) create mode 100644 src/lightning_lite/strategies/__init__.py create mode 100644 src/lightning_lite/strategies/launchers/__init__.py rename src/{pytorch_lightning => lightning_lite}/strategies/launchers/base.py (100%) create mode 100644 src/lightning_lite/strategies/launchers/multiprocessing.py create mode 100644 src/lightning_lite/strategies/launchers/subprocess_script.py create mode 100644 src/lightning_lite/strategies/launchers/xla.py create mode 100644 tests/tests_lite/strategies/__init__.py create mode 100644 tests/tests_lite/strategies/launchers/__init__.py create mode 100644 tests/tests_lite/strategies/launchers/test_multiprocessing.py create mode 100644 tests/tests_lite/strategies/launchers/test_subprocess_script.py create mode 100644 tests/tests_lite/strategies/launchers/test_xla.py diff --git a/src/lightning_lite/strategies/__init__.py b/src/lightning_lite/strategies/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/src/lightning_lite/strategies/launchers/__init__.py b/src/lightning_lite/strategies/launchers/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/src/pytorch_lightning/strategies/launchers/base.py b/src/lightning_lite/strategies/launchers/base.py similarity index 100% rename from src/pytorch_lightning/strategies/launchers/base.py rename to src/lightning_lite/strategies/launchers/base.py diff --git a/src/lightning_lite/strategies/launchers/multiprocessing.py b/src/lightning_lite/strategies/launchers/multiprocessing.py new file mode 100644 index 0000000000000..fc6dd5025fdf5 --- /dev/null +++ b/src/lightning_lite/strategies/launchers/multiprocessing.py @@ -0,0 +1,178 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +from dataclasses import dataclass +from multiprocessing.queues import SimpleQueue +from typing import Any, Callable, Dict, Optional + +import torch +import torch.backends.cudnn +import torch.multiprocessing as mp +from typing_extensions import Literal + +from lightning_lite.strategies.launchers.base import _Launcher +from lightning_lite.utilities.apply_func import move_data_to_device +from lightning_lite.utilities.imports import _TORCH_GREATER_EQUAL_1_11 +from lightning_lite.utilities.seed import _collect_rng_states, _set_rng_states + + +class _MultiProcessingLauncher(_Launcher): + r"""Launches processes that run a given function in parallel, and joins them all at the end. + + The main process in which this launcher is invoked creates N so-called worker processes (using + :func:`torch.multiprocessing.start_processes`) that run the given function. + Worker processes have a rank that ranges from 0 to N - 1. + + Note: + - This launcher requires all objects to be pickleable. + - It is important that the entry point to the program/script is guarded by ``if __name__ == "__main__"``. + - With start method 'fork' the user must ensure that no CUDA context gets created in the main process before + the launcher is invoked. E.g., one should avoid creating cuda tensors or calling ``torch.cuda.*`` functions + before calling ``Trainer.fit``. + + Args: + strategy: A reference to the strategy that is used together with this launcher. + start_method: The method how to start the processes. + - 'spawn': The default start method. Requires all objects to be pickleable. + - 'fork': Preferrable for IPython/Jupyter environments where 'spawn' is not available. Not available on + the Windows platform for example. + - 'forkserver': Alternative implementation to 'fork'. + """ + + def __init__( + self, + # TODO(lite): Fix this type annotation once the strategy base class gets added to Lite + strategy: "Strategy", # type: ignore[name-defined] # noqa: F821 + start_method: Literal["spawn", "fork", "forkserver"] = "spawn", + ) -> None: + self._strategy = strategy + self._start_method = start_method + if start_method not in mp.get_all_start_methods(): + raise ValueError( + f"The start method '{self._start_method}' is not available on this platform. Available methods are:" + f" {', '.join(mp.get_all_start_methods())}" + ) + if start_method in ("fork", "forkserver") and _is_forking_disabled(): + raise ValueError( + "Forking is disabled in this environment by `PL_DISABLE_FORKING=1`. Choose a different start method." + ) + + @property + def is_interactive_compatible(self) -> bool: + # The start method 'spawn' is not supported in interactive environments + # The start method 'fork' is the only one supported in Jupyter environments, with constraints around CUDA + # initialization. For more context, see https://github.com/Lightning-AI/lightning/issues/7550 + return self._start_method == "fork" + + def launch(self, function: Callable, *args: Any, **kwargs: Any) -> Any: + """Launches processes that run the given function in parallel. + + The function is allowed to have a return value. However, when all processes join, only the return value + of worker process 0 gets returned from this `launch` method in the main process. + + Arguments: + function: The entry point for all launched processes. + *args: Optional positional arguments to be passed to the given function. + **kwargs: Optional keyword arguments to be passed to the given function. + """ + # The default cluster environment in Lightning chooses a random free port number + # This needs to be done in the main process here before starting processes to ensure each rank will connect + # through the same port + os.environ["MASTER_PORT"] = str(self._strategy.cluster_environment.main_port) + context = mp.get_context(self._start_method) + return_queue = context.SimpleQueue() + + if self._start_method == "spawn": + global_states = _GlobalStateSnapshot.capture() + process_args = [function, args, kwargs, return_queue, global_states] + else: + process_args = [function, args, kwargs, return_queue] + + mp.start_processes( + self._wrapping_function, + args=process_args, + nprocs=self._strategy.num_processes, + start_method=self._start_method, + ) + return return_queue.get() + + def _wrapping_function( + self, + process_idx: int, + function: Callable, + args: Any, + kwargs: Any, + return_queue: SimpleQueue, + global_states: Optional["_GlobalStateSnapshot"] = None, + ) -> None: + if global_states: + global_states.restore() + # TODO(lite): Update worker setup once DDPSpawn strategy is in Lite + self._strategy._worker_setup(process_idx) + results = function(*args, **kwargs) + + if self._strategy.local_rank == 0: + return_queue.put(move_data_to_device(results, "cpu")) + + +@dataclass +class _GlobalStateSnapshot: + """Captures a hand-selected set of (global) variables in modules and provides a way to restore them. + + It facilitates and encapsulates the transfer of globals like PyTorch's deterministic flags or random generator state + across process boundaries when launching processes with :func:`torch.multiprocessing.spawn`. + + Example: + + .. code-block:: python + + # in main process + snapshot = _GlobalStateSnapshot.capture() + + # in worker process + snapshot.restore() + """ + + use_deterministic_algorithms: bool + use_deterministic_algorithms_warn_only: bool + cudnn_benchmark: bool + rng_states: Dict[str, Any] + + @classmethod + def capture(cls) -> "_GlobalStateSnapshot": + """Capture a few global states from torch, numpy, etc., that we want to restore in a spawned worker + process.""" + warn_only = torch.is_deterministic_algorithms_warn_only_enabled() if _TORCH_GREATER_EQUAL_1_11 else False + return cls( + use_deterministic_algorithms=torch.are_deterministic_algorithms_enabled(), + use_deterministic_algorithms_warn_only=warn_only, + cudnn_benchmark=torch.backends.cudnn.benchmark, + rng_states=_collect_rng_states(), + ) + + def restore(self) -> None: + """Restores all globals to the values captured in the :meth:`capture` method.""" + if _TORCH_GREATER_EQUAL_1_11: + torch.use_deterministic_algorithms( + self.use_deterministic_algorithms, warn_only=self.use_deterministic_algorithms_warn_only + ) + else: + torch.use_deterministic_algorithms(self.use_deterministic_algorithms) + torch.backends.cudnn.benchmark = self.cudnn_benchmark + _set_rng_states(self.rng_states) + + +def _is_forking_disabled() -> bool: + """Returns whether forking is disabled through the environment variable ``PL_DISABLE_FORK``.""" + return bool(int(os.environ.get("PL_DISABLE_FORK", "0"))) diff --git a/src/lightning_lite/strategies/launchers/subprocess_script.py b/src/lightning_lite/strategies/launchers/subprocess_script.py new file mode 100644 index 0000000000000..7f814e01e2b71 --- /dev/null +++ b/src/lightning_lite/strategies/launchers/subprocess_script.py @@ -0,0 +1,167 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import subprocess +import sys +from time import sleep +from typing import Any, Callable, Optional + +import __main__ +import numpy as np +from lightning_utilities.core.imports import RequirementCache + +from lightning_lite.strategies.launchers.base import _Launcher + +_HYDRA_AVAILABLE = RequirementCache("hydra") + + +class _SubprocessScriptLauncher(_Launcher): + r""" + A process laucher that invokes the current script as many times as desired in a single node. + + This launcher needs to be invoked on each node. + In its default behavior, the main process in each node then spawns N-1 child processes via :func:`subprocess.Popen`, + where N is the number of devices (e.g. GPU) per node. It is very similar to how :mod:`torch.distributed.run` + launches processes. + + For example, if the script gets invoked with the command + + .. code-block:: bash + + python train.py --devices 4 + + The launcher will create three additional subprocesses that get called like so: + + .. code-block:: bash + + LOCAL_RANK=1 python train.py --devices 4 + LOCAL_RANK=2 python train.py --devices 4 + LOCAL_RANK=3 python train.py --devices 4 + + It is implied that the main process which launched the others has ``LOCAL_RANK=0``. + Beside the local rank, the following other environment variables also get set, but unlike the local rank, these + get determined by the cluster environment: + + 1. `MASTER_ADDR`: The IP address of the main node. + 2. `MASTER_PORT`: The port number of the main node through which all processes communicate. + 3. `NODE_RANK`: The index of the node the current process is running on. Ranges from 0 to ``num_nodes - 1``. + 4. `WORLD_SIZE`: The total number of processes across all nodes, i.e., ``num_processes * num_nodes``. + + Arguments: + cluster_environment: A cluster environment that provides access to world size, node rank, etc. + num_processes: The number of processes to launch in the current node. + num_nodes: The total number of nodes that participate in this process group. + """ + + def __init__( + self, + # TODO(lite): Update type annotation once ClusterEnvironment has moved to Lite + cluster_environment: "ClusterEnvironment", # type: ignore[name-defined] # noqa: F821 + num_processes: int, + num_nodes: int, + ) -> None: + super().__init__() + self.cluster_environment = cluster_environment + self.num_processes = num_processes + self.num_nodes = num_nodes + + @property + def is_interactive_compatible(self) -> bool: + return False + + def launch(self, function: Callable, *args: Any, **kwargs: Any) -> Any: + """Creates new processes, then calls the given function. + + Arguments: + function: A callback function to execute after all processes have been created. + It is up to the implementation of this function to synchronize the processes, e.g., with barriers. + *args: Optional positional arguments to be passed to the given function. + **kwargs: Optional keyword arguments to be passed to the given function. + """ + if not self.cluster_environment.creates_processes_externally: + self._call_children_scripts() + return function(*args, **kwargs) + + def _call_children_scripts(self) -> None: + # bookkeeping of spawned processes + self._check_can_spawn_children() + + # DDP Environment variables + os.environ["MASTER_ADDR"] = self.cluster_environment.main_address + os.environ["MASTER_PORT"] = str(self.cluster_environment.main_port) + + # allow the user to pass the node rank + os.environ["NODE_RANK"] = str(self.cluster_environment.node_rank()) + os.environ["LOCAL_RANK"] = str(self.cluster_environment.local_rank()) + + # Check if the current calling command looked like `python a/b/c.py` or `python -m a.b.c` + # See https://docs.python.org/3/reference/import.html#main-spec + if __main__.__spec__ is None: # pragma: no-cover + # Script called as `python a/b/c.py` + if _HYDRA_AVAILABLE: + # when user is using hydra find the absolute path + from hydra.utils import to_absolute_path + + to_abs_path = to_absolute_path + else: + to_abs_path = os.path.abspath + + # pull out the commands used to run the script and resolve the absolute file path + command = sys.argv + try: + full_path = to_abs_path(command[0]) + except Exception: + full_path = os.path.abspath(command[0]) + + command[0] = full_path + # use the same python interpreter and actually running + command = [sys.executable] + command + else: # Script called as `python -m a.b.c` + command = [sys.executable, "-m", __main__.__spec__.name] + sys.argv[1:] + + os.environ["WORLD_SIZE"] = f"{self.num_processes * self.num_nodes}" + + for local_rank in range(1, self.num_processes): + env_copy = os.environ.copy() + env_copy["LOCAL_RANK"] = f"{local_rank}" + + # remove env var if global seed not set + if os.environ.get("PL_GLOBAL_SEED") is None and "PL_GLOBAL_SEED" in env_copy: + del env_copy["PL_GLOBAL_SEED"] + + # start process + # if hydra is available and initialized, make sure to set the cwd correctly + cwd: Optional[str] = None + if _HYDRA_AVAILABLE: + from hydra.core.hydra_config import HydraConfig + from hydra.utils import get_original_cwd + + if HydraConfig.initialized(): + cwd = get_original_cwd() + os_cwd = f'"{os.getcwd()}"' + command += [f"hydra.run.dir={os_cwd}", f"hydra.job.name=train_ddp_process_{local_rank}"] + subprocess.Popen(command, env=env_copy, cwd=cwd) + + # starting all processes at once can cause issues + # with dataloaders delay between 1-10 seconds + delay = np.random.uniform(1, 5, 1)[0] + sleep(delay) + + def _check_can_spawn_children(self) -> None: + if self.cluster_environment.local_rank() != 0: + raise RuntimeError( + "Lightning attempted to launch new distributed processes with `local_rank > 0`. This should not happen." + " Possible reasons: 1) LOCAL_RANK environment variable was incorrectly modified by the user," + " 2) `ClusterEnvironment.creates_processes_externally` incorrectly implemented." + ) diff --git a/src/lightning_lite/strategies/launchers/xla.py b/src/lightning_lite/strategies/launchers/xla.py new file mode 100644 index 0000000000000..6580fd4a01d0e --- /dev/null +++ b/src/lightning_lite/strategies/launchers/xla.py @@ -0,0 +1,121 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import time +from functools import wraps +from multiprocessing.queues import SimpleQueue +from typing import Any, Callable, Optional, Tuple, TYPE_CHECKING + +import torch.multiprocessing as mp +from torch.multiprocessing import ProcessContext + +from lightning_lite.strategies.launchers.multiprocessing import _GlobalStateSnapshot, _MultiProcessingLauncher +from lightning_lite.utilities import _TPU_AVAILABLE +from lightning_lite.utilities.apply_func import move_data_to_device + +if _TPU_AVAILABLE: + import torch_xla.core.xla_model as xm + import torch_xla.distributed.xla_multiprocessing as xmp +else: + xm, xmp = None, None + +if TYPE_CHECKING: + from lightning_lite.strategies import Strategy + + +class _XLALauncher(_MultiProcessingLauncher): + r"""Launches processes that run a given function in parallel on XLA supported hardware, and joins them all at the + end. + + The main process in which this launcher is invoked creates N so-called worker processes (using the + `torch_xla` :func:`xmp.spawn`) that run the given function. + Worker processes have a rank that ranges from 0 to N - 1. + + Note: + - This launcher requires all objects to be pickleable. + - It is important that the entry point to the program/script is guarded by ``if __name__ == "__main__"``. + + Args: + strategy: A reference to the strategy that is used together with this launcher + """ + + def __init__(self, strategy: "Strategy") -> None: + super().__init__(strategy=strategy, start_method="fork") + + @property + def is_interactive_compatible(self) -> bool: + return True + + def launch(self, function: Callable, *args: Any, **kwargs: Any) -> Any: + """Launches processes that run the given function in parallel. + + The function is allowed to have a return value. However, when all processes join, only the return value + of worker process 0 gets returned from this `launch` method in the main process. + + Arguments: + function: The entry point for all launched processes. + *args: Optional positional arguments to be passed to the given function. + **kwargs: Optional keyword arguments to be passed to the given function. + """ + context = mp.get_context(self._start_method) + return_queue = context.SimpleQueue() + _save_spawn( + self._wrapping_function, + args=(function, args, kwargs, return_queue), + nprocs=len(self._strategy.parallel_devices), + start_method=self._start_method, + ) + return return_queue.get() + + def _wrapping_function( + self, + process_idx: int, + function: Callable, + args: Any, + kwargs: Any, + return_queue: SimpleQueue, + global_states: Optional[_GlobalStateSnapshot] = None, + ) -> None: + # TODO(lite): Update worker setup once TPUSpawn strategy is in Lite + self._strategy._worker_setup(process_idx) + results = function(*args, **kwargs) + + if self._strategy.local_rank == 0: + return_queue.put(move_data_to_device(results, "cpu")) + + +def _save_spawn( + fn: Callable, + args: Tuple = (), + nprocs: Optional[int] = None, + join: bool = True, + daemon: bool = False, + start_method: str = "spawn", +) -> Optional[ProcessContext]: + """Wraps the :func:`torch_xla.distributed.xla_multiprocessing.spawn` with added teardown logic for the worker + processes.""" + + @wraps(fn) + def wrapped(rank: int, *_args: Any) -> None: + fn(rank, *_args) + + # Make all processes wait for each other before joining + # https://github.com/pytorch/xla/issues/1801#issuecomment-602799542 + xm.rendezvous("end-process") + + # Ensure that the rank 0 process is the one exiting last + # https://github.com/pytorch/xla/issues/2190#issuecomment-641665358 + if rank == 0: + time.sleep(1) + + return xmp.spawn(wrapped, args=args, nprocs=nprocs, join=join, daemon=daemon, start_method=start_method) diff --git a/src/lightning_lite/utilities/device_parser.py b/src/lightning_lite/utilities/device_parser.py index f0e5802d07c0e..6967f7bf0af16 100644 --- a/src/lightning_lite/utilities/device_parser.py +++ b/src/lightning_lite/utilities/device_parser.py @@ -1,13 +1,10 @@ import multiprocessing -import os from typing import Any, List, MutableSequence, Optional, Tuple, Union import torch from lightning_lite.plugins.environments.torchelastic_environment import TorchElasticEnvironment - -# TODO(lite): Fix the imports -# from lightning_lite.strategies.launchers.multiprocessing import _is_forking_disabled +from lightning_lite.strategies.launchers.multiprocessing import _is_forking_disabled from lightning_lite.utilities.exceptions import MisconfigurationException from lightning_lite.utilities.types import _DEVICE @@ -309,9 +306,3 @@ def is_cuda_available() -> bool: return torch.cuda.is_available() with multiprocessing.get_context("fork").Pool(1) as pool: return pool.apply(torch.cuda.is_available) - - -# TODO(lite): move this back to launchers/multiprocessing.py once launchers have moved -def _is_forking_disabled() -> bool: - """Returns whether forking is disabled through the environment variable ``PL_DISABLE_FORK``.""" - return bool(int(os.environ.get("PL_DISABLE_FORK", "0"))) diff --git a/src/pytorch_lightning/strategies/launchers/__init__.py b/src/pytorch_lightning/strategies/launchers/__init__.py index d75df88b2df28..1c106cc8ffb66 100644 --- a/src/pytorch_lightning/strategies/launchers/__init__.py +++ b/src/pytorch_lightning/strategies/launchers/__init__.py @@ -11,13 +11,11 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from pytorch_lightning.strategies.launchers.base import _Launcher from pytorch_lightning.strategies.launchers.multiprocessing import _MultiProcessingLauncher from pytorch_lightning.strategies.launchers.subprocess_script import _SubprocessScriptLauncher from pytorch_lightning.strategies.launchers.xla import _XLALauncher __all__ = [ - "_Launcher", "_MultiProcessingLauncher", "_SubprocessScriptLauncher", "_XLALauncher", diff --git a/src/pytorch_lightning/strategies/launchers/multiprocessing.py b/src/pytorch_lightning/strategies/launchers/multiprocessing.py index fdc17f8b8d90f..be6a56b2e35dc 100644 --- a/src/pytorch_lightning/strategies/launchers/multiprocessing.py +++ b/src/pytorch_lightning/strategies/launchers/multiprocessing.py @@ -26,10 +26,10 @@ from typing_extensions import Literal import pytorch_lightning as pl +from lightning_lite.strategies.launchers.base import _Launcher from lightning_lite.utilities.apply_func import move_data_to_device from lightning_lite.utilities.seed import _collect_rng_states, _set_rng_states from lightning_lite.utilities.types import _PATH -from pytorch_lightning.strategies.launchers.base import _Launcher from pytorch_lightning.trainer.states import TrainerFn, TrainerState from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_11 from pytorch_lightning.utilities.rank_zero import rank_zero_debug diff --git a/src/pytorch_lightning/strategies/launchers/subprocess_script.py b/src/pytorch_lightning/strategies/launchers/subprocess_script.py index f9e565260f703..6713f636b98ef 100644 --- a/src/pytorch_lightning/strategies/launchers/subprocess_script.py +++ b/src/pytorch_lightning/strategies/launchers/subprocess_script.py @@ -23,7 +23,7 @@ import pytorch_lightning as pl from lightning_lite.plugins.environments.cluster_environment import ClusterEnvironment -from pytorch_lightning.strategies.launchers.base import _Launcher +from lightning_lite.strategies.launchers.base import _Launcher _HYDRA_AVAILABLE = RequirementCache("hydra") diff --git a/src/pytorch_lightning/strategies/strategy.py b/src/pytorch_lightning/strategies/strategy.py index 0f73b7b24e675..dc2a5b6397289 100644 --- a/src/pytorch_lightning/strategies/strategy.py +++ b/src/pytorch_lightning/strategies/strategy.py @@ -24,6 +24,7 @@ import pytorch_lightning as pl from lightning_lite.plugins.io.checkpoint_plugin import CheckpointIO +from lightning_lite.strategies.launchers.base import _Launcher from lightning_lite.utilities.apply_func import move_data_to_device from lightning_lite.utilities.distributed import ReduceOp from lightning_lite.utilities.optimizer import optimizer_to_device, optimizers_to_device @@ -32,7 +33,6 @@ from pytorch_lightning.plugins import TorchCheckpointIO from pytorch_lightning.plugins.io.wrapper import _WrappingCheckpointIO from pytorch_lightning.plugins.precision import PrecisionPlugin -from pytorch_lightning.strategies.launchers.base import _Launcher from pytorch_lightning.trainer.states import TrainerFn from pytorch_lightning.utilities.types import ( LRSchedulerConfig, diff --git a/tests/tests_lite/conftest.py b/tests/tests_lite/conftest.py index 209d6869a18db..952d32e4a9c94 100644 --- a/tests/tests_lite/conftest.py +++ b/tests/tests_lite/conftest.py @@ -76,6 +76,13 @@ def teardown_process_group(): torch.distributed.destroy_process_group() +@pytest.fixture +def reset_deterministic_algorithm(): + """Ensures that torch determinism settings are reset before the next test runs.""" + yield + torch.use_deterministic_algorithms(False) + + @pytest.fixture def caplog(caplog): """Workaround for https://github.com/pytest-dev/pytest/issues/3697. diff --git a/tests/tests_lite/strategies/__init__.py b/tests/tests_lite/strategies/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/tests/tests_lite/strategies/launchers/__init__.py b/tests/tests_lite/strategies/launchers/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/tests/tests_lite/strategies/launchers/test_multiprocessing.py b/tests/tests_lite/strategies/launchers/test_multiprocessing.py new file mode 100644 index 0000000000000..70b45763fe2df --- /dev/null +++ b/tests/tests_lite/strategies/launchers/test_multiprocessing.py @@ -0,0 +1,95 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +from unittest import mock +from unittest.mock import ANY, Mock + +import pytest +import torch + +from lightning_lite.strategies.launchers.multiprocessing import _GlobalStateSnapshot, _MultiProcessingLauncher +from tests_pytorch.helpers.runif import RunIf + + +@RunIf(skip_windows=True) +@pytest.mark.parametrize("start_method", ["fork", "forkserver"]) +def test_multiprocessing_launcher_interactive_compatible(start_method): + launcher = _MultiProcessingLauncher(strategy=Mock(), start_method=start_method) + assert launcher.is_interactive_compatible == (start_method == "fork") + + +@mock.patch("lightning_lite.strategies.launchers.multiprocessing.mp.get_all_start_methods", return_value=[]) +def test_multiprocessing_launcher_forking_on_unsupported_platform(_): + with pytest.raises(ValueError, match="The start method 'fork' is not available on this platform"): + _MultiProcessingLauncher(strategy=Mock(), start_method="fork") + + +@RunIf(skip_windows=True) +@pytest.mark.parametrize("start_method", ["fork", "forkserver"]) +@mock.patch.dict(os.environ, {"PL_DISABLE_FORK": "1"}, clear=True) +def test_multiprocessing_launcher_disabled_forking(start_method): + with pytest.raises(ValueError, match="Forking is disabled in this environment"): + _MultiProcessingLauncher(strategy=Mock(), start_method=start_method) + + +@pytest.mark.parametrize("start_method", ["spawn", "fork"]) +@mock.patch("lightning_lite.strategies.launchers.multiprocessing.mp") +def test_multiprocessing_launcher_start_method(mp_mock, start_method): + mp_mock.get_all_start_methods.return_value = [start_method] + launcher = _MultiProcessingLauncher(strategy=Mock(), start_method=start_method) + launcher.launch(function=Mock()) + mp_mock.get_context.assert_called_with(start_method) + mp_mock.start_processes.assert_called_with( + ANY, + args=ANY, + nprocs=ANY, + start_method=start_method, + ) + + +@pytest.mark.parametrize("start_method", ["spawn", "fork"]) +@mock.patch("lightning_lite.strategies.launchers.multiprocessing.mp") +def test_multiprocessing_launcher_restore_globals(mp_mock, start_method): + """Test that we pass the global state snapshot to the worker function only if we are starting with 'spawn'.""" + mp_mock.get_all_start_methods.return_value = [start_method] + launcher = _MultiProcessingLauncher(strategy=Mock(), start_method=start_method) + launcher.launch(function=Mock()) + function_args = mp_mock.start_processes.call_args[1]["args"] + if start_method == "spawn": + assert len(function_args) == 5 + assert isinstance(function_args[4], _GlobalStateSnapshot) + else: + assert len(function_args) == 4 + + +@pytest.mark.usefixtures("reset_deterministic_algorithm") +def test_global_state_snapshot(): + """Test the capture() and restore() methods for the global state snapshot.""" + torch.use_deterministic_algorithms(True) + torch.backends.cudnn.benchmark = False + torch.manual_seed(123) + + # capture the state of globals + snapshot = _GlobalStateSnapshot.capture() + + # simulate there is a process boundary and flags get reset here + torch.use_deterministic_algorithms(False) + torch.backends.cudnn.benchmark = True + torch.manual_seed(321) + + # restore the state of globals + snapshot.restore() + assert torch.are_deterministic_algorithms_enabled() + assert not torch.backends.cudnn.benchmark + assert torch.initial_seed() == 123 diff --git a/tests/tests_lite/strategies/launchers/test_subprocess_script.py b/tests/tests_lite/strategies/launchers/test_subprocess_script.py new file mode 100644 index 0000000000000..c9af07343b454 --- /dev/null +++ b/tests/tests_lite/strategies/launchers/test_subprocess_script.py @@ -0,0 +1,78 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +from unittest import mock +from unittest.mock import Mock + +import pytest + +from lightning_lite.strategies.launchers.subprocess_script import _SubprocessScriptLauncher + + +def test_subprocess_script_launcher_interactive_compatible(): + launcher = _SubprocessScriptLauncher(Mock(), num_processes=2, num_nodes=1) + assert not launcher.is_interactive_compatible + + +@mock.patch("lightning_lite.strategies.launchers.subprocess_script.subprocess.Popen") +def test_subprocess_script_launcher_error_launching_on_non_zero_rank(popen_mock): + cluster_env = Mock() + cluster_env.creates_processes_externally = False + cluster_env.local_rank.return_value = 1 + launcher = _SubprocessScriptLauncher(cluster_env, num_processes=2, num_nodes=1) + with pytest.raises(RuntimeError, match="attempted to launch new distributed processes with `local_rank > 0`"): + launcher.launch(Mock()) + + +@mock.patch("lightning_lite.strategies.launchers.subprocess_script.subprocess.Popen") +def test_subprocess_script_launcher_external_processes(popen_mock): + cluster_env = Mock() + cluster_env.creates_processes_externally = True + function = Mock() + launcher = _SubprocessScriptLauncher(cluster_env, num_processes=4, num_nodes=2) + launcher.launch(function, "positional-arg", keyword_arg=0) + function.assert_called_with("positional-arg", keyword_arg=0) + popen_mock.assert_not_called() + + +@mock.patch("lightning_lite.strategies.launchers.subprocess_script.sleep") +@mock.patch("lightning_lite.strategies.launchers.subprocess_script.subprocess.Popen") +def test_subprocess_script_launcher_launch_processes(popen_mock, _): + cluster_env = Mock() + cluster_env.creates_processes_externally = False + cluster_env.local_rank.return_value = 0 + cluster_env.main_address = "address" + cluster_env.main_port = 1234 + + function = Mock() + launcher = _SubprocessScriptLauncher(cluster_env, num_processes=4, num_nodes=2) + num_new_processes = launcher.num_processes - 1 + + # launches n-1 new processes, the current one will participate too + launcher.launch(function, "positional-arg", keyword_arg=0) + + calls = popen_mock.call_args_list + assert len(calls) == num_new_processes + + # world size in child processes + world_sizes = [int(calls[i][1]["env"]["WORLD_SIZE"]) for i in range(num_new_processes)] + assert world_sizes == [launcher.num_processes * launcher.num_nodes] * num_new_processes + + # local rank in child processes + local_ranks = [int(calls[i][1]["env"]["LOCAL_RANK"]) for i in range(num_new_processes)] + assert local_ranks == list(range(1, num_new_processes + 1)) + + # the current process + assert int(os.environ["WORLD_SIZE"]) == launcher.num_processes * launcher.num_nodes + assert int(os.environ["LOCAL_RANK"]) == 0 diff --git a/tests/tests_lite/strategies/launchers/test_xla.py b/tests/tests_lite/strategies/launchers/test_xla.py new file mode 100644 index 0000000000000..0136cb6a27beb --- /dev/null +++ b/tests/tests_lite/strategies/launchers/test_xla.py @@ -0,0 +1,39 @@ +from unittest import mock +from unittest.mock import ANY, Mock + +from tests_lite.helpers.runif import RunIf + +from lightning_lite.strategies.launchers.xla import _XLALauncher + + +@RunIf(skip_windows=True) +def test_xla_launcher_default_start_method(): + launcher = _XLALauncher(strategy=Mock()) + assert launcher._start_method == "fork" + + +@RunIf(skip_windows=True) +def test_xla_launcher_interactive_compatible(): + launcher = _XLALauncher(strategy=Mock()) + assert launcher.is_interactive_compatible + + +@RunIf(skip_windows=True) +@mock.patch("lightning_lite.strategies.launchers.xla.mp") +@mock.patch("lightning_lite.strategies.launchers.xla.xm") +@mock.patch("lightning_lite.strategies.launchers.xla.xmp") +def test_xla_launcher_xmp_spawn(xmp_mock, xm_mock, mp_mock): + strategy = Mock() + strategy.parallel_devices = [0, 1, 2, 3] + launcher = _XLALauncher(strategy=strategy) + function = Mock() + launcher.launch(function, "positional-arg", keyword_arg=0) + # mp_mock.get_context.assert_called_with(start_method) + xmp_mock.spawn.assert_called_with( + ANY, + args=(function, ("positional-arg",), {"keyword_arg": 0}, ANY), + nprocs=4, + join=True, + daemon=False, + start_method="fork", + ) From cf3428784f4f51862eb2c098a7ac1244397ebb24 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Mon, 12 Sep 2022 16:39:40 +0200 Subject: [PATCH 104/193] Set `running_torchscript` recursively (#14657) * Set `running_torchscript` recursively * CHANGELOG --- src/pytorch_lightning/CHANGELOG.md | 9 +++++--- src/pytorch_lightning/core/module.py | 15 ++++++++++-- .../tests_pytorch/models/test_torchscript.py | 23 +++++++++++++++++++ 3 files changed, 42 insertions(+), 5 deletions(-) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index c28f9bd1498e4..5ffbc1214ade5 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -159,6 +159,12 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed an issue to keep downscaling the batch size in case there hasn't been even a single successful optimal batch size with `mode="power"` ([#14372](https://github.com/Lightning-AI/lightning/pull/14372)) +- Fixed compatibility when `torch.distributed` is not available ([#14454](https://github.com/Lightning-AI/lightning/pull/14454)) + + +- Fixed torchscript error with ensembles of LightningModules ([#14657](https://github.com/Lightning-AI/lightning/pull/14657)) + + ## [1.7.5] - 2022-09-06 ### Fixed @@ -182,9 +188,6 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed incorrect values after transferring data to an MPS device ([#14368](https://github.com/Lightning-AI/lightning/pull/14368)) -- Fixed compatibility when `torch.distributed` is not available ([#14454](https://github.com/Lightning-AI/lightning/pull/14454)) - - ## [1.7.3] - 2022-08-25 ### Fixed diff --git a/src/pytorch_lightning/core/module.py b/src/pytorch_lightning/core/module.py index ab655adb4d656..800d4be1d6e6a 100644 --- a/src/pytorch_lightning/core/module.py +++ b/src/pytorch_lightning/core/module.py @@ -89,6 +89,7 @@ class LightningModule( "truncated_bptt_steps", "use_amp", "trainer", + "_running_torchscript", ] + _DeviceDtypeModuleMixin.__jit_unused_properties__ + HyperparametersMixin.__jit_unused_properties__ @@ -117,8 +118,7 @@ def __init__(self, *args: Any, **kwargs: Any) -> None: self._param_requires_grad_state: Dict[str, bool] = {} self._metric_attributes: Optional[Dict[int, str]] = None self._should_prevent_trainer_and_dataloaders_deepcopy: bool = False - # TODO: remove in 1.8 - self._running_torchscript = False + self._running_torchscript_internal = False # workaround for https://github.com/pytorch/pytorch/issues/67146 self._register_sharded_tensor_state_dict_hooks_if_available() @@ -293,6 +293,17 @@ def loggers(self) -> List[Logger]: """Reference to the list of loggers in the Trainer.""" return self.trainer.loggers if self._trainer else [] + @property + def _running_torchscript(self) -> bool: + return self._running_torchscript_internal + + @_running_torchscript.setter + def _running_torchscript(self, value: bool) -> None: + for v in self.children(): + if isinstance(v, LightningModule): + v._running_torchscript_internal = value + self._running_torchscript_internal = value + def _call_batch_hook(self, hook_name: str, *args: Any) -> Any: if self._trainer: datahook_selector = self._trainer._data_connector._datahook_selector diff --git a/tests/tests_pytorch/models/test_torchscript.py b/tests/tests_pytorch/models/test_torchscript.py index fc63d661ab5e7..a7a1006542b0a 100644 --- a/tests/tests_pytorch/models/test_torchscript.py +++ b/tests/tests_pytorch/models/test_torchscript.py @@ -20,6 +20,7 @@ from fsspec.implementations.local import LocalFileSystem from lightning_lite.utilities.cloud_io import get_filesystem +from pytorch_lightning.core.module import LightningModule from pytorch_lightning.demos.boring_classes import BoringModel from tests_pytorch.helpers.advanced_models import BasicGAN, ParityModuleRNN from tests_pytorch.helpers.runif import RunIf @@ -170,3 +171,25 @@ def test_torchscript_with_no_input(tmpdir): with pytest.raises(ValueError, match="requires either `example_inputs` or `model.example_input_array`"): model.to_torchscript(method="trace") + + +def test_torchscript_script_recursively(): + class Child(LightningModule): + def __init__(self): + super().__init__() + self.model = torch.nn.Linear(1, 1) + + def forward(self, inputs): + return self.model(inputs) + + class Parent(LightningModule): + def __init__(self): + super().__init__() + self.model = Child() + + def forward(self, inputs): + return self.model(inputs) + + lm = Parent() + script = lm.to_torchscript(method="script") + assert isinstance(script, torch.jit.RecursiveScriptModule) From 86fd5b22d43d4db4261e559750f6f628945ac8e8 Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Mon, 12 Sep 2022 16:47:24 +0200 Subject: [PATCH 105/193] (app) Make Logging DEBUG mode lazy (#14464) --- .azure/app-cloud-e2e.yml | 6 +- .gitignore | 1 + examples/app_boring/app_dynamic.py | 2 +- examples/app_commands_and_api/app.py | 2 +- examples/app_drive/app.py | 2 +- examples/app_template_streamlit_ui/app.py | 2 +- examples/app_v0/app.py | 2 +- src/lightning_app/CHANGELOG.md | 3 + src/lightning_app/api/http_methods.py | 6 + src/lightning_app/cli/cmd_init.py | 5 +- src/lightning_app/cli/cmd_install.py | 4 +- src/lightning_app/cli/cmd_react_ui_init.py | 5 +- src/lightning_app/cli/lightning_cli.py | 8 +- .../cli/pl-app-template/core/callbacks.py | 5 +- src/lightning_app/components/python/popen.py | 5 +- src/lightning_app/components/python/tracer.py | 5 +- src/lightning_app/components/serve/serve.py | 4 +- src/lightning_app/components/training.py | 4 +- src/lightning_app/core/api.py | 5 +- src/lightning_app/core/app.py | 28 +++- src/lightning_app/core/constants.py | 2 +- src/lightning_app/core/queues.py | 4 +- .../frontend/panel/app_state_comm.py | 4 +- .../frontend/panel/app_state_watcher.py | 4 +- .../frontend/panel/panel_frontend.py | 4 +- src/lightning_app/runners/cloud.py | 6 +- src/lightning_app/runners/runtime.py | 4 +- src/lightning_app/source_code/copytree.py | 6 +- src/lightning_app/storage/copier.py | 5 +- src/lightning_app/storage/orchestrator.py | 4 +- src/lightning_app/storage/path.py | 4 +- src/lightning_app/storage/payload.py | 4 +- src/lightning_app/testing/testing.py | 158 ++++++++++++------ src/lightning_app/utilities/app_helpers.py | 32 +++- src/lightning_app/utilities/app_logs.py | 4 +- src/lightning_app/utilities/load_app.py | 5 +- src/lightning_app/utilities/log_helpers.py | 5 +- src/lightning_app/utilities/login.py | 6 +- src/lightning_app/utilities/network.py | 5 +- .../utilities/packaging/build_config.py | 6 +- .../utilities/packaging/lightning_utils.py | 3 +- src/lightning_app/utilities/proxies.py | 5 +- src/lightning_app/utilities/state.py | 5 +- tests/tests_app/core/test_lightning_app.py | 20 +++ .../collect_failures/app.py | 2 +- tests/tests_app_examples/conftest.py | 44 +++++ .../custom_work_dependencies/app.py | 2 +- tests/tests_app_examples/idle_timeout/app.py | 2 +- .../test_commands_and_api.py | 10 +- tests/tests_app_examples/test_v0_app.py | 1 + 50 files changed, 330 insertions(+), 140 deletions(-) create mode 100644 tests/tests_app_examples/conftest.py diff --git a/.azure/app-cloud-e2e.yml b/.azure/app-cloud-e2e.yml index eef8a8b8bfff8..1c6822cf2bb54 100644 --- a/.azure/app-cloud-e2e.yml +++ b/.azure/app-cloud-e2e.yml @@ -108,12 +108,15 @@ jobs: displayName: 'Install lightning' - bash: | + rm -rf examples/app_template_jupyterlab || true git clone https://github.com/Lightning-AI/LAI-lightning-template-jupyterlab-App examples/app_template_jupyterlab cp examples/app_template_jupyterlab/tests/test_template_jupyterlab.py tests/tests_app_examples/test_template_jupyterlab.py condition: eq(variables['name'], 'template_jupyterlab') displayName: 'Clone Template Jupyter Lab Repo' - - bash: git clone https://github.com/Lightning-AI/lightning-template-react examples/app_template_react_ui + - bash: | + rm -rf examples/app_template_react_ui || true + git clone https://github.com/Lightning-AI/lightning-template-react examples/app_template_react_ui condition: eq(variables['name'], 'template_react_ui') displayName: 'Clone Template React UI Repo' @@ -137,6 +140,7 @@ jobs: LIGHTNING_API_KEY: $(LIGHTNING_API_KEY_PROD) LIGHTNING_USERNAME: $(LIGHTNING_USERNAME) LIGHTNING_CLOUD_URL: $(LIGHTNING_CLOUD_URL_PROD) + LIGHTNING_DEBUG: '1' displayName: 'Run the tests' - publish: '$(Build.ArtifactStagingDirectory)/videos' diff --git a/.gitignore b/.gitignore index a7dc915b84284..c308ce2620c99 100644 --- a/.gitignore +++ b/.gitignore @@ -174,3 +174,4 @@ our_model.tar test.png saved_models data/ +.shared diff --git a/examples/app_boring/app_dynamic.py b/examples/app_boring/app_dynamic.py index 6e3fdfa3ccdee..836206efa259b 100644 --- a/examples/app_boring/app_dynamic.py +++ b/examples/app_boring/app_dynamic.py @@ -64,4 +64,4 @@ def configure_layout(self): return {"name": "Boring Tab", "content": self.dict["dst_w"].url + "/file" if "dst_w" in self.dict else ""} -app = L.LightningApp(BoringApp()) +app = L.LightningApp(BoringApp(), debug=True) diff --git a/examples/app_commands_and_api/app.py b/examples/app_commands_and_api/app.py index 0d15bc531bb38..057e137912b17 100644 --- a/examples/app_commands_and_api/app.py +++ b/examples/app_commands_and_api/app.py @@ -40,4 +40,4 @@ def configure_api(self): return [Post("/user/command_without_client", self.command_without_client)] -app = LightningApp(FlowCommands()) +app = LightningApp(FlowCommands(), debug=True) diff --git a/examples/app_drive/app.py b/examples/app_drive/app.py index 60004847933a9..0dd0257d47fa9 100644 --- a/examples/app_drive/app.py +++ b/examples/app_drive/app.py @@ -48,4 +48,4 @@ def run(self): self._exit("Application End!") -app = L.LightningApp(Flow()) +app = L.LightningApp(Flow(), debug=True) diff --git a/examples/app_template_streamlit_ui/app.py b/examples/app_template_streamlit_ui/app.py index 33aa3dd26f700..45bb775984cd3 100644 --- a/examples/app_template_streamlit_ui/app.py +++ b/examples/app_template_streamlit_ui/app.py @@ -45,4 +45,4 @@ def configure_layout(self): return [{"name": "StreamLitUI", "content": self.streamlit_ui}] -app = LightningApp(HelloWorld()) +app = LightningApp(HelloWorld(), debug=True) diff --git a/examples/app_v0/app.py b/examples/app_v0/app.py index 26345f5b43e46..e914722dada82 100644 --- a/examples/app_v0/app.py +++ b/examples/app_v0/app.py @@ -46,4 +46,4 @@ def configure_layout(self): return [tab1, tab2, tab3] -app = L.LightningApp(V0App()) +app = L.LightningApp(V0App(), debug=True) diff --git a/src/lightning_app/CHANGELOG.md b/src/lightning_app/CHANGELOG.md index ac8c3b2c1d9c9..6fd47662f3a88 100644 --- a/src/lightning_app/CHANGELOG.md +++ b/src/lightning_app/CHANGELOG.md @@ -12,6 +12,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Adds `PanelFrontend` to easily create complex UI in Python ([#13531](https://github.com/Lightning-AI/lightning/pull/13531)) +### Fixed + +- Resolved `LightningApp(..., debug=True)` ([#14464](https://github.com/Lightning-AI/lightning/pull/14464)) ## [0.6.0] - 2022-08-23 diff --git a/src/lightning_app/api/http_methods.py b/src/lightning_app/api/http_methods.py index 02b6ec87f17d2..d74ac9c61b78f 100644 --- a/src/lightning_app/api/http_methods.py +++ b/src/lightning_app/api/http_methods.py @@ -10,6 +10,9 @@ from fastapi import FastAPI from lightning_app.api.request_types import APIRequest, CommandRequest +from lightning_app.utilities.app_helpers import Logger + +logger = Logger(__name__) def _signature_proxy_function(): @@ -51,6 +54,7 @@ def add_route(self, app: FastAPI, request_queue: Queue, responses_store: Dict[st async def _handle_request(*args, **kwargs): async def fn(*args, **kwargs): request_id = str(uuid4()).split("-")[0] + logger.debug(f"Processing request {request_id} for route: {self.route}") request_queue.put( request_cls( name=self.component_name, @@ -67,6 +71,8 @@ async def fn(*args, **kwargs): if (time.time() - t0) > self.timeout: raise Exception("The response was never received.") + logger.debug(f"Processed request {request_id} for route: {self.route}") + return responses_store.pop(request_id) return await asyncio.create_task(fn(*args, **kwargs)) diff --git a/src/lightning_app/cli/cmd_init.py b/src/lightning_app/cli/cmd_init.py index a7127cd6eb205..73469f90cc0ce 100644 --- a/src/lightning_app/cli/cmd_init.py +++ b/src/lightning_app/cli/cmd_init.py @@ -1,9 +1,10 @@ -import logging import os import re import shutil -logger = logging.getLogger(__name__) +from lightning_app.utilities.app_helpers import Logger + +logger = Logger(__name__) def app(app_name): diff --git a/src/lightning_app/cli/cmd_install.py b/src/lightning_app/cli/cmd_install.py index 8f0e45145e59a..a5325eabbcd56 100644 --- a/src/lightning_app/cli/cmd_install.py +++ b/src/lightning_app/cli/cmd_install.py @@ -1,4 +1,3 @@ -import logging import os import re import shutil @@ -9,8 +8,9 @@ from packaging.version import Version from lightning_app.core.constants import LIGHTNING_APPS_PUBLIC_REGISTRY, LIGHTNING_COMPONENT_PUBLIC_REGISTRY +from lightning_app.utilities.app_helpers import Logger -logger = logging.getLogger(__name__) +logger = Logger(__name__) def gallery_component(name, yes_arg, version_arg, cwd=None): diff --git a/src/lightning_app/cli/cmd_react_ui_init.py b/src/lightning_app/cli/cmd_react_ui_init.py index b60c788b7cd5a..9ac2a4f690e5d 100644 --- a/src/lightning_app/cli/cmd_react_ui_init.py +++ b/src/lightning_app/cli/cmd_react_ui_init.py @@ -1,10 +1,11 @@ -import logging import os import re import shutil import subprocess -logger = logging.getLogger(__name__) +from lightning_app.utilities.app_helpers import Logger + +logger = Logger(__name__) def react_ui(dest_dir=None): diff --git a/src/lightning_app/cli/lightning_cli.py b/src/lightning_app/cli/lightning_cli.py index 0ed23f6577097..9e11abafd62b6 100644 --- a/src/lightning_app/cli/lightning_cli.py +++ b/src/lightning_app/cli/lightning_cli.py @@ -1,4 +1,3 @@ -import logging import os import sys from pathlib import Path @@ -26,6 +25,7 @@ from lightning_app.core.constants import get_lightning_cloud_url from lightning_app.runners.runtime import dispatch from lightning_app.runners.runtime_type import RuntimeType +from lightning_app.utilities.app_helpers import Logger from lightning_app.utilities.app_logs import _app_logs_reader from lightning_app.utilities.cli_helpers import _arrow_time_callback, _format_input_env_variables from lightning_app.utilities.cloud import _get_project @@ -33,7 +33,7 @@ from lightning_app.utilities.login import Auth from lightning_app.utilities.network import LightningClient -logger = logging.getLogger(__name__) +logger = Logger(__name__) def get_app_url(runtime_type: RuntimeType, *args) -> str: @@ -405,7 +405,7 @@ def install(): def install_app(name, yes, version, overwrite: bool = False): if "github.com" in name: if version != "latest": - logger.warning( + logger.warn( f"The provided version {version} isn't the officially supported one. " f"The provided version will be ignored." ) @@ -428,7 +428,7 @@ def install_app(name, yes, version, overwrite: bool = False): def install_component(name, yes, version): if "github.com" in name: if version != "latest": - logger.warning( + logger.warn( f"The provided version {version} isn't the officially supported one. " f"The provided version will be ignored." ) diff --git a/src/lightning_app/cli/pl-app-template/core/callbacks.py b/src/lightning_app/cli/pl-app-template/core/callbacks.py index f324d10f1faa4..573672526cdbd 100644 --- a/src/lightning_app/cli/pl-app-template/core/callbacks.py +++ b/src/lightning_app/cli/pl-app-template/core/callbacks.py @@ -1,11 +1,11 @@ import inspect -import logging from typing import Any, Dict, TYPE_CHECKING, Union from core.state import ProgressBarState, TrainerState import pytorch_lightning as pl from lightning.app.storage import Path +from lightning_app.utilities.app_helpers import Logger from pytorch_lightning import Callback from pytorch_lightning.callbacks.progress.base import get_standard_metrics from pytorch_lightning.loggers import TensorBoardLogger, WandbLogger @@ -14,7 +14,8 @@ if TYPE_CHECKING: from core.components.script_runner import ScriptRunner -_log = logging.getLogger(__name__) + +_log = Logger(__name__) class PLAppProgressTracker(Callback): diff --git a/src/lightning_app/components/python/popen.py b/src/lightning_app/components/python/popen.py index 7efc2b6d83c61..1453fd1be17ac 100644 --- a/src/lightning_app/components/python/popen.py +++ b/src/lightning_app/components/python/popen.py @@ -1,4 +1,3 @@ -import logging import os import signal import subprocess @@ -7,10 +6,10 @@ from typing import Dict, List, Optional, Union from lightning_app import LightningWork -from lightning_app.utilities.app_helpers import _collect_child_process_pids +from lightning_app.utilities.app_helpers import _collect_child_process_pids, Logger from lightning_app.utilities.tracer import Tracer -logger = logging.getLogger(__name__) +logger = Logger(__name__) class PopenPythonScript(LightningWork): diff --git a/src/lightning_app/components/python/tracer.py b/src/lightning_app/components/python/tracer.py index abc4609e044ef..b172f7bc4fcc8 100644 --- a/src/lightning_app/components/python/tracer.py +++ b/src/lightning_app/components/python/tracer.py @@ -1,4 +1,3 @@ -import logging import os import signal import sys @@ -8,11 +7,11 @@ from lightning_app import LightningWork from lightning_app.storage.drive import Drive from lightning_app.storage.payload import Payload -from lightning_app.utilities.app_helpers import _collect_child_process_pids +from lightning_app.utilities.app_helpers import _collect_child_process_pids, Logger from lightning_app.utilities.packaging.tarfile import clean_tarfile, extract_tarfile from lightning_app.utilities.tracer import Tracer -logger = logging.getLogger(__name__) +logger = Logger(__name__) class Code(TypedDict): diff --git a/src/lightning_app/components/serve/serve.py b/src/lightning_app/components/serve/serve.py index 818cbdf776f3a..935aca724268a 100644 --- a/src/lightning_app/components/serve/serve.py +++ b/src/lightning_app/components/serve/serve.py @@ -1,6 +1,5 @@ import abc import inspect -import logging import os import pydoc import subprocess @@ -15,8 +14,9 @@ from lightning_app import LightningWork from lightning_app.components.serve.types import _DESERIALIZER, _SERIALIZER +from lightning_app.utilities.app_helpers import Logger -logger = logging.getLogger(__name__) +logger = Logger(__name__) fastapi_service = FastAPI() diff --git a/src/lightning_app/components/training.py b/src/lightning_app/components/training.py index 9773fe9670e52..78790f10a36d8 100644 --- a/src/lightning_app/components/training.py +++ b/src/lightning_app/components/training.py @@ -1,4 +1,3 @@ -import logging import os from typing import Any, Dict, List, Optional, Tuple, Type, Union @@ -6,8 +5,9 @@ from lightning_app import LightningFlow, structures from lightning_app.components.python import TracerPythonScript from lightning_app.storage.path import Path +from lightning_app.utilities.app_helpers import Logger -_logger = logging.getLogger(__name__) +_logger = Logger(__name__) class PyTorchLightningScriptRunner(TracerPythonScript): diff --git a/src/lightning_app/core/api.py b/src/lightning_app/core/api.py index 8b625713e0c2c..faf0aad061978 100644 --- a/src/lightning_app/core/api.py +++ b/src/lightning_app/core/api.py @@ -1,5 +1,4 @@ import asyncio -import logging import os import queue import sys @@ -24,7 +23,7 @@ from lightning_app.api.request_types import DeltaRequest from lightning_app.core.constants import FRONTEND_DIR from lightning_app.core.queues import RedisQueue -from lightning_app.utilities.app_helpers import InMemoryStateStore, StateStore +from lightning_app.utilities.app_helpers import InMemoryStateStore, Logger, StateStore from lightning_app.utilities.enum import OpenAPITags from lightning_app.utilities.imports import _is_redis_available, _is_starsessions_available @@ -58,7 +57,7 @@ class SessionMiddleware: # In the future, this would be abstracted to support horizontal scaling. responses_store = {} -logger = logging.getLogger(__name__) +logger = Logger(__name__) # This can be replaced with a consumer that publishes states in a kv-store diff --git a/src/lightning_app/core/app.py b/src/lightning_app/core/app.py index 65242a1ae0a2a..2bc0d7fc109d4 100644 --- a/src/lightning_app/core/app.py +++ b/src/lightning_app/core/app.py @@ -11,12 +11,18 @@ from deepdiff import DeepDiff, Delta import lightning_app +from lightning_app import _console from lightning_app.api.request_types import APIRequest, CommandRequest, DeltaRequest -from lightning_app.core.constants import FLOW_DURATION_SAMPLES, FLOW_DURATION_THRESHOLD, STATE_ACCUMULATE_WAIT +from lightning_app.core.constants import ( + DEBUG_ENABLED, + FLOW_DURATION_SAMPLES, + FLOW_DURATION_THRESHOLD, + STATE_ACCUMULATE_WAIT, +) from lightning_app.core.queues import BaseQueue, SingleProcessQueue from lightning_app.frontend import Frontend from lightning_app.storage.path import storage_root_dir -from lightning_app.utilities.app_helpers import _delta_to_app_state_delta, _LightningAppRef +from lightning_app.utilities.app_helpers import _delta_to_app_state_delta, _LightningAppRef, Logger from lightning_app.utilities.commands.base import _process_requests from lightning_app.utilities.component import _convert_paths_after_init from lightning_app.utilities.enum import AppStage, CacheCallsKeys @@ -30,7 +36,7 @@ if t.TYPE_CHECKING: from lightning_app.runners.backends.backend import Backend, WorkManager -logger = logging.getLogger(__name__) +logger = Logger(__name__) class LightningApp: @@ -116,8 +122,13 @@ def __init__( # is only available after all Flows and Works have been instantiated. _convert_paths_after_init(self.root) - if debug: - logging.getLogger().setLevel(logging.DEBUG) + # Lazily enable debugging. + if debug or DEBUG_ENABLED: + if not DEBUG_ENABLED: + os.environ["LIGHTNING_DEBUG"] = "2" + _console.setLevel(logging.DEBUG) + + logger.debug(f"ENV: {os.environ}") def get_component_by_name(self, component_name: str): """Returns the instance corresponding to the given component name.""" @@ -433,6 +444,8 @@ def _run(self) -> bool: self._has_updated = False + self._on_run_end() + return True def _update_layout(self) -> None: @@ -550,3 +563,8 @@ def on_run_once_end(self) -> None: # disable any flow schedules. for flow in self.flows: flow._disable_running_schedules() + + def _on_run_end(self): + if os.getenv("LIGHTNING_DEBUG") == "2": + del os.environ["LIGHTNING_DEBUG"] + _console.setLevel(logging.INFO) diff --git a/src/lightning_app/core/constants.py b/src/lightning_app/core/constants.py index 74a8dc17f141f..5caf497513837 100644 --- a/src/lightning_app/core/constants.py +++ b/src/lightning_app/core/constants.py @@ -30,7 +30,7 @@ LIGHTNING_CLOUD_PROJECT_ID = os.getenv("LIGHTNING_CLOUD_PROJECT_ID") LIGHTNING_CREDENTIAL_PATH = os.getenv("LIGHTNING_CREDENTIAL_PATH", str(Path.home() / ".lightning" / "credentials.json")) DOT_IGNORE_FILENAME = ".lightningignore" - +DEBUG_ENABLED = bool(int(os.getenv("LIGHTNING_DEBUG", "0"))) LIGHTNING_COMPONENT_PUBLIC_REGISTRY = "https://lightning.ai/v1/components" LIGHTNING_APPS_PUBLIC_REGISTRY = "https://lightning.ai/v1/apps" diff --git a/src/lightning_app/core/queues.py b/src/lightning_app/core/queues.py index 2b7295d7f327f..728320b5b7797 100644 --- a/src/lightning_app/core/queues.py +++ b/src/lightning_app/core/queues.py @@ -1,4 +1,3 @@ -import logging import multiprocessing import pickle import queue @@ -15,12 +14,13 @@ REDIS_WARNING_QUEUE_SIZE, STATE_UPDATE_TIMEOUT, ) +from lightning_app.utilities.app_helpers import Logger from lightning_app.utilities.imports import _is_redis_available, requires if _is_redis_available(): import redis -logger = logging.getLogger(__name__) +logger = Logger(__name__) READINESS_QUEUE_CONSTANT = "READINESS_QUEUE" diff --git a/src/lightning_app/frontend/panel/app_state_comm.py b/src/lightning_app/frontend/panel/app_state_comm.py index d9092628dd67f..851b955f6741d 100644 --- a/src/lightning_app/frontend/panel/app_state_comm.py +++ b/src/lightning_app/frontend/panel/app_state_comm.py @@ -4,7 +4,6 @@ from __future__ import annotations import asyncio -import logging import os from threading import Thread from typing import Callable @@ -12,8 +11,9 @@ import websockets from lightning_app.core.constants import APP_SERVER_PORT +from lightning_app.utilities.app_helpers import Logger -_logger = logging.getLogger(__name__) +_logger = Logger(__name__) _CALLBACKS = [] _THREAD: Thread = None diff --git a/src/lightning_app/frontend/panel/app_state_watcher.py b/src/lightning_app/frontend/panel/app_state_watcher.py index 49eee09ea80fb..b93a4c3eaf37c 100644 --- a/src/lightning_app/frontend/panel/app_state_watcher.py +++ b/src/lightning_app/frontend/panel/app_state_watcher.py @@ -7,15 +7,15 @@ """ from __future__ import annotations -import logging import os from lightning_app.frontend.panel.app_state_comm import watch_app_state from lightning_app.frontend.utils import _get_flow_state +from lightning_app.utilities.app_helpers import Logger from lightning_app.utilities.imports import _is_param_available, requires from lightning_app.utilities.state import AppState -_logger = logging.getLogger(__name__) +_logger = Logger(__name__) if _is_param_available(): diff --git a/src/lightning_app/frontend/panel/panel_frontend.py b/src/lightning_app/frontend/panel/panel_frontend.py index d89ed898751be..80e59786453da 100644 --- a/src/lightning_app/frontend/panel/panel_frontend.py +++ b/src/lightning_app/frontend/panel/panel_frontend.py @@ -2,7 +2,6 @@ from __future__ import annotations import inspect -import logging import os import pathlib import subprocess @@ -11,11 +10,12 @@ from lightning_app.frontend.frontend import Frontend from lightning_app.frontend.utils import _get_frontend_environment +from lightning_app.utilities.app_helpers import Logger from lightning_app.utilities.cloud import is_running_in_cloud from lightning_app.utilities.imports import requires from lightning_app.utilities.log import get_frontend_logfile -_logger = logging.getLogger("PanelFrontend") +_logger = Logger(__name__) def has_panel_autoreload() -> bool: diff --git a/src/lightning_app/runners/cloud.py b/src/lightning_app/runners/cloud.py index 2cd98ebe4cf68..59b342b166eb3 100644 --- a/src/lightning_app/runners/cloud.py +++ b/src/lightning_app/runners/cloud.py @@ -1,5 +1,4 @@ import fnmatch -import logging import os import random import string @@ -44,12 +43,13 @@ from lightning_app.runners.runtime import Runtime from lightning_app.source_code import LocalSourceCodeDir from lightning_app.storage import Drive +from lightning_app.utilities.app_helpers import Logger from lightning_app.utilities.cloud import _get_project from lightning_app.utilities.dependency_caching import get_hash from lightning_app.utilities.packaging.app_config import AppConfig, find_config_file from lightning_app.utilities.packaging.lightning_utils import _prepare_lightning_wheels_and_requirements -logger = logging.getLogger(__name__) +logger = Logger(__name__) @dataclass @@ -331,4 +331,4 @@ def _check_uploaded_folder(root: Path, repo: LocalSourceCodeDir) -> None: ) else: warning_msg += "\nYou can ignore some files or folders by adding them to `.lightningignore`." - logger.warning(warning_msg) + logger.warn(warning_msg) diff --git a/src/lightning_app/runners/runtime.py b/src/lightning_app/runners/runtime.py index 348e3ecca12a0..3de5a8556cdfb 100644 --- a/src/lightning_app/runners/runtime.py +++ b/src/lightning_app/runners/runtime.py @@ -1,4 +1,3 @@ -import logging import multiprocessing import sys from dataclasses import dataclass, field @@ -10,11 +9,12 @@ from lightning_app import LightningApp from lightning_app.core.constants import APP_SERVER_HOST, APP_SERVER_PORT from lightning_app.runners.backends import Backend, BackendType +from lightning_app.utilities.app_helpers import Logger from lightning_app.utilities.enum import AppStage, CacheCallsKeys, make_status, WorkStageStatus from lightning_app.utilities.load_app import load_app_from_file from lightning_app.utilities.proxies import WorkRunner -logger = logging.getLogger(__name__) +logger = Logger(__name__) def dispatch( diff --git a/src/lightning_app/source_code/copytree.py b/src/lightning_app/source_code/copytree.py index 5b85d1b54b3ed..12db97e80ff2e 100644 --- a/src/lightning_app/source_code/copytree.py +++ b/src/lightning_app/source_code/copytree.py @@ -1,13 +1,13 @@ import fnmatch -import logging import os from pathlib import Path from shutil import copy2, copystat, Error from typing import Callable, List, Set, Union from lightning_app.core.constants import DOT_IGNORE_FILENAME +from lightning_app.utilities.app_helpers import Logger -logger = logging.getLogger(__name__) +logger = Logger(__name__) def copytree( @@ -159,7 +159,7 @@ def _ignore_filename_spell_check(src: Path): possible_spelling_mistakes.extend([p.lstrip(".") for p in possible_spelling_mistakes]) for path in src.iterdir(): if path.is_file() and path.name in possible_spelling_mistakes: - logger.warning( + logger.warn( f"Lightning uses `{DOT_IGNORE_FILENAME}` as the ignore file but found {path.name} at " f"{path.parent} instead. If this was a mistake, please rename the file." ) diff --git a/src/lightning_app/storage/copier.py b/src/lightning_app/storage/copier.py index f379207457bfd..8fed3d6a10f96 100644 --- a/src/lightning_app/storage/copier.py +++ b/src/lightning_app/storage/copier.py @@ -1,5 +1,4 @@ import concurrent.futures -import logging import pathlib import threading from threading import Thread @@ -12,9 +11,11 @@ from lightning_app.core.queues import BaseQueue from lightning_app.storage.path import filesystem from lightning_app.storage.requests import ExistsRequest, GetRequest +from lightning_app.utilities.app_helpers import Logger _PathRequest = Union[GetRequest, ExistsRequest] -_logger = logging.getLogger(__name__) + +_logger = Logger(__name__) num_workers = 8 diff --git a/src/lightning_app/storage/orchestrator.py b/src/lightning_app/storage/orchestrator.py index 3297666fa8421..c32fa9a7a1fbb 100644 --- a/src/lightning_app/storage/orchestrator.py +++ b/src/lightning_app/storage/orchestrator.py @@ -1,4 +1,3 @@ -import logging import threading import traceback from queue import Empty @@ -8,6 +7,7 @@ from lightning_app.core.queues import BaseQueue from lightning_app.storage.path import filesystem, path_to_work_artifact from lightning_app.storage.requests import ExistsRequest, ExistsResponse, GetRequest, GetResponse +from lightning_app.utilities.app_helpers import Logger from lightning_app.utilities.enum import WorkStageStatus if TYPE_CHECKING: @@ -16,7 +16,7 @@ _PathRequest = Union[GetRequest, ExistsRequest] _PathResponse = Union[ExistsResponse, GetResponse] -_logger = logging.getLogger(__name__) +_logger = Logger(__name__) class StorageOrchestrator(Thread): diff --git a/src/lightning_app/storage/path.py b/src/lightning_app/storage/path.py index 58060db308e25..35abc25191cb4 100644 --- a/src/lightning_app/storage/path.py +++ b/src/lightning_app/storage/path.py @@ -1,5 +1,4 @@ import hashlib -import logging import os import pathlib import shutil @@ -14,6 +13,7 @@ import lightning_app from lightning_app.core.queues import BaseQueue from lightning_app.storage.requests import ExistsRequest, ExistsResponse, GetRequest, GetResponse +from lightning_app.utilities.app_helpers import Logger from lightning_app.utilities.component import _is_flow_context from lightning_app.utilities.imports import _is_s3fs_available @@ -27,7 +27,7 @@ num_workers = 8 -_logger = logging.getLogger(__name__) +_logger = Logger(__name__) class Path(PathlibPath): diff --git a/src/lightning_app/storage/payload.py b/src/lightning_app/storage/payload.py index 082fea0fc9adc..0c7b111326890 100644 --- a/src/lightning_app/storage/payload.py +++ b/src/lightning_app/storage/payload.py @@ -1,5 +1,4 @@ import hashlib -import logging import pathlib import pickle from abc import ABC, abstractmethod @@ -10,9 +9,10 @@ from lightning_app.core.queues import BaseQueue from lightning_app.storage.path import filesystem, Path, shared_storage_path from lightning_app.storage.requests import ExistsRequest, ExistsResponse, GetRequest, GetResponse +from lightning_app.utilities.app_helpers import Logger from lightning_app.utilities.component import _is_flow_context -_logger = logging.getLogger(__name__) +_logger = Logger(__name__) class BasePayload(ABC): diff --git a/src/lightning_app/testing/testing.py b/src/lightning_app/testing/testing.py index 387592a4c178e..34ea7aa6b660c 100644 --- a/src/lightning_app/testing/testing.py +++ b/src/lightning_app/testing/testing.py @@ -1,6 +1,5 @@ import asyncio import json -import logging import os import shutil import subprocess @@ -9,6 +8,7 @@ import time import traceback from contextlib import contextmanager +from multiprocessing import Process from subprocess import Popen from time import sleep from typing import Any, Callable, Dict, Generator, List, Optional, Type @@ -28,6 +28,7 @@ from lightning_app.utilities.cloud import _get_project from lightning_app.utilities.enum import CacheCallsKeys from lightning_app.utilities.imports import _is_playwright_available, requires +from lightning_app.utilities.logs_socket_api import _LightningLogsSocketAPI from lightning_app.utilities.network import _configure_session, LightningClient from lightning_app.utilities.proxies import ProxyWorkRun @@ -36,7 +37,49 @@ from playwright.sync_api import HttpCredentials, sync_playwright -_logger = logging.getLogger(__name__) +def _on_error_callback(ws_app, *_): + print(traceback.format_exc()) + ws_app.close() + + +def print_logs(app_id: str): + client = LightningClient() + project = _get_project(client) + + works = client.lightningwork_service_list_lightningwork( + project_id=project.project_id, + app_id=app_id, + ).lightningworks + component_names = ["flow"] + [w.name for w in works] + + rich_colors = list(ANSI_COLOR_NAMES) + colors = {c: rich_colors[i + 1] for i, c in enumerate(component_names)} + + max_length = max(len(c.replace("root.", "")) for c in component_names) + identifiers = [] + + print("################### PRINTING LOGS ###################") + + logs_api_client = _LightningLogsSocketAPI(client.api_client) + + while True: + gen = _app_logs_reader( + logs_api_client=logs_api_client, + project_id=project.project_id, + app_id=app_id, + component_names=component_names, + follow=False, + on_error_callback=_on_error_callback, + ) + for log_event in gen: + message = log_event.message + identifier = f"{log_event.timestamp}{log_event.message}" + if identifier not in identifiers: + date = log_event.timestamp.strftime("%m/%d/%Y %H:%M:%S") + identifiers.append(identifier) + color = colors[log_event.component_name] + padding = (max_length - len(log_event.component_name)) * " " + print(f"[{color}]{log_event.component_name}{padding}[/{color}] {date} {message}") class LightningTestApp(LightningApp): @@ -178,12 +221,17 @@ def run_app_in_cloud(app_folder: str, app_name: str = "app.py", extra_args: [str # 2. Create the right application name. basename = app_folder.split("/")[-1] PR_NUMBER = os.getenv("PR_NUMBER", None) + TEST_APP_NAME = os.getenv("TEST_APP_NAME", basename) + os.environ["TEST_APP_NAME"] = TEST_APP_NAME + if PR_NUMBER: name = f"test-{PR_NUMBER}-{TEST_APP_NAME}-" + str(int(time.time())) else: name = f"test-{TEST_APP_NAME}-" + str(int(time.time())) + os.environ["LIGHTNING_APP_NAME"] = name + # 3. Disconnect from the App if any. Popen("lightning disconnect", shell=True).wait() @@ -191,6 +239,7 @@ def run_app_in_cloud(app_folder: str, app_name: str = "app.py", extra_args: [str with tempfile.TemporaryDirectory() as tmpdir: env_copy = os.environ.copy() env_copy["PACKAGE_LIGHTNING"] = "1" + env_copy["LIGHTNING_DEBUG"] = "1" shutil.copytree(app_folder, tmpdir, dirs_exist_ok=True) # TODO - add -no-cache to the command line. process = Popen( @@ -295,6 +344,26 @@ def run_app_in_cloud(app_folder: str, app_name: str = "app.py", extra_args: [str """ ) + client = LightningClient() + project = _get_project(client) + + lightning_apps = [ + app + for app in client.lightningapp_instance_service_list_lightningapp_instances( + project.project_id + ).lightningapps + if app.name == name + ] + + if not lightning_apps: + return True + + assert len(lightning_apps) == 1 + app_id = lightning_apps[0].id + + process = Process(target=print_logs, kwargs={"app_id": app_id}) + process.start() + while True: try: with admin_page.context.expect_page() as page_catcher: @@ -305,15 +374,12 @@ def run_app_in_cloud(app_folder: str, app_name: str = "app.py", extra_args: [str except (playwright._impl._api_types.Error, playwright._impl._api_types.TimeoutError): pass - client = LightningClient() - project = _get_project(client) - identifiers = [] - rich_colors = list(ANSI_COLOR_NAMES) + print(f"The Lightning Id Name : [bold magenta]{app_id}[/bold magenta]") + + logs_api_client = _LightningLogsSocketAPI(client.api_client) def fetch_logs(component_names: Optional[List[str]] = None) -> Generator: """This methods creates websockets connection in threads and returns the logs to the main thread.""" - app_id = admin_page.url.split("/")[-1] - if not component_names: works = client.lightningwork_service_list_lightningwork( project_id=project.project_id, @@ -321,64 +387,54 @@ def fetch_logs(component_names: Optional[List[str]] = None) -> Generator: ).lightningworks component_names = ["flow"] + [w.name for w in works] - def on_error_callback(ws_app, *_): - print(traceback.print_exc()) - ws_app.close() - - colors = {c: rich_colors[i + 1] for i, c in enumerate(component_names)} gen = _app_logs_reader( - client=client, + logs_api_client=logs_api_client, project_id=project.project_id, app_id=app_id, component_names=component_names, follow=False, - on_error_callback=on_error_callback, + on_error_callback=_on_error_callback, ) - max_length = max(len(c.replace("root.", "")) for c in component_names) for log_event in gen: - message = log_event.message - identifier = f"{log_event.timestamp}{log_event.message}" - if identifier not in identifiers: - date = log_event.timestamp.strftime("%m/%d/%Y %H:%M:%S") - identifiers.append(identifier) - color = colors[log_event.component_name] - padding = (max_length - len(log_event.component_name)) * " " - print(f"[{color}]{log_event.component_name}{padding}[/{color}] {date} {message}") - yield message - - # 7. Print your application ID - print( - f"The Lightning Id Name : [bold magenta]{str(view_page.url).split('.')[0].split('//')[-1]}[/bold magenta]" - ) + yield log_event.message try: yield admin_page, view_page, fetch_logs, name except KeyboardInterrupt: pass finally: - print("##################################################") - button = admin_page.locator('[data-cy="stop"]') - try: - button.wait_for(timeout=3 * 1000) - button.click() - except (playwright._impl._api_types.Error, playwright._impl._api_types.TimeoutError): - pass - context.close() - browser.close() - - list_lightningapps = client.lightningapp_instance_service_list_lightningapp_instances(project.project_id) - - for lightningapp in list_lightningapps.lightningapps: - if lightningapp.name != name: - continue + has_finished = False + while not has_finished: try: - res = client.lightningapp_instance_service_delete_lightningapp_instance( - project_id=project.project_id, - id=lightningapp.id, + button = admin_page.locator('[data-cy="stop"]') + try: + button.wait_for(timeout=3 * 1000) + button.click() + except (playwright._impl._api_types.Error, playwright._impl._api_types.TimeoutError): + pass + context.close() + browser.close() + + list_lightningapps = client.lightningapp_instance_service_list_lightningapp_instances( + project.project_id ) - assert res == {} - except ApiException as e: - print(f"Failed to delete {lightningapp.name}. Exception {e}") + + for lightningapp in list_lightningapps.lightningapps: + if lightningapp.name != name: + continue + try: + res = client.lightningapp_instance_service_delete_lightningapp_instance( + project_id=project.project_id, + id=lightningapp.id, + ) + assert res == {} + except ApiException as e: + print(f"Failed to delete {lightningapp.name}. Exception {e}") + + process.kill() + has_finished = True + except Exception: + pass Popen("lightning disconnect", shell=True).wait() diff --git a/src/lightning_app/utilities/app_helpers.py b/src/lightning_app/utilities/app_helpers.py index 0cc2d84ea58b7..b22e73016043e 100644 --- a/src/lightning_app/utilities/app_helpers.py +++ b/src/lightning_app/utilities/app_helpers.py @@ -25,7 +25,6 @@ from lightning_app.core.flow import LightningFlow from lightning_app.utilities.types import Component - logger = logging.getLogger(__name__) @@ -385,3 +384,34 @@ def default(self, obj: Any) -> Any: if callable(getattr(obj, "__json__", None)): return obj.__json__() return json.JSONEncoder.default(self, obj) + + +class Logger: + + """This class is used to improve the debugging experience.""" + + def __init__(self, name: str): + self.logger = logging.getLogger(name) + self.level = None + + def info(self, msg, *args, **kwargs): + self.logger.info(msg, *args, **kwargs) + + def warn(self, msg, *args, **kwargs): + self._set_level() + self.logger.warn(msg, *args, **kwargs) + + def debug(self, msg, *args, **kwargs): + self._set_level() + self.logger.debug(msg, *args, **kwargs) + + def error(self, msg, *args, **kwargs): + self._set_level() + self.logger.error(msg, *args, **kwargs) + + def _set_level(self): + """Lazily set the level once set by the users.""" + # Set on the first from either log, warn, debug or error call. + if self.level is None: + self.level = logging.DEBUG if bool(int(os.getenv("LIGHTNING_DEBUG", "0"))) else logging.INFO + self.logger.setLevel(self.level) diff --git a/src/lightning_app/utilities/app_logs.py b/src/lightning_app/utilities/app_logs.py index 30533902cdeb8..0fbe359972852 100644 --- a/src/lightning_app/utilities/app_logs.py +++ b/src/lightning_app/utilities/app_logs.py @@ -10,7 +10,6 @@ from lightning_app.utilities.log_helpers import _error_callback, _OrderedLogEntry from lightning_app.utilities.logs_socket_api import _LightningLogsSocketAPI -from lightning_app.utilities.network import LightningClient @dataclass @@ -57,7 +56,7 @@ def callback(ws_app: WebSocketApp, msg: str): def _app_logs_reader( - client: LightningClient, + logs_api_client: _LightningLogsSocketAPI, project_id: str, app_id: str, component_names: List[str], @@ -66,7 +65,6 @@ def _app_logs_reader( ) -> Iterator[_LogEvent]: read_queue = queue.PriorityQueue() - logs_api_client = _LightningLogsSocketAPI(client.api_client) # We will use a socket per component log_sockets = [ diff --git a/src/lightning_app/utilities/load_app.py b/src/lightning_app/utilities/load_app.py index 0fff863bc43de..614944bc7e249 100644 --- a/src/lightning_app/utilities/load_app.py +++ b/src/lightning_app/utilities/load_app.py @@ -1,5 +1,4 @@ import inspect -import logging import os import sys import traceback @@ -12,7 +11,9 @@ if TYPE_CHECKING: from lightning_app import LightningApp, LightningFlow, LightningWork -logger = logging.getLogger(__name__) +from lightning_app.utilities.app_helpers import Logger + +logger = Logger(__name__) def load_app_from_file(filepath: str) -> "LightningApp": diff --git a/src/lightning_app/utilities/log_helpers.py b/src/lightning_app/utilities/log_helpers.py index 5938c443ae031..dadf49fc16c3e 100644 --- a/src/lightning_app/utilities/log_helpers.py +++ b/src/lightning_app/utilities/log_helpers.py @@ -1,11 +1,12 @@ -import logging from dataclasses import dataclass from datetime import datetime from json import JSONDecodeError from websocket import WebSocketApp -logger = logging.getLogger(__name__) +from lightning_app.utilities.app_helpers import Logger + +logger = Logger(__name__) # This is a superclass to inherit log entry classes from it: diff --git a/src/lightning_app/utilities/login.py b/src/lightning_app/utilities/login.py index bd70605738c35..4539ef805eafa 100644 --- a/src/lightning_app/utilities/login.py +++ b/src/lightning_app/utilities/login.py @@ -1,6 +1,5 @@ import base64 import json -import logging import os import pathlib from dataclasses import dataclass @@ -16,9 +15,10 @@ from starlette.responses import RedirectResponse from lightning_app.core.constants import get_lightning_cloud_url, LIGHTNING_CREDENTIAL_PATH +from lightning_app.utilities.app_helpers import Logger from lightning_app.utilities.network import find_free_network_port -logger = logging.getLogger(__name__) +logger = Logger(__name__) class Keys(Enum): @@ -165,7 +165,7 @@ async def save_token(request: Request, token="", key="", user_id: str = Query("" auth.save(token=token, username=user_id, user_id=user_id, api_key=key) logger.info("Authentication Successful") else: - logger.warning( + logger.warn( "Authentication Failed. This is most likely because you're using an older version of the CLI. \n" # noqa E501 "Please try to update the CLI or open an issue with this information \n" # E501 f"expected token in {request.query_params.items()}" diff --git a/src/lightning_app/utilities/network.py b/src/lightning_app/utilities/network.py index 050734723acc1..4ed4ea1591318 100644 --- a/src/lightning_app/utilities/network.py +++ b/src/lightning_app/utilities/network.py @@ -1,4 +1,3 @@ -import logging import socket import time from functools import wraps @@ -13,7 +12,9 @@ from requests.exceptions import ConnectionError, ConnectTimeout, ReadTimeout from urllib3.util.retry import Retry -logger = logging.getLogger(__name__) +from lightning_app.utilities.app_helpers import Logger + +logger = Logger(__name__) def find_free_network_port() -> int: diff --git a/src/lightning_app/utilities/packaging/build_config.py b/src/lightning_app/utilities/packaging/build_config.py index b776e202666de..c1dc95a07bc12 100644 --- a/src/lightning_app/utilities/packaging/build_config.py +++ b/src/lightning_app/utilities/packaging/build_config.py @@ -1,17 +1,17 @@ import inspect -import logging import os import re from dataclasses import asdict, dataclass from types import FrameType from typing import cast, List, Optional, TYPE_CHECKING, Union +from lightning_app.utilities.app_helpers import Logger + if TYPE_CHECKING: from lightning_app import LightningWork from lightning_app.utilities.packaging.cloud_compute import CloudCompute - -logger = logging.getLogger(__name__) +logger = Logger(__name__) def load_requirements( diff --git a/src/lightning_app/utilities/packaging/lightning_utils.py b/src/lightning_app/utilities/packaging/lightning_utils.py index 073d4d7ab613a..f0e87f63e674f 100644 --- a/src/lightning_app/utilities/packaging/lightning_utils.py +++ b/src/lightning_app/utilities/packaging/lightning_utils.py @@ -16,9 +16,10 @@ from lightning_app import _logger, _PROJECT_ROOT, _root_logger from lightning_app.__version__ import version from lightning_app.core.constants import PACKAGE_LIGHTNING +from lightning_app.utilities.app_helpers import Logger from lightning_app.utilities.git import check_github_repository, get_dir_name -logger = logging.getLogger(__name__) +logger = Logger(__name__) # FIXME(alecmerdler): Use GitHub release artifacts once the `lightning-ui` repo is public diff --git a/src/lightning_app/utilities/proxies.py b/src/lightning_app/utilities/proxies.py index a03fe45caa752..9691454bb2697 100644 --- a/src/lightning_app/utilities/proxies.py +++ b/src/lightning_app/utilities/proxies.py @@ -1,4 +1,3 @@ -import logging import os import pathlib import signal @@ -37,7 +36,9 @@ from lightning_app.core.queues import BaseQueue -logger = logging.getLogger(__name__) +from lightning_app.utilities.app_helpers import Logger + +logger = Logger(__name__) _state_observer_lock = threading.Lock() diff --git a/src/lightning_app/utilities/state.py b/src/lightning_app/utilities/state.py index 378c3e20ec14e..a882953ab0450 100644 --- a/src/lightning_app/utilities/state.py +++ b/src/lightning_app/utilities/state.py @@ -1,6 +1,5 @@ import enum import json -import logging import os from copy import deepcopy from typing import Any, Dict, List, Optional, Tuple, Union @@ -11,10 +10,10 @@ from lightning_app.core.constants import APP_SERVER_HOST, APP_SERVER_PORT from lightning_app.storage.drive import _maybe_create_drive -from lightning_app.utilities.app_helpers import AppStatePlugin, BaseStatePlugin +from lightning_app.utilities.app_helpers import AppStatePlugin, BaseStatePlugin, Logger from lightning_app.utilities.network import _configure_session -logger = logging.getLogger(__name__) +logger = Logger(__name__) # GLOBAL APP STATE _LAST_STATE = None diff --git a/tests/tests_app/core/test_lightning_app.py b/tests/tests_app/core/test_lightning_app.py index 3776481965be3..6b578166858a5 100644 --- a/tests/tests_app/core/test_lightning_app.py +++ b/tests/tests_app/core/test_lightning_app.py @@ -955,3 +955,23 @@ def test_non_updated_flow(caplog): MultiProcessRuntime(app, start_server=False).dispatch() assert caplog.messages == ["Hello World"] assert app.counter == 3 + + +def test_debug_mode_logging(): + """This test validates the DEBUG messages are collected when activated by the LightningApp(debug=True) and + cleanup once finished.""" + + from lightning_app.core.app import _console + + app = LightningApp(A4(), debug=True) + assert _console.level == logging.DEBUG + assert os.getenv("LIGHTNING_DEBUG") == "2" + + MultiProcessRuntime(app, start_server=False).dispatch() + + assert os.getenv("LIGHTNING_DEBUG") is None + assert _console.level == logging.INFO + + app = LightningApp(A4()) + assert _console.level == logging.INFO + MultiProcessRuntime(app, start_server=False).dispatch() diff --git a/tests/tests_app_examples/collect_failures/app.py b/tests/tests_app_examples/collect_failures/app.py index 89e302b2e6723..6675cff61dea9 100644 --- a/tests/tests_app_examples/collect_failures/app.py +++ b/tests/tests_app_examples/collect_failures/app.py @@ -43,4 +43,4 @@ def run(self): if __name__ == "__main__": - app = LightningApp(RootFlow()) + app = LightningApp(RootFlow(), debug=True) diff --git a/tests/tests_app_examples/conftest.py b/tests/tests_app_examples/conftest.py new file mode 100644 index 0000000000000..40d2db2e020d8 --- /dev/null +++ b/tests/tests_app_examples/conftest.py @@ -0,0 +1,44 @@ +import os + +from lightning_cloud.openapi.rest import ApiException + +from lightning_app.utilities.cloud import _get_project +from lightning_app.utilities.network import LightningClient + + +def pytest_timeout_cancel_timer(item): + """This hook deletes the Lightning App when timeout triggers.""" + + if item.name.endswith("_example_cloud"): + name = os.getenv("LIGHTNING_APP_NAME") + print(f"Timeout was triggered. Deleting the App {name}.") + + client = LightningClient() + project = _get_project(client) + + lightning_apps = [ + app + for app in client.lightningapp_instance_service_list_lightningapp_instances( + project.project_id + ).lightningapps + if app.name == name + ] + + if not lightning_apps: + return True + + assert len(lightning_apps) == 1 + + lightning_app = lightning_apps[0] + + try: + res = client.lightningapp_instance_service_delete_lightningapp_instance( + project_id=project.project_id, + id=lightning_app.id, + ) + assert res == {} + + except ApiException as e: + print(f"Failed to delete {name}. Exception {e}") + + return True diff --git a/tests/tests_app_examples/custom_work_dependencies/app.py b/tests/tests_app_examples/custom_work_dependencies/app.py index a821adf3fcffc..06e5f40d52aa6 100644 --- a/tests/tests_app_examples/custom_work_dependencies/app.py +++ b/tests/tests_app_examples/custom_work_dependencies/app.py @@ -49,4 +49,4 @@ def run(self): self._exit() -app = LightningApp(CustomWorkBuildConfigChecker()) +app = LightningApp(CustomWorkBuildConfigChecker(), debug=True) diff --git a/tests/tests_app_examples/idle_timeout/app.py b/tests/tests_app_examples/idle_timeout/app.py index 218c9e0174d08..e6086cd7907bd 100644 --- a/tests/tests_app_examples/idle_timeout/app.py +++ b/tests/tests_app_examples/idle_timeout/app.py @@ -68,4 +68,4 @@ def run(self): self._exit() -app = LightningApp(RootFlow()) +app = LightningApp(RootFlow(), debug=True) diff --git a/tests/tests_app_examples/test_commands_and_api.py b/tests/tests_app_examples/test_commands_and_api.py index 8fe3d024c8343..ab68374175d5c 100644 --- a/tests/tests_app_examples/test_commands_and_api.py +++ b/tests/tests_app_examples/test_commands_and_api.py @@ -9,6 +9,7 @@ from lightning_app.testing.testing import run_app_in_cloud +@pytest.mark.timeout(300) @pytest.mark.cloud def test_commands_and_api_example_cloud() -> None: with run_app_in_cloud(os.path.join(_PROJECT_ROOT, "examples/app_commands_and_api")) as ( @@ -21,16 +22,19 @@ def test_commands_and_api_example_cloud() -> None: app_id = admin_page.url.split("/")[-1] # 2: Connect to the App - Popen(f"lightning connect {app_id} -y", shell=True).wait() + Popen(f"python -m lightning connect {app_id} -y", shell=True).wait() # 3: Send the first command with the client - cmd = "lightning command with client --name=this" + cmd = "python -m lightning command with client --name=this" Popen(cmd, shell=True).wait() # 4: Send the second command without a client - cmd = "lightning command without client --name=is" + cmd = "python -m lightning command without client --name=is" Popen(cmd, shell=True).wait() + # This prevents some flakyness in the CI. Couldn't reproduce it locally. + sleep(5) + # 5: Send a request to the Rest API directly. base_url = view_page.url.replace("/view", "").replace("/child_flow", "") resp = requests.post(base_url + "/user/command_without_client?name=awesome") diff --git a/tests/tests_app_examples/test_v0_app.py b/tests/tests_app_examples/test_v0_app.py index 026c45a4e1ba1..f600b7eea3234 100644 --- a/tests/tests_app_examples/test_v0_app.py +++ b/tests/tests_app_examples/test_v0_app.py @@ -39,6 +39,7 @@ def check_content(button_name, text_content): locator = view_page.frame_locator("iframe").locator("div") locator.wait_for(timeout=3 * 1000) assert text_content in " ".join(locator.all_text_contents()) + print(f"Validated {button_name}") return True wait_for(view_page, check_content, "TAB_1", "Hello from component A") From 85ada5191d2ecd5eb57c8c135e73be273b372910 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 12 Sep 2022 16:47:44 +0200 Subject: [PATCH 106/193] Standalone Lite: Precision Plugins (#14547) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Carlos Mocholí Co-authored-by: Justus Schock <12886177+justusschock@users.noreply.github.com> --- src/lightning_lite/plugins/__init__.py | 14 +++ .../plugins/precision/__init__.py | 28 +++++ .../plugins/precision/deepspeed.py | 85 +++++++++++++++ .../plugins/precision/double.py | 36 +++++++ src/lightning_lite/plugins/precision/mixed.py | 26 +++++ .../plugins/precision/native_amp.py | 97 +++++++++++++++++ .../plugins/precision/precision.py | 100 ++++++++++++++++++ src/lightning_lite/plugins/precision/tpu.py | 34 ++++++ .../plugins/precision/tpu_bf16.py | 29 +++++ .../tests_lite/plugins/precision/__init__.py | 0 .../plugins/precision/test_deepspeed.py | 56 ++++++++++ .../plugins/precision/test_double.py | 25 +++++ .../plugins/precision/test_native_amp.py | 82 ++++++++++++++ .../plugins/precision/test_tpu_bf16.py | 23 ++++ 14 files changed, 635 insertions(+) create mode 100644 src/lightning_lite/plugins/precision/__init__.py create mode 100644 src/lightning_lite/plugins/precision/deepspeed.py create mode 100644 src/lightning_lite/plugins/precision/double.py create mode 100644 src/lightning_lite/plugins/precision/mixed.py create mode 100644 src/lightning_lite/plugins/precision/native_amp.py create mode 100644 src/lightning_lite/plugins/precision/precision.py create mode 100644 src/lightning_lite/plugins/precision/tpu.py create mode 100644 src/lightning_lite/plugins/precision/tpu_bf16.py create mode 100644 tests/tests_lite/plugins/precision/__init__.py create mode 100644 tests/tests_lite/plugins/precision/test_deepspeed.py create mode 100644 tests/tests_lite/plugins/precision/test_double.py create mode 100644 tests/tests_lite/plugins/precision/test_native_amp.py create mode 100644 tests/tests_lite/plugins/precision/test_tpu_bf16.py diff --git a/src/lightning_lite/plugins/__init__.py b/src/lightning_lite/plugins/__init__.py index ff7d31fbe7a91..3f716062e9fa7 100644 --- a/src/lightning_lite/plugins/__init__.py +++ b/src/lightning_lite/plugins/__init__.py @@ -16,10 +16,24 @@ from lightning_lite.plugins.io.checkpoint_plugin import CheckpointIO from lightning_lite.plugins.io.torch_plugin import TorchCheckpointIO from lightning_lite.plugins.io.xla_plugin import XLACheckpointIO +from lightning_lite.plugins.precision.deepspeed import DeepSpeedPrecision +from lightning_lite.plugins.precision.double import DoublePrecision +from lightning_lite.plugins.precision.mixed import MixedPrecision +from lightning_lite.plugins.precision.native_amp import NativeMixedPrecision +from lightning_lite.plugins.precision.precision import Precision +from lightning_lite.plugins.precision.tpu import TPUPrecision +from lightning_lite.plugins.precision.tpu_bf16 import TPUBf16Precision __all__ = [ "ClusterEnvironment", "CheckpointIO", "TorchCheckpointIO", "XLACheckpointIO", + "DeepSpeedPrecision", + "DoublePrecision", + "MixedPrecision", + "NativeMixedPrecision", + "Precision", + "TPUPrecision", + "TPUBf16Precision", ] diff --git a/src/lightning_lite/plugins/precision/__init__.py b/src/lightning_lite/plugins/precision/__init__.py new file mode 100644 index 0000000000000..28fdf80273bc8 --- /dev/null +++ b/src/lightning_lite/plugins/precision/__init__.py @@ -0,0 +1,28 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from lightning_lite.plugins.precision.deepspeed import DeepSpeedPrecision +from lightning_lite.plugins.precision.mixed import MixedPrecision +from lightning_lite.plugins.precision.native_amp import NativeMixedPrecision +from lightning_lite.plugins.precision.precision import Precision +from lightning_lite.plugins.precision.tpu import TPUPrecision +from lightning_lite.plugins.precision.tpu_bf16 import TPUBf16Precision + +__all__ = [ + "DeepSpeedPrecision", + "MixedPrecision", + "NativeMixedPrecision", + "Precision", + "TPUPrecision", + "TPUBf16Precision", +] diff --git a/src/lightning_lite/plugins/precision/deepspeed.py b/src/lightning_lite/plugins/precision/deepspeed.py new file mode 100644 index 0000000000000..8610121863195 --- /dev/null +++ b/src/lightning_lite/plugins/precision/deepspeed.py @@ -0,0 +1,85 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import Any, Optional, TYPE_CHECKING, Union + +from lightning_utilities.core.imports import RequirementCache +from torch import Tensor +from torch.optim import LBFGS, Optimizer + +from lightning_lite.plugins.precision.precision import Precision +from lightning_lite.utilities.enums import AMPType, PrecisionType +from lightning_lite.utilities.imports import _APEX_AVAILABLE + +_DEEPSPEED_AVAILABLE = RequirementCache("deepspeed") +if TYPE_CHECKING and _DEEPSPEED_AVAILABLE: + import deepspeed + + +class DeepSpeedPrecision(Precision): + """Precision plugin for DeepSpeed integration. + + Args: + precision: Double precision (64), full precision (32), half precision (16) or bfloat16 precision (bf16). + amp_type: The mixed precision backend to use ("native" or "apex"). + amp_level: The optimization level to use (O1, O2, etc...). By default it will be set to "O2" + if ``amp_type`` is set to "apex". + + Raises: + MisconfigurationException: + If using ``bfloat16`` precision and ``deepspeed None: + if amp_type == AMPType.APEX: + if not _APEX_AVAILABLE: + raise ImportError( + "You have asked for Apex AMP but `apex` is not installed." + " Install `apex` using this guide: https://github.com/NVIDIA/apex" + ) + + amp_level = amp_level or "O2" + + supported_precision = (PrecisionType.HALF, PrecisionType.FLOAT, PrecisionType.BFLOAT) + if precision not in supported_precision: + raise ValueError( + f"`precision={precision!r})` is not supported in DeepSpeed." + f" `precision` must be one of: {(x.value for x in supported_precision)}." + ) + + super().__init__() + self.precision = precision + self.amp_type = amp_type + self.amp_level = amp_level + + def backward(self, tensor: Tensor, model: Optional["deepspeed.DeepSpeedEngine"], *args: Any, **kwargs: Any) -> None: + """Performs back-propagation using DeepSpeed's engine.""" + if model is None: + raise ValueError("Please provide the model as input to `backward`.") + model.backward(tensor, *args, **kwargs) + + def optimizer_step( + self, + optimizer: Optimizer, + model: Optional["deepspeed.DeepSpeedEngine"] = None, + **kwargs: Any, + ) -> Any: + if isinstance(optimizer, LBFGS): + raise TypeError("DeepSpeed and the LBFGS optimizer are not compatible.") + if model is None: + raise TypeError("`optimizer_step()` requires a reference to the model.") + # DeepSpeed handles the optimizer step internally + return model.step(**kwargs) diff --git a/src/lightning_lite/plugins/precision/double.py b/src/lightning_lite/plugins/precision/double.py new file mode 100644 index 0000000000000..13f5909deac9d --- /dev/null +++ b/src/lightning_lite/plugins/precision/double.py @@ -0,0 +1,36 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from contextlib import contextmanager +from typing import Generator + +import torch + +from lightning_lite.plugins.precision import Precision + + +class DoublePrecision(Precision): + """Plugin for training with double (``torch.float64``) precision.""" + + precision: int = 64 + + @contextmanager + def forward_context(self) -> Generator[None, None, None]: + """A context manager to change the default tensor type. + + See: :meth:`torch.set_default_tensor_type` + """ + default_dtype = torch.get_default_dtype() + torch.set_default_dtype(torch.float64) + yield + torch.set_default_dtype(default_dtype) diff --git a/src/lightning_lite/plugins/precision/mixed.py b/src/lightning_lite/plugins/precision/mixed.py new file mode 100644 index 0000000000000..140096f98c6e1 --- /dev/null +++ b/src/lightning_lite/plugins/precision/mixed.py @@ -0,0 +1,26 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING, Union + +from lightning_lite.plugins.precision.precision import Precision + +if TYPE_CHECKING: + from lightning_lite.utilities import AMPType + + +class MixedPrecision(Precision): + """Base Class for mixed precision.""" + + backend: "AMPType" + precision: Union[str, int] = "mixed" diff --git a/src/lightning_lite/plugins/precision/native_amp.py b/src/lightning_lite/plugins/precision/native_amp.py new file mode 100644 index 0000000000000..bd54cdb846299 --- /dev/null +++ b/src/lightning_lite/plugins/precision/native_amp.py @@ -0,0 +1,97 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from contextlib import contextmanager +from typing import Any, Dict, Generator, Optional, Union + +import torch +from torch import Tensor +from torch.nn import Module +from torch.optim import LBFGS, Optimizer + +from lightning_lite.plugins.precision.mixed import MixedPrecision +from lightning_lite.utilities.enums import AMPType +from lightning_lite.utilities.imports import _TORCH_GREATER_EQUAL_1_10 + +if _TORCH_GREATER_EQUAL_1_10: + from torch import autocast as new_autocast +else: + from torch.cuda.amp import autocast as old_autocast + + +class NativeMixedPrecision(MixedPrecision): + """Plugin for Native Mixed Precision (AMP) training with ``torch.autocast``. + + Args: + precision: Whether to use ``torch.float16`` (``16``) or ``torch.bfloat16`` (``'bf16'``). + device: The device for ``torch.autocast``. + scaler: An optional :class:`torch.cuda.amp.GradScaler` to use. + """ + + backend = AMPType.NATIVE + + def __init__( + self, precision: Union[str, int], device: str, scaler: Optional[torch.cuda.amp.GradScaler] = None + ) -> None: + super().__init__() + if precision == "bf16" and not _TORCH_GREATER_EQUAL_1_10: + raise ImportError("To use bfloat16 with native amp you must install torch greater or equal to 1.10.") + if scaler is None and precision == 16: + scaler = torch.cuda.amp.GradScaler() + if scaler is not None and precision == "bf16": + raise ValueError(f"`precision='bf16'` does not use a scaler, found {scaler}.") + self.precision = precision + self.device = device + self.scaler = scaler + + @contextmanager + def forward_context(self) -> Generator[None, None, None]: + with self._autocast_context_manager(): + yield + + def backward(self, tensor: Tensor, model: Optional[Module], *args: Any, **kwargs: Any) -> None: + if self.scaler is not None: + tensor = self.scaler.scale(tensor) + super().backward(tensor, model, *args, **kwargs) + + def optimizer_step( + self, + optimizer: Optimizer, + model: Optional[Module] = None, + **kwargs: Any, + ) -> Any: + if self.scaler is None: + # skip scaler logic, as bfloat16 does not require scaler + return super().optimizer_step(optimizer, model=model, **kwargs) + if isinstance(optimizer, LBFGS): + raise TypeError("Native AMP and the LBFGS optimizer are not compatible.") + # note: the scaler will skip the `optimizer.step` if nonfinite gradients are found + step_output = self.scaler.step(optimizer, **kwargs) + self.scaler.update() + return step_output + + def state_dict(self) -> Dict[str, Any]: + if self.scaler is not None: + return self.scaler.state_dict() + return {} + + def load_state_dict(self, state_dict: Dict[str, Any]) -> None: + if self.scaler is not None: + self.scaler.load_state_dict(state_dict) + + def _autocast_context_manager(self) -> Union["old_autocast", "new_autocast"]: + if _TORCH_GREATER_EQUAL_1_10: + # the dtype could be automatically inferred but we need to manually set it due to a bug upstream + # https://github.com/pytorch/pytorch/issues/67233 + return new_autocast(self.device, dtype=torch.bfloat16 if self.precision == "bf16" else torch.half) + return old_autocast() diff --git a/src/lightning_lite/plugins/precision/precision.py b/src/lightning_lite/plugins/precision/precision.py new file mode 100644 index 0000000000000..881db89e88f56 --- /dev/null +++ b/src/lightning_lite/plugins/precision/precision.py @@ -0,0 +1,100 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import contextlib +from typing import Any, Dict, Generator, Optional, Union + +from torch import Tensor +from torch.nn import Module +from torch.optim import Optimizer + +from lightning_lite.utilities.types import _PARAMETERS + + +class Precision: + """Base class for all plugins handling the precision-specific parts of the training. + + The class attribute precision must be overwritten in child classes. The default value reflects fp32 training. + """ + + precision: Union[str, int] = 32 + + @contextlib.contextmanager + def forward_context(self) -> Generator[None, None, None]: + """A contextmanager for managing model forward/training_step/evaluation_step/predict_step.""" + yield + + def pre_backward(self, tensor: Tensor, module: Optional[Module]) -> None: + """Runs before precision plugin executes backward. + + Args: + tensor: The tensor that will be used for backpropagation + module: The module that was involved in producing the tensor and whose parameters need the gradients + """ + + def backward(self, tensor: Tensor, model: Optional[Module], *args: Any, **kwargs: Any) -> None: + """Performs the actual backpropagation. + + Args: + tensor: The tensor that will be used for backpropagation + model: The module that was involved in producing the tensor and whose parameters need the gradients + """ + tensor.backward(*args, **kwargs) + + def post_backward(self, tensor: Tensor, module: Optional[Module]) -> None: + """Runs after precision plugin executes backward. + + Args: + tensor: The tensor that will be used for backpropagation + module: The module that was involved in producing the tensor and whose parameters need the gradients + """ + + def optimizer_step( + self, + optimizer: Optimizer, + model: Optional[Module] = None, + **kwargs: Any, + ) -> Any: + """Hook to run the optimizer step.""" + return optimizer.step(**kwargs) + + def get_main_params(self, optimizer: Optimizer) -> _PARAMETERS: + """The main params of the model. + + Returns the plain model params here. Maybe different in other precision plugins. + """ + for group in optimizer.param_groups: + yield from group["params"] + + def state_dict(self) -> Dict[str, Any]: + """Called when saving a checkpoint, implement to generate precision plugin state_dict. + + Returns: + A dictionary containing precision plugin state. + """ + return {} + + def load_state_dict(self, state_dict: Dict[str, Any]) -> None: + """Called when loading a checkpoint, implement to reload precision plugin state given precision plugin + state_dict. + + Args: + state_dict: the precision plugin state returned by ``state_dict``. + """ + pass + + def teardown(self) -> None: + """This method is called to teardown the training process. + + It is the right place to release memory and free other resources. + """ diff --git a/src/lightning_lite/plugins/precision/tpu.py b/src/lightning_lite/plugins/precision/tpu.py new file mode 100644 index 0000000000000..480b491f2e120 --- /dev/null +++ b/src/lightning_lite/plugins/precision/tpu.py @@ -0,0 +1,34 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import Any, Optional + +from torch.nn import Module +from torch.optim import Optimizer + +from lightning_lite.plugins.precision.precision import Precision + + +class TPUPrecision(Precision): + """Precision plugin for TPU integration.""" + + def optimizer_step( + self, + optimizer: Optimizer, + model: Optional[Module] = None, + **kwargs: Any, + ) -> Any: + + import torch_xla.core.xla_model as xm + + return xm.optimizer_step(optimizer, optimizer_args=kwargs) diff --git a/src/lightning_lite/plugins/precision/tpu_bf16.py b/src/lightning_lite/plugins/precision/tpu_bf16.py new file mode 100644 index 0000000000000..d388a9ae175ac --- /dev/null +++ b/src/lightning_lite/plugins/precision/tpu_bf16.py @@ -0,0 +1,29 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os + +from lightning_lite.plugins.precision import TPUPrecision + + +class TPUBf16Precision(TPUPrecision): + """Plugin that enables bfloats on TPUs.""" + + precision: str = "bf16" + + def __init__(self) -> None: + super().__init__() + os.environ["XLA_USE_BF16"] = "1" + + def teardown(self) -> None: + os.environ.pop("XLA_USE_BF16", None) diff --git a/tests/tests_lite/plugins/precision/__init__.py b/tests/tests_lite/plugins/precision/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/tests/tests_lite/plugins/precision/test_deepspeed.py b/tests/tests_lite/plugins/precision/test_deepspeed.py new file mode 100644 index 0000000000000..811e90c147807 --- /dev/null +++ b/tests/tests_lite/plugins/precision/test_deepspeed.py @@ -0,0 +1,56 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from unittest import mock +from unittest.mock import Mock + +import pytest + +from lightning_lite.plugins.precision.deepspeed import DeepSpeedPrecision + + +def test_invalid_precision_with_deepspeed_precision(): + with pytest.raises(ValueError, match="is not supported in DeepSpeed. `precision` must be one of"): + DeepSpeedPrecision(precision=64, amp_type="native") + + +def test_deepspeed_precision_apex_not_installed(monkeypatch): + import lightning_lite.plugins.precision.deepspeed as deepspeed_apex + + monkeypatch.setattr(deepspeed_apex, "_APEX_AVAILABLE", False) + with pytest.raises(ImportError, match="You have asked for Apex AMP but `apex` is not installed."): + DeepSpeedPrecision(precision=16, amp_type="apex") + + +@mock.patch("lightning_lite.plugins.precision.deepspeed._APEX_AVAILABLE", return_value=True) +def test_deepspeed_precision_apex_default_level(_): + precision_plugin = DeepSpeedPrecision(precision=16, amp_type="apex") + assert isinstance(precision_plugin, DeepSpeedPrecision) + assert precision_plugin.amp_level == "O2" + + +def test_deepspeed_precision_backward(): + precision_plugin = DeepSpeedPrecision(precision=32, amp_type="native") + tensor = Mock() + model = Mock() + precision_plugin.backward(tensor, model, "positional-arg", keyword="arg") + model.backward.assert_called_once_with(tensor, "positional-arg", keyword="arg") + + +def test_deepspeed_precision_optimizer_step(): + precision_plugin = DeepSpeedPrecision(precision=32, amp_type="native") + optimizer = Mock() + model = Mock() + precision_plugin.optimizer_step(optimizer, model=model, lr_kwargs=dict()) + model.step.assert_called_once_with(lr_kwargs=dict()) + optimizer.step.assert_not_called() diff --git a/tests/tests_lite/plugins/precision/test_double.py b/tests/tests_lite/plugins/precision/test_double.py new file mode 100644 index 0000000000000..cbbc88565d681 --- /dev/null +++ b/tests/tests_lite/plugins/precision/test_double.py @@ -0,0 +1,25 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch + +from lightning_lite.plugins.precision.double import DoublePrecision + + +def test_double_precision_forward_context(): + precision_plugin = DoublePrecision() + assert torch.get_default_dtype() == torch.float32 + with precision_plugin.forward_context(): + assert torch.get_default_dtype() == torch.float64 + assert torch.get_default_dtype() == torch.float32 diff --git a/tests/tests_lite/plugins/precision/test_native_amp.py b/tests/tests_lite/plugins/precision/test_native_amp.py new file mode 100644 index 0000000000000..9f23ffebbcde7 --- /dev/null +++ b/tests/tests_lite/plugins/precision/test_native_amp.py @@ -0,0 +1,82 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from unittest import mock +from unittest.mock import Mock + +import pytest +import torch +from tests_lite.helpers.runif import RunIf + +from lightning_lite.plugins.precision.native_amp import NativeMixedPrecision + + +def test_native_amp_precision_default_scaler(): + precision_plugin = NativeMixedPrecision(precision=16, device=Mock()) + assert isinstance(precision_plugin.scaler, torch.cuda.amp.GradScaler) + + +@mock.patch("lightning_lite.plugins.precision.native_amp._TORCH_GREATER_EQUAL_1_10", True) +def test_native_amp_precision_scaler_with_bf16(): + with pytest.raises(ValueError, match="`precision='bf16'` does not use a scaler"): + NativeMixedPrecision(precision="bf16", device=Mock(), scaler=Mock()) + + precision_plugin = NativeMixedPrecision(precision="bf16", device=Mock()) + assert precision_plugin.scaler is None + + +@mock.patch("lightning_lite.plugins.precision.native_amp._TORCH_GREATER_EQUAL_1_10", False) +def test_native_amp_precision_bf16_min_torch(): + with pytest.raises(ImportError, match="you must install torch greater or equal to 1.10"): + NativeMixedPrecision(precision="bf16", device=Mock()) + + +@RunIf(min_torch="1.10") +def test_native_amp_precision_forward_context(): + precision_plugin = NativeMixedPrecision(precision="mixed", device="cuda") + assert torch.get_default_dtype() == torch.float32 + with precision_plugin.forward_context(): + assert torch.get_autocast_gpu_dtype() == torch.float16 + + +def test_native_amp_precision_backward(): + precision_plugin = NativeMixedPrecision(precision="mixed", device="cuda") + precision_plugin.scaler = Mock() + precision_plugin.scaler.scale = Mock(side_effect=(lambda x: x)) + tensor = Mock() + model = Mock() + precision_plugin.backward(tensor, model, "positional-arg", keyword="arg") + precision_plugin.scaler.scale.assert_called_once_with(tensor) + tensor.backward.assert_called_once_with("positional-arg", keyword="arg") + + +def test_native_amp_precision_optimizer_step_with_scaler(): + precision_plugin = NativeMixedPrecision(precision="mixed", device="cuda") + precision_plugin.scaler = Mock() + optimizer = Mock() + model = Mock() + + precision_plugin.optimizer_step(optimizer, model=model, keyword="arg") + precision_plugin.scaler.step.assert_called_once_with(optimizer, keyword="arg") + precision_plugin.scaler.update.assert_called_once() + + +@RunIf(min_torch="1.10") +def test_native_amp_precision_optimizer_step_without_scaler(): + precision_plugin = NativeMixedPrecision(precision="bf16", device="cuda") + assert precision_plugin.scaler is None + optimizer = Mock() + model = Mock() + + precision_plugin.optimizer_step(optimizer, model=model, keyword="arg") + optimizer.step.assert_called_once_with(keyword="arg") diff --git a/tests/tests_lite/plugins/precision/test_tpu_bf16.py b/tests/tests_lite/plugins/precision/test_tpu_bf16.py new file mode 100644 index 0000000000000..e7f3b9a0c075b --- /dev/null +++ b/tests/tests_lite/plugins/precision/test_tpu_bf16.py @@ -0,0 +1,23 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os + +from lightning_lite.plugins import TPUBf16Precision + + +def test_teardown(): + plugin = TPUBf16Precision() + assert os.environ.get("XLA_USE_BF16") == "1" + plugin.teardown() + assert "XLA_USE_BF16" not in os.environ From dfa570ef9fa8c292b4d426e24b053618d521d1c2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Mon, 12 Sep 2022 17:25:54 +0200 Subject: [PATCH 107/193] Run CircleCI with the HEAD sha, not the base (#14625) * Run CircleCI with the HEAD sha, not the base * Different solution --- .circleci/config.yml | 2 +- .github/workflows/ci-circleci.yml | 3 +++ dockers/tpu-tests/tpu_test_cases.jsonnet | 2 +- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 57d318bc240b6..05c901eee0e82 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -50,8 +50,8 @@ references: run: name: Update jsonnet command: | - export PR_NUMBER=$(git ls-remote origin "pull/*/head" | grep -F -f <(git rev-parse HEAD) | awk -F'/' '{print $3}') export SHA=$(git rev-parse --short HEAD) + export PR_NUMBER=$(git ls-remote origin "pull/*/head" | grep -F -f $SHA | awk -F'/' '{print $3}') python -c "fname = 'dockers/tpu-tests/tpu_test_cases.jsonnet' ; data = open(fname).read().replace('{PYTORCH_VERSION}', '$XLA_VER') data = data.replace('{PYTHON_VERSION}', '$PYTHON_VER').replace('{PR_NUMBER}', '$PR_NUMBER').replace('{SHA}', '$SHA') ; open(fname, 'w').write(data)" cat dockers/tpu-tests/tpu_test_cases.jsonnet diff --git a/.github/workflows/ci-circleci.yml b/.github/workflows/ci-circleci.yml index 751c3f9edc5f2..d1ff85e45e0dd 100644 --- a/.github/workflows/ci-circleci.yml +++ b/.github/workflows/ci-circleci.yml @@ -23,6 +23,9 @@ jobs: trigger-circleci: runs-on: ubuntu-latest steps: + - uses: actions/checkout@v3 + with: + ref: ${{ github.event.pull_request.head.sha }} - uses: CircleCI-Public/trigger-circleci-pipeline-action@v1.0.5 env: CCI_TOKEN: ${{ secrets.CCI_TOKEN }} diff --git a/dockers/tpu-tests/tpu_test_cases.jsonnet b/dockers/tpu-tests/tpu_test_cases.jsonnet index 1f6bf4c41b324..51dec56c7e134 100644 --- a/dockers/tpu-tests/tpu_test_cases.jsonnet +++ b/dockers/tpu-tests/tpu_test_cases.jsonnet @@ -28,7 +28,7 @@ local tputests = base.BaseTest { cd lightning echo $PWD git ls-remote --refs origin - git fetch origin "refs/pull/{PR_NUMBER}/head:pr/{PR_NUMBER}" && git checkout "pr/{PR_NUMBER}" + git fetch origin "refs/pull/{PR_NUMBER}/head" git checkout {SHA} export PACKAGE_NAME=pytorch export FREEZE_REQUIREMENTS=1 From 3b39c7eb9caae75225d6c6f193248c4d3356ba29 Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Mon, 12 Sep 2022 17:58:18 +0200 Subject: [PATCH 108/193] set next App dev (#14609) --- src/lightning_app/CHANGELOG.md | 29 +++++++++++++++++++++++++---- src/lightning_app/__version__.py | 2 +- 2 files changed, 26 insertions(+), 5 deletions(-) diff --git a/src/lightning_app/CHANGELOG.md b/src/lightning_app/CHANGELOG.md index 6fd47662f3a88..a6a3d2e9c37c4 100644 --- a/src/lightning_app/CHANGELOG.md +++ b/src/lightning_app/CHANGELOG.md @@ -5,11 +5,26 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). -## [0.7.0] - 2022-MM-DD +## [UnReleased] - 2022-MM-DD ### Added -- Adds `PanelFrontend` to easily create complex UI in Python ([#13531](https://github.com/Lightning-AI/lightning/pull/13531)) +- + + +### Changed + +- + + +### Deprecated + +- + + +### Removed + +- ### Fixed @@ -17,10 +32,12 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Resolved `LightningApp(..., debug=True)` ([#14464](https://github.com/Lightning-AI/lightning/pull/14464)) -## [0.6.0] - 2022-08-23 +## [0.6.0] - 2022-09-08 ### Added +- Introduce lightning connect ([#14452](https://github.com/Lightning-AI/lightning/pull/14452)) +- Adds `PanelFrontend` to easily create complex UI in Python ([#13531](https://github.com/Lightning-AI/lightning/pull/13531)) - Add support for `Lightning App Commands` through the `configure_commands` hook on the Lightning Flow and the `ClientCommand` ([#13602](https://github.com/Lightning-AI/lightning/pull/13602)) - Add support for Lightning AI BYOC cluster management ([#13835](https://github.com/Lightning-AI/lightning/pull/13835)) - Add support to see Lightning AI BYOC cluster logs ([#14334](https://github.com/Lightning-AI/lightning/pull/14334)) @@ -30,16 +47,20 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Add support for printing application logs using CLI `lightning show logs [components]` ([#13634](https://github.com/Lightning-AI/lightning/pull/13634)) - Add support for `Lightning API` through the `configure_api` hook on the Lightning Flow and the `Post`, `Get`, `Delete`, `Put` HttpMethods ([#13945](https://github.com/Lightning-AI/lightning/pull/13945)) - Added a warning when `configure_layout` returns URLs configured with http instead of https ([#14233](https://github.com/Lightning-AI/lightning/pull/14233)) +- Add `--app_args` support from the CLI ([#13625](https://github.com/Lightning-AI/lightning/pull/13625)) ### Changed - Default values and parameter names for Lightning AI BYOC cluster management ([#14132](https://github.com/Lightning-AI/lightning/pull/14132)) - Run the flow only if the state has changed from the previous execution ([#14076](https://github.com/Lightning-AI/lightning/pull/14076)) +- Increased DeepDiff's verbose level to properly handle dict changes ([#13960](https://github.com/Lightning-AI/lightning/pull/13960)) +- Setup: added requirement freeze for next major version ([#14480](https://github.com/Lightning-AI/lightning/pull/14480)) ### Fixed - Unification of app template: moved `app.py` to root dir for `lightning init app ` template ([#13853](https://github.com/Lightning-AI/lightning/pull/13853)) -- Fixing an issue with `lightning --version` command ([#14433](https://github.com/Lightning-AI/lightning/pull/14433)) +- Fixed an issue with `lightning --version` command ([#14433](https://github.com/Lightning-AI/lightning/pull/14433)) +- Fixed imports of collections.abc for py3.10 ([#14345](https://github.com/Lightning-AI/lightning/pull/14345)) ## [0.5.7] - 2022-08-22 diff --git a/src/lightning_app/__version__.py b/src/lightning_app/__version__.py index af4963b4c66b2..2696386c149ac 100644 --- a/src/lightning_app/__version__.py +++ b/src/lightning_app/__version__.py @@ -1 +1 @@ -version = "0.6.0rc0" +version = "0.7.0dev" From d013bcc5bf23475578f9f3699f33824596174110 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 12 Sep 2022 18:00:14 +0200 Subject: [PATCH 109/193] Standalone Lite: Accelerators (#14578) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Carlos Mocholí Co-authored-by: Jirka Borovec --- src/lightning_lite/accelerators/__init__.py | 22 ++++++ .../accelerators/accelerator.py | 56 ++++++++++++++ src/lightning_lite/accelerators/cpu.py | 65 ++++++++++++++++ src/lightning_lite/accelerators/cuda.py | 64 ++++++++++++++++ src/lightning_lite/accelerators/mps.py | 74 +++++++++++++++++++ .../accelerators/registry.py | 11 +-- src/lightning_lite/accelerators/tpu.py | 59 +++++++++++++++ src/lightning_lite/utilities/device_parser.py | 6 +- src/lightning_lite/utilities/imports.py | 1 - .../accelerators/__init__.py | 6 +- .../accelerators/accelerator.py | 60 +++++---------- src/pytorch_lightning/accelerators/cpu.py | 10 ++- src/pytorch_lightning/accelerators/cuda.py | 17 ++--- src/pytorch_lightning/accelerators/hpu.py | 13 ++-- src/pytorch_lightning/accelerators/ipu.py | 11 ++- src/pytorch_lightning/accelerators/mps.py | 22 +++--- src/pytorch_lightning/accelerators/tpu.py | 16 ++-- src/pytorch_lightning/lite/lite.py | 2 +- src/pytorch_lightning/strategies/strategy.py | 2 +- .../connectors/accelerator_connector.py | 2 +- tests/tests_lite/accelerators/__init__.py | 0 tests/tests_lite/accelerators/test_cpu.py | 43 +++++++++++ tests/tests_lite/accelerators/test_cuda.py | 53 +++++++++++++ tests/tests_lite/accelerators/test_mps.py | 49 ++++++++++++ .../tests_lite/accelerators/test_registry.py | 73 ++++++++++++++++++ tests/tests_lite/accelerators/test_tpu.py | 40 ++++++++++ tests/tests_lite/helpers/runif.py | 20 +++-- .../utilities/test_device_parser.py | 7 ++ .../tests_pytorch/accelerators/test_common.py | 12 +++ tests/tests_pytorch/accelerators/test_cpu.py | 11 +++ .../accelerators/test_registry.py | 50 +------------ .../deprecated_api/test_remove_1-10.py | 6 ++ tests/tests_pytorch/helpers/runif.py | 6 +- .../connectors/test_accelerator_connector.py | 15 +++- 34 files changed, 744 insertions(+), 160 deletions(-) create mode 100644 src/lightning_lite/accelerators/__init__.py create mode 100644 src/lightning_lite/accelerators/accelerator.py create mode 100644 src/lightning_lite/accelerators/cpu.py create mode 100644 src/lightning_lite/accelerators/cuda.py create mode 100644 src/lightning_lite/accelerators/mps.py rename src/{pytorch_lightning => lightning_lite}/accelerators/registry.py (93%) create mode 100644 src/lightning_lite/accelerators/tpu.py create mode 100644 tests/tests_lite/accelerators/__init__.py create mode 100644 tests/tests_lite/accelerators/test_cpu.py create mode 100644 tests/tests_lite/accelerators/test_cuda.py create mode 100644 tests/tests_lite/accelerators/test_mps.py create mode 100644 tests/tests_lite/accelerators/test_registry.py create mode 100644 tests/tests_lite/accelerators/test_tpu.py diff --git a/src/lightning_lite/accelerators/__init__.py b/src/lightning_lite/accelerators/__init__.py new file mode 100644 index 0000000000000..f1abf60f142d9 --- /dev/null +++ b/src/lightning_lite/accelerators/__init__.py @@ -0,0 +1,22 @@ +# Copyright The PyTorch Lightning team. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from lightning_lite.accelerators.accelerator import Accelerator # noqa: F401 +from lightning_lite.accelerators.cpu import CPUAccelerator # noqa: F401 +from lightning_lite.accelerators.cuda import CUDAAccelerator # noqa: F401 +from lightning_lite.accelerators.mps import MPSAccelerator # noqa: F401 +from lightning_lite.accelerators.registry import _AcceleratorRegistry, call_register_accelerators +from lightning_lite.accelerators.tpu import TPUAccelerator # noqa: F401 + +_ACCELERATORS_BASE_MODULE = "lightning_lite.accelerators" +ACCELERATOR_REGISTRY = _AcceleratorRegistry() +call_register_accelerators(ACCELERATOR_REGISTRY, _ACCELERATORS_BASE_MODULE) diff --git a/src/lightning_lite/accelerators/accelerator.py b/src/lightning_lite/accelerators/accelerator.py new file mode 100644 index 0000000000000..741cc62b70f53 --- /dev/null +++ b/src/lightning_lite/accelerators/accelerator.py @@ -0,0 +1,56 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from abc import ABC, abstractmethod +from typing import Any, Dict + +import torch + + +class Accelerator(ABC): + """The Accelerator base class. + + An Accelerator is meant to deal with one type of hardware. + """ + + @abstractmethod + def setup_device(self, device: torch.device) -> None: + """Create and prepare the device for the current process.""" + + @abstractmethod + def teardown(self) -> None: + """Clean up any state created by the accelerator.""" + + @staticmethod + @abstractmethod + def parse_devices(devices: Any) -> Any: + """Accelerator device parsing logic.""" + + @staticmethod + @abstractmethod + def get_parallel_devices(devices: Any) -> Any: + """Gets parallel devices for the Accelerator.""" + + @staticmethod + @abstractmethod + def auto_device_count() -> int: + """Get the device count when set to auto.""" + + @staticmethod + @abstractmethod + def is_available() -> bool: + """Detect if the hardware is available.""" + + @classmethod + def register_accelerators(cls, accelerator_registry: Dict) -> None: + pass diff --git a/src/lightning_lite/accelerators/cpu.py b/src/lightning_lite/accelerators/cpu.py new file mode 100644 index 0000000000000..24b360179801b --- /dev/null +++ b/src/lightning_lite/accelerators/cpu.py @@ -0,0 +1,65 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import Dict, List, Union + +import torch + +from lightning_lite.accelerators.accelerator import Accelerator +from lightning_lite.utilities import device_parser + + +class CPUAccelerator(Accelerator): + """Accelerator for CPU devices.""" + + def setup_device(self, device: torch.device) -> None: + """ + Raises: + ValueError: + If the selected device is not CPU. + """ + if device.type != "cpu": + raise ValueError(f"Device should be CPU, got {device} instead.") + + def teardown(self) -> None: + pass + + @staticmethod + def parse_devices(devices: Union[int, str, List[int]]) -> int: + """Accelerator device parsing logic.""" + devices = device_parser.parse_cpu_cores(devices) + return devices + + @staticmethod + def get_parallel_devices(devices: Union[int, str, List[int]]) -> List[torch.device]: + """Gets parallel devices for the Accelerator.""" + devices = device_parser.parse_cpu_cores(devices) + return [torch.device("cpu")] * devices + + @staticmethod + def auto_device_count() -> int: + """Get the devices when set to auto.""" + return 1 + + @staticmethod + def is_available() -> bool: + """CPU is always available for execution.""" + return True + + @classmethod + def register_accelerators(cls, accelerator_registry: Dict) -> None: + accelerator_registry.register( + "cpu", + cls, + description=cls.__class__.__name__, + ) diff --git a/src/lightning_lite/accelerators/cuda.py b/src/lightning_lite/accelerators/cuda.py new file mode 100644 index 0000000000000..7e7947a361873 --- /dev/null +++ b/src/lightning_lite/accelerators/cuda.py @@ -0,0 +1,64 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import Dict, List, Optional, Union + +import torch + +from lightning_lite.accelerators.accelerator import Accelerator +from lightning_lite.utilities import device_parser + + +class CUDAAccelerator(Accelerator): + """Accelerator for NVIDIA CUDA devices.""" + + def setup_device(self, device: torch.device) -> None: + """ + Raises: + ValueError: + If the selected device is not of type CUDA. + """ + if device.type != "cuda": + raise ValueError(f"Device should be CUDA, got {device} instead.") + torch.cuda.set_device(device) + + def teardown(self) -> None: + # clean up memory + torch.cuda.empty_cache() + + @staticmethod + def parse_devices(devices: Union[int, str, List[int]]) -> Optional[List[int]]: + """Accelerator device parsing logic.""" + return device_parser.parse_gpu_ids(devices, include_cuda=True) + + @staticmethod + def get_parallel_devices(devices: List[int]) -> List[torch.device]: + """Gets parallel devices for the Accelerator.""" + return [torch.device("cuda", i) for i in devices] + + @staticmethod + def auto_device_count() -> int: + """Get the devices when set to auto.""" + return device_parser.num_cuda_devices() + + @staticmethod + def is_available() -> bool: + return device_parser.num_cuda_devices() > 0 + + @classmethod + def register_accelerators(cls, accelerator_registry: Dict) -> None: + accelerator_registry.register( + "cuda", + cls, + description=cls.__class__.__name__, + ) diff --git a/src/lightning_lite/accelerators/mps.py b/src/lightning_lite/accelerators/mps.py new file mode 100644 index 0000000000000..694cb135ddead --- /dev/null +++ b/src/lightning_lite/accelerators/mps.py @@ -0,0 +1,74 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import platform +from functools import lru_cache +from typing import Dict, List, Optional, Union + +import torch + +from lightning_lite.accelerators.accelerator import Accelerator +from lightning_lite.utilities import device_parser +from lightning_lite.utilities.imports import _TORCH_GREATER_EQUAL_1_12 + + +class MPSAccelerator(Accelerator): + """Accelerator for Metal Apple Silicon GPU devices.""" + + def setup_device(self, device: torch.device) -> None: + """ + Raises: + ValueError: + If the selected device is not MPS. + """ + if device.type != "mps": + raise ValueError(f"Device should be MPS, got {device} instead.") + + def teardown(self) -> None: + pass + + @staticmethod + def parse_devices(devices: Union[int, str, List[int]]) -> Optional[List[int]]: + """Accelerator device parsing logic.""" + parsed_devices = device_parser.parse_gpu_ids(devices, include_mps=True) + return parsed_devices + + @staticmethod + def get_parallel_devices(devices: Union[int, str, List[int]]) -> List[torch.device]: + """Gets parallel devices for the Accelerator.""" + parsed_devices = MPSAccelerator.parse_devices(devices) + assert parsed_devices is not None + + return [torch.device("mps", i) for i in range(len(parsed_devices))] + + @staticmethod + def auto_device_count() -> int: + """Get the devices when set to auto.""" + return 1 + + @staticmethod + @lru_cache(1) + def is_available() -> bool: + """MPS is only available for certain torch builds starting at torch>=1.12, and is only enabled on a machine + with the ARM-based Apple Silicon processors.""" + return ( + _TORCH_GREATER_EQUAL_1_12 and torch.backends.mps.is_available() and platform.processor() in ("arm", "arm64") + ) + + @classmethod + def register_accelerators(cls, accelerator_registry: Dict) -> None: + accelerator_registry.register( + "mps", + cls, + description=cls.__class__.__name__, + ) diff --git a/src/pytorch_lightning/accelerators/registry.py b/src/lightning_lite/accelerators/registry.py similarity index 93% rename from src/pytorch_lightning/accelerators/registry.py rename to src/lightning_lite/accelerators/registry.py index 74a306df265ca..29e05a174fcde 100644 --- a/src/pytorch_lightning/accelerators/registry.py +++ b/src/lightning_lite/accelerators/registry.py @@ -15,9 +15,9 @@ from inspect import getmembers, isclass from typing import Any, Callable, Dict, List, Optional +from lightning_lite.accelerators.accelerator import Accelerator +from lightning_lite.utilities.exceptions import MisconfigurationException from lightning_lite.utilities.registry import _is_register_method_overridden -from pytorch_lightning.accelerators.accelerator import Accelerator -from pytorch_lightning.utilities.exceptions import MisconfigurationException class _AcceleratorRegistry(dict): @@ -112,11 +112,8 @@ def __str__(self) -> str: return "Registered Accelerators: {}".format(", ".join(self.available_accelerators())) -AcceleratorRegistry = _AcceleratorRegistry() - - -def call_register_accelerators(base_module: str) -> None: +def call_register_accelerators(registry: _AcceleratorRegistry, base_module: str) -> None: module = importlib.import_module(base_module) for _, mod in getmembers(module, isclass): if issubclass(mod, Accelerator) and _is_register_method_overridden(mod, Accelerator, "register_accelerators"): - mod.register_accelerators(AcceleratorRegistry) + mod.register_accelerators(registry) diff --git a/src/lightning_lite/accelerators/tpu.py b/src/lightning_lite/accelerators/tpu.py new file mode 100644 index 0000000000000..4d124b25f291c --- /dev/null +++ b/src/lightning_lite/accelerators/tpu.py @@ -0,0 +1,59 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import Dict, List, Optional, Union + +import torch + +from lightning_lite.accelerators.accelerator import Accelerator +from lightning_lite.utilities import device_parser +from lightning_lite.utilities.imports import _TPU_AVAILABLE + + +class TPUAccelerator(Accelerator): + """Accelerator for TPU devices.""" + + def setup_device(self, device: torch.device) -> None: + pass + + def teardown(self) -> None: + pass + + @staticmethod + def parse_devices(devices: Union[int, str, List[int]]) -> Optional[Union[int, List[int]]]: + """Accelerator device parsing logic.""" + return device_parser.parse_tpu_cores(devices) + + @staticmethod + def get_parallel_devices(devices: Union[int, List[int]]) -> List[int]: + """Gets parallel devices for the Accelerator.""" + if isinstance(devices, int): + return list(range(devices)) + return devices + + @staticmethod + def auto_device_count() -> int: + """Get the devices when set to auto.""" + return 8 + + @staticmethod + def is_available() -> bool: + return _TPU_AVAILABLE + + @classmethod + def register_accelerators(cls, accelerator_registry: Dict) -> None: + accelerator_registry.register( + "tpu", + cls, + description=cls.__class__.__name__, + ) diff --git a/src/lightning_lite/utilities/device_parser.py b/src/lightning_lite/utilities/device_parser.py index 6967f7bf0af16..04aa14ecdbcd2 100644 --- a/src/lightning_lite/utilities/device_parser.py +++ b/src/lightning_lite/utilities/device_parser.py @@ -213,9 +213,9 @@ def _get_all_available_mps_gpus() -> List[int]: A list of all available MPS GPUs """ # lazy import to avoid circular dependencies - # from lightning_lite.accelerators.mps import _MPS_AVAILABLE - _MPS_AVAILABLE = False # TODO(lite): revert this once MPS utils have moved - return [0] if _MPS_AVAILABLE else [] + from lightning_lite.accelerators.mps import MPSAccelerator + + return [0] if MPSAccelerator.is_available() else [] def _get_all_available_cuda_gpus() -> List[int]: diff --git a/src/lightning_lite/utilities/imports.py b/src/lightning_lite/utilities/imports.py index 34e7b5ac5f82f..70d8549368ea5 100644 --- a/src/lightning_lite/utilities/imports.py +++ b/src/lightning_lite/utilities/imports.py @@ -35,7 +35,6 @@ _HOROVOD_AVAILABLE = module_available("horovod.torch") _OMEGACONF_AVAILABLE = package_available("omegaconf") _POPTORCH_AVAILABLE = package_available("poptorch") -_PSUTIL_AVAILABLE = package_available("psutil") _XLA_AVAILABLE: bool = package_available("torch_xla") # TODO(lite): import this from the fairscale files once they move to lite package diff --git a/src/pytorch_lightning/accelerators/__init__.py b/src/pytorch_lightning/accelerators/__init__.py index 1bba4a42879bc..c56b22705a23a 100644 --- a/src/pytorch_lightning/accelerators/__init__.py +++ b/src/pytorch_lightning/accelerators/__init__.py @@ -10,6 +10,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from lightning_lite.accelerators.registry import _AcceleratorRegistry, call_register_accelerators from pytorch_lightning.accelerators.accelerator import Accelerator # noqa: F401 from pytorch_lightning.accelerators.cpu import CPUAccelerator # noqa: F401 from pytorch_lightning.accelerators.cuda import CUDAAccelerator # noqa: F401 @@ -17,9 +18,8 @@ from pytorch_lightning.accelerators.hpu import HPUAccelerator # noqa: F401 from pytorch_lightning.accelerators.ipu import IPUAccelerator # noqa: F401 from pytorch_lightning.accelerators.mps import MPSAccelerator # noqa: F401 -from pytorch_lightning.accelerators.registry import AcceleratorRegistry, call_register_accelerators # noqa: F401 from pytorch_lightning.accelerators.tpu import TPUAccelerator # noqa: F401 ACCELERATORS_BASE_MODULE = "pytorch_lightning.accelerators" - -call_register_accelerators(ACCELERATORS_BASE_MODULE) +AcceleratorRegistry = _AcceleratorRegistry() +call_register_accelerators(AcceleratorRegistry, ACCELERATORS_BASE_MODULE) diff --git a/src/pytorch_lightning/accelerators/accelerator.py b/src/pytorch_lightning/accelerators/accelerator.py index 9d941c846688b..448f8e87951be 100644 --- a/src/pytorch_lightning/accelerators/accelerator.py +++ b/src/pytorch_lightning/accelerators/accelerator.py @@ -11,32 +11,33 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from abc import ABC, abstractmethod -from typing import Any, Dict, Union +from abc import ABC +from typing import Any, Dict import torch +from lightning_utilities.core.rank_zero import rank_zero_deprecation import pytorch_lightning as pl +from lightning_lite.accelerators.accelerator import Accelerator as _Accelerator +from lightning_lite.utilities.types import _DEVICE -class Accelerator(ABC): - """The Accelerator Base Class. An Accelerator is meant to deal with one type of Hardware. +class Accelerator(_Accelerator, ABC): + """The Accelerator base class for Lightning PyTorch. - Currently there are accelerators for: - - - CPU - - GPU - - TPU - - IPU - - HPU + An Accelerator is meant to deal with one type of hardware. """ def setup_environment(self, root_device: torch.device) -> None: - """Setup any processes or distributed connections. - - This is called before the LightningModule/DataModule setup hook which allows the user to access the accelerator - environment before setup is complete. """ + .. deprecated:: v1.8.0 + This hook was deprecated in v1.8.0 and will be removed in v1.10.0. Please use ``setup_device()`` instead. + """ + rank_zero_deprecation( + "`Accelerator.setup_environment` has been deprecated in deprecated in v1.8.0 and will be removed in" + " v1.10.0. Please use ``setup_device()`` instead." + ) + self.setup_device(root_device) def setup(self, trainer: "pl.Trainer") -> None: """Setup plugins for the trainer fit and creates optimizers. @@ -45,7 +46,7 @@ def setup(self, trainer: "pl.Trainer") -> None: trainer: the trainer instance """ - def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]: + def get_device_stats(self, device: _DEVICE) -> Dict[str, Any]: """Get stats for a given device. Args: @@ -55,30 +56,3 @@ def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]: Dictionary of device stats """ raise NotImplementedError - - def teardown(self) -> None: - """Clean up any state created by the accelerator.""" - - @staticmethod - @abstractmethod - def parse_devices(devices: Any) -> Any: - """Accelerator device parsing logic.""" - - @staticmethod - @abstractmethod - def get_parallel_devices(devices: Any) -> Any: - """Gets parallel devices for the Accelerator.""" - - @staticmethod - @abstractmethod - def auto_device_count() -> int: - """Get the device count when set to auto.""" - - @staticmethod - @abstractmethod - def is_available() -> bool: - """Detect if the hardware is available.""" - - @classmethod - def register_accelerators(cls, accelerator_registry: Dict) -> None: - pass diff --git a/src/pytorch_lightning/accelerators/cpu.py b/src/pytorch_lightning/accelerators/cpu.py index 00eeac15ff641..4369233350ad7 100644 --- a/src/pytorch_lightning/accelerators/cpu.py +++ b/src/pytorch_lightning/accelerators/cpu.py @@ -25,20 +25,22 @@ class CPUAccelerator(Accelerator): """Accelerator for CPU devices.""" - def setup_environment(self, root_device: torch.device) -> None: + def setup_device(self, device: torch.device) -> None: """ Raises: MisconfigurationException: If the selected device is not CPU. """ - super().setup_environment(root_device) - if root_device.type != "cpu": - raise MisconfigurationException(f"Device should be CPU, got {root_device} instead.") + if device.type != "cpu": + raise MisconfigurationException(f"Device should be CPU, got {device} instead.") def get_device_stats(self, device: _DEVICE) -> Dict[str, Any]: """Get CPU stats from ``psutil`` package.""" return get_cpu_stats() + def teardown(self) -> None: + pass + @staticmethod def parse_devices(devices: Union[int, str, List[int]]) -> int: """Accelerator device parsing logic.""" diff --git a/src/pytorch_lightning/accelerators/cuda.py b/src/pytorch_lightning/accelerators/cuda.py index e5f939c69ac1c..03b7eadf55cd5 100644 --- a/src/pytorch_lightning/accelerators/cuda.py +++ b/src/pytorch_lightning/accelerators/cuda.py @@ -31,16 +31,15 @@ class CUDAAccelerator(Accelerator): """Accelerator for NVIDIA CUDA devices.""" - def setup_environment(self, root_device: torch.device) -> None: + def setup_device(self, device: torch.device) -> None: """ Raises: MisconfigurationException: If the selected device is not GPU. """ - super().setup_environment(root_device) - if root_device.type != "cuda": - raise MisconfigurationException(f"Device should be GPU, got {root_device} instead") - torch.cuda.set_device(root_device) + if device.type != "cuda": + raise MisconfigurationException(f"Device should be GPU, got {device} instead") + torch.cuda.set_device(device) def setup(self, trainer: "pl.Trainer") -> None: # TODO refactor input from trainer to local_rank @four4fish @@ -71,6 +70,10 @@ def get_device_stats(self, device: _DEVICE) -> Dict[str, Any]: """ return torch.cuda.memory_stats(device) + def teardown(self) -> None: + # clean up memory + torch.cuda.empty_cache() + @staticmethod def parse_devices(devices: Union[int, str, List[int]]) -> Optional[List[int]]: """Accelerator device parsing logic.""" @@ -98,10 +101,6 @@ def register_accelerators(cls, accelerator_registry: Dict) -> None: description=f"{cls.__class__.__name__}", ) - def teardown(self) -> None: - # clean up memory - torch.cuda.empty_cache() - def get_nvidia_gpu_stats(device: _DEVICE) -> Dict[str, float]: # pragma: no-cover """Get GPU stats including memory, fan speed, and temperature from nvidia-smi. diff --git a/src/pytorch_lightning/accelerators/hpu.py b/src/pytorch_lightning/accelerators/hpu.py index c85e81756c2a9..3d18a0ad556cf 100644 --- a/src/pytorch_lightning/accelerators/hpu.py +++ b/src/pytorch_lightning/accelerators/hpu.py @@ -16,6 +16,7 @@ import torch +from lightning_lite.utilities.types import _DEVICE from pytorch_lightning.accelerators.accelerator import Accelerator from pytorch_lightning.utilities.device_parser import parse_hpus from pytorch_lightning.utilities.exceptions import MisconfigurationException @@ -29,17 +30,16 @@ class HPUAccelerator(Accelerator): """Accelerator for HPU devices.""" - def setup_environment(self, root_device: torch.device) -> None: + def setup_device(self, device: torch.device) -> None: """ Raises: MisconfigurationException: If the selected device is not HPU. """ - super().setup_environment(root_device) - if root_device.type != "hpu": - raise MisconfigurationException(f"Device should be HPU, got {root_device} instead.") + if device.type != "hpu": + raise MisconfigurationException(f"Device should be HPU, got {device} instead.") - def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]: + def get_device_stats(self, device: _DEVICE) -> Dict[str, Any]: """Returns a map of the following metrics with their values: - Limit: amount of total memory on HPU device. @@ -59,6 +59,9 @@ def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]: rank_zero_debug("HPU `get_device_stats` failed") return {} + def teardown(self) -> None: + pass + @staticmethod def parse_devices(devices: Union[int, str, List[int]]) -> Optional[int]: """Accelerator device parsing logic.""" diff --git a/src/pytorch_lightning/accelerators/ipu.py b/src/pytorch_lightning/accelerators/ipu.py index b09fd33c29227..f9e5a13bbb03b 100644 --- a/src/pytorch_lightning/accelerators/ipu.py +++ b/src/pytorch_lightning/accelerators/ipu.py @@ -11,10 +11,11 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import Any, Dict, List, Union +from typing import Any, Dict, List import torch +from lightning_lite.utilities.types import _DEVICE from pytorch_lightning.accelerators.accelerator import Accelerator from pytorch_lightning.utilities.imports import _IPU_AVAILABLE @@ -22,10 +23,16 @@ class IPUAccelerator(Accelerator): """Accelerator for IPUs.""" - def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]: + def setup_device(self, device: torch.device) -> None: + pass + + def get_device_stats(self, device: _DEVICE) -> Dict[str, Any]: """IPU device stats aren't supported yet.""" return {} + def teardown(self) -> None: + pass + @staticmethod def parse_devices(devices: int) -> int: """Accelerator device parsing logic.""" diff --git a/src/pytorch_lightning/accelerators/mps.py b/src/pytorch_lightning/accelerators/mps.py index 5610ba1549da9..6fa6f423fbed7 100644 --- a/src/pytorch_lightning/accelerators/mps.py +++ b/src/pytorch_lightning/accelerators/mps.py @@ -11,41 +11,37 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import platform from typing import Any, Dict, List, Optional, Union import torch +from lightning_lite.accelerators.mps import MPSAccelerator as _MPSAccelerator from lightning_lite.utilities import device_parser from lightning_lite.utilities.types import _DEVICE from pytorch_lightning.accelerators.accelerator import Accelerator from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning.utilities.imports import _PSUTIL_AVAILABLE, _TORCH_GREATER_EQUAL_1_12 - -# For using the `MPSAccelerator`, user's machine should have `torch>=1.12`, Metal programming framework and -# the ARM-based Apple Silicon processors. -_MPS_AVAILABLE = ( - _TORCH_GREATER_EQUAL_1_12 and torch.backends.mps.is_available() and platform.processor() in ("arm", "arm64") -) +from pytorch_lightning.utilities.imports import _PSUTIL_AVAILABLE class MPSAccelerator(Accelerator): """Accelerator for Metal Apple Silicon GPU devices.""" - def setup_environment(self, root_device: torch.device) -> None: + def setup_device(self, device: torch.device) -> None: """ Raises: MisconfigurationException: If the selected device is not MPS. """ - super().setup_environment(root_device) - if root_device.type != "mps": - raise MisconfigurationException(f"Device should be MPS, got {root_device} instead.") + if device.type != "mps": + raise MisconfigurationException(f"Device should be MPS, got {device} instead.") def get_device_stats(self, device: _DEVICE) -> Dict[str, Any]: """Get M1 (cpu + gpu) stats from ``psutil`` package.""" return get_device_stats() + def teardown(self) -> None: + pass + @staticmethod def parse_devices(devices: Union[int, str, List[int]]) -> Optional[List[int]]: """Accelerator device parsing logic.""" @@ -68,7 +64,7 @@ def auto_device_count() -> int: @staticmethod def is_available() -> bool: """MPS is only available for certain torch builds starting at torch>=1.12.""" - return _MPS_AVAILABLE + return _MPSAccelerator.is_available() @classmethod def register_accelerators(cls, accelerator_registry: Dict) -> None: diff --git a/src/pytorch_lightning/accelerators/tpu.py b/src/pytorch_lightning/accelerators/tpu.py index 89170e4c924ad..8637de9095dd2 100644 --- a/src/pytorch_lightning/accelerators/tpu.py +++ b/src/pytorch_lightning/accelerators/tpu.py @@ -16,17 +16,18 @@ import torch from lightning_lite.utilities import device_parser +from lightning_lite.utilities.types import _DEVICE from pytorch_lightning.accelerators.accelerator import Accelerator -from pytorch_lightning.utilities.imports import _TPU_AVAILABLE, _XLA_AVAILABLE - -if _XLA_AVAILABLE: - import torch_xla.core.xla_model as xm +from pytorch_lightning.utilities.imports import _TPU_AVAILABLE class TPUAccelerator(Accelerator): """Accelerator for TPU devices.""" - def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]: + def setup_device(self, device: torch.device) -> None: + pass + + def get_device_stats(self, device: _DEVICE) -> Dict[str, Any]: """Gets stats for the given TPU device. Args: @@ -35,6 +36,8 @@ def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]: Returns: A dictionary mapping the metrics (free memory and peak memory) to their values. """ + import torch_xla.core.xla_model as xm + memory_info = xm.get_memory_info(device) free_memory = memory_info["kb_free"] peak_memory = memory_info["kb_total"] - free_memory @@ -44,6 +47,9 @@ def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]: } return device_stats + def teardown(self) -> None: + pass + @staticmethod def parse_devices(devices: Union[int, str, List[int]]) -> Optional[Union[int, List[int]]]: """Accelerator device parsing logic.""" diff --git a/src/pytorch_lightning/lite/lite.py b/src/pytorch_lightning/lite/lite.py index c301f71d441b8..331495e04ce06 100644 --- a/src/pytorch_lightning/lite/lite.py +++ b/src/pytorch_lightning/lite/lite.py @@ -307,7 +307,7 @@ def to_device(self, obj: Union[nn.Module, Tensor, Any]) -> Union[nn.Module, Tens if isinstance(obj, nn.Module): if self.device.type == "cuda": # need to call this manually here again in case we spawned with DDPSpawnStrategy - # TODO: refactor to let plugin handle this cleanly + # TODO: refactor to let accelerator handle this cleanly (see Accelerator.setup_device) torch.cuda.set_device(self.device) return obj.to(self.device) return move_data_to_device(obj, device=self.device) diff --git a/src/pytorch_lightning/strategies/strategy.py b/src/pytorch_lightning/strategies/strategy.py index dc2a5b6397289..2b85bbb88cc9d 100644 --- a/src/pytorch_lightning/strategies/strategy.py +++ b/src/pytorch_lightning/strategies/strategy.py @@ -128,7 +128,7 @@ def setup_environment(self) -> None: environment before setup is complete. """ assert self.accelerator is not None - self.accelerator.setup_environment(self.root_device) + self.accelerator.setup_device(self.root_device) def setup_optimizers(self, trainer: "pl.Trainer") -> None: """Creates optimizers and schedulers. diff --git a/src/pytorch_lightning/trainer/connectors/accelerator_connector.py b/src/pytorch_lightning/trainer/connectors/accelerator_connector.py index 77f29d3c159e5..c7432ff298c88 100644 --- a/src/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/src/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -29,13 +29,13 @@ TorchElasticEnvironment, ) from lightning_lite.utilities import _StrategyType, AMPType, device_parser, LightningEnum +from pytorch_lightning.accelerators import AcceleratorRegistry from pytorch_lightning.accelerators.accelerator import Accelerator from pytorch_lightning.accelerators.cpu import CPUAccelerator from pytorch_lightning.accelerators.cuda import CUDAAccelerator from pytorch_lightning.accelerators.hpu import HPUAccelerator from pytorch_lightning.accelerators.ipu import IPUAccelerator from pytorch_lightning.accelerators.mps import MPSAccelerator -from pytorch_lightning.accelerators.registry import AcceleratorRegistry from pytorch_lightning.accelerators.tpu import TPUAccelerator from pytorch_lightning.plugins import ( ApexMixedPrecisionPlugin, diff --git a/tests/tests_lite/accelerators/__init__.py b/tests/tests_lite/accelerators/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/tests/tests_lite/accelerators/test_cpu.py b/tests/tests_lite/accelerators/test_cpu.py new file mode 100644 index 0000000000000..f6ccba3560b96 --- /dev/null +++ b/tests/tests_lite/accelerators/test_cpu.py @@ -0,0 +1,43 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +import torch + +from lightning_lite.accelerators.cpu import CPUAccelerator + + +def test_auto_device_count(): + assert CPUAccelerator.auto_device_count() == 1 + + +def test_availability(): + assert CPUAccelerator.is_available() + + +def test_init_device_with_wrong_device_type(): + with pytest.raises(ValueError, match="Device should be CPU"): + CPUAccelerator().setup_device(torch.device("cuda")) + + +@pytest.mark.parametrize( + "devices,expected", + [ + (1, [torch.device("cpu")]), + (2, [torch.device("cpu")] * 2), + ("3", [torch.device("cpu")] * 3), + ], +) +def test_get_parallel_devices(devices, expected): + assert CPUAccelerator.get_parallel_devices(devices) == expected diff --git a/tests/tests_lite/accelerators/test_cuda.py b/tests/tests_lite/accelerators/test_cuda.py new file mode 100644 index 0000000000000..85106ed5c8c5b --- /dev/null +++ b/tests/tests_lite/accelerators/test_cuda.py @@ -0,0 +1,53 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from unittest import mock + +import pytest +import torch +from tests_lite.helpers.runif import RunIf + +from lightning_lite.accelerators.cuda import CUDAAccelerator + + +@mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=2) +def test_auto_device_count(_): + assert CUDAAccelerator.auto_device_count() == 2 + + +@RunIf(min_cuda_gpus=1) +def test_gpu_availability(): + assert CUDAAccelerator.is_available() + + +def test_init_device_with_wrong_device_type(): + with pytest.raises(ValueError, match="Device should be CUDA"): + CUDAAccelerator().setup_device(torch.device("cpu")) + + +@pytest.mark.parametrize( + "devices,expected", + [ + ([], []), + ([1], [torch.device("cuda", 1)]), + ([3, 1], [torch.device("cuda", 3), torch.device("cuda", 1)]), + ], +) +def test_get_parallel_devices(devices, expected): + assert CUDAAccelerator.get_parallel_devices(devices) == expected + + +@mock.patch("torch.cuda.set_device") +def test_set_cuda_device(set_device_mock): + CUDAAccelerator().setup_device(torch.device("cuda", 1)) + set_device_mock.assert_called_once_with(torch.device("cuda", 1)) diff --git a/tests/tests_lite/accelerators/test_mps.py b/tests/tests_lite/accelerators/test_mps.py new file mode 100644 index 0000000000000..f6148ff7ff3f8 --- /dev/null +++ b/tests/tests_lite/accelerators/test_mps.py @@ -0,0 +1,49 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import pytest +import torch +from tests_lite.helpers.runif import RunIf + +from lightning_lite.accelerators.mps import MPSAccelerator + +_MAYBE_MPS = "mps" if MPSAccelerator.is_available() else "cpu" # torch.device(mps) only works on torch>=1.12 + + +def test_auto_device_count(): + assert MPSAccelerator.auto_device_count() == 1 + + +@RunIf(mps=True) +def test_mps_availability(): + assert MPSAccelerator.is_available() + + +def test_init_device_with_wrong_device_type(): + with pytest.raises(ValueError, match="Device should be MPS"): + MPSAccelerator().setup_device(torch.device("cpu")) + + +@RunIf(mps=True) +@pytest.mark.parametrize( + "devices,expected", + [ + (1, [torch.device(_MAYBE_MPS, 0)]), + (2, [torch.device(_MAYBE_MPS, 0), torch.device(_MAYBE_MPS, 1)]), + ([0], [torch.device(_MAYBE_MPS, 0)]), + # TODO(lite): This case passes with the implementation from PL, but looks like a bug + ([0, 2], [torch.device(_MAYBE_MPS, 0), torch.device(_MAYBE_MPS, 1)]), + ], +) +def test_get_parallel_devices(devices, expected): + assert MPSAccelerator.get_parallel_devices(devices) == expected diff --git a/tests/tests_lite/accelerators/test_registry.py b/tests/tests_lite/accelerators/test_registry.py new file mode 100644 index 0000000000000..de8824eb38dbc --- /dev/null +++ b/tests/tests_lite/accelerators/test_registry.py @@ -0,0 +1,73 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import Any, Dict + +import torch + +from lightning_lite.accelerators import Accelerator, ACCELERATOR_REGISTRY + + +def test_accelerator_registry_with_new_accelerator(): + accelerator_name = "custom_accelerator" + accelerator_description = "Custom Accelerator" + + class CustomAccelerator(Accelerator): + def __init__(self, param1, param2): + self.param1 = param1 + self.param2 = param2 + super().__init__() + + def setup_device(self, device: torch.device) -> None: + pass + + def get_device_stats(self, device: torch.device) -> Dict[str, Any]: + pass + + def teardown(self) -> None: + pass + + @staticmethod + def parse_devices(devices): + return devices + + @staticmethod + def get_parallel_devices(devices): + return ["foo"] * devices + + @staticmethod + def auto_device_count(): + return 3 + + @staticmethod + def is_available(): + return True + + ACCELERATOR_REGISTRY.register( + accelerator_name, CustomAccelerator, description=accelerator_description, param1="abc", param2=123 + ) + + assert accelerator_name in ACCELERATOR_REGISTRY + + assert ACCELERATOR_REGISTRY[accelerator_name]["description"] == accelerator_description + assert ACCELERATOR_REGISTRY[accelerator_name]["init_params"] == {"param1": "abc", "param2": 123} + assert ACCELERATOR_REGISTRY[accelerator_name]["accelerator_name"] == accelerator_name + + assert isinstance(ACCELERATOR_REGISTRY.get(accelerator_name), CustomAccelerator) + + ACCELERATOR_REGISTRY.remove(accelerator_name) + assert accelerator_name not in ACCELERATOR_REGISTRY + + +def test_available_accelerators_in_registry(): + assert ACCELERATOR_REGISTRY.available_accelerators() == ["cpu", "cuda", "mps", "tpu"] diff --git a/tests/tests_lite/accelerators/test_tpu.py b/tests/tests_lite/accelerators/test_tpu.py new file mode 100644 index 0000000000000..044a0c9951009 --- /dev/null +++ b/tests/tests_lite/accelerators/test_tpu.py @@ -0,0 +1,40 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License +import pytest +from tests_lite.helpers.runif import RunIf + +from lightning_lite.accelerators.tpu import TPUAccelerator + + +def test_auto_device_count(): + assert TPUAccelerator.auto_device_count() == 8 + + +@RunIf(tpu=True) +def test_availability(): + assert TPUAccelerator.is_available() + + +@pytest.mark.parametrize( + "devices,expected", + [ + (0, []), # TODO(lite): This should raise an exception + (1, [0]), + (2, [0, 1]), + (3, [0, 1, 2]), + ("anything-else", "anything-else"), # TODO(lite): This should raise an exception + ], +) +def test_get_parallel_devices(devices, expected): + assert TPUAccelerator.get_parallel_devices(devices) == expected diff --git a/tests/tests_lite/helpers/runif.py b/tests/tests_lite/helpers/runif.py index 7dd9aaf72962c..00a31d0f48b4e 100644 --- a/tests/tests_lite/helpers/runif.py +++ b/tests/tests_lite/helpers/runif.py @@ -20,7 +20,8 @@ from packaging.version import Version from pkg_resources import get_distribution -from lightning_lite.utilities.imports import _FAIRSCALE_AVAILABLE, _PSUTIL_AVAILABLE, _TPU_AVAILABLE +from lightning_lite.accelerators.mps import MPSAccelerator +from lightning_lite.utilities.imports import _FAIRSCALE_AVAILABLE, _TPU_AVAILABLE class RunIf: @@ -40,10 +41,10 @@ def __new__( max_torch: Optional[str] = None, min_python: Optional[str] = None, tpu: bool = False, + mps: Optional[bool] = None, skip_windows: bool = False, standalone: bool = False, fairscale: bool = False, - psutil: bool = False, **kwargs, ): """ @@ -54,11 +55,12 @@ def __new__( max_torch: Require that PyTorch is less than this version. min_python: Require that Python is greater or equal than this version. tpu: Require that TPU is available. + mps: If True: Require that MPS (Apple Silicon) is available, + if False: Explicitly Require that MPS is not available skip_windows: Skip for Windows platform. standalone: Mark the test as standalone, our CI will run it in a separate process. This requires that the ``PL_RUN_STANDALONE_TESTS=1`` environment variable is set. fairscale: Require that facebookresearch/fairscale is installed. - psutil: Require that psutil is installed. **kwargs: Any :class:`pytest.mark.skipif` keyword arguments. """ conditions = [] @@ -95,6 +97,14 @@ def __new__( # used in conftest.py::pytest_collection_modifyitems kwargs["tpu"] = True + if mps is not None: + if mps: + conditions.append(not MPSAccelerator.is_available()) + reasons.append("MPS") + else: + conditions.append(MPSAccelerator.is_available()) + reasons.append("not MPS") + if standalone: env_flag = os.getenv("PL_RUN_STANDALONE_TESTS", "0") conditions.append(env_flag != "1") @@ -110,10 +120,6 @@ def __new__( conditions.append(not _FAIRSCALE_AVAILABLE) reasons.append("Fairscale") - if psutil: - conditions.append(not _PSUTIL_AVAILABLE) - reasons.append("psutil") - reasons = [rs for cond, rs in zip(conditions, reasons) if cond] return pytest.mark.skipif( *args, condition=any(conditions), reason=f"Requires: [{' + '.join(reasons)}]", **kwargs diff --git a/tests/tests_lite/utilities/test_device_parser.py b/tests/tests_lite/utilities/test_device_parser.py index 0f005d5ce3a37..09e35fb61d51c 100644 --- a/tests/tests_lite/utilities/test_device_parser.py +++ b/tests/tests_lite/utilities/test_device_parser.py @@ -112,3 +112,10 @@ def test_num_cuda_devices_without_forking(*_): implementation for determining cuda availability.""" assert device_parser.is_cuda_available() assert device_parser.num_cuda_devices() == 2 + + +@pytest.mark.parametrize("devices", ([3], -1)) +def test_invalid_devices_with_cpu_accelerator(devices): + """Test invalid device flag raises MisconfigurationException.""" + with pytest.raises(MisconfigurationException, match="should be an int > 0"): + device_parser.parse_cpu_cores(devices) diff --git a/tests/tests_pytorch/accelerators/test_common.py b/tests/tests_pytorch/accelerators/test_common.py index 05fb76f1cc572..3eeda536e4c72 100644 --- a/tests/tests_pytorch/accelerators/test_common.py +++ b/tests/tests_pytorch/accelerators/test_common.py @@ -11,8 +11,11 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from typing import Any, Dict from unittest import mock +import torch + from pytorch_lightning import Trainer from pytorch_lightning.accelerators import Accelerator, CPUAccelerator, CUDAAccelerator, IPUAccelerator, TPUAccelerator from pytorch_lightning.strategies import DDPStrategy @@ -28,6 +31,15 @@ def test_auto_device_count(_): def test_pluggable_accelerator(): class TestAccelerator(Accelerator): + def setup_device(self, device: torch.device) -> None: + pass + + def get_device_stats(self, device: torch.device) -> Dict[str, Any]: + pass + + def teardown(self) -> None: + pass + @staticmethod def parse_devices(devices): return devices diff --git a/tests/tests_pytorch/accelerators/test_cpu.py b/tests/tests_pytorch/accelerators/test_cpu.py index 4453b7add086f..717acff318633 100644 --- a/tests/tests_pytorch/accelerators/test_cpu.py +++ b/tests/tests_pytorch/accelerators/test_cpu.py @@ -1,6 +1,7 @@ import os from pathlib import Path from typing import Any, Dict, Union +from unittest.mock import Mock import pytest import torch @@ -13,6 +14,7 @@ from pytorch_lightning.plugins.precision.precision_plugin import PrecisionPlugin from pytorch_lightning.strategies import SingleDeviceStrategy from pytorch_lightning.utilities.exceptions import MisconfigurationException +from tests_pytorch.helpers.runif import RunIf def test_restore_checkpoint_after_pre_setup_default(): @@ -27,6 +29,15 @@ def test_availability(): assert CPUAccelerator.is_available() +@RunIf(psutil=True) +def test_get_device_stats(tmpdir): + gpu_stats = CPUAccelerator().get_device_stats(Mock()) + fields = ["cpu_vm_percent", "cpu_percent", "cpu_swap_percent"] + + for f in fields: + assert any(f in h for h in gpu_stats.keys()) + + @pytest.mark.parametrize("restore_after_pre_setup", [True, False]) def test_restore_checkpoint_after_pre_setup(tmpdir, restore_after_pre_setup): """Test to ensure that if restore_checkpoint_after_setup is True, then we only load the state after pre- diff --git a/tests/tests_pytorch/accelerators/test_registry.py b/tests/tests_pytorch/accelerators/test_registry.py index 004723c19eeb6..d9ac7dec0b1bd 100644 --- a/tests/tests_pytorch/accelerators/test_registry.py +++ b/tests/tests_pytorch/accelerators/test_registry.py @@ -11,55 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from pytorch_lightning import Trainer -from pytorch_lightning.accelerators import Accelerator, AcceleratorRegistry - - -def test_accelerator_registry_with_new_accelerator(): - - accelerator_name = "custom_accelerator" - accelerator_description = "Custom Accelerator" - - class CustomAccelerator(Accelerator): - def __init__(self, param1, param2): - self.param1 = param1 - self.param2 = param2 - super().__init__() - - @staticmethod - def parse_devices(devices): - return devices - - @staticmethod - def get_parallel_devices(devices): - return ["foo"] * devices - - @staticmethod - def auto_device_count(): - return 3 - - @staticmethod - def is_available(): - return True - - AcceleratorRegistry.register( - accelerator_name, CustomAccelerator, description=accelerator_description, param1="abc", param2=123 - ) - - assert accelerator_name in AcceleratorRegistry - - assert AcceleratorRegistry[accelerator_name]["description"] == accelerator_description - assert AcceleratorRegistry[accelerator_name]["init_params"] == {"param1": "abc", "param2": 123} - assert AcceleratorRegistry[accelerator_name]["accelerator_name"] == accelerator_name - - assert isinstance(AcceleratorRegistry.get(accelerator_name), CustomAccelerator) - - trainer = Trainer(accelerator=accelerator_name, devices="auto") - assert isinstance(trainer.accelerator, CustomAccelerator) - assert trainer.strategy.parallel_devices == ["foo"] * 3 - - AcceleratorRegistry.remove(accelerator_name) - assert accelerator_name not in AcceleratorRegistry +from pytorch_lightning.accelerators import AcceleratorRegistry def test_available_accelerators_in_registry(): diff --git a/tests/tests_pytorch/deprecated_api/test_remove_1-10.py b/tests/tests_pytorch/deprecated_api/test_remove_1-10.py index a48c6a7884083..3eebceaadd40a 100644 --- a/tests/tests_pytorch/deprecated_api/test_remove_1-10.py +++ b/tests/tests_pytorch/deprecated_api/test_remove_1-10.py @@ -20,6 +20,7 @@ from torch.utils.data import DataLoader from pytorch_lightning import Trainer +from pytorch_lightning.accelerators.cpu import CPUAccelerator from pytorch_lightning.core.mixins.device_dtype_mixin import DeviceDtypeModuleMixin from pytorch_lightning.demos.boring_classes import BoringModel, RandomDataset from pytorch_lightning.overrides import LightningDistributedModule, LightningParallelModule @@ -250,3 +251,8 @@ def test_v1_10_deprecated_seed_utilities(): with pytest.deprecated_call(match="seed.pl_worker_init_function` has been deprecated in v1.8.0"): pl_worker_init_function(0) + + +def test_v1_10_deprecated_accelerator_setup_environment_method(): + with pytest.deprecated_call(match="`Accelerator.setup_environment` has been deprecated in deprecated in v1.8.0"): + CPUAccelerator().setup_environment(torch.device("cpu")) diff --git a/tests/tests_pytorch/helpers/runif.py b/tests/tests_pytorch/helpers/runif.py index afd61976550eb..62cb93ed38919 100644 --- a/tests/tests_pytorch/helpers/runif.py +++ b/tests/tests_pytorch/helpers/runif.py @@ -20,7 +20,7 @@ from packaging.version import Version from pkg_resources import get_distribution -from pytorch_lightning.accelerators.mps import _MPS_AVAILABLE +from pytorch_lightning.accelerators.mps import MPSAccelerator from pytorch_lightning.callbacks.progress.rich_progress import _RICH_AVAILABLE from pytorch_lightning.overrides.fairscale import _FAIRSCALE_AVAILABLE from pytorch_lightning.strategies.bagua import _BAGUA_AVAILABLE @@ -189,10 +189,10 @@ def __new__( if mps is not None: if mps: - conditions.append(not _MPS_AVAILABLE) + conditions.append(not MPSAccelerator.is_available()) reasons.append("MPS") else: - conditions.append(_MPS_AVAILABLE) + conditions.append(MPSAccelerator.is_available()) reasons.append("not MPS") if horovod: diff --git a/tests/tests_pytorch/trainer/connectors/test_accelerator_connector.py b/tests/tests_pytorch/trainer/connectors/test_accelerator_connector.py index 1a420e9269fae..562e7c4df2e4c 100644 --- a/tests/tests_pytorch/trainer/connectors/test_accelerator_connector.py +++ b/tests/tests_pytorch/trainer/connectors/test_accelerator_connector.py @@ -13,6 +13,7 @@ # limitations under the License import os +from typing import Any, Dict from unittest import mock from unittest.mock import Mock @@ -139,6 +140,15 @@ def creates_processes_externally(self) -> bool: @mock.patch("pytorch_lightning.strategies.DDPStrategy.setup_distributed", autospec=True) def test_custom_accelerator(device_count_mock, setup_distributed_mock): class Accel(Accelerator): + def setup_device(self, device: torch.device) -> None: + pass + + def get_device_stats(self, device: torch.device) -> Dict[str, Any]: + pass + + def teardown(self) -> None: + pass + @staticmethod def parse_devices(devices): return devices @@ -777,9 +787,8 @@ def test_gpu_accelerator_backend_choice_cuda(_): assert isinstance(trainer.accelerator, CUDAAccelerator) -# TODO(lite): remove skip once MPS utils have moved -@pytest.mark.skip(reason="Utils in Lite rely on MPS accelerator file, but refactor is not yet finished") -@mock.patch("pytorch_lightning.accelerators.mps._MPS_AVAILABLE", return_value=True) +@mock.patch("lightning_lite.accelerators.mps.MPSAccelerator.is_available", return_value=True) +@mock.patch("lightning_lite.utilities.device_parser._get_all_available_mps_gpus", return_value=[0]) @mock.patch("torch.device", return_value="mps") # necessary because torch doesn't allow creation of mps devices def test_gpu_accelerator_backend_choice_mps(*_): trainer = Trainer(accelerator="gpu") From b679fc29222595bb6081665d192585df3574f49c Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Mon, 12 Sep 2022 18:54:14 +0200 Subject: [PATCH 110/193] (app) Resolve a bug where the state changes isn't detected properly (#14465) Co-authored-by: Mansy Co-authored-by: Jirka Borovec --- src/lightning_app/CHANGELOG.md | 3 ++ src/lightning_app/core/app.py | 7 +++- tests/tests_app/core/test_lightning_app.py | 42 ++++++++++++++++++++++ tests/tests_app_examples/test_v0_app.py | 2 +- 4 files changed, 52 insertions(+), 2 deletions(-) diff --git a/src/lightning_app/CHANGELOG.md b/src/lightning_app/CHANGELOG.md index a6a3d2e9c37c4..e77561974fdef 100644 --- a/src/lightning_app/CHANGELOG.md +++ b/src/lightning_app/CHANGELOG.md @@ -21,6 +21,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - +### Fixed + +- Resolved a bug where the state change detection using DeepDiff won't worked with Path, Drive objects ([#14465](https://github.com/Lightning-AI/lightning/pull/14465)) ### Removed diff --git a/src/lightning_app/core/app.py b/src/lightning_app/core/app.py index 2bc0d7fc109d4..fad3d33b53c81 100644 --- a/src/lightning_app/core/app.py +++ b/src/lightning_app/core/app.py @@ -9,6 +9,7 @@ from time import time from deepdiff import DeepDiff, Delta +from lightning_utilities.core.apply_func import apply_to_collection import lightning_app from lightning_app import _console @@ -21,6 +22,7 @@ ) from lightning_app.core.queues import BaseQueue, SingleProcessQueue from lightning_app.frontend import Frontend +from lightning_app.storage import Drive, Path from lightning_app.storage.path import storage_root_dir from lightning_app.utilities.app_helpers import _delta_to_app_state_delta, _LightningAppRef, Logger from lightning_app.utilities.commands.base import _process_requests @@ -319,9 +321,12 @@ def maybe_apply_changes(self) -> bool: deltas = self._collect_deltas_from_ui_and_work_queues() if not deltas: + # Path and Drive aren't processed by DeepDiff, so we need to convert them to dict. + last_state = apply_to_collection(self.last_state, (Path, Drive), lambda x: x.to_dict()) + state = apply_to_collection(self.state, (Path, Drive), lambda x: x.to_dict()) # When no deltas are received from the Rest API or work queues, # we need to check if the flow modified the state and populate changes. - deep_diff = DeepDiff(self.last_state, self.state, verbose_level=2) + deep_diff = DeepDiff(last_state, state, verbose_level=2) if deep_diff: # TODO: Resolve changes with ``CacheMissException``. # new_state = self.populate_changes(self.last_state, self.state) diff --git a/tests/tests_app/core/test_lightning_app.py b/tests/tests_app/core/test_lightning_app.py index 6b578166858a5..55af3a1aeb1b3 100644 --- a/tests/tests_app/core/test_lightning_app.py +++ b/tests/tests_app/core/test_lightning_app.py @@ -20,6 +20,7 @@ from lightning_app.core.queues import BaseQueue, MultiProcessQueue, RedisQueue, SingleProcessQueue from lightning_app.frontend import StreamlitFrontend from lightning_app.runners import MultiProcessRuntime, SingleProcessRuntime +from lightning_app.storage import Path from lightning_app.storage.path import storage_root_dir from lightning_app.testing.helpers import RunIf from lightning_app.testing.testing import LightningTestApp @@ -975,3 +976,44 @@ def test_debug_mode_logging(): app = LightningApp(A4()) assert _console.level == logging.INFO MultiProcessRuntime(app, start_server=False).dispatch() + + +class WorkPath(LightningWork): + def __init__(self): + super().__init__() + self.path = None + + def run(self): + self.path = Path(__file__) + + +class FlowPath(LightningFlow): + def __init__(self): + super().__init__() + self.w = WorkPath() + + def run(self): + self.w.run() + + +class TestLightningHasUpdatedApp(LightningApp): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.counter = 0 + + def run_once(self): + res = super().run_once() + + if self.root.w.has_succeeded: + self.counter += 1 + + # TODO: Resolve bug where it should work with self.counter == 2 + if self.counter > 5: + assert not self._has_updated + return True + return res + + +def test_lightning_app_has_updated(): + app = TestLightningHasUpdatedApp(FlowPath()) + MultiProcessRuntime(app, start_server=False).dispatch() diff --git a/tests/tests_app_examples/test_v0_app.py b/tests/tests_app_examples/test_v0_app.py index f600b7eea3234..b0b89eb99a17e 100644 --- a/tests/tests_app_examples/test_v0_app.py +++ b/tests/tests_app_examples/test_v0_app.py @@ -11,7 +11,7 @@ class LightningAppTestInt(LightningTestApp): def run_once(self) -> Tuple[bool, float]: - if self.root.counter > 1: + if self.root.counter == 1: print("V0 App End") self.stage = AppStage.STOPPING return True, 0.0 From cd671243ab760ad6e97298dd7896225288669cf5 Mon Sep 17 00:00:00 2001 From: Mauricio Villegas Date: Mon, 12 Sep 2022 19:10:34 +0200 Subject: [PATCH 111/193] Fix mypy errors in pytorch_lightning/cli.py (#14653) --- requirements/pytorch/extra.txt | 2 +- src/pytorch_lightning/cli.py | 11 ++++++----- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/requirements/pytorch/extra.txt b/requirements/pytorch/extra.txt index b547e7a62f2ed..5c38a286ef21d 100644 --- a/requirements/pytorch/extra.txt +++ b/requirements/pytorch/extra.txt @@ -5,7 +5,7 @@ matplotlib>3.1, <3.5.3 omegaconf>=2.0.5, <2.3.0 hydra-core>=1.0.5, <1.3.0 -jsonargparse[signatures]>=4.12.0, <=4.12.0 +jsonargparse[signatures]>=4.12.0, <4.14.0 gcsfs>=2021.5.0, <2022.8.0 rich>=10.14.0, !=10.15.0.a, <13.0.0 protobuf<=3.20.1 # strict # an extra is updating protobuf, this pin prevents TensorBoard failure diff --git a/src/pytorch_lightning/cli.py b/src/pytorch_lightning/cli.py index 82156c6b4ab90..875f9e00660ff 100644 --- a/src/pytorch_lightning/cli.py +++ b/src/pytorch_lightning/cli.py @@ -145,7 +145,7 @@ def add_optimizer_args( assert all(issubclass(o, Optimizer) for o in optimizer_class) else: assert issubclass(optimizer_class, Optimizer) - kwargs = {"instantiate": False, "fail_untyped": False, "skip": {"params"}} + kwargs: Dict[str, Any] = {"instantiate": False, "fail_untyped": False, "skip": {"params"}} if isinstance(optimizer_class, tuple): self.add_subclass_arguments(optimizer_class, nested_key, **kwargs) else: @@ -170,7 +170,7 @@ def add_lr_scheduler_args( assert all(issubclass(o, LRSchedulerTypeTuple) for o in lr_scheduler_class) else: assert issubclass(lr_scheduler_class, LRSchedulerTypeTuple) - kwargs = {"instantiate": False, "fail_untyped": False, "skip": {"optimizer"}} + kwargs: Dict[str, Any] = {"instantiate": False, "fail_untyped": False, "skip": {"optimizer"}} if isinstance(lr_scheduler_class, tuple): self.add_subclass_arguments(lr_scheduler_class, nested_key, **kwargs) else: @@ -436,6 +436,7 @@ def subcommands() -> Dict[str, Set[str]]: def _add_subcommands(self, parser: LightningArgumentParser, **kwargs: Any) -> None: """Adds subcommands to the input parser.""" + self._subcommand_parsers: Dict[str, LightningArgumentParser] = {} parser_subcommands = parser.add_subcommands() # the user might have passed a builder function trainer_class = ( @@ -444,6 +445,7 @@ def _add_subcommands(self, parser: LightningArgumentParser, **kwargs: Any) -> No # register all subcommands in separate subcommand parsers under the main parser for subcommand in self.subcommands(): subcommand_parser = self._prepare_subcommand_parser(trainer_class, subcommand, **kwargs.get(subcommand, {})) + self._subcommand_parsers[subcommand] = subcommand_parser fn = getattr(trainer_class, subcommand) # extract the first line description in the docstring for the subcommand help message description = _get_short_description(fn) @@ -528,8 +530,7 @@ def _parser(self, subcommand: Optional[str]) -> LightningArgumentParser: if subcommand is None: return self.parser # return the subcommand parser for the subcommand passed - action_subcommand = self.parser._subcommands_action - return action_subcommand._name_parser_map[subcommand] + return self._subcommand_parsers[subcommand] @staticmethod def configure_optimizers( @@ -611,7 +612,7 @@ def get_automatic( # override the existing method self.model.configure_optimizers = MethodType(fn, self.model) - def _get(self, config: Dict[str, Any], key: str, default: Optional[Any] = None) -> Any: + def _get(self, config: Namespace, key: str, default: Optional[Any] = None) -> Any: """Utility to get a config value which might be inside a subcommand.""" return config.get(str(self.subcommand), config).get(key, default) From f49c2e3ab6c9b0532272b26b09f9c7faec305168 Mon Sep 17 00:00:00 2001 From: donlapark <10988155+donlapark@users.noreply.github.com> Date: Tue, 13 Sep 2022 00:13:58 +0700 Subject: [PATCH 112/193] fixes mypy errors in trainer/supporters.py (#14633) * fixes mypy errors in trainer/supporters.py * Fxes mypy error when accessing "__init__" directly * add an assertion in lr_finder.py * Make init calls `reset` in `TensorRunningAccum` * Fixes formatting * Add `self.window_length` to `__init__` Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- pyproject.toml | 1 - src/pytorch_lightning/trainer/supporters.py | 68 ++++++++++++--------- src/pytorch_lightning/tuner/lr_finder.py | 4 +- 3 files changed, 41 insertions(+), 32 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 166447dd655f6..5a8f632481127 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -54,7 +54,6 @@ module = [ "pytorch_lightning.callbacks.progress.rich_progress", "pytorch_lightning.profilers.base", "pytorch_lightning.profilers.pytorch", - "pytorch_lightning.trainer.supporters", "pytorch_lightning.trainer.trainer", "pytorch_lightning.tuner.batch_size_scaling", "pytorch_lightning.utilities.data", diff --git a/src/pytorch_lightning/trainer/supporters.py b/src/pytorch_lightning/trainer/supporters.py index e183bdcc644d6..454143416f735 100644 --- a/src/pytorch_lightning/trainer/supporters.py +++ b/src/pytorch_lightning/trainer/supporters.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +from collections.abc import Sized from dataclasses import asdict, dataclass, field from typing import Any, Callable, Dict, Iterable, Iterator, List, Mapping, Optional, Sequence, Union @@ -53,23 +54,24 @@ class TensorRunningAccum: def __init__(self, window_length: int): self.window_length = window_length - self.memory = None - self.current_idx: int = 0 - self.last_idx: Optional[int] = None - self.rotated: bool = False + self.reset(window_length) def reset(self, window_length: Optional[int] = None) -> None: """Empty the accumulator.""" - if window_length is None: - window_length = self.window_length - self.__init__(window_length) + if window_length is not None: + self.window_length = window_length + self.memory: Optional[torch.Tensor] = None + self.current_idx: int = 0 + self.last_idx: Optional[int] = None + self.rotated: bool = False - def last(self): + def last(self) -> Optional[torch.Tensor]: """Get the last added element.""" if self.last_idx is not None: + assert isinstance(self.memory, torch.Tensor) return self.memory[self.last_idx].float() - def append(self, x): + def append(self, x: torch.Tensor) -> None: """Add an element to the accumulator.""" if self.memory is None: # tradeoff memory for speed by keeping the memory on device @@ -88,20 +90,21 @@ def append(self, x): if self.current_idx == 0: self.rotated = True - def mean(self): + def mean(self) -> Optional[torch.Tensor]: """Get mean value from stored elements.""" return self._agg_memory("mean") - def max(self): + def max(self) -> Optional[torch.Tensor]: """Get maximal value from stored elements.""" return self._agg_memory("max") - def min(self): + def min(self) -> Optional[torch.Tensor]: """Get minimal value from stored elements.""" return self._agg_memory("min") - def _agg_memory(self, how: str): + def _agg_memory(self, how: str) -> Optional[torch.Tensor]: if self.last_idx is not None: + assert isinstance(self.memory, torch.Tensor) if self.rotated: return getattr(self.memory.float(), how)() return getattr(self.memory[: self.current_idx].float(), how)() @@ -139,7 +142,7 @@ def done(self) -> bool: class CycleIterator: """Iterator for restarting a dataloader if it runs out of samples.""" - def __init__(self, loader: Any, length: Optional[int] = None, state: SharedCycleIteratorState = None): + def __init__(self, loader: Any, length: Optional[Union[int, float]] = None, state: SharedCycleIteratorState = None): """ Args: loader: the loader to restart for cyclic (and optionally infinite) sampling @@ -184,6 +187,8 @@ def __next__(self) -> Any: Raises: StopIteration: if more then :attr:`length` batches have been returned """ + assert isinstance(self._loader_iter, Iterator) + # Note: if self.length is `inf`, then the iterator will never stop if self.counter >= self.__len__() or self.state.done: raise StopIteration @@ -257,13 +262,13 @@ def _calc_num_data(self, datasets: Union[Sequence, Mapping], mode: str) -> Union Returns: length: the length of `CombinedDataset` """ - if mode not in CombinedDataset.COMPUTE_FUNCS.keys(): + if mode not in self.COMPUTE_FUNCS.keys(): raise MisconfigurationException(f"Invalid Mode: {mode}") # extract the lengths all_lengths = self._get_len_recursive(datasets) - compute_func = CombinedDataset.COMPUTE_FUNCS[mode] + compute_func = self.COMPUTE_FUNCS[mode] if isinstance(all_lengths, (int, float)): length = all_lengths @@ -272,8 +277,9 @@ def _calc_num_data(self, datasets: Union[Sequence, Mapping], mode: str) -> Union return length - def _get_len_recursive(self, data) -> int: + def _get_len_recursive(self, data: Any) -> Union[int, float, List, Dict]: if isinstance(data, Dataset): + assert isinstance(data, Sized) return len(data) if isinstance(data, (float, int)): @@ -290,13 +296,13 @@ def _get_len_recursive(self, data) -> int: return self._get_len(data) @staticmethod - def _get_len(dataset) -> int: + def _get_len(dataset: Any) -> Union[int, float]: try: return len(dataset) except (TypeError, NotImplementedError): return float("inf") - def __len__(self) -> int: + def __len__(self) -> Union[int, float]: """Return the minimum length of the datasets.""" return self._calc_num_data(self.datasets, self.mode) @@ -348,8 +354,8 @@ def __init__(self, loaders: Any, mode: str = "min_size"): if self.mode == "max_size_cycle": self._wrap_loaders_max_size_cycle() - self._loaders_iter_state_dict = None - self._iterator = None # assigned in __iter__ + self._loaders_iter_state_dict: Optional[Dict] = None + self._iterator: Optional[Iterator] = None # assigned in __iter__ @staticmethod def _state_dict_fn(iterator: Optional[Iterator], has_completed: int) -> Dict: @@ -384,7 +390,7 @@ def state_dict(self, has_completed: bool = False) -> Dict: has_completed=has_completed, ) - def load_state_dict(self, state_dict) -> None: + def load_state_dict(self, state_dict: Dict) -> None: # store the samplers state. # They would be reloaded once the `CombinedIterator` as been created # and the workers are created. @@ -482,10 +488,10 @@ def __iter__(self) -> Any: # prevent `NotImplementedError` from PyTorch: # https://github.com/pytorch/pytorch/blob/v1.9.0/torch/utils/data/dataloader.py#L541 - def __getstate__patch__(*_): + def __getstate__patch__(*_: Any) -> Dict: return {} - _BaseDataLoaderIter.__getstate__ = __getstate__patch__ + _BaseDataLoaderIter.__getstate__ = __getstate__patch__ # type: ignore[assignment] iterator = CombinedLoaderIterator(self.loaders) # handle fault tolerant restart logic. self.on_restart(iterator) @@ -493,7 +499,7 @@ def __getstate__patch__(*_): return iterator @staticmethod - def _calc_num_batches(loaders: Any, mode="min_size") -> Union[int, float]: + def _calc_num_batches(loaders: Any, mode: str = "min_size") -> Union[int, float]: """Compute the length (aka the number of batches) of `CombinedLoader`. Args: @@ -509,16 +515,16 @@ def _calc_num_batches(loaders: Any, mode="min_size") -> Union[int, float]: return all_lengths return _nested_calc_num_data(all_lengths, max if mode == "max_size_cycle" else min) - def __len__(self) -> int: + def __len__(self) -> Union[int, float]: return self._calc_num_batches(self.loaders, mode=self.mode) @staticmethod - def _shutdown_workers_and_reset_iterator(dataloader) -> None: + def _shutdown_workers_and_reset_iterator(dataloader: DataLoader) -> None: if hasattr(dataloader, "_iterator") and isinstance(dataloader._iterator, _MultiProcessingDataLoaderIter): dataloader._iterator._shutdown_workers() dataloader._iterator = None - def reset(self): + def reset(self) -> None: if self._iterator: self._iterator._loader_iters = None if self.loaders is not None: @@ -535,7 +541,7 @@ def __init__(self, loaders: Any): loaders: the loaders to sample from. Can be all kind of collection """ self.loaders = loaders - self._loader_iters = None + self._loader_iters: Any = None @property def loader_iters(self) -> Any: @@ -584,7 +590,9 @@ def create_loader_iters( return apply_to_collection(loaders, Iterable, iter, wrong_dtype=(Sequence, Mapping)) -def _nested_calc_num_data(data: Union[Mapping, Sequence], compute_func: Callable): +def _nested_calc_num_data( + data: Union[Mapping, Sequence], compute_func: Callable[[List[Union[int, float]]], Union[int, float]] +) -> Union[int, float]: if isinstance(data, (float, int)): return data diff --git a/src/pytorch_lightning/tuner/lr_finder.py b/src/pytorch_lightning/tuner/lr_finder.py index 6b6d5771a4751..d8e5e6fc4a79b 100644 --- a/src/pytorch_lightning/tuner/lr_finder.py +++ b/src/pytorch_lightning/tuner/lr_finder.py @@ -356,7 +356,9 @@ def on_train_batch_end( if self.progress_bar: self.progress_bar.update() - current_loss = trainer.fit_loop.running_loss.last().item() + loss_tensor = trainer.fit_loop.running_loss.last() + assert loss_tensor is not None + current_loss = loss_tensor.item() current_step = trainer.global_step # Avg loss (loss with momentum) + smoothing From 4f3c47294e0aea3cbd2aac3593b0b8f7b43728a3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 12 Sep 2022 19:35:18 +0200 Subject: [PATCH 113/193] Add troubleshooting section to MPS docs (#14642) Co-authored-by: Justus Schock <12886177+justusschock@users.noreply.github.com> --- docs/source-pytorch/accelerators/mps_basic.rst | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/docs/source-pytorch/accelerators/mps_basic.rst b/docs/source-pytorch/accelerators/mps_basic.rst index 15e6ab929ba14..5db866a531e13 100644 --- a/docs/source-pytorch/accelerators/mps_basic.rst +++ b/docs/source-pytorch/accelerators/mps_basic.rst @@ -34,7 +34,7 @@ Run on Apple silicon gpus ------------------------- Enable the following Trainer arguments to run on Apple silicon gpus (MPS devices). -.. code:: +.. code-block:: python trainer = Trainer(accelerator="mps", devices=1) @@ -46,3 +46,18 @@ Enable the following Trainer arguments to run on Apple silicon gpus (MPS devices What does MPS stand for? ------------------------ MPS is short for `Metal Performance Shaders `_ which is the technology used in the back for gpu communication and computing. + +---- + +Troubleshooting +--------------- + + +If Lightning can't detect the Apple Silicon hardware, it will raise this exception: + +.. code:: + + MisconfigurationException: MPSAccelerator can not run on your system since the accelerator is not available. + +If you are seeing this despite running on an ARM-enabled Mac, the most likely cause is that your Python is being emulated and thinks it is running on an Intel CPU. +To solve this, re-install your python executable (and if using environment managers like conda, you have to reinstall these as well) by downloading the Apple M1/M2 build (not Intel!), for example `here `_. From 9769f57fc621baa38f117487a135367e890e8c67 Mon Sep 17 00:00:00 2001 From: Noha Alon Date: Mon, 12 Sep 2022 21:00:46 +0300 Subject: [PATCH 114/193] Removes timeout from streamlit e2e test (#14667) * Removes timeout from streamlit e2e test We have a timeout on the app view which waits for the button but it causes a refresh on the page which causes playwright to miss the button on each refresh. we can remove the timeout altogether since we have a time limit on the test itself in the CI setup Co-authored-by: thomas chaton Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- tests/tests_app_examples/test_template_streamlit_ui.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/tests_app_examples/test_template_streamlit_ui.py b/tests/tests_app_examples/test_template_streamlit_ui.py index e2c33305298f7..30d6c38070988 100644 --- a/tests/tests_app_examples/test_template_streamlit_ui.py +++ b/tests/tests_app_examples/test_template_streamlit_ui.py @@ -19,7 +19,7 @@ def test_template_streamlit_ui_example_cloud() -> None: def click_button(*_, **__): button = view_page.frame_locator("iframe").locator('button:has-text("Should print to the terminal ?")') - button.wait_for(timeout=5 * 1000) + if button.all_text_contents() == ["Should print to the terminal ?"]: button.click() return True From 92309c215339da9c1913b782dccf7b73a5c14532 Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Tue, 13 Sep 2022 03:13:27 +0900 Subject: [PATCH 115/193] Remove skipping logic in PL CI (#14565) Drop skipping logic --- .github/workflows/ci-pytorch-test-conda.yml | 25 +---------- .github/workflows/ci-pytorch-test-full.yml | 46 ++++----------------- 2 files changed, 10 insertions(+), 61 deletions(-) diff --git a/.github/workflows/ci-pytorch-test-conda.yml b/.github/workflows/ci-pytorch-test-conda.yml index c49361153eb4d..f7ade50623b55 100644 --- a/.github/workflows/ci-pytorch-test-conda.yml +++ b/.github/workflows/ci-pytorch-test-conda.yml @@ -39,24 +39,7 @@ jobs: id: changed-files uses: tj-actions/changed-files@v29.0.3 - - name: Decide if the test should be skipped - id: skip - shell: bash -l {0} - run: | - FILTER='src/pytorch_lightning|requirements/pytorch|tests/tests_pytorch|examples/pl_*' - echo "${{ steps.changed-files.outputs.all_changed_files }}" | tr " " "\n" > changed_files.txt - MATCHES=$(cat changed_files.txt | grep -E $FILTER) - echo $MATCHES - if [ -z "$MATCHES" ]; then - echo "Skip" - echo "::set-output name=continue::0" - else - echo "Continue" - echo "::set-output name=continue::1" - fi - - name: Update base dependencies - if: ${{ (steps.skip.outputs.continue == '1') }} env: PACKAGE_NAME: pytorch FREEZE_REQUIREMENTS: 1 @@ -70,12 +53,10 @@ jobs: run: pip install "Pillow<9.0" # It messes with torchvision - name: DocTests - if: ${{ (steps.skip.outputs.continue == '1') }} working-directory: ./src run: pytest pytorch_lightning --cov=pytorch_lightning - name: Update all dependencies - if: ${{ (steps.skip.outputs.continue == '1') }} env: HOROVOD_BUILD_ARCH_FLAGS: "-mfma" HOROVOD_WITHOUT_MXNET: 1 @@ -95,11 +76,9 @@ jobs: python requirements/pytorch/check-avail-extras.py - name: Pull legacy checkpoints - if: ${{ (steps.skip.outputs.continue == '1') }} run: bash .actions/pull_legacy_checkpoints.sh - name: Testing PyTorch - if: ${{ (steps.skip.outputs.continue == '1') }} working-directory: tests/tests_pytorch run: coverage run --source pytorch_lightning -m pytest -v --timeout 150 --durations=50 --junitxml=results-${{ runner.os }}-torch${{ matrix.pytorch-version }}.xml @@ -111,7 +90,7 @@ jobs: if: failure() - name: Statistics - if: ${{ success() && (steps.skip.outputs.continue == '1') }} + if: success() working-directory: tests/tests_pytorch run: | coverage report @@ -119,7 +98,7 @@ jobs: - name: Upload coverage to Codecov uses: codecov/codecov-action@v3 - if: ${{ success() && (steps.skip.outputs.continue == '1') }} + if: success() # see: https://github.com/actions/toolkit/issues/399 continue-on-error: true with: diff --git a/.github/workflows/ci-pytorch-test-full.yml b/.github/workflows/ci-pytorch-test-full.yml index e4c5ecd9cc0c1..6962c2a952b40 100644 --- a/.github/workflows/ci-pytorch-test-full.yml +++ b/.github/workflows/ci-pytorch-test-full.yml @@ -41,63 +41,42 @@ jobs: id: changed-files uses: tj-actions/changed-files@v29.0.3 - - name: Decide if the test should be skipped - id: skip - shell: bash -l {0} - run: | - FILTER='src/pytorch_lightning|requirements/pytorch|tests/tests_pytorch|examples/pl_*' - echo "${{ steps.changed-files.outputs.all_changed_files }}" | tr " " "\n" > changed_files.txt - MATCHES=$(cat changed_files.txt | grep -E $FILTER) - echo $MATCHES - if [ -z "$MATCHES" ]; then - echo "Skip" - echo "::set-output name=continue::0" - else - echo "Continue" - echo "::set-output name=continue::1" - fi - - name: Set up Python ${{ matrix.python-version }} - if: ${{ (steps.skip.outputs.continue == '1') }} uses: actions/setup-python@v4 with: python-version: ${{ matrix.python-version }} - name: Reset caching - if: ${{ (steps.skip.outputs.continue == '1') }} run: python -c "import time; days = time.time() / 60 / 60 / 24; print(f'TIME_PERIOD=d{int(days / 2) * 2}')" >> $GITHUB_ENV - name: basic setup - if: ${{ (steps.skip.outputs.continue == '1') }} run: | pip --version pip install -q -r .actions/requirements.txt # Github Actions: Run step on specific OS: https://stackoverflow.com/a/57948488/4521646 - name: Setup macOS - if: ${{ (runner.os == 'macOS') && (steps.skip.outputs.continue == '1') }} + if: runner.os == 'macOS' run: | brew install openmpi libuv # Horovod on macOS requires OpenMPI, Gloo not currently supported - name: Setup Windows - if: ${{ (runner.os == 'windows') && (steps.skip.outputs.continue == '1') }} + if: runner.os == 'windows' run: | python .actions/assistant.py requirements_prune_pkgs horovod - name: Set min. dependencies - if: ${{ (matrix.requires == 'oldest') && (steps.skip.outputs.continue == '1') }} + if: matrix.requires == 'oldest' run: | python .actions/assistant.py replace_oldest_ver # Note: This uses an internal pip API and may not always work # https://github.com/actions/cache/blob/master/examples.md#multiple-oss-in-a-workflow - name: Get pip cache dir - if: ${{ (steps.skip.outputs.continue == '1') }} id: pip-cache run: echo "::set-output name=dir::$(pip cache dir)" - name: pip cache - if: ${{ (steps.skip.outputs.continue == '1') }} uses: actions/cache@v3 with: path: ${{ steps.pip-cache.outputs.dir }} @@ -106,11 +85,9 @@ jobs: ${{ runner.os }}-pip-td${{ env.TIME_PERIOD }}-py${{ matrix.python-version }}-${{ matrix.release }}-${{ matrix.requires }}- - name: Pull legacy checkpoints - if: ${{ (steps.skip.outputs.continue == '1') }} run: bash .actions/pull_legacy_checkpoints.sh - name: Install dependencies - if: ${{ (steps.skip.outputs.continue == '1') }} env: PACKAGE_NAME: pytorch FREEZE_REQUIREMENTS: 1 @@ -122,12 +99,10 @@ jobs: shell: bash - name: DocTests - if: ${{ (steps.skip.outputs.continue == '1') }} working-directory: ./src run: pytest pytorch_lightning --cov=pytorch_lightning - name: Install extra dependencies - if: ${{ (steps.skip.outputs.continue == '1') }} run: | # adjust versions according installed Torch version python ./requirements/pytorch/adjust-versions.py requirements/pytorch/extra.txt @@ -136,7 +111,7 @@ jobs: shell: bash - name: Reinstall Horovod if necessary - if: ${{ (runner.os != 'windows') && (steps.skip.outputs.continue == '1') }} + if: runner.os != 'windows' env: HOROVOD_BUILD_ARCH_FLAGS: "-mfma" HOROVOD_WITHOUT_MXNET: 1 @@ -153,50 +128,45 @@ jobs: shell: bash - name: Cache datasets - if: ${{ (steps.skip.outputs.continue == '1') }} uses: actions/cache@v3 with: path: Datasets key: pl-dataset - name: Sanity check - if: ${{ (steps.skip.outputs.continue == '1') }} run: python requirements/pytorch/check-avail-extras.py - name: Testing Warnings # the stacklevel can only be set on >=3.7 - if: ${{ (steps.skip.outputs.continue == '1') && ( matrix.python-version != '3.7' ) }} + if: ${{ matrix.python-version != '3.7' }} working-directory: tests/tests_pytorch # needs to run outside of `pytest` run: python utilities/test_warnings.py - name: Testing PyTorch - if: ${{ (steps.skip.outputs.continue == '1') }} working-directory: tests/tests_pytorch # NOTE: do not include coverage report here, see: https://github.com/nedbat/coveragepy/issues/1003 run: coverage run --source pytorch_lightning -m pytest -v --durations=50 --junitxml=results-${{ runner.os }}-py${{ matrix.python-version }}-${{ matrix.requires }}-${{ matrix.release }}.xml - name: Upload pytest results - if: ${{ (failure()) && (steps.skip.outputs.continue == '1') }} + if: failure() uses: actions/upload-artifact@v3 with: name: unittest-results-${{ runner.os }}-py${{ matrix.python-version }}-${{ matrix.requires }}-${{ matrix.release }} path: tests/tests_pytorch/results-${{ runner.os }}-py${{ matrix.python-version }}-${{ matrix.requires }}-${{ matrix.release }}.xml - name: Prepare Examples - if: ${{ (steps.skip.outputs.continue == '1') }} run: | # adjust versions according installed Torch version python ./requirements/pytorch/adjust-versions.py requirements/pytorch/examples.txt pip install -r requirements/pytorch/examples.txt --find-links https://download.pytorch.org/whl/cpu/torch_stable.html --upgrade - name: Run Examples - if: ${{ (steps.skip.outputs.continue == '1') }} working-directory: ./examples run: python -m pytest test_pl_examples.py -v --durations=10 - name: Statistics - if: ${{ (success()) && (steps.skip.outputs.continue == '1') }} + if: success() working-directory: tests/tests_pytorch run: | coverage report @@ -204,7 +174,7 @@ jobs: - name: Upload coverage to Codecov uses: codecov/codecov-action@v3 - if: ${{ (always()) && (steps.skip.outputs.continue == '1') }} + if: always() # see: https://github.com/actions/toolkit/issues/399 continue-on-error: true with: From a7f38370814fce6bfe07f280070b477dd233a657 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 12 Sep 2022 23:48:35 +0530 Subject: [PATCH 116/193] Update docutils requirement from <0.19,>=0.16 to >=0.16,<0.20 in /requirements (#14664) Update docutils requirement in /requirements Updates the requirements on [docutils](https://docutils.sourceforge.io/) to permit the latest version. --- updated-dependencies: - dependency-name: docutils dependency-type: direct:production ... Signed-off-by: dependabot[bot] Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- requirements/docs.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/docs.txt b/requirements/docs.txt index 1b00471602c60..99663b1234837 100644 --- a/requirements/docs.txt +++ b/requirements/docs.txt @@ -2,7 +2,7 @@ sphinx>=4.0, <5.0 myst-parser>=0.15, <0.17 nbsphinx>=0.8.5, <=0.8.9 pandoc>=1.0, <=2.2 -docutils>=0.16, <0.19 +docutils>=0.16, <0.20 sphinxcontrib-fulltoc>=1.0, <=1.2.0 sphinxcontrib-mockautodoc sphinx-autodoc-typehints>=1.11, <1.15 # strict; v1.15 failing on master (#11405) From 3d540efe4fa16c8adede16cb0761eda3b3acc58f Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 12 Sep 2022 23:48:51 +0530 Subject: [PATCH 117/193] Update psutil requirement from <=5.9.1 to <5.9.3 in /requirements (#14665) Updates the requirements on [psutil](https://github.com/giampaolo/psutil) to permit the latest version. - [Release notes](https://github.com/giampaolo/psutil/releases) - [Changelog](https://github.com/giampaolo/psutil/blob/master/HISTORY.rst) - [Commits](https://github.com/giampaolo/psutil/compare/release-0.1.0...release-5.9.2) --- updated-dependencies: - dependency-name: psutil dependency-type: direct:production ... Signed-off-by: dependabot[bot] Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- requirements/pytorch/test.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/pytorch/test.txt b/requirements/pytorch/test.txt index d6488574e357d..22bd62bef9311 100644 --- a/requirements/pytorch/test.txt +++ b/requirements/pytorch/test.txt @@ -11,7 +11,7 @@ mypy==0.971 cloudpickle>=1.3, <=2.1.0 scikit-learn>0.22.1, <1.1.3 onnxruntime<1.13.0 -psutil<=5.9.1 # for `DeviceStatsMonitor` +psutil<5.9.3 # for `DeviceStatsMonitor` pandas>1.0, <=1.4.3 # needed in benchmarks fastapi<0.83.0 uvicorn<=0.18.2 From 1680a768197fee78d30dd26dd3feef83c10db6bc Mon Sep 17 00:00:00 2001 From: Mauricio Villegas Date: Mon, 12 Sep 2022 20:25:29 +0200 Subject: [PATCH 118/193] Removed from_argparse_args tests in test_cli.py (#14597) --- tests/tests_pytorch/test_cli.py | 155 -------------------------------- 1 file changed, 155 deletions(-) diff --git a/tests/tests_pytorch/test_cli.py b/tests/tests_pytorch/test_cli.py index f9ca39cb883bc..5d4ff6daebbd3 100644 --- a/tests/tests_pytorch/test_cli.py +++ b/tests/tests_pytorch/test_cli.py @@ -14,9 +14,6 @@ import inspect import json import os -import pickle -import sys -from argparse import Namespace from contextlib import contextmanager, ExitStack, redirect_stdout from io import StringIO from typing import Callable, List, Optional, Union @@ -46,7 +43,6 @@ from pytorch_lightning.loggers.wandb import _WANDB_AVAILABLE from pytorch_lightning.strategies import DDPStrategy from pytorch_lightning.trainer.states import TrainerFn -from pytorch_lightning.utilities import _TPU_AVAILABLE from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.imports import _TORCHVISION_AVAILABLE from tests_pytorch.helpers.runif import RunIf @@ -67,42 +63,6 @@ def mock_subclasses(baseclass, *subclasses): yield None -@mock.patch("argparse.ArgumentParser.parse_args") -def test_default_args(mock_argparse): - """Tests default argument parser for Trainer.""" - mock_argparse.return_value = Namespace(**Trainer.default_attributes()) - - parser = LightningArgumentParser(add_help=False, parse_as_dict=False) - args = parser.parse_args([]) - - args.max_epochs = 5 - trainer = Trainer.from_argparse_args(args) - - assert isinstance(trainer, Trainer) - assert trainer.max_epochs == 5 - - -@pytest.mark.parametrize("cli_args", [["--accumulate_grad_batches=22"], []]) -def test_add_argparse_args_redefined(cli_args): - """Redefines some default Trainer arguments via the cli and tests the Trainer initialization correctness.""" - parser = LightningArgumentParser(add_help=False, parse_as_dict=False) - parser.add_lightning_class_args(Trainer, None) - - args = parser.parse_args(cli_args) - - # make sure we can pickle args - pickle.dumps(args) - - # Check few deprecated args are not in namespace: - for depr_name in ("gradient_clip", "nb_gpu_nodes", "max_nb_epochs"): - assert depr_name not in args - - trainer = Trainer.from_argparse_args(args=args) - pickle.dumps(trainer) - - assert isinstance(trainer, Trainer) - - @pytest.mark.parametrize("cli_args", [["--callbacks=1", "--logger"], ["--foo", "--bar=1"]]) def test_add_argparse_args_redefined_error(cli_args, monkeypatch): """Asserts error raised in case of passing not default cli arguments.""" @@ -122,121 +82,6 @@ def _raise(): parser.parse_args(cli_args) -@pytest.mark.parametrize( - ["cli_args", "expected"], - [ - ("--auto_lr_find=True --auto_scale_batch_size=power", dict(auto_lr_find=True, auto_scale_batch_size="power")), - ( - "--auto_lr_find any_string --auto_scale_batch_size ON", - dict(auto_lr_find="any_string", auto_scale_batch_size=True), - ), - ("--auto_lr_find=Yes --auto_scale_batch_size=On", dict(auto_lr_find=True, auto_scale_batch_size=True)), - ("--auto_lr_find Off --auto_scale_batch_size No", dict(auto_lr_find=False, auto_scale_batch_size=False)), - ("--auto_lr_find TRUE --auto_scale_batch_size FALSE", dict(auto_lr_find=True, auto_scale_batch_size=False)), - ("--tpu_cores=8", dict(tpu_cores=8)), - ("--tpu_cores=1,", dict(tpu_cores="1,")), - ("--limit_train_batches=100", dict(limit_train_batches=100)), - ("--limit_train_batches 0.8", dict(limit_train_batches=0.8)), - ("--enable_model_summary FALSE", dict(enable_model_summary=False)), - ( - "", - dict( - # These parameters are marked as Optional[...] in Trainer.__init__, - # with None as default. They should not be changed by the argparse - # interface. - min_steps=None, - accelerator=None, - profiler=None, - ), - ), - ], -) -def test_parse_args_parsing(cli_args, expected): - """Test parsing simple types and None optionals not modified.""" - cli_args = cli_args.split(" ") if cli_args else [] - with mock.patch("sys.argv", ["any.py"] + cli_args): - parser = LightningArgumentParser(add_help=False, parse_as_dict=False) - parser.add_lightning_class_args(Trainer, None) - args = parser.parse_args() - - for k, v in expected.items(): - assert getattr(args, k) == v - if "tpu_cores" not in expected or _TPU_AVAILABLE: - assert Trainer.from_argparse_args(args) - - -@pytest.mark.parametrize( - ["cli_args", "expected", "instantiate"], - [ - (["--gpus", "[0, 2]"], dict(gpus=[0, 2]), False), - (["--tpu_cores=[1,3]"], dict(tpu_cores=[1, 3]), False), - (['--accumulate_grad_batches={"5":3,"10":20}'], dict(accumulate_grad_batches={5: 3, 10: 20}), True), - ], -) -def test_parse_args_parsing_complex_types(cli_args, expected, instantiate): - """Test parsing complex types.""" - with mock.patch("sys.argv", ["any.py"] + cli_args): - parser = LightningArgumentParser(add_help=False, parse_as_dict=False) - parser.add_lightning_class_args(Trainer, None) - args = parser.parse_args() - - for k, v in expected.items(): - assert getattr(args, k) == v - if instantiate: - assert Trainer.from_argparse_args(args) - - -@pytest.mark.parametrize( - ["cli_args", "expected_gpu"], - [ - ("--accelerator gpu --devices 1", [0]), - ("--accelerator gpu --devices 0,", [0]), - ("--accelerator gpu --devices 1,", [1]), - ("--accelerator gpu --devices 0,1", [0, 1]), - ], -) -def test_parse_args_parsing_gpus(monkeypatch, cli_args, expected_gpu): - """Test parsing of gpus and instantiation of Trainer.""" - monkeypatch.setattr("lightning_lite.utilities.device_parser.num_cuda_devices", lambda: 2) - monkeypatch.setattr("lightning_lite.utilities.device_parser.is_cuda_available", lambda: True) - cli_args = cli_args.split(" ") if cli_args else [] - with mock.patch("sys.argv", ["any.py"] + cli_args): - parser = LightningArgumentParser(add_help=False, parse_as_dict=False) - parser.add_lightning_class_args(Trainer, None) - args = parser.parse_args() - - trainer = Trainer.from_argparse_args(args) - assert trainer.device_ids == expected_gpu - - -@pytest.mark.skipif( - sys.version_info < (3, 7), - reason="signature inspection while mocking is not working in Python < 3.7 despite autospec", -) -@pytest.mark.parametrize( - ["cli_args", "extra_args"], - [ - ({}, {}), - (dict(logger=False), {}), - (dict(logger=False), dict(logger=True)), - (dict(logger=False), dict(enable_checkpointing=True)), - ], -) -def test_init_from_argparse_args(cli_args, extra_args): - unknown_args = dict(unknown_arg=0) - - # unknown args in the argparser/namespace should be ignored - with mock.patch("pytorch_lightning.Trainer.__init__", autospec=True, return_value=None) as init: - trainer = Trainer.from_argparse_args(Namespace(**cli_args, **unknown_args), **extra_args) - expected = dict(cli_args) - expected.update(extra_args) # extra args should override any cli arg - init.assert_called_with(trainer, **expected) - - # passing in unknown manual args should throw an error - with pytest.raises(TypeError, match=r"__init__\(\) got an unexpected keyword argument 'unknown_arg'"): - Trainer.from_argparse_args(Namespace(**cli_args), **extra_args, **unknown_args) - - class Model(LightningModule): def __init__(self, model_param: int): super().__init__() From 80e2f097e0f16d2077bb44402d8ecc7942fcb197 Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Mon, 12 Sep 2022 20:29:23 +0200 Subject: [PATCH 119/193] (app): Add load_state_dict and state_dict (#14100) Co-authored-by: manskx Co-authored-by: Jirka Borovec Co-authored-by: Jirka --- src/lightning_app/CHANGELOG.md | 2 +- src/lightning_app/core/flow.py | 91 ++++++++++++++- src/lightning_app/core/work.py | 4 - src/lightning_app/utilities/app_helpers.py | 63 +++++++++++ src/lightning_app/utilities/introspection.py | 2 +- tests/tests_app/core/test_lightning_flow.py | 113 ++++++++++++++++++- 6 files changed, 266 insertions(+), 9 deletions(-) diff --git a/src/lightning_app/CHANGELOG.md b/src/lightning_app/CHANGELOG.md index e77561974fdef..b5cdb5b37b136 100644 --- a/src/lightning_app/CHANGELOG.md +++ b/src/lightning_app/CHANGELOG.md @@ -9,7 +9,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Added -- +- Add `load_state_dict` and `state_dict` ([#14100](https://github.com/Lightning-AI/lightning/pull/14100)) ### Changed diff --git a/src/lightning_app/core/flow.py b/src/lightning_app/core/flow.py index 41c46cd868307..3d265c49e2472 100644 --- a/src/lightning_app/core/flow.py +++ b/src/lightning_app/core/flow.py @@ -309,15 +309,24 @@ def get_all_children(self): self.get_all_children_(children) return children - def set_state(self, provided_state: Dict) -> None: + def set_state(self, provided_state: Dict, recurse: bool = True) -> None: """Method to set the state to this LightningFlow, its children and - :class:`~lightning_app.core.work.LightningWork`.""" + :class:`~lightning_app.core.work.LightningWork`. + + Arguments: + provided_state: The state to be reloaded + recurse: Whether to apply the state down children. + """ for k, v in provided_state["vars"].items(): if isinstance(v, Dict): v = _maybe_create_drive(self.name, v) setattr(self, k, v) self._changes = provided_state["changes"] self._calls.update(provided_state["calls"]) + + if not recurse: + return + for child, state in provided_state["flows"].items(): getattr(self, child).set_state(state) for work, state in provided_state["works"].items(): @@ -667,3 +676,81 @@ def configure_api(self): under the ``/docs`` route. """ raise NotImplementedError + + def state_dict(self): + """Returns the current flow state but not its children.""" + return { + "vars": _sanitize_state({el: getattr(self, el) for el in self._state}), + "calls": self._calls.copy(), + "changes": {}, + "flows": {}, + "works": {}, + "structures": {}, + } + + def load_state_dict( + self, + flow_state: Dict[str, Any], + children_states: Dict[str, Any], + strict: bool = True, + ) -> None: + """Reloads the state of this flow and its children. + + .. code-block:: python + + import lightning as L + + + class Work(L.LightningWork): + def __init__(self): + super().__init__() + self.counter = 0 + + def run(self): + self.counter += 1 + + + class Flow(L.LightningFlow): + def run(self): + # dynamically create a work. + if not getattr(self, "w", None): + self.w = WorkReload() + + self.w.run() + + def load_state_dict(self, flow_state, children_states, strict) -> None: + # 1: Re-instantiate the dynamic work + self.w = Work() + + # 2: Make any states modification / migration. + ... + + # 3: Call the parent ``load_state_dict`` to + # recursively reload the states. + super().load_state_dict( + flow_state, + children_states, + strict, + ) + + Arguments: + flow_state: The state of the current flow. + children_states: The state of the dynamic children of this flow. + strict: Whether to raise an exception if a dynamic + children hasn't been re-created. + """ + self.set_state(flow_state, recurse=False) + direct_children_states = {k: v for k, v in children_states.items() if "." not in k} + for child_name, state in direct_children_states.items(): + child = getattr(self, child_name, None) + if isinstance(child, LightningFlow): + lower_children_states = { + k.replace(child_name + ".", ""): v + for k, v in children_states.items() + if k.startswith(child_name) and k != child_name + } + child.load_state_dict(state, lower_children_states, strict=strict) + elif isinstance(child, LightningWork): + child.set_state(state) + elif strict: + raise ValueError(f"The component {child_name} wasn't instantiated for the component {self.name}") diff --git a/src/lightning_app/core/work.py b/src/lightning_app/core/work.py index 99d8c3611a09d..054fe44cfcb89 100644 --- a/src/lightning_app/core/work.py +++ b/src/lightning_app/core/work.py @@ -544,10 +544,6 @@ def _aggregate_status_timeout(self, statuses: List[Dict]) -> WorkStatus: status = {**timeout_statuses[-1], "timestamp": statuses[0]["timestamp"]} return WorkStatus(**status, count=len(timeout_statuses)) - def load_state_dict(self, state): - # TODO (tchaton) Implement logic for state reloading. - pass - def on_exit(self): """Override this hook to add your logic when the work is exiting.""" pass diff --git a/src/lightning_app/utilities/app_helpers.py b/src/lightning_app/utilities/app_helpers.py index b22e73016043e..10f07e86cb0c7 100644 --- a/src/lightning_app/utilities/app_helpers.py +++ b/src/lightning_app/utilities/app_helpers.py @@ -415,3 +415,66 @@ def _set_level(self): if self.level is None: self.level = logging.DEBUG if bool(int(os.getenv("LIGHTNING_DEBUG", "0"))) else logging.INFO self.logger.setLevel(self.level) + + +def _state_dict(flow: "LightningFlow"): + state = {} + flows = [flow] + list(flow.flows.values()) + for f in flows: + state[f.name] = f.state_dict() + for w in flow.works(): + state[w.name] = w.state + return state + + +def _load_state_dict(root_flow: "LightningFlow", state: Dict[str, Any], strict: bool = True) -> None: + """This function is used to reload the state assuming dynamic components creation. + + When a component isn't found but its state exists, its state is passed up to its closest existing parent. + + Arguments: + root_flow: The flow at the top of the component tree. + state: The collected state dict. + strict: Whether to validate all components have been re-created. + """ + # 1: Reload the state of the existing works + for w in root_flow.works(): + w.set_state(state.pop(w.name)) + + # 2: Collect the existing flows + flows = [root_flow] + list(root_flow.flows.values()) + flow_map = {f.name: f for f in flows} + + # 3: Find the state of the all dynamic components + dynamic_components = {k: v for k, v in state.items() if k not in flow_map} + + # 4: Propagate the state of the dynamic components to their closest parents + dynamic_children_state = {} + for name, component_state in dynamic_components.items(): + affiliation = name.split(".") + for idx in range(0, len(affiliation)): + parent_name = ".".join(affiliation[:-idx]) + has_matched = False + for flow_name, flow in flow_map.items(): + if flow_name == parent_name: + if flow_name not in dynamic_children_state: + dynamic_children_state[flow_name] = {} + + dynamic_children_state[flow_name].update({name.replace(parent_name + ".", ""): component_state}) + has_matched = True + break + if has_matched: + break + + # 5: Reload the flow states + for flow_name, flow in flow_map.items(): + flow.load_state_dict(state.pop(flow_name), dynamic_children_state.get(flow_name, {}), strict=strict) + + # 6: Verify all dynamic components has been re-created. + if strict: + components_names = ( + [root_flow.name] + [f.name for f in root_flow.flows.values()] + [w.name for w in root_flow.works()] + ) + for component_name in dynamic_components: + if component_name not in components_names: + raise Exception(f"The component {component_name} was re-created during state reloading.") diff --git a/src/lightning_app/utilities/introspection.py b/src/lightning_app/utilities/introspection.py index d31d0f896f642..856f6d6ea84a8 100644 --- a/src/lightning_app/utilities/introspection.py +++ b/src/lightning_app/utilities/introspection.py @@ -394,4 +394,4 @@ def _is_init_context(component: Union["LightningFlow", "LightningWork"]) -> bool def _is_run_context(component: Union["LightningFlow", "LightningWork"]) -> bool: """Checks whether the call to a component originates from within the context of the component's ``run`` method.""" - return _is_method_context(component, "run") + return _is_method_context(component, "run") or _is_method_context(component, "load_state_dict") diff --git a/tests/tests_app/core/test_lightning_flow.py b/tests/tests_app/core/test_lightning_flow.py index 4c0eb23ea014c..489def49fcdba 100644 --- a/tests/tests_app/core/test_lightning_flow.py +++ b/tests/tests_app/core/test_lightning_flow.py @@ -16,7 +16,12 @@ from lightning_app.storage import Path from lightning_app.storage.path import storage_root_dir from lightning_app.testing.helpers import EmptyFlow, EmptyWork -from lightning_app.utilities.app_helpers import _delta_to_app_state_delta, _LightningAppRef +from lightning_app.utilities.app_helpers import ( + _delta_to_app_state_delta, + _LightningAppRef, + _load_state_dict, + _state_dict, +) from lightning_app.utilities.enum import CacheCallsKeys from lightning_app.utilities.exceptions import ExitAppException @@ -633,3 +638,109 @@ def run(self): assert len(self._calls["scheduling"]) == 8 Flow().run() + + +class WorkReload(LightningWork): + def __init__(self): + super().__init__(cache_calls=False) + self.counter = 0 + + def run(self): + self.counter += 1 + + +class FlowReload(LightningFlow): + def __init__(self): + super().__init__() + self.counter = 0 + + def run(self): + if not getattr(self, "w", None): + self.w = WorkReload() + + self.counter += 1 + self.w.run() + + def load_state_dict(self, flow_state, children_states, strict) -> None: + self.w = WorkReload() + super().load_state_dict(flow_state, children_states, strict=strict) + + +class FlowReload2(LightningFlow): + def __init__(self, random_value: str): + super().__init__() + self.random_value = random_value + self.counter = 0 + + def run(self): + if not getattr(self, "w", None): + self.w = WorkReload() + self.w.run() + self.counter += 1 + + def load_state_dict(self, flow_state, children_states, strict) -> None: + self.w = WorkReload() + super().load_state_dict(flow_state, children_states, strict=strict) + + +class RootFlowReload(LightningFlow): + def __init__(self): + super().__init__() + self.flow = FlowReload() + self.counter = 0 + + def run(self): + if not getattr(self, "flow_2", None): + self.flow_2 = FlowReload2("something") + self.flow.run() + self.flow_2.run() + self.counter += 1 + + def load_state_dict(self, flow_state, children_states, strict) -> None: + self.flow_2 = FlowReload2(children_states["flow_2"]["vars"]["random_value"]) + super().load_state_dict(flow_state, children_states, strict=strict) + + +class RootFlowReload2(RootFlowReload): + def load_state_dict(self, flow_state, children_states, strict) -> None: + LightningFlow.load_state_dict(self, flow_state, children_states, strict=strict) + + +def test_lightning_flow_reload(): + flow = RootFlowReload() + + assert flow.counter == 0 + assert flow.flow.counter == 0 + + flow.run() + + assert flow.flow.w.counter == 1 + assert flow.counter == 1 + assert flow.flow.counter == 1 + assert flow.flow_2.counter == 1 + assert flow.flow_2.w.counter == 1 + + state = _state_dict(flow) + flow = RootFlowReload() + _load_state_dict(flow, state) + + assert flow.flow.w.counter == 1 + assert flow.counter == 1 + assert flow.flow.counter == 1 + assert flow.flow_2.counter == 1 + assert flow.flow_2.w.counter == 1 + + flow.run() + + assert flow.flow.w.counter == 2 + assert flow.counter == 2 + assert flow.flow.counter == 2 + assert flow.flow_2.counter == 2 + assert flow.flow_2.w.counter == 2 + + flow = RootFlowReload2() + flow.run() + state = _state_dict(flow) + flow = RootFlowReload2() + with pytest.raises(ValueError, match="The component flow_2 wasn't instantiated for the component root"): + _load_state_dict(flow, state) From 36f1949d25816560ab82a331cf831ea696036b16 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 12 Sep 2022 19:01:49 +0000 Subject: [PATCH 120/193] Bump tj-actions/changed-files from 29.0.3 to 29.0.4 (#14650) Bumps [tj-actions/changed-files](https://github.com/tj-actions/changed-files) from 29.0.3 to 29.0.4. - [Release notes](https://github.com/tj-actions/changed-files/releases) - [Changelog](https://github.com/tj-actions/changed-files/blob/main/HISTORY.md) - [Commits](https://github.com/tj-actions/changed-files/compare/v29.0.3...v29.0.4) --- updated-dependencies: - dependency-name: tj-actions/changed-files dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/ci-pr-gatekeeper.yml | 2 +- .github/workflows/ci-pytorch-test-conda.yml | 2 +- .github/workflows/ci-pytorch-test-full.yml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ci-pr-gatekeeper.yml b/.github/workflows/ci-pr-gatekeeper.yml index 5c235f151b59b..f13aa98f87dca 100644 --- a/.github/workflows/ci-pr-gatekeeper.yml +++ b/.github/workflows/ci-pr-gatekeeper.yml @@ -20,7 +20,7 @@ jobs: fetch-depth: "2" # To retrieve the preceding commit. - name: Get changed files using defaults id: changed-files - uses: tj-actions/changed-files@v29.0.3 + uses: tj-actions/changed-files@v29.0.4 - name: Determine changes id: touched run: | diff --git a/.github/workflows/ci-pytorch-test-conda.yml b/.github/workflows/ci-pytorch-test-conda.yml index f7ade50623b55..e3b4582e05b1d 100644 --- a/.github/workflows/ci-pytorch-test-conda.yml +++ b/.github/workflows/ci-pytorch-test-conda.yml @@ -37,7 +37,7 @@ jobs: - name: Get changed files id: changed-files - uses: tj-actions/changed-files@v29.0.3 + uses: tj-actions/changed-files@v29.0.4 - name: Update base dependencies env: diff --git a/.github/workflows/ci-pytorch-test-full.yml b/.github/workflows/ci-pytorch-test-full.yml index 6962c2a952b40..d17e23dba15a5 100644 --- a/.github/workflows/ci-pytorch-test-full.yml +++ b/.github/workflows/ci-pytorch-test-full.yml @@ -39,7 +39,7 @@ jobs: - name: Get changed files id: changed-files - uses: tj-actions/changed-files@v29.0.3 + uses: tj-actions/changed-files@v29.0.4 - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v4 From 925edbca077b1c222bfc6a3beeb007e5dee53a3c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 12 Sep 2022 21:02:38 +0200 Subject: [PATCH 121/193] Remove the deprecated `weights_save_path` Trainer argument (#14424) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Carlos Mocholí --- docs/source-pytorch/common/trainer.rst | 38 ------------ src/pytorch_lightning/CHANGELOG.md | 3 + .../callbacks/model_checkpoint.py | 14 ++--- .../trainer/connectors/callback_connector.py | 10 +--- .../connectors/checkpoint_connector.py | 3 +- .../trainer/connectors/signal_connector.py | 4 +- src/pytorch_lightning/trainer/trainer.py | 37 ------------ .../checkpointing/test_model_checkpoint.py | 31 ---------- .../deprecated_api/test_remove_1-8.py | 7 --- tests/tests_pytorch/loggers/test_all.py | 59 ------------------- 10 files changed, 11 insertions(+), 195 deletions(-) diff --git a/docs/source-pytorch/common/trainer.rst b/docs/source-pytorch/common/trainer.rst index 049bbf94181af..cc8b57a182988 100644 --- a/docs/source-pytorch/common/trainer.rst +++ b/docs/source-pytorch/common/trainer.rst @@ -1507,44 +1507,6 @@ Can specify as float or int. total_fit_batches = total_train_batches + total_val_batches -weights_save_path -^^^^^^^^^^^^^^^^^ - - -.. warning:: `weights_save_path` has been deprecated in v1.6 and will be removed in v1.8. Please pass - ``dirpath`` directly to the :class:`~pytorch_lightning.callbacks.model_checkpoint.ModelCheckpoint` - callback. - - -.. raw:: html - - - -| - -Directory of where to save weights if specified. - -.. testcode:: - - # default used by the Trainer - trainer = Trainer(weights_save_path=os.getcwd()) - - # save to your custom path - trainer = Trainer(weights_save_path="my/path") - -Example:: - - # if checkpoint callback used, then overrides the weights path - # **NOTE: this saves weights to some/path NOT my/path - checkpoint = ModelCheckpoint(dirpath='some/path') - trainer = Trainer( - callbacks=[checkpoint], - weights_save_path='my/path' - ) - - enable_model_summary ^^^^^^^^^^^^^^^^^^^^ diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 5ffbc1214ade5..398b42fe71247 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -133,6 +133,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Removed deprecated support for old torchtext versions ([#14375](https://github.com/Lightning-AI/lightning/pull/14375)) +- Removed the deprecated `weights_save_path` Trainer argumnent and `Trainer.weights_save_path` property ([#14424](https://github.com/Lightning-AI/lightning/pull/14424)) + + - Remove the deprecated ([#14471](https://github.com/Lightning-AI/lightning/pull/14471)) * `pytorch_lightning.utilities.distributed.rank_zero_only` in favor of `pytorch_lightning.utilities.rank_zero.rank_zero_only` * `pytorch_lightning.utilities.distributed.rank_zero_debug` in favor of `pytorch_lightning.utilities.rank_zero.rank_zero_debug` diff --git a/src/pytorch_lightning/callbacks/model_checkpoint.py b/src/pytorch_lightning/callbacks/model_checkpoint.py index e484cfde5cb8c..a789b95a4407d 100644 --- a/src/pytorch_lightning/callbacks/model_checkpoint.py +++ b/src/pytorch_lightning/callbacks/model_checkpoint.py @@ -67,8 +67,7 @@ class ModelCheckpoint(Checkpoint): By default, dirpath is ``None`` and will be set at runtime to the location specified by :class:`~pytorch_lightning.trainer.trainer.Trainer`'s - :paramref:`~pytorch_lightning.trainer.trainer.Trainer.default_root_dir` or - :paramref:`~pytorch_lightning.trainer.trainer.Trainer.weights_save_path` arguments, + :paramref:`~pytorch_lightning.trainer.trainer.Trainer.default_root_dir` argument, and if the Trainer uses a logger, the path will also contain logger name and version. filename: checkpoint filename. Can contain named formatting options to be auto-filled. @@ -577,9 +576,8 @@ def __resolve_ckpt_dir(self, trainer: "pl.Trainer") -> None: determine where to save checkpoints. The path for saving weights is set in this priority: 1. The ``ModelCheckpoint``'s ``dirpath`` if passed in - 2. The ``Trainer``'s ``weights_saved_path`` if passed in (deprecated) - 3. The ``Logger``'s ``log_dir`` if the trainer has loggers - 4. The ``Trainer``'s ``default_root_dir`` if the trainer has no loggers + 2. The ``Logger``'s ``log_dir`` if the trainer has loggers + 3. The ``Trainer``'s ``default_root_dir`` if the trainer has no loggers The path gets extended with subdirectory "checkpoints". """ @@ -587,11 +585,7 @@ def __resolve_ckpt_dir(self, trainer: "pl.Trainer") -> None: # short circuit if dirpath was passed to ModelCheckpoint return - # TODO: Remove weights_save_path logic here in v1.8 - if trainer._weights_save_path_internal != trainer.default_root_dir: - # the user has changed weights_save_path - ckpt_path = os.path.join(trainer._weights_save_path_internal, "checkpoints") - elif len(trainer.loggers) > 0: + if len(trainer.loggers) > 0: if trainer.loggers[0].save_dir is not None: save_dir = trainer.loggers[0].save_dir else: diff --git a/src/pytorch_lightning/trainer/connectors/callback_connector.py b/src/pytorch_lightning/trainer/connectors/callback_connector.py index 32d67d44ad44c..a1144c2912c95 100644 --- a/src/pytorch_lightning/trainer/connectors/callback_connector.py +++ b/src/pytorch_lightning/trainer/connectors/callback_connector.py @@ -32,7 +32,7 @@ from pytorch_lightning.callbacks.timer import Timer from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.imports import _PYTHON_GREATER_EQUAL_3_8_0, _PYTHON_GREATER_EQUAL_3_10_0 -from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation, rank_zero_info +from pytorch_lightning.utilities.rank_zero import rank_zero_info _log = logging.getLogger(__name__) @@ -47,20 +47,12 @@ def on_trainer_init( enable_checkpointing: bool, enable_progress_bar: bool, default_root_dir: Optional[str], - weights_save_path: Optional[str], enable_model_summary: bool, max_time: Optional[Union[str, timedelta, Dict[str, int]]] = None, accumulate_grad_batches: Optional[Union[int, Dict[int, int]]] = None, ) -> None: # init folder paths for checkpoint + weights save callbacks self.trainer._default_root_dir = default_root_dir or os.getcwd() - if weights_save_path: - rank_zero_deprecation( - "Setting `Trainer(weights_save_path=)` has been deprecated in v1.6 and will be" - " removed in v1.8. Please pass ``dirpath`` directly to the `ModelCheckpoint` callback" - ) - - self.trainer._weights_save_path = weights_save_path or self.trainer._default_root_dir # init callbacks if isinstance(callbacks, Callback): diff --git a/src/pytorch_lightning/trainer/connectors/checkpoint_connector.py b/src/pytorch_lightning/trainer/connectors/checkpoint_connector.py index 300f3c129243b..3ddb3572cc489 100644 --- a/src/pytorch_lightning/trainer/connectors/checkpoint_connector.py +++ b/src/pytorch_lightning/trainer/connectors/checkpoint_connector.py @@ -56,8 +56,7 @@ def __init__(self, trainer: "pl.Trainer", resume_from_checkpoint: Optional[_PATH @property def _hpc_resume_path(self) -> Optional[str]: - # TODO: in v1.8 set this equal to self.trainer.default_root_dir - dir_path_hpc = self.trainer._weights_save_path_internal + dir_path_hpc = self.trainer.default_root_dir fs = get_filesystem(dir_path_hpc) if not fs.isdir(dir_path_hpc): return None diff --git a/src/pytorch_lightning/trainer/connectors/signal_connector.py b/src/pytorch_lightning/trainer/connectors/signal_connector.py index 17e11bfdf649d..3948736a6f5cf 100644 --- a/src/pytorch_lightning/trainer/connectors/signal_connector.py +++ b/src/pytorch_lightning/trainer/connectors/signal_connector.py @@ -68,8 +68,8 @@ def slurm_sigusr1_handler_fn(self, signum: _SIGNUM, frame: FrameType) -> None: # save logger to make sure we get all the metrics for logger in self.trainer.loggers: logger.finalize("finished") - # TODO: in v1.8 change this to use self.trainer.default_root_dir - hpc_save_path = self.trainer._checkpoint_connector.hpc_save_path(self.trainer._weights_save_path_internal) + + hpc_save_path = self.trainer._checkpoint_connector.hpc_save_path(self.trainer.default_root_dir) self.trainer.save_checkpoint(hpc_save_path) if self.trainer.is_global_zero: diff --git a/src/pytorch_lightning/trainer/trainer.py b/src/pytorch_lightning/trainer/trainer.py index fc0fc36238ed9..5af8d692ea03b 100644 --- a/src/pytorch_lightning/trainer/trainer.py +++ b/src/pytorch_lightning/trainer/trainer.py @@ -169,7 +169,6 @@ def __init__( sync_batchnorm: bool = False, precision: Union[int, str] = 32, enable_model_summary: bool = True, - weights_save_path: Optional[str] = None, # TODO: Remove in 1.8 num_sanity_val_steps: int = 2, resume_from_checkpoint: Optional[Union[Path, str]] = None, profiler: Optional[Union[Profiler, str]] = None, @@ -402,17 +401,6 @@ def __init__( enable_model_summary: Whether to enable model summarization by default. Default: ``True``. - weights_save_path: Where to save weights if specified. Will override default_root_dir - for checkpoints only. Use this if for whatever reason you need the checkpoints - stored in a different place than the logs written in `default_root_dir`. - Can be remote file paths such as `s3://mybucket/path` or 'hdfs://path/' - Defaults to `default_root_dir`. - - .. deprecated:: v1.6 - ``weights_save_path`` has been deprecated in v1.6 and will be removed in v1.8. Please pass - ``dirpath`` directly to the :class:`~pytorch_lightning.callbacks.model_checkpoint.ModelCheckpoint` - callback. - move_metrics_to_cpu: Whether to force internal logged metrics to be moved to cpu. This can save some gpu memory, but can make training slower. Use with attention. Default: ``False``. @@ -489,7 +477,6 @@ def __init__( enable_checkpointing, enable_progress_bar, default_root_dir, - weights_save_path, enable_model_summary, max_time, accumulate_grad_batches, @@ -2234,30 +2221,6 @@ def default_root_dir(self) -> str: return os.path.normpath(self._default_root_dir) return self._default_root_dir - @property - def weights_save_path(self) -> str: - """ - The default root location to save weights (checkpoints), e.g., when the - :class:`~pytorch_lightning.callbacks.model_checkpoint.ModelCheckpoint` does not define a file path. - - .. deprecated:: v1.6 - `Trainer.weights_save_path` has been deprecated in v1.6 and will be removed in v1.8. - """ - rank_zero_deprecation("`Trainer.weights_save_path` has been deprecated in v1.6 and will be removed in v1.8.") - return self._weights_save_path_internal - - # TODO: Remove _weights_save_path_internal in v1.8 - @property - def _weights_save_path_internal(self) -> str: - """This is an internal implementation of weights_save_path which allows weights_save_path to be used - internally by the framework without emitting a deprecation warning. - - To be removed in v1.8. - """ - if get_filesystem(self._weights_save_path).protocol == "file": - return os.path.normpath(self._weights_save_path) - return self._weights_save_path - @property def early_stopping_callback(self) -> Optional[EarlyStopping]: """The first :class:`~pytorch_lightning.callbacks.early_stopping.EarlyStopping` callback in the diff --git a/tests/tests_pytorch/checkpointing/test_model_checkpoint.py b/tests/tests_pytorch/checkpointing/test_model_checkpoint.py index ccda0d7bcc408..6d44bfe83be3d 100644 --- a/tests/tests_pytorch/checkpointing/test_model_checkpoint.py +++ b/tests/tests_pytorch/checkpointing/test_model_checkpoint.py @@ -1339,37 +1339,6 @@ def test_last_global_step_saved(): assert model_checkpoint._last_global_step_saved == 0 -# TODO: remove test_dirpath_weights_save_path in v1.8 -@pytest.mark.parametrize( - "logger_setting", - [ - False, - TensorBoardLogger(save_dir="logger1"), - [TensorBoardLogger(save_dir="logger1"), TensorBoardLogger(save_dir="logger2")], - ], -) -def test_dirpath_weights_save_path(tmpdir, logger_setting): - """Tests that the ModelCheckpoint.dirpath is set correctly when user specifies weights_save_path with no - loggers, one logger, and multiple loggers.""" - model = BoringModel() - mc = ModelCheckpoint(monitor="epoch", save_top_k=-1) - with pytest.deprecated_call(match=r"Setting `Trainer\(weights_save_path=\)` has been deprecated in v1.6"): - trainer = Trainer( - default_root_dir=tmpdir, - weights_save_path=tmpdir / "weights_save_path", - limit_train_batches=1, - limit_val_batches=1, - num_sanity_val_steps=0, - max_epochs=5, - check_val_every_n_epoch=2, - callbacks=mc, - enable_model_summary=False, - logger=logger_setting, - ) - trainer.fit(model) - assert mc.dirpath == tmpdir / "weights_save_path" / "checkpoints" - - @pytest.mark.parametrize("every_n_epochs", (0, 5)) def test_save_last_every_n_epochs_interaction(tmpdir, every_n_epochs): """Test that `save_last` ignores `every_n_epochs`.""" diff --git a/tests/tests_pytorch/deprecated_api/test_remove_1-8.py b/tests/tests_pytorch/deprecated_api/test_remove_1-8.py index b9e36df94d669..d3e8c92822ae2 100644 --- a/tests/tests_pytorch/deprecated_api/test_remove_1-8.py +++ b/tests/tests_pytorch/deprecated_api/test_remove_1-8.py @@ -399,13 +399,6 @@ def on_pretrain_routine_end(self, trainer, pl_module): trainer.fit(model) -def test_v1_8_0_weights_save_path(tmpdir): - with pytest.deprecated_call(match=r"Setting `Trainer\(weights_save_path=\)` has been deprecated in v1.6"): - trainer = Trainer(weights_save_path=tmpdir) - with pytest.deprecated_call(match=r"`Trainer.weights_save_path` has been deprecated in v1.6"): - _ = trainer.weights_save_path - - @pytest.mark.flaky(reruns=3) @pytest.mark.parametrize(["action", "expected"], [("a", [3, 1]), ("b", [2]), ("c", [1])]) def test_simple_profiler_iterable_durations(tmpdir, action: str, expected: list): diff --git a/tests/tests_pytorch/loggers/test_all.py b/tests/tests_pytorch/loggers/test_all.py index 612d7bf035c2f..279a1aeab7e69 100644 --- a/tests/tests_pytorch/loggers/test_all.py +++ b/tests/tests_pytorch/loggers/test_all.py @@ -13,7 +13,6 @@ # limitations under the License. import contextlib import inspect -import os import pickle from unittest import mock from unittest.mock import ANY @@ -157,64 +156,6 @@ def log_metrics(self, metrics, step): assert log_metric_names == expected -@pytest.mark.parametrize("logger_class", ALL_LOGGER_CLASSES_WO_NEPTUNE) -def test_loggers_save_dir_and_weights_save_path_all(tmpdir, monkeypatch, logger_class): - """Test the combinations of save_dir, weights_save_path and default_root_dir.""" - - with contextlib.ExitStack() as stack: - for mgr in LOGGER_CTX_MANAGERS: - stack.enter_context(mgr) - _patch_comet_atexit(monkeypatch) - _test_loggers_save_dir_and_weights_save_path(tmpdir, CometLogger) - - -def _test_loggers_save_dir_and_weights_save_path(tmpdir, logger_class): - class TestLogger(logger_class): - # for this test it does not matter what these attributes are - # so we standardize them to make testing easier - @property - def version(self): - return "version" - - @property - def name(self): - return "name" - - model = BoringModel() - trainer_args = dict(default_root_dir=tmpdir, max_steps=3) - - # no weights_save_path given - save_dir = tmpdir / "logs" - weights_save_path = None - logger = TestLogger(**_get_logger_args(TestLogger, save_dir)) - trainer = Trainer(**trainer_args, logger=logger, weights_save_path=weights_save_path) - trainer.fit(model) - assert trainer._weights_save_path_internal == trainer.default_root_dir - assert trainer.checkpoint_callback.dirpath == os.path.join(str(logger.save_dir), "name", "version", "checkpoints") - assert trainer.default_root_dir == tmpdir - - # with weights_save_path given, the logger path and checkpoint path should be different - save_dir = tmpdir / "logs" - weights_save_path = tmpdir / "weights" - logger = TestLogger(**_get_logger_args(TestLogger, save_dir)) - with pytest.deprecated_call(match=r"Setting `Trainer\(weights_save_path=\)` has been deprecated in v1.6"): - trainer = Trainer(**trainer_args, logger=logger, weights_save_path=weights_save_path) - trainer.fit(model) - assert trainer._weights_save_path_internal == weights_save_path - assert trainer.logger.save_dir == save_dir - assert trainer.checkpoint_callback.dirpath == weights_save_path / "checkpoints" - assert trainer.default_root_dir == tmpdir - - # no logger given - weights_save_path = tmpdir / "weights" - with pytest.deprecated_call(match=r"Setting `Trainer\(weights_save_path=\)` has been deprecated in v1.6"): - trainer = Trainer(**trainer_args, logger=False, weights_save_path=weights_save_path) - trainer.fit(model) - assert trainer._weights_save_path_internal == weights_save_path - assert trainer.checkpoint_callback.dirpath == weights_save_path / "checkpoints" - assert trainer.default_root_dir == tmpdir - - @pytest.mark.parametrize( "logger_class", ALL_LOGGER_CLASSES_WO_NEPTUNE ) # WandbLogger and NeptuneLogger get tested separately From e5998e6bf2169e484df07e2bbacf98ed21f19624 Mon Sep 17 00:00:00 2001 From: Max Ehrlich Date: Mon, 12 Sep 2022 15:24:35 -0400 Subject: [PATCH 122/193] Make the SLURM Preemption/Timeout Signal Configurable (#14626) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Add parameter to change the preemption signal * Make the signal connector use the custom signal from SLURMEnvironment Signed-off-by: Max Ehrlich Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Adrian Wälchli --- .../source-pytorch/clouds/cluster_advanced.rst | 12 ++++++++++++ .../plugins/environments/slurm_environment.py | 9 ++++++++- src/pytorch_lightning/CHANGELOG.md | 4 ++++ .../trainer/connectors/signal_connector.py | 18 +++++++++++------- .../connectors/test_signal_connector.py | 14 ++++++++------ 5 files changed, 43 insertions(+), 14 deletions(-) diff --git a/docs/source-pytorch/clouds/cluster_advanced.rst b/docs/source-pytorch/clouds/cluster_advanced.rst index f1c5520466d66..b01aaeef9be7c 100644 --- a/docs/source-pytorch/clouds/cluster_advanced.rst +++ b/docs/source-pytorch/clouds/cluster_advanced.rst @@ -113,6 +113,18 @@ To get this behavior make sure to add the correct signal to your SLURM script # 90 seconds before training ends SBATCH --signal=SIGUSR1@90 +You can change this signal if your environment requires the use of a different one, for example + +.. code-block:: bash + + #SBATCH --signal=SIGHUP@90 + +Then, when you make your trainer, pass the `requeue_signal` option to the :class:`~pytorch_lightning.plugins.environments.slurm_environment.SLURMEnvironment` plugin: + +.. code-block:: python + + trainer = Trainer(plugins=[SLURMEnvironment(requeue_signal=signal.SIGHUP)]) + If auto-resubmit is not desired, it can be turned off in the :class:`~pytorch_lightning.plugins.environments.slurm_environment.SLURMEnvironment` plugin: .. code-block:: python diff --git a/src/lightning_lite/plugins/environments/slurm_environment.py b/src/lightning_lite/plugins/environments/slurm_environment.py index 5973453194a28..a69eea6a471f3 100644 --- a/src/lightning_lite/plugins/environments/slurm_environment.py +++ b/src/lightning_lite/plugins/environments/slurm_environment.py @@ -15,9 +15,11 @@ import logging import os import re +import signal from typing import Optional from lightning_lite.plugins.environments.cluster_environment import ClusterEnvironment +from lightning_lite.utilities.imports import _IS_WINDOWS log = logging.getLogger(__name__) @@ -28,11 +30,16 @@ class SLURMEnvironment(ClusterEnvironment): Args: auto_requeue: Whether automatic job resubmission is enabled or not. How and under which conditions a job gets rescheduled gets determined by the owner of this plugin. + requeue_signal: The signal that SLURM will send to indicate that the job should be requeued. Defaults to + SIGUSR1 on Unix. """ - def __init__(self, auto_requeue: bool = True) -> None: + def __init__(self, auto_requeue: bool = True, requeue_signal: Optional[signal.Signals] = None) -> None: super().__init__() self.auto_requeue = auto_requeue + if requeue_signal is None and not _IS_WINDOWS: + requeue_signal = signal.SIGUSR1 + self.requeue_signal = requeue_signal @property def creates_processes_externally(self) -> bool: diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 398b42fe71247..61fa4323a2422 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -6,6 +6,10 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ## [unReleased] - 2022-MM-DD + +- Added an option to configure the signal SLURM sends when a job is preempted or requeued ([#14610](https://github.com/Lightning-AI/lightning/issues/14610)) + + ### Added diff --git a/src/pytorch_lightning/trainer/connectors/signal_connector.py b/src/pytorch_lightning/trainer/connectors/signal_connector.py index 3948736a6f5cf..83a9e38ce0299 100644 --- a/src/pytorch_lightning/trainer/connectors/signal_connector.py +++ b/src/pytorch_lightning/trainer/connectors/signal_connector.py @@ -42,7 +42,7 @@ def __init__(self, trainer: "pl.Trainer") -> None: def register_signal_handlers(self) -> None: self._original_handlers = self._get_current_signal_handlers() - sigusr1_handlers: List[_HANDLER] = [] + sigusr_handlers: List[_HANDLER] = [] sigterm_handlers: List[_HANDLER] = [] if _fault_tolerant_training(): @@ -51,19 +51,23 @@ def register_signal_handlers(self) -> None: environment = self.trainer._accelerator_connector.cluster_environment if isinstance(environment, SLURMEnvironment) and environment.auto_requeue: log.info("SLURM auto-requeueing enabled. Setting signal handlers.") - sigusr1_handlers.append(self.slurm_sigusr1_handler_fn) + sigusr_handlers.append(self.slurm_sigusr_handler_fn) sigterm_handlers.append(self.sigterm_handler_fn) - # signal.SIGUSR1 doesn't seem available on windows + # Windows seems to have signal incompatibilities if not self._is_on_windows(): - if sigusr1_handlers and not self._has_already_handler(signal.SIGUSR1): - self._register_signal(signal.SIGUSR1, HandlersCompose(sigusr1_handlers)) + sigusr = environment.requeue_signal if isinstance(environment, SLURMEnvironment) else signal.SIGUSR1 + + assert sigusr is not None + + if sigusr_handlers and not self._has_already_handler(sigusr): + self._register_signal(sigusr, HandlersCompose(sigusr_handlers)) if sigterm_handlers and not self._has_already_handler(signal.SIGTERM): self._register_signal(signal.SIGTERM, HandlersCompose(sigterm_handlers)) - def slurm_sigusr1_handler_fn(self, signum: _SIGNUM, frame: FrameType) -> None: - rank_zero_info("handling SIGUSR1") + def slurm_sigusr_handler_fn(self, signum: _SIGNUM, frame: FrameType) -> None: + rank_zero_info("handling auto-requeue signal") # save logger to make sure we get all the metrics for logger in self.trainer.loggers: diff --git a/tests/tests_pytorch/trainer/connectors/test_signal_connector.py b/tests/tests_pytorch/trainer/connectors/test_signal_connector.py index 4f8bee8398edc..21b9364d2ebf7 100644 --- a/tests/tests_pytorch/trainer/connectors/test_signal_connector.py +++ b/tests/tests_pytorch/trainer/connectors/test_signal_connector.py @@ -20,6 +20,7 @@ import pytest from lightning_lite.plugins.environments import SLURMEnvironment +from lightning_lite.utilities.imports import _IS_WINDOWS from pytorch_lightning import Trainer from pytorch_lightning.demos.boring_classes import BoringModel from pytorch_lightning.trainer.connectors.signal_connector import SignalConnector @@ -78,8 +79,9 @@ def training_step(self, batch, batch_idx): @RunIf(skip_windows=True) @pytest.mark.parametrize("auto_requeue", (True, False)) -def test_auto_requeue_flag(auto_requeue): - trainer = Trainer(plugins=[SLURMEnvironment(auto_requeue=auto_requeue)]) +@pytest.mark.parametrize("requeue_signal", [signal.SIGUSR1, signal.SIGUSR2, signal.SIGHUP] if not _IS_WINDOWS else []) +def test_auto_requeue_custom_signal_flag(auto_requeue, requeue_signal): + trainer = Trainer(plugins=[SLURMEnvironment(auto_requeue=auto_requeue, requeue_signal=requeue_signal)]) connector = SignalConnector(trainer) connector.register_signal_handlers() @@ -88,12 +90,12 @@ def test_auto_requeue_flag(auto_requeue): assert len(sigterm_handlers) == 1 assert sigterm_handlers[0].__qualname__ == "SignalConnector.sigterm_handler_fn" - sigusr1_handlers = signal.getsignal(signal.SIGUSR1).signal_handlers - assert len(sigusr1_handlers) == 1 - assert sigusr1_handlers[0].__qualname__ == "SignalConnector.slurm_sigusr1_handler_fn" + sigusr_handlers = signal.getsignal(requeue_signal).signal_handlers + assert len(sigusr_handlers) == 1 + assert sigusr_handlers[0].__qualname__ == "SignalConnector.slurm_sigusr_handler_fn" else: assert signal.getsignal(signal.SIGTERM) is signal.SIG_DFL - assert signal.getsignal(signal.SIGUSR1) is signal.SIG_DFL + assert signal.getsignal(requeue_signal) is signal.SIG_DFL connector.teardown() From f73b31bf3ccd5cec28acbdbc8e29f5c8bbfa7aea Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 12 Sep 2022 20:18:26 +0000 Subject: [PATCH 123/193] Update traitlets requirement from <5.2.0 as strict in /requirements (#14666) * Update traitlets requirement from <5.2.0 as strict in /requirements Co-authored-by: Jirka Borovec --- requirements/app/base.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/app/base.txt b/requirements/app/base.txt index 003a174ef937e..460a2775f64bf 100644 --- a/requirements/app/base.txt +++ b/requirements/app/base.txt @@ -5,6 +5,6 @@ starsessions>=1.2.1, <2.0 # strict fsspec>=2022.5.0, <=2022.7.1 s3fs>=2022.5.0, <2022.8.3 croniter>=1.3.0, <1.4.0 # strict; TODO: for now until we find something more robust. -traitlets<5.2.0 # Traitlets 5.2.X fails: https://github.com/ipython/traitlets/issues/741 +traitlets<5.2.0 # strict; ToDo: Traitlets 5.2.X fails: https://github.com/ipython/traitlets/issues/741 arrow>=1.2.0, <=1.2.2 lightning-utilities==0.3.* From 4bd135a6f62c7089a0b9928e5ea85a6edc568063 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 12 Sep 2022 23:46:46 +0200 Subject: [PATCH 124/193] Remove deprecated `LoggerCollection` (#14283) Co-authored-by: Rohit Gupta Co-authored-by: Jirka Borovec --- src/pytorch_lightning/CHANGELOG.md | 3 + src/pytorch_lightning/core/module.py | 24 +---- src/pytorch_lightning/loggers/__init__.py | 4 +- src/pytorch_lightning/loggers/base.py | 5 - src/pytorch_lightning/loggers/logger.py | 97 +------------------ src/pytorch_lightning/trainer/trainer.py | 20 +--- .../core/test_lightning_module.py | 8 +- .../deprecated_api/test_remove_1-8.py | 26 +---- tests/tests_pytorch/loggers/test_logger.py | 87 +---------------- .../trainer/properties/test_loggers.py | 18 +--- 10 files changed, 21 insertions(+), 271 deletions(-) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 61fa4323a2422..7f840d247ed84 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -128,6 +128,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Removed the experimental `pytorch_lightning.utiltiies.meta` functions in favor of built-in https://github.com/pytorch/torchdistx support ([#13868](https://github.com/Lightning-AI/lightning/pull/13868)) +- Removed the deprecated `LoggerCollection`; `Trainer.logger` and `LightningModule.logger` now returns the first logger when more than one gets passed to the Trainer ([#14283](https://github.com/Lightning-AI/lightning/pull/14283)) + + - Removed the deprecated the `trainer.lr_schedulers` ([#14408](https://github.com/Lightning-AI/lightning/pull/14408)) diff --git a/src/pytorch_lightning/core/module.py b/src/pytorch_lightning/core/module.py index 800d4be1d6e6a..46880add37737 100644 --- a/src/pytorch_lightning/core/module.py +++ b/src/pytorch_lightning/core/module.py @@ -18,7 +18,6 @@ import numbers import os import tempfile -import warnings import weakref from contextlib import contextmanager from pathlib import Path @@ -43,7 +42,7 @@ from pytorch_lightning.core.mixins import HyperparametersMixin from pytorch_lightning.core.optimizer import LightningOptimizer from pytorch_lightning.core.saving import ModelIO -from pytorch_lightning.loggers import Logger, LoggerCollection +from pytorch_lightning.loggers import Logger from pytorch_lightning.trainer.connectors.logger_connector.fx_validator import _FxValidator from pytorch_lightning.utilities import _IS_WINDOWS, _TORCH_GREATER_EQUAL_1_10, GradClipAlgorithmType from pytorch_lightning.utilities.exceptions import MisconfigurationException @@ -267,26 +266,7 @@ def truncated_bptt_steps(self, truncated_bptt_steps: int) -> None: @property def logger(self) -> Optional[Logger]: """Reference to the logger object in the Trainer.""" - # this should match the implementation of `trainer.logger` - # we don't reuse it so we can properly set the deprecation stacklevel - if self._trainer is None: - return None - loggers = self.trainer.loggers - if len(loggers) == 0: - return None - if len(loggers) == 1: - return loggers[0] - else: - if not self._running_torchscript: - rank_zero_deprecation( - "Using `lightning_module.logger` when multiple loggers are configured." - " This behavior will change in v1.8 when `LoggerCollection` is removed, and" - " `lightning_module.logger` will return the first logger available.", - stacklevel=5, - ) - with warnings.catch_warnings(): - warnings.simplefilter("ignore") - return LoggerCollection(loggers) + return self._trainer.logger if self._trainer is not None else None @property def loggers(self) -> List[Logger]: diff --git a/src/pytorch_lightning/loggers/__init__.py b/src/pytorch_lightning/loggers/__init__.py index c97a7a09d9e7f..b9f5d2919d61e 100644 --- a/src/pytorch_lightning/loggers/__init__.py +++ b/src/pytorch_lightning/loggers/__init__.py @@ -17,13 +17,13 @@ from pytorch_lightning.loggers.base import LightningLoggerBase from pytorch_lightning.loggers.comet import _COMET_AVAILABLE, CometLogger # noqa: F401 from pytorch_lightning.loggers.csv_logs import CSVLogger -from pytorch_lightning.loggers.logger import Logger, LoggerCollection +from pytorch_lightning.loggers.logger import Logger from pytorch_lightning.loggers.mlflow import _MLFLOW_AVAILABLE, MLFlowLogger # noqa: F401 from pytorch_lightning.loggers.neptune import NeptuneLogger # noqa: F401 from pytorch_lightning.loggers.tensorboard import TensorBoardLogger from pytorch_lightning.loggers.wandb import WandbLogger # noqa: F401 -__all__ = ["CSVLogger", "LightningLoggerBase", "Logger", "LoggerCollection", "TensorBoardLogger"] +__all__ = ["CSVLogger", "LightningLoggerBase", "Logger", "TensorBoardLogger"] if _COMET_AVAILABLE: __all__.append("CometLogger") diff --git a/src/pytorch_lightning/loggers/base.py b/src/pytorch_lightning/loggers/base.py index 628a56609b34d..5cfe1545bba55 100644 --- a/src/pytorch_lightning/loggers/base.py +++ b/src/pytorch_lightning/loggers/base.py @@ -57,11 +57,6 @@ def __init__(self, *args, **kwargs) -> None: # type: ignore[no-untyped-def] super().__init__(*args, **kwargs) -class LoggerCollection(logger.LoggerCollection): - def __init__(self, *args, **kwargs) -> None: # type: ignore[no-untyped-def] - super().__init__(*args, **kwargs) - - class DummyExperiment(logger.DummyExperiment): def __init__(self, *args, **kwargs) -> None: # type: ignore[no-untyped-def] rank_zero_deprecation( diff --git a/src/pytorch_lightning/loggers/logger.py b/src/pytorch_lightning/loggers/logger.py index 56bf4660c29dd..6894b9c980f4b 100644 --- a/src/pytorch_lightning/loggers/logger.py +++ b/src/pytorch_lightning/loggers/logger.py @@ -20,7 +20,7 @@ from argparse import Namespace from collections import defaultdict from functools import wraps -from typing import Any, Callable, Dict, Generator, Iterable, List, Mapping, Optional, Sequence, Union +from typing import Any, Callable, Dict, List, Mapping, Optional, Sequence, Union from weakref import ReferenceType import numpy as np @@ -212,97 +212,6 @@ def version(self) -> Optional[Union[int, str]]: """Return the experiment version.""" -class LoggerCollection(Logger): - """The :class:`LoggerCollection` class is used to iterate all logging actions over the given `logger_iterable`. - - .. deprecated:: v1.6 - `LoggerCollection` is deprecated in v1.6 and will be removed in v1.8. - Directly pass a list of loggers to the Trainer and access the list via the `trainer.loggers` attribute. - - Args: - logger_iterable: An iterable collection of loggers - """ - - def __init__(self, logger_iterable: Iterable[Logger]): - super().__init__() - self._logger_iterable = logger_iterable - rank_zero_deprecation( - "`LoggerCollection` is deprecated in v1.6 and will be removed in v1.8. Directly pass a list of loggers" - " to the Trainer and access the list via the `trainer.loggers` attribute." - ) - - def __getitem__(self, index: int) -> Logger: - return list(self._logger_iterable)[index] - - def after_save_checkpoint(self, checkpoint_callback: "ReferenceType[Checkpoint]") -> None: - for logger in self._logger_iterable: - logger.after_save_checkpoint(checkpoint_callback) - - def update_agg_funcs( - self, - agg_key_funcs: Optional[Mapping[str, Callable[[Sequence[float]], float]]] = None, - agg_default_func: Callable[[Sequence[float]], float] = np.mean, - ) -> None: - for logger in self._logger_iterable: - logger.update_agg_funcs(agg_key_funcs, agg_default_func) - - @property - def experiment(self) -> List[Any]: - """Returns a list of experiment objects for all the loggers in the logger collection.""" - return [logger.experiment for logger in self._logger_iterable] - - def agg_and_log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> None: - for logger in self._logger_iterable: - logger.agg_and_log_metrics(metrics=metrics, step=step) - - def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> None: - for logger in self._logger_iterable: - logger.log_metrics(metrics=metrics, step=step) - - def log_hyperparams(self, params: Union[Dict[str, Any], Namespace], *args: Any, **kwargs: Any) -> None: - for logger in self._logger_iterable: - logger.log_hyperparams(params, *args, **kwargs) - - def log_graph(self, model: "pl.LightningModule", input_array: Optional[Tensor] = None) -> None: - for logger in self._logger_iterable: - logger.log_graph(model, input_array) - - def log_text(self, *args: Any, **kwargs: Any) -> None: - for logger in self._logger_iterable: - logger.log_text(*args, **kwargs) - - def log_image(self, *args: Any, **kwargs: Any) -> None: - for logger in self._logger_iterable: - logger.log_image(*args, **kwargs) - - def save(self) -> None: - for logger in self._logger_iterable: - logger.save() - - def finalize(self, status: str) -> None: - for logger in self._logger_iterable: - logger.finalize(status) - - @property - def save_dir(self) -> Optional[str]: - """Returns ``None`` as checkpoints should be saved to default / chosen location when using multiple - loggers.""" - # Checkpoints should be saved to default / chosen location when using multiple loggers - return None - - @property - def name(self) -> str: - """Returns the unique experiment names for all the loggers in the logger collection joined by an - underscore.""" - return "_".join(dict.fromkeys(str(logger.name) for logger in self._logger_iterable)) - - @property - def version(self) -> str: - """Returns the unique experiment versions for all the loggers in the logger collection joined by an - underscore.""" - return "_".join(dict.fromkeys(str(logger.version) for logger in self._logger_iterable)) - - class DummyExperiment: """Dummy experiment.""" @@ -355,10 +264,6 @@ def __getitem__(self, idx: int) -> "DummyLogger": # enables self.logger[0].experiment.add_image(...) return self - def __iter__(self) -> Generator[None, None, None]: - # if DummyLogger is substituting a logger collection, pretend it is empty - yield from () - def __getattr__(self, name: str) -> Callable: """Allows the DummyLogger to be called with arbitrary methods, to avoid AttributeErrors.""" diff --git a/src/pytorch_lightning/trainer/trainer.py b/src/pytorch_lightning/trainer/trainer.py index 5af8d692ea03b..aeb36f52639ac 100644 --- a/src/pytorch_lightning/trainer/trainer.py +++ b/src/pytorch_lightning/trainer/trainer.py @@ -56,7 +56,7 @@ from pytorch_lightning.core.datamodule import LightningDataModule from pytorch_lightning.core.optimizer import LightningOptimizer from pytorch_lightning.loggers import Logger -from pytorch_lightning.loggers.logger import DummyLogger, LoggerCollection +from pytorch_lightning.loggers.logger import DummyLogger from pytorch_lightning.loggers.tensorboard import TensorBoardLogger from pytorch_lightning.loops import PredictionLoop, TrainingEpochLoop from pytorch_lightning.loops.dataloader.evaluation_loop import EvaluationLoop @@ -2604,28 +2604,12 @@ def _active_loop(self) -> Optional[Union[FitLoop, EvaluationLoop, PredictionLoop @property def logger(self) -> Optional[Logger]: - loggers = self.loggers - if len(loggers) == 0: - return None - if len(loggers) == 1: - return loggers[0] - else: - rank_zero_deprecation( - "Using `trainer.logger` when multiple loggers are configured." - " This behavior will change in v1.8 when `LoggerCollection` is removed, and" - " `trainer.logger` will return the first logger available.", - stacklevel=5, - ) - with warnings.catch_warnings(): - warnings.simplefilter("ignore") - return LoggerCollection(loggers) + return self.loggers[0] if len(self.loggers) > 0 else None @logger.setter def logger(self, logger: Optional[Logger]) -> None: if not logger: self.loggers = [] - elif isinstance(logger, LoggerCollection): - self.loggers = list(logger) else: self.loggers = [logger] diff --git a/tests/tests_pytorch/core/test_lightning_module.py b/tests/tests_pytorch/core/test_lightning_module.py index bdd3454c41e85..71f2e07d23709 100644 --- a/tests/tests_pytorch/core/test_lightning_module.py +++ b/tests/tests_pytorch/core/test_lightning_module.py @@ -79,7 +79,7 @@ def test_property_logger(tmpdir): assert model.logger is None logger = TensorBoardLogger(tmpdir) - trainer = Mock(loggers=[logger]) + trainer = Trainer(logger=logger) model.trainer = trainer assert model.logger == logger @@ -94,6 +94,12 @@ def test_property_loggers(tmpdir): model.trainer = trainer assert model.loggers == [logger] + logger0 = TensorBoardLogger(tmpdir) + logger1 = TensorBoardLogger(tmpdir) + trainer = Trainer(logger=[logger0, logger1]) + model.trainer = trainer + assert model.loggers == [logger0, logger1] + def test_1_optimizer_toggle_model(): """Test toggle_model runs when only one optimizer is used.""" diff --git a/tests/tests_pytorch/deprecated_api/test_remove_1-8.py b/tests/tests_pytorch/deprecated_api/test_remove_1-8.py index d3e8c92822ae2..77e007951edb0 100644 --- a/tests/tests_pytorch/deprecated_api/test_remove_1-8.py +++ b/tests/tests_pytorch/deprecated_api/test_remove_1-8.py @@ -26,7 +26,7 @@ from pytorch_lightning import Callback, Trainer from pytorch_lightning.callbacks import ModelCheckpoint from pytorch_lightning.demos.boring_classes import BoringDataModule, BoringModel -from pytorch_lightning.loggers import CSVLogger, Logger, LoggerCollection +from pytorch_lightning.loggers import CSVLogger, Logger from pytorch_lightning.plugins.precision.precision_plugin import PrecisionPlugin from pytorch_lightning.profiler import AbstractProfiler, BaseProfiler from pytorch_lightning.profilers import AdvancedProfiler, Profiler, SimpleProfiler @@ -441,30 +441,6 @@ def _get_python_cprofile_total_duration(profile): np.testing.assert_allclose(recorded_total_duration, expected_total_duration, rtol=0.2) -def test_v1_8_0_logger_collection(tmpdir): - logger1 = CSVLogger(tmpdir) - logger2 = CSVLogger(tmpdir) - - trainer1 = Trainer(logger=logger1) - trainer2 = Trainer(logger=[logger1, logger2]) - - # Should have no deprecation warning - trainer1.logger - trainer1.loggers - trainer2.loggers - - with pytest.deprecated_call(match="logger` will return the first logger"): - _ = trainer2.logger - with pytest.deprecated_call(match="`LoggerCollection` is deprecated in v1.6"): - _ = LoggerCollection([logger1, logger2]) - - model = BoringModel() - trainer = Trainer(logger=[logger1, logger2]) - model.trainer = trainer - with pytest.deprecated_call(match="logger` will return the first logger"): - _ = model.logger - - def test_v1_8_0_precision_plugin_checkpoint_hooks(tmpdir): class PrecisionPluginSaveHook(PrecisionPlugin): def on_save_checkpoint(self, checkpoint): diff --git a/tests/tests_pytorch/loggers/test_logger.py b/tests/tests_pytorch/loggers/test_logger.py index d9a59b98125e8..f274ed1570464 100644 --- a/tests/tests_pytorch/loggers/test_logger.py +++ b/tests/tests_pytorch/loggers/test_logger.py @@ -15,7 +15,7 @@ from argparse import Namespace from copy import deepcopy from typing import Any, Dict, Optional -from unittest.mock import MagicMock, patch +from unittest.mock import patch import numpy as np import pytest @@ -23,77 +23,13 @@ from pytorch_lightning import Trainer from pytorch_lightning.demos.boring_classes import BoringDataModule, BoringModel -from pytorch_lightning.loggers import Logger, LoggerCollection, TensorBoardLogger +from pytorch_lightning.loggers import Logger, TensorBoardLogger from pytorch_lightning.loggers.logger import DummyExperiment, DummyLogger from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.logger import _convert_params, _sanitize_params from pytorch_lightning.utilities.rank_zero import rank_zero_only -def test_logger_collection(): - mock1 = MagicMock() - mock2 = MagicMock() - - with pytest.deprecated_call(match="`LoggerCollection` is deprecated in v1.6"): - logger = LoggerCollection([mock1, mock2]) - - assert logger[0] == mock1 - assert logger[1] == mock2 - - assert logger.experiment[0] == mock1.experiment - assert logger.experiment[1] == mock2.experiment - - assert logger.save_dir is None - - logger.update_agg_funcs({"test": np.mean}, np.sum) - mock1.update_agg_funcs.assert_called_once_with({"test": np.mean}, np.sum) - mock2.update_agg_funcs.assert_called_once_with({"test": np.mean}, np.sum) - - logger.log_metrics(metrics={"test": 2.0}, step=4) - mock1.log_metrics.assert_called_once_with(metrics={"test": 2.0}, step=4) - mock2.log_metrics.assert_called_once_with(metrics={"test": 2.0}, step=4) - - logger.finalize("success") - mock1.finalize.assert_called_once() - mock2.finalize.assert_called_once() - - -def test_logger_collection_unique_names(): - unique_name = "name1" - logger1 = CustomLogger(name=unique_name) - logger2 = CustomLogger(name=unique_name) - - with pytest.deprecated_call(match="`LoggerCollection` is deprecated in v1.6"): - logger = LoggerCollection([logger1, logger2]) - - assert logger.name == unique_name - - -def test_logger_collection_names_order(): - loggers = [CustomLogger(name=n) for n in ("name1", "name2", "name1", "name3")] - with pytest.deprecated_call(match="`LoggerCollection` is deprecated in v1.6"): - logger = LoggerCollection(loggers) - assert logger.name == f"{loggers[0].name}_{loggers[1].name}_{loggers[3].name}" - - -def test_logger_collection_unique_versions(): - unique_version = "1" - logger1 = CustomLogger(version=unique_version) - logger2 = CustomLogger(version=unique_version) - - with pytest.deprecated_call(match="`LoggerCollection` is deprecated in v1.6"): - logger = LoggerCollection([logger1, logger2]) - - assert logger.version == unique_version - - -def test_logger_collection_versions_order(): - loggers = [CustomLogger(version=v) for v in ("1", "2", "1", "3")] - with pytest.deprecated_call(match="`LoggerCollection` is deprecated in v1.6"): - logger = LoggerCollection(loggers) - assert logger.version == f"{loggers[0].version}_{loggers[1].version}_{loggers[3].version}" - - class CustomLogger(Logger): def __init__(self, experiment: str = "test", name: str = "name", version: str = "1"): super().__init__() @@ -231,25 +167,6 @@ def validation_epoch_end(self, outputs): trainer.fit(model) -def test_dummyexperiment_support_indexing(): - """Test that the DummyExperiment can imitate indexing the experiment in a LoggerCollection.""" - experiment = DummyExperiment() - assert experiment[0] == experiment - - -def test_dummylogger_support_indexing(): - """Test that the DummyLogger can imitate indexing of a LoggerCollection.""" - logger = DummyLogger() - assert logger[0] == logger - - -def test_dummylogger_empty_iterable(): - """Test that DummyLogger represents an empty iterable.""" - logger = DummyLogger() - for _ in logger: - assert False - - def test_dummylogger_noop_method_calls(): """Test that the DummyLogger methods can be called with arbitrary arguments.""" logger = DummyLogger() diff --git a/tests/tests_pytorch/trainer/properties/test_loggers.py b/tests/tests_pytorch/trainer/properties/test_loggers.py index 986596fc8b0a3..e0e1057b77eb7 100644 --- a/tests/tests_pytorch/trainer/properties/test_loggers.py +++ b/tests/tests_pytorch/trainer/properties/test_loggers.py @@ -15,7 +15,7 @@ import pytest from pytorch_lightning import Trainer -from pytorch_lightning.loggers import LoggerCollection, TensorBoardLogger +from pytorch_lightning.loggers import TensorBoardLogger from tests_pytorch.loggers.test_logger import CustomLogger @@ -46,10 +46,6 @@ def test_trainer_loggers_setters(): """Test the behavior of setters for trainer.logger and trainer.loggers.""" logger1 = CustomLogger() logger2 = CustomLogger() - with pytest.deprecated_call(match="`LoggerCollection` is deprecated in v1.6"): - logger_collection = LoggerCollection([logger1, logger2]) - with pytest.deprecated_call(match="`LoggerCollection` is deprecated in v1.6"): - logger_collection_2 = LoggerCollection([logger2]) trainer = Trainer() assert type(trainer.logger) == TensorBoardLogger @@ -60,16 +56,6 @@ def test_trainer_loggers_setters(): assert trainer.logger == logger1 assert trainer.loggers == [logger1] - trainer.logger = logger_collection - with pytest.deprecated_call(match="logger` when multiple loggers are configured"): - assert trainer.logger._logger_iterable == logger_collection._logger_iterable - assert trainer.loggers == [logger1, logger2] - - # LoggerCollection of size 1 should result in trainer.logger becoming the contained logger. - trainer.logger = logger_collection_2 - assert trainer.logger == logger2 - assert trainer.loggers == [logger2] - trainer.logger = None assert trainer.logger is None assert trainer.loggers == [] @@ -77,8 +63,6 @@ def test_trainer_loggers_setters(): # Test setters for trainer.loggers trainer.loggers = [logger1, logger2] assert trainer.loggers == [logger1, logger2] - with pytest.deprecated_call(match="logger` when multiple loggers are configured"): - assert trainer.logger._logger_iterable == logger_collection._logger_iterable trainer.loggers = [logger1] assert trainer.loggers == [logger1] From 6e21f46fe80c46fbd9fd153a5ad3ce4075c62869 Mon Sep 17 00:00:00 2001 From: Dmitry Frolov Date: Mon, 12 Sep 2022 20:11:00 -0400 Subject: [PATCH 125/193] [CLI] Move storage from app prefix to project/app prefix (#14583) * Move storage from app prefix to project/app prefix: checking and legacy support * Changelog message Co-authored-by: Jirka Borovec --- src/lightning_app/CHANGELOG.md | 3 +-- src/lightning_app/storage/path.py | 6 ++++++ tests/tests_app/storage/test_path.py | 20 ++++++++++++++++++-- 3 files changed, 25 insertions(+), 4 deletions(-) diff --git a/src/lightning_app/CHANGELOG.md b/src/lightning_app/CHANGELOG.md index b5cdb5b37b136..a600c454cb179 100644 --- a/src/lightning_app/CHANGELOG.md +++ b/src/lightning_app/CHANGELOG.md @@ -14,7 +14,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Changed -- +- Application storage prefix moved from `app_id` to `project_id/app_id` ([#14583](https://github.com/Lightning-AI/lightning/pull/14583)) ### Deprecated @@ -29,7 +29,6 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - - ### Fixed - Resolved `LightningApp(..., debug=True)` ([#14464](https://github.com/Lightning-AI/lightning/pull/14464)) diff --git a/src/lightning_app/storage/path.py b/src/lightning_app/storage/path.py index 35abc25191cb4..523565f9ed16e 100644 --- a/src/lightning_app/storage/path.py +++ b/src/lightning_app/storage/path.py @@ -385,6 +385,12 @@ def shared_storage_path() -> pathlib.Path: ``SHARED_MOUNT_DIRECTORY`` environment variable. In the cloud, the shared path will point to a S3 bucket. All Works have access to this shared dropbox. """ + storage_path = os.getenv("LIGHTNING_STORAGE_PATH", "") + if storage_path != "": + return pathlib.Path(storage_path) + + # TODO[dmitsf]: this logic is still needed for compatibility reasons. + # We should remove it after some time. bucket_name = os.getenv("LIGHTNING_BUCKET_NAME", "") app_id = os.getenv("LIGHTNING_CLOUD_APP_ID", "") diff --git a/tests/tests_app/storage/test_path.py b/tests/tests_app/storage/test_path.py index 39ac073cc82d3..3b87fef3a2dab 100644 --- a/tests/tests_app/storage/test_path.py +++ b/tests/tests_app/storage/test_path.py @@ -4,7 +4,7 @@ import pickle from re import escape from time import sleep -from unittest import mock +from unittest import mock, TestCase from unittest.mock import MagicMock, Mock import pytest @@ -490,7 +490,6 @@ def test_path_as_argument_to_run_method(): def test_path_get_errors(tmpdir): with _context("work"): - with pytest.raises( RuntimeError, match="Trying to get the file .* but the path is not attached to a LightningApp" ): @@ -704,3 +703,20 @@ def test_filesystem(monkeypatch): assert fs._mock_new_parent._mock_mock_calls[0].kwargs["secret"] == "d" assert not fs._mock_new_parent._mock_mock_calls[0].kwargs["use_ssl"] assert fs._mock_new_parent._mock_mock_calls[0].kwargs["client_kwargs"] == {"endpoint_url": "a"} + + +class TestSharedStoragePath(TestCase): + @mock.patch.dict(os.environ, {"LIGHTNING_STORAGE_PATH": "test-bucket/lightningapps/test-project/test-app"}) + def test_shared_storage_path_storage_path_set(self): + self.assertEqual(pathlib.Path("test-bucket/lightningapps/test-project/test-app"), shared_storage_path()) + + @mock.patch.dict(os.environ, {"LIGHTNING_CLOUD_APP_ID": "test-app", "LIGHTNING_BUCKET_NAME": "test-bucket"}) + def test_shared_storage_path_bucket_and_app_id_set(self): + self.assertEqual(pathlib.Path("test-bucket/lightningapps/test-app"), shared_storage_path()) + + @mock.patch.dict(os.environ, {"SHARED_MOUNT_DIRECTORY": "test-app/.shared"}) + def test_shared_storage_path_mount_directory_set(self): + self.assertTrue(shared_storage_path().match("*/test-app/.shared")) + + def test_shared_storage_path_no_envvars_set(self): + self.assertTrue(shared_storage_path().match("*/.shared")) From 4b77adba64728cc9d7826be8c4728deb818dd5ed Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Tue, 13 Sep 2022 04:45:32 +0200 Subject: [PATCH 126/193] CI: run CLI after install (#14659) --- .github/actions/pkg-install/action.yml | 2 -- .github/workflows/ci-pkg-install.yml | 6 ++++++ 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/.github/actions/pkg-install/action.yml b/.github/actions/pkg-install/action.yml index a6cf0d659b28c..c39d11eca460b 100644 --- a/.github/actions/pkg-install/action.yml +++ b/.github/actions/pkg-install/action.yml @@ -31,7 +31,6 @@ runs: pip install *.tar.gz ${{ inputs.pip-flags }} pip list | grep lightning python -c "import ${PKG_NAME} ; print(${PKG_NAME}.__version__)" - pip uninstall -y ${PKG_NAME} shell: bash - name: Install | Uninstall package - wheel @@ -40,5 +39,4 @@ runs: pip install *.whl ${{ inputs.pip-flags }} pip list | grep lightning python -c "import ${PKG_NAME} ; print(${PKG_NAME}.__version__)" - pip uninstall -y ${PKG_NAME} shell: bash diff --git a/.github/workflows/ci-pkg-install.yml b/.github/workflows/ci-pkg-install.yml index 69c602d2e2913..36015eea599f7 100644 --- a/.github/workflows/ci-pkg-install.yml +++ b/.github/workflows/ci-pkg-install.yml @@ -96,6 +96,9 @@ jobs: pkg-name: "lightning" pip-flags: "-U --pre --find-links ../pypi/" + - name: Run CLI + run: python -m lightning --version + install-meta-pypi: runs-on: ${{ matrix.os }} strategy: @@ -143,3 +146,6 @@ jobs: with: pkg-name: "lightning" pip-flags: "-U --pre --find-links ../pypi/" + + - name: Run CLI + run: python -m lightning --version From bdec502f50b2f6b22f078b22baf6781882237bd1 Mon Sep 17 00:00:00 2001 From: Kushashwa Ravi Shrimali Date: Tue, 13 Sep 2022 14:59:03 +0530 Subject: [PATCH 127/193] Requirements: try Traitlets >= 5.3.0 (#14679) * Traitlets >= 5.3.0 * Apply suggestions from code review Co-authored-by: Jirka Borovec --- requirements/app/base.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/app/base.txt b/requirements/app/base.txt index 460a2775f64bf..d19c4ca207117 100644 --- a/requirements/app/base.txt +++ b/requirements/app/base.txt @@ -5,6 +5,6 @@ starsessions>=1.2.1, <2.0 # strict fsspec>=2022.5.0, <=2022.7.1 s3fs>=2022.5.0, <2022.8.3 croniter>=1.3.0, <1.4.0 # strict; TODO: for now until we find something more robust. -traitlets<5.2.0 # strict; ToDo: Traitlets 5.2.X fails: https://github.com/ipython/traitlets/issues/741 +traitlets>=5.3.0, <=5.4.0 arrow>=1.2.0, <=1.2.2 lightning-utilities==0.3.* From 1ee3d1eb72f61b05353e289727cf6e20791b4a64 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 13 Sep 2022 12:53:46 +0200 Subject: [PATCH 128/193] Avoid warning when cloning tensor in self.log (#14599) Co-authored-by: Jirka Borovec --- src/pytorch_lightning/CHANGELOG.md | 4 ++++ src/pytorch_lightning/core/module.py | 6 +++++- .../trainer/logging_/test_train_loop_logging.py | 16 ++++++++++++++++ 3 files changed, 25 insertions(+), 1 deletion(-) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 7f840d247ed84..073052341652e 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -169,12 +169,16 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed an issue to keep downscaling the batch size in case there hasn't been even a single successful optimal batch size with `mode="power"` ([#14372](https://github.com/Lightning-AI/lightning/pull/14372)) +- Fixed an issue where `self.log`-ing a tensor would create a user warning from PyTorch about cloning tensors ([#14599](https://github.com/Lightning-AI/lightning/pull/14599)) + + - Fixed compatibility when `torch.distributed` is not available ([#14454](https://github.com/Lightning-AI/lightning/pull/14454)) - Fixed torchscript error with ensembles of LightningModules ([#14657](https://github.com/Lightning-AI/lightning/pull/14657)) + ## [1.7.5] - 2022-09-06 ### Fixed diff --git a/src/pytorch_lightning/core/module.py b/src/pytorch_lightning/core/module.py index 46880add37737..95aad6a2f8c94 100644 --- a/src/pytorch_lightning/core/module.py +++ b/src/pytorch_lightning/core/module.py @@ -549,7 +549,11 @@ def __check_allowed(v: Any, name: str, value: Any) -> None: raise ValueError(f"`self.log({name}, {value})` was called, but `{type(v).__name__}` values cannot be logged") def __to_tensor(self, value: Union[torch.Tensor, numbers.Number], name: str) -> Tensor: - value = torch.tensor(value, device=self.device) + value = ( + value.clone().detach().to(self.device) + if isinstance(value, torch.Tensor) + else torch.tensor(value, device=self.device) + ) if not torch.numel(value) == 1: raise ValueError( f"`self.log({name}, {value})` was called, but the tensor must have a single element." diff --git a/tests/tests_pytorch/trainer/logging_/test_train_loop_logging.py b/tests/tests_pytorch/trainer/logging_/test_train_loop_logging.py index cd7f83ddc7bfe..6a2feae352c3b 100644 --- a/tests/tests_pytorch/trainer/logging_/test_train_loop_logging.py +++ b/tests/tests_pytorch/trainer/logging_/test_train_loop_logging.py @@ -32,6 +32,7 @@ from pytorch_lightning.trainer.states import RunningStage from pytorch_lightning.utilities.exceptions import MisconfigurationException from tests_pytorch.helpers.runif import RunIf +from tests_pytorch.helpers.utils import no_warning_call def test__training_step__log(tmpdir): @@ -626,6 +627,21 @@ def training_step(self, *args): trainer.fit(model) +def test_log_tensor_and_clone_no_torch_warning(tmpdir): + """Regression test for issue https://github.com/Lightning-AI/lightning/issues/14594.""" + + class TestModel(BoringModel): + def training_step(self, *args): + self.log("foo", torch.tensor(1)) + return super().training_step(*args) + + trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=1) + model = TestModel() + match = r"recommended.*.clone\(\).detach\(\)" + with no_warning_call(UserWarning, match=match): + trainer.fit(model) + + def test_logging_raises(tmpdir): class TestModel(BoringModel): def training_step(self, batch, batch_idx): From 19a1274093f0a406d2a190c1947eab892ccdc106 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 13 Sep 2022 14:26:03 +0200 Subject: [PATCH 129/193] Better error message when dataloader and datamodule is None (V2) (#14637) --- src/pytorch_lightning/CHANGELOG.md | 5 ++- .../trainer/configuration_validator.py | 18 ---------- .../trainer/connectors/data_connector.py | 26 ++++++++++++++ .../trainer/connectors/test_data_connector.py | 35 +++++++++++++++++++ .../trainer/test_config_validator.py | 28 +-------------- 5 files changed, 66 insertions(+), 46 deletions(-) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 073052341652e..5b7bea5ce66ee 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -63,7 +63,10 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - The `pyDeprecate` dependency is no longer installed ([#14472](https://github.com/Lightning-AI/lightning/pull/14472)) -- When using multiple loggers, by default checkpoints and profiler output now get saved to the log dir of the first logger in the list ([14325](https://github.com/Lightning-AI/lightning/pull/14325)) +- When using multiple loggers, by default checkpoints and profiler output now get saved to the log dir of the first logger in the list ([#14325](https://github.com/Lightning-AI/lightning/pull/14325)) + + +- Improved the error messaging when passing `Trainer.method(model, x_dataloader=None)` with no module-method implementations available ([#14614](https://github.com/Lightning-AI/lightning/pull/14614)) diff --git a/src/pytorch_lightning/trainer/configuration_validator.py b/src/pytorch_lightning/trainer/configuration_validator.py index f1d86995d10c2..023ccb09bd974 100644 --- a/src/pytorch_lightning/trainer/configuration_validator.py +++ b/src/pytorch_lightning/trainer/configuration_validator.py @@ -69,16 +69,6 @@ def __verify_train_val_loop_configuration(trainer: "pl.Trainer", model: "pl.Ligh " `training_step()`, `train_dataloader()` and `configure_optimizers()` to be defined." ) - # ----------------------------------- - # verify model has a train dataloader - # ----------------------------------- - has_train_dataloader = trainer._data_connector._train_dataloader_source.is_defined() - if not has_train_dataloader: - raise MisconfigurationException( - "No `train_dataloader()` method defined. Lightning `Trainer` expects as minimum a" - " `training_step()`, `train_dataloader()` and `configure_optimizers()` to be defined." - ) - # ----------------------------------- # verify model has optimizer # ----------------------------------- @@ -119,19 +109,11 @@ def __verify_train_val_loop_configuration(trainer: "pl.Trainer", model: "pl.Ligh def __verify_eval_loop_configuration(trainer: "pl.Trainer", model: "pl.LightningModule", stage: str) -> None: - loader_name = f"{stage}_dataloader" step_name = "validation_step" if stage == "val" else f"{stage}_step" trainer_method = "validate" if stage == "val" else stage - has_loader = getattr(trainer._data_connector, f"_{stage}_dataloader_source").is_defined() has_step = is_overridden(step_name, model) - # ----------------------------------- - # verify model has an eval_dataloader - # ----------------------------------- - if not has_loader: - raise MisconfigurationException(f"No `{loader_name}()` method defined to run `Trainer.{trainer_method}`.") - # predict_step is not required to be overridden if stage == "predict": if model.predict_step is None: diff --git a/src/pytorch_lightning/trainer/connectors/data_connector.py b/src/pytorch_lightning/trainer/connectors/data_connector.py index 56ba809e105b2..b2a6dbe0c8a5a 100644 --- a/src/pytorch_lightning/trainer/connectors/data_connector.py +++ b/src/pytorch_lightning/trainer/connectors/data_connector.py @@ -139,6 +139,17 @@ def attach_data( predict_dataloaders=predict_dataloaders, ) self.attach_datamodule(model, datamodule=datamodule) + + # Validate that the required data sources are available + if self.trainer.state.fn == TrainerFn.FITTING: + _check_dataloader_none(train_dataloaders, self._train_dataloader_source, self.trainer.state.fn) + elif self.trainer.state.fn == TrainerFn.VALIDATING: + _check_dataloader_none(val_dataloaders, self._val_dataloader_source, self.trainer.state.fn) + elif self.trainer.state.fn == TrainerFn.TESTING: + _check_dataloader_none(test_dataloaders, self._test_dataloader_source, self.trainer.state.fn) + elif self.trainer.state.fn == TrainerFn.PREDICTING: + _check_dataloader_none(predict_dataloaders, self._predict_dataloader_source, self.trainer.state.fn) + # set local properties on the model self._copy_trainer_model_properties(model) @@ -580,3 +591,18 @@ def get_instance(self, hook_name: str) -> Union["pl.LightningModule", "pl.Lightn " `LightningDataModule`. It will use the implementation from `LightningModule` instance." ) return self.model + + +def _check_dataloader_none( + dataloader: Optional[Union[TRAIN_DATALOADERS, EVAL_DATALOADERS]], + dataloader_source: _DataLoaderSource, + trainer_fn: TrainerFn, +) -> None: + # A prefix in the message to disambiguate between the train- and (optional) val dataloader that .fit() accepts + prefix = "train_" if trainer_fn == TrainerFn.FITTING else "" + if dataloader is None and not dataloader_source.is_defined(): + raise ValueError( + f"An invalid dataloader was passed to `Trainer.{trainer_fn}({prefix}dataloaders=...)`." + f" Either pass the dataloader to the `.{trainer_fn}()` method OR implement" + f" `def {dataloader_source.name}(self):` in your LightningModule/LightningDataModule." + ) diff --git a/tests/tests_pytorch/trainer/connectors/test_data_connector.py b/tests/tests_pytorch/trainer/connectors/test_data_connector.py index 703ce8f053590..847922c05294a 100644 --- a/tests/tests_pytorch/trainer/connectors/test_data_connector.py +++ b/tests/tests_pytorch/trainer/connectors/test_data_connector.py @@ -570,3 +570,38 @@ def test_error_raised_with_insufficient_float_limit_train_dataloader(): match="Please increase the `limit_train_batches` argument. Try at least", ): trainer.reset_train_dataloader(model) + + +@pytest.mark.parametrize( + "trainer_fn_name, dataloader_name", + [ + ("fit", "train_dataloaders"), + ("validate", "dataloaders"), + ("test", "dataloaders"), + ("predict", "dataloaders"), + ], +) +def test_attach_data_input_validation_with_none_dataloader(trainer_fn_name, dataloader_name, tmpdir): + """Test that passing `Trainer.method(x_dataloader=None)` with no module-method implementations available raises + an error.""" + trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True) + model = BoringModel() + datamodule = BoringDataModule() + trainer_fn = getattr(trainer, trainer_fn_name) + + # Pretend that these methods are not implemented + model.train_dataloader = None + model.val_dataloader = None + model.test_dataloader = None + model.predict_dataloader = None + + datamodule.train_dataloader = None + datamodule.val_dataloader = None + datamodule.test_dataloader = None + datamodule.predict_dataloader = None + + with pytest.raises(ValueError, match=f"An invalid .*dataloader was passed to `Trainer.{trainer_fn_name}"): + trainer_fn(model, **{dataloader_name: None}, datamodule=datamodule) + + with pytest.raises(ValueError, match=f"An invalid .*dataloader was passed to `Trainer.{trainer_fn_name}"): + trainer_fn(model, **{dataloader_name: None}, datamodule=None) diff --git a/tests/tests_pytorch/trainer/test_config_validator.py b/tests/tests_pytorch/trainer/test_config_validator.py index 7cc742eea845f..c4d34315364c5 100644 --- a/tests/tests_pytorch/trainer/test_config_validator.py +++ b/tests/tests_pytorch/trainer/test_config_validator.py @@ -23,17 +23,9 @@ def test_wrong_train_setting(tmpdir): - """ - * Test that an error is thrown when no `train_dataloader()` is defined - * Test that an error is thrown when no `training_step()` is defined - """ + """Test that an error is raised when no `training_step()` is defined.""" trainer = Trainer(default_root_dir=tmpdir, max_epochs=1) - with pytest.raises(MisconfigurationException, match=r"No `train_dataloader\(\)` method defined."): - model = BoringModel() - model.train_dataloader = None - trainer.fit(model) - with pytest.raises(MisconfigurationException, match=r"No `training_step\(\)` method defined."): model = BoringModel() model.training_step = None @@ -71,36 +63,18 @@ def test_eval_loop_config(tmpdir): """When either eval step or eval data is missing.""" trainer = Trainer(default_root_dir=tmpdir, max_epochs=1) - # has val step but no val data - model = BoringModel() - model.val_dataloader = None - with pytest.raises(MisconfigurationException, match=r"No `val_dataloader\(\)` method defined"): - trainer.validate(model) - # has test data but no val step model = BoringModel() model.validation_step = None with pytest.raises(MisconfigurationException, match=r"No `validation_step\(\)` method defined"): trainer.validate(model) - # has test loop but no test data - model = BoringModel() - model.test_dataloader = None - with pytest.raises(MisconfigurationException, match=r"No `test_dataloader\(\)` method defined"): - trainer.test(model) - # has test data but no test step model = BoringModel() model.test_step = None with pytest.raises(MisconfigurationException, match=r"No `test_step\(\)` method defined"): trainer.test(model) - # has predict step but no predict data - model = BoringModel() - model.predict_dataloader = None - with pytest.raises(MisconfigurationException, match=r"No `predict_dataloader\(\)` method defined"): - trainer.predict(model) - # has predict data but no predict_step model = BoringModel() model.predict_step = None From fd99cb27f27e2744153bc467232ab8a7027e3e7f Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Tue, 13 Sep 2022 22:38:14 +0900 Subject: [PATCH 130/193] Update CODEOWNER of CI/CD (#14676) Update CODEOWNER --- .github/CODEOWNERS | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index dc3b2187d0fd5..2460e2a71d761 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -9,6 +9,8 @@ # CI/CD and configs /.github/ @borda @carmocca @akihironitta @tchaton +/.circleci/ @borda @carmocca @akihironitta @tchaton +/.azure/ @borda @carmocca @akihironitta @tchaton /dockers/ @borda @carmocca @akihironitta @tchaton *.yml @borda @carmocca @akihironitta @tchaton From 16b4644e06c0be98df49cab23439f4618adc3428 Mon Sep 17 00:00:00 2001 From: Pritam Soni <23050213+pritamsoni-hsr@users.noreply.github.com> Date: Tue, 13 Sep 2022 19:19:09 +0530 Subject: [PATCH 131/193] feat: LAI2-10296 check if user has sufficient credits to run an app from the cli (#14285) --- src/lightning_app/cli/lightning_cli.py | 50 ++++++++++++++++++++------ src/lightning_app/runners/cloud.py | 32 +++++++++++++---- tests/tests_app/runners/test_cloud.py | 15 ++++++++ 3 files changed, 80 insertions(+), 17 deletions(-) diff --git a/src/lightning_app/cli/lightning_cli.py b/src/lightning_app/cli/lightning_cli.py index 9e11abafd62b6..c1e4be8cc85f0 100644 --- a/src/lightning_app/cli/lightning_cli.py +++ b/src/lightning_app/cli/lightning_cli.py @@ -6,6 +6,7 @@ import arrow import click import rich +from lightning_cloud.openapi import Externalv1LightningappInstance from requests.exceptions import ConnectionError from rich.color import ANSI_COLOR_NAMES @@ -36,10 +37,11 @@ logger = Logger(__name__) -def get_app_url(runtime_type: RuntimeType, *args) -> str: +def get_app_url(runtime_type: RuntimeType, *args, need_credits: bool = False) -> str: if runtime_type == RuntimeType.CLOUD: - lightning_app = args[0] - return f"{get_lightning_cloud_url()}/me/apps/{lightning_app.id}" + lightning_app: Externalv1LightningappInstance = args[0] + action = "?action=add_credits" if need_credits else "" + return f"{get_lightning_cloud_url()}/me/apps/{lightning_app.id}{action}" else: return "http://127.0.0.1:7501/view" @@ -302,9 +304,9 @@ def _run_app( env_vars = _format_input_env_variables(env) os.environ.update(env_vars) - def on_before_run(*args): + def on_before_run(*args, **kwargs): if open_ui and not without_server: - click.launch(get_app_url(runtime_type, *args)) + click.launch(get_app_url(runtime_type, *args, **kwargs)) click.echo("Your Lightning App is starting. This won't take long.") @@ -335,15 +337,26 @@ def run(): @click.argument("file", type=click.Path(exists=True)) @click.option("--cloud", type=bool, default=False, is_flag=True) @click.option( - "--cluster-id", type=str, default=None, help="Run Lightning App on a specific Lightning AI BYOC compute cluster" + "--cluster-id", + type=str, + default=None, + help="Run Lightning App on a specific Lightning AI BYOC compute cluster", ) @click.option("--name", help="The current application name", default="", type=str) @click.option("--without-server", is_flag=True, default=False) @click.option( - "--no-cache", is_flag=True, default=False, help="Disable caching of packages " "installed from requirements.txt" + "--no-cache", + is_flag=True, + default=False, + help="Disable caching of packages " "installed from requirements.txt", ) @click.option("--blocking", "blocking", type=bool, default=False) -@click.option("--open-ui", type=bool, default=True, help="Decide whether to launch the app UI in a web browser") +@click.option( + "--open-ui", + type=bool, + default=True, + help="Decide whether to launch the app UI in a web browser", +) @click.option("--env", type=str, default=[], multiple=True, help="Env variables to be set for the app.") @click.option("--app_args", type=str, default=[], multiple=True, help="Collection of arguments for the app.") def run_app( @@ -386,7 +399,12 @@ def install(): @install.command("app") @click.argument("name", type=str) -@click.option("--yes", "-y", is_flag=True, help="disables prompt to ask permission to create env and run install cmds") +@click.option( + "--yes", + "-y", + is_flag=True, + help="disables prompt to ask permission to create env and run install cmds", +) @click.option( "--version", "-v", @@ -416,7 +434,12 @@ def install_app(name, yes, version, overwrite: bool = False): @install.command("component") @click.argument("name", type=str) -@click.option("--yes", "-y", is_flag=True, help="disables prompt to ask permission to create env and run install cmds") +@click.option( + "--yes", + "-y", + is_flag=True, + help="disables prompt to ask permission to create env and run install cmds", +) @click.option( "--version", "-v", @@ -491,7 +514,12 @@ def init_component(name): @init.command("react-ui") -@click.option("--dest_dir", "-dest_dir", type=str, help="optional destination directory to create the react ui") +@click.option( + "--dest_dir", + "-dest_dir", + type=str, + help="optional destination directory to create the react ui", +) def init_react_ui(dest_dir): """Create a react UI to give a Lightning component a React.js web user interface (UI)""" cmd_react_ui_init.react_ui(dest_dir) diff --git a/src/lightning_app/runners/cloud.py b/src/lightning_app/runners/cloud.py index 59b342b166eb3..14f0f5794c27d 100644 --- a/src/lightning_app/runners/cloud.py +++ b/src/lightning_app/runners/cloud.py @@ -27,6 +27,7 @@ V1LightningappInstanceState, V1LightningworkDrives, V1LightningworkSpec, + V1Membership, V1Metadata, V1NetworkConfig, V1PackageManager, @@ -38,6 +39,7 @@ ) from lightning_cloud.openapi.rest import ApiException +from lightning_app.core.app import LightningApp from lightning_app.core.constants import CLOUD_UPLOAD_WARNING, DISABLE_DEPENDENCY_CACHE from lightning_app.runners.backends.cloud import CloudBackend from lightning_app.runners.runtime import Runtime @@ -224,6 +226,16 @@ def dispatch( repo.package() repo.upload(url=lightning_app_release.source_upload_url) + # check if user has sufficient credits to run an app + # if so set the desired state to running otherwise, create the app in stopped state, + # and open the admin ui to add credits and running the app. + has_sufficient_credits = self._project_has_sufficient_credits(project, app=self.app) + app_release_desired_state = ( + V1LightningappInstanceState.RUNNING if has_sufficient_credits else V1LightningappInstanceState.STOPPED + ) + if not has_sufficient_credits: + logger.warn("You may need Lightning credits to run your apps on the cloud.") + # right now we only allow a single instance of the app find_instances_resp = self.backend.client.lightningapp_instance_service_list_lightningapp_instances( project.project_id, app_id=lightning_app.id @@ -261,9 +273,7 @@ def dispatch( project_id=project.project_id, id=existing_instance.id, body=Body3( - spec=V1LightningappInstanceSpec( - desired_state=V1LightningappInstanceState.RUNNING, env=v1_env_vars - ) + spec=V1LightningappInstanceSpec(desired_state=app_release_desired_state, env=v1_env_vars) ), ) else: @@ -274,7 +284,7 @@ def dispatch( lightning_app_release.id, Body9( cluster_id=cluster_id, - desired_state=V1LightningappInstanceState.RUNNING, + desired_state=app_release_desired_state, name=lightning_app.name, env=v1_env_vars, ), @@ -285,7 +295,7 @@ def dispatch( sys.exit(1) if on_before_run: - on_before_run(lightning_app_instance) + on_before_run(lightning_app_instance, need_credits=not has_sufficient_credits) if lightning_app_instance.status.phase == V1LightningappInstanceState.FAILED: raise RuntimeError("Failed to create the application. Cannot upload the source code.") @@ -331,4 +341,14 @@ def _check_uploaded_folder(root: Path, repo: LocalSourceCodeDir) -> None: ) else: warning_msg += "\nYou can ignore some files or folders by adding them to `.lightningignore`." - logger.warn(warning_msg) + + logger.warning(warning_msg) + + def _project_has_sufficient_credits(self, project: V1Membership, app: Optional[LightningApp] = None): + """check if user has enough credits to run the app with its hardware if app is not passed return True if + user has 1 or more credits.""" + balance = project.balance + if balance is None: + balance = 0 # value is missing in some tests + + return balance >= 1 diff --git a/tests/tests_app/runners/test_cloud.py b/tests/tests_app/runners/test_cloud.py index 640eb9c114c2d..633a91359bbb3 100644 --- a/tests/tests_app/runners/test_cloud.py +++ b/tests/tests_app/runners/test_cloud.py @@ -686,3 +686,18 @@ def test_check_uploaded_folder(monkeypatch, tmpdir, caplog): with caplog.at_level(logging.WARN): backend._check_uploaded_folder(Path("."), repo) assert caplog.messages[0].startswith("Your application folder . is more than 2 MB. Found 5.0 MB") + + +@mock.patch("lightning_app.core.queues.QueuingSystem", MagicMock()) +@mock.patch("lightning_app.runners.backends.cloud.LightningClient", MagicMock()) +def test_project_has_sufficient_credits(): + app = mock.MagicMock(spec=LightningApp) + cloud_runtime = cloud.CloudRuntime(app=app, entrypoint_file="entrypoint.py") + credits_and_test_value = [ + [0.3, False], + [1, True], + [1.1, True], + ] + for balance, result in credits_and_test_value: + project = V1Membership(name="test-project1", project_id="test-project-id1", balance=balance) + assert cloud_runtime._project_has_sufficient_credits(project) is result From 96cc288f53b847f0020d53c3079821a03921d1fb Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Tue, 13 Sep 2022 16:50:36 +0200 Subject: [PATCH 132/193] (hot fix) Resolve Boring App (#14684) * resolve_boring_app * update * update * update * update * update * update * update * update * update * update * update * update * update --- src/lightning_app/CHANGELOG.md | 2 + src/lightning_app/cli/lightning_cli.py | 3 +- src/lightning_app/testing/testing.py | 64 +++++---------------- src/lightning_app/utilities/app_logs.py | 21 ++++--- tests/tests_app_examples/conftest.py | 44 -------------- tests/tests_app_examples/test_boring_app.py | 11 ++-- 6 files changed, 36 insertions(+), 109 deletions(-) diff --git a/src/lightning_app/CHANGELOG.md b/src/lightning_app/CHANGELOG.md index a600c454cb179..7927372278e36 100644 --- a/src/lightning_app/CHANGELOG.md +++ b/src/lightning_app/CHANGELOG.md @@ -25,6 +25,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Resolved a bug where the state change detection using DeepDiff won't worked with Path, Drive objects ([#14465](https://github.com/Lightning-AI/lightning/pull/14465)) +- Resolved a bug where the wrong client was passed to collect cloud logs ([#14684](https://github.com/Lightning-AI/lightning/pull/14684)) + ### Removed - diff --git a/src/lightning_app/cli/lightning_cli.py b/src/lightning_app/cli/lightning_cli.py index c1e4be8cc85f0..89f342d0e24e3 100644 --- a/src/lightning_app/cli/lightning_cli.py +++ b/src/lightning_app/cli/lightning_cli.py @@ -32,6 +32,7 @@ from lightning_app.utilities.cloud import _get_project from lightning_app.utilities.cluster_logs import _cluster_logs_reader from lightning_app.utilities.login import Auth +from lightning_app.utilities.logs_socket_api import _LightningLogsSocketAPI from lightning_app.utilities.network import LightningClient logger = Logger(__name__) @@ -151,7 +152,7 @@ def logs(app_name: str, components: List[str], follow: bool) -> None: raise click.ClickException(f"Component '{component}' does not exist in app {app_name}.") log_reader = _app_logs_reader( - client=client, + logs_api_client=_LightningLogsSocketAPI(client.api_client), project_id=project.project_id, app_id=apps[app_name].id, component_names=components, diff --git a/src/lightning_app/testing/testing.py b/src/lightning_app/testing/testing.py index 34ea7aa6b660c..14bca57fe1a4a 100644 --- a/src/lightning_app/testing/testing.py +++ b/src/lightning_app/testing/testing.py @@ -6,7 +6,6 @@ import sys import tempfile import time -import traceback from contextlib import contextmanager from multiprocessing import Process from subprocess import Popen @@ -38,7 +37,6 @@ def _on_error_callback(ws_app, *_): - print(traceback.format_exc()) ws_app.close() @@ -210,7 +208,9 @@ def run_cli(args) -> Generator: @requires("playwright") @contextmanager -def run_app_in_cloud(app_folder: str, app_name: str = "app.py", extra_args: [str] = []) -> Generator: +def run_app_in_cloud( + app_folder: str, app_name: str = "app.py", extra_args: List[str] = [], debug: bool = True +) -> Generator: """This utility is used to automate testing e2e application with lightning_app.ai.""" # 1. Validate the provide app_folder is correct. if not os.path.exists(os.path.join(app_folder, "app.py")): @@ -239,7 +239,8 @@ def run_app_in_cloud(app_folder: str, app_name: str = "app.py", extra_args: [str with tempfile.TemporaryDirectory() as tmpdir: env_copy = os.environ.copy() env_copy["PACKAGE_LIGHTNING"] = "1" - env_copy["LIGHTNING_DEBUG"] = "1" + if debug: + env_copy["LIGHTNING_DEBUG"] = "1" shutil.copytree(app_folder, tmpdir, dirs_exist_ok=True) # TODO - add -no-cache to the command line. process = Popen( @@ -308,7 +309,7 @@ def run_app_in_cloud(app_folder: str, app_name: str = "app.py", extra_args: [str """, [LIGHTNING_CLOUD_PROJECT_ID], ) - admin_page.goto(f"{Config.url}/{Config.username}/apps") + admin_page.goto(f"{Config.url}/{Config.username}/apps", timeout=60 * 1000) # Closing the Create Project dialog. try: @@ -324,15 +325,6 @@ def run_app_in_cloud(app_folder: str, app_name: str = "app.py", extra_args: [str print("'Create Project' dialog not visible, skipping.") admin_page.locator(f"text={name}").click() - admin_page.evaluate( - """data => { - window.localStorage.setItem('gridUserId', data[0]); - window.localStorage.setItem('gridUserKey', data[1]); - window.localStorage.setItem('gridUserToken', data[2]); - } - """, - [Config.id, Config.key, token], - ) sleep(5) # Scroll to the bottom of the page. Used to capture all logs. admin_page.evaluate( @@ -361,8 +353,9 @@ def run_app_in_cloud(app_folder: str, app_name: str = "app.py", extra_args: [str assert len(lightning_apps) == 1 app_id = lightning_apps[0].id - process = Process(target=print_logs, kwargs={"app_id": app_id}) - process.start() + if debug: + process = Process(target=print_logs, kwargs={"app_id": app_id}) + process.start() while True: try: @@ -403,40 +396,11 @@ def fetch_logs(component_names: Optional[List[str]] = None) -> Generator: except KeyboardInterrupt: pass finally: - has_finished = False - while not has_finished: - try: - button = admin_page.locator('[data-cy="stop"]') - try: - button.wait_for(timeout=3 * 1000) - button.click() - except (playwright._impl._api_types.Error, playwright._impl._api_types.TimeoutError): - pass - context.close() - browser.close() - - list_lightningapps = client.lightningapp_instance_service_list_lightningapp_instances( - project.project_id - ) - - for lightningapp in list_lightningapps.lightningapps: - if lightningapp.name != name: - continue - try: - res = client.lightningapp_instance_service_delete_lightningapp_instance( - project_id=project.project_id, - id=lightningapp.id, - ) - assert res == {} - except ApiException as e: - print(f"Failed to delete {lightningapp.name}. Exception {e}") - - process.kill() - has_finished = True - except Exception: - pass - - Popen("lightning disconnect", shell=True).wait() + if debug: + process.kill() + + context.close() + browser.close() def wait_for(page, callback: Callable, *args, **kwargs) -> Any: diff --git a/src/lightning_app/utilities/app_logs.py b/src/lightning_app/utilities/app_logs.py index 0fbe359972852..0a63949fbd5ad 100644 --- a/src/lightning_app/utilities/app_logs.py +++ b/src/lightning_app/utilities/app_logs.py @@ -1,7 +1,6 @@ import json import queue from dataclasses import dataclass -from datetime import timedelta from threading import Thread from typing import Callable, Iterator, List, Optional @@ -87,18 +86,22 @@ def _app_logs_reader( th.start() # Print logs from queue when log event is available - user_log_start = "<<< BEGIN USER_RUN_FLOW SECTION >>>" - start_timestamp = None + flow = "Your app has started. View it in your browser" + work = "USER_RUN_WORK" + start_timestamps = {} # Print logs from queue when log event is available try: while True: - log_event = read_queue.get(timeout=None if follow else 1.0) - if user_log_start in log_event.message: - start_timestamp = log_event.timestamp + timedelta(seconds=0.5) - - if start_timestamp and log_event.timestamp > start_timestamp: - yield log_event + log_event: _LogEvent = read_queue.get(timeout=None if follow else 1.0) + token = flow if log_event.component_name == "flow" else work + if token in log_event.message: + start_timestamps[log_event.component_name] = log_event.timestamp + + timestamp = start_timestamps.get(log_event.component_name, None) + if timestamp and log_event.timestamp >= timestamp: + if "launcher" not in log_event.message: + yield log_event except queue.Empty: # Empty is raised by queue.get if timeout is reached. Follow = False case. diff --git a/tests/tests_app_examples/conftest.py b/tests/tests_app_examples/conftest.py index 40d2db2e020d8..e69de29bb2d1d 100644 --- a/tests/tests_app_examples/conftest.py +++ b/tests/tests_app_examples/conftest.py @@ -1,44 +0,0 @@ -import os - -from lightning_cloud.openapi.rest import ApiException - -from lightning_app.utilities.cloud import _get_project -from lightning_app.utilities.network import LightningClient - - -def pytest_timeout_cancel_timer(item): - """This hook deletes the Lightning App when timeout triggers.""" - - if item.name.endswith("_example_cloud"): - name = os.getenv("LIGHTNING_APP_NAME") - print(f"Timeout was triggered. Deleting the App {name}.") - - client = LightningClient() - project = _get_project(client) - - lightning_apps = [ - app - for app in client.lightningapp_instance_service_list_lightningapp_instances( - project.project_id - ).lightningapps - if app.name == name - ] - - if not lightning_apps: - return True - - assert len(lightning_apps) == 1 - - lightning_app = lightning_apps[0] - - try: - res = client.lightningapp_instance_service_delete_lightningapp_instance( - project_id=project.project_id, - id=lightning_app.id, - ) - assert res == {} - - except ApiException as e: - print(f"Failed to delete {name}. Exception {e}") - - return True diff --git a/tests/tests_app_examples/test_boring_app.py b/tests/tests_app_examples/test_boring_app.py index afb958571d16b..aa4c568b4f2a4 100644 --- a/tests/tests_app_examples/test_boring_app.py +++ b/tests/tests_app_examples/test_boring_app.py @@ -10,7 +10,11 @@ @pytest.mark.cloud def test_boring_app_example_cloud() -> None: - with run_app_in_cloud(os.path.join(_PROJECT_ROOT, "examples/app_boring/"), app_name="app_dynamic.py") as ( + with run_app_in_cloud( + os.path.join(_PROJECT_ROOT, "examples/app_boring/"), + app_name="app_dynamic.py", + debug=False, + ) as ( _, view_page, fetch_logs, @@ -19,15 +23,11 @@ def test_boring_app_example_cloud() -> None: def check_hello_there(*_, **__): locator = view_page.frame_locator("iframe").locator('ul:has-text("Hello there!")') - locator.wait_for(timeout=3 * 1000) if len(locator.all_text_contents()): return True wait_for(view_page, check_hello_there) - for _ in fetch_logs(): - pass - runner = CliRunner() result = runner.invoke(logs, [name]) lines = result.output.splitlines() @@ -35,3 +35,4 @@ def check_hello_there(*_, **__): assert result.exit_code == 0 assert result.exception is None assert any("http://0.0.0.0:8080" in line for line in lines) + print("Succeeded App!") From 616304831ad3960b133e8ddcfeab6d8c4bafe18c Mon Sep 17 00:00:00 2001 From: otaj <6065855+otaj@users.noreply.github.com> Date: Tue, 13 Sep 2022 16:52:09 +0200 Subject: [PATCH 133/193] Remove deprecated `BaseProfiler` and `AbstractProfiler` (#14404) Co-authored-by: Jirka Co-authored-by: Akihiro Nitta --- pyproject.toml | 1 - src/pytorch_lightning/CHANGELOG.md | 3 ++ src/pytorch_lightning/profilers/__init__.py | 4 +- src/pytorch_lightning/profilers/base.py | 48 ------------------- .../deprecated_api/test_remove_1-8.py | 29 +---------- 5 files changed, 5 insertions(+), 80 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 5a8f632481127..dd48b8126a351 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -52,7 +52,6 @@ warn_no_return = "False" # mypy --no-error-summary 2>&1 | tr ':' ' ' | awk '{print $1}' | sort | uniq | sed 's/\.py//g; s|src/||g; s|\/|\.|g' | xargs -I {} echo '"{}",' module = [ "pytorch_lightning.callbacks.progress.rich_progress", - "pytorch_lightning.profilers.base", "pytorch_lightning.profilers.pytorch", "pytorch_lightning.trainer.trainer", "pytorch_lightning.tuner.batch_size_scaling", diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 5b7bea5ce66ee..58baf7d2a63cd 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -164,6 +164,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Removed the deprecated class `TrainerCallbackHookMixin` ([#14401](https://github.com/Lightning-AI/lightning/14401)) +- Removed the deprecated `BaseProfiler` and `AbstractProfiler` classes ([#14404](https://github.com/Lightning-AI/lightning/pull/14404)) + + ### Fixed - Reset the dataloaders on OOM failure in batch size finder to use the last successful batch size ([#14372](https://github.com/Lightning-AI/lightning/pull/14372)) diff --git a/src/pytorch_lightning/profilers/__init__.py b/src/pytorch_lightning/profilers/__init__.py index dad105135fa01..0e97d02feb202 100644 --- a/src/pytorch_lightning/profilers/__init__.py +++ b/src/pytorch_lightning/profilers/__init__.py @@ -12,15 +12,13 @@ # See the License for the specific language governing permissions and # limitations under the License. from pytorch_lightning.profilers.advanced import AdvancedProfiler -from pytorch_lightning.profilers.base import AbstractProfiler, BaseProfiler, PassThroughProfiler +from pytorch_lightning.profilers.base import PassThroughProfiler from pytorch_lightning.profilers.profiler import Profiler from pytorch_lightning.profilers.pytorch import PyTorchProfiler from pytorch_lightning.profilers.simple import SimpleProfiler from pytorch_lightning.profilers.xla import XLAProfiler __all__ = [ - "AbstractProfiler", - "BaseProfiler", "Profiler", "AdvancedProfiler", "PassThroughProfiler", diff --git a/src/pytorch_lightning/profilers/base.py b/src/pytorch_lightning/profilers/base.py index b91f628013a33..030205066d08b 100644 --- a/src/pytorch_lightning/profilers/base.py +++ b/src/pytorch_lightning/profilers/base.py @@ -12,56 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Profiler to check if there are any bottlenecks in your code.""" -from abc import ABC, abstractmethod -from typing import Any from pytorch_lightning.profilers.profiler import Profiler -from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation - - -class AbstractProfiler(ABC): - """Specification of a profiler. - - See deprecation warning below - - .. deprecated:: v1.6 - `AbstractProfiler` was deprecated in v1.6 and will be removed in v1.8. - Please use `Profiler` instead. - """ - - @abstractmethod - def start(self, action_name: str) -> None: - """Defines how to start recording an action.""" - - @abstractmethod - def stop(self, action_name: str) -> None: - """Defines how to record the duration once an action is complete.""" - - @abstractmethod - def summary(self) -> str: - """Create profiler summary in text format.""" - - @abstractmethod - def setup(self, **kwargs: Any) -> None: - """Execute arbitrary pre-profiling set-up steps as defined by subclass.""" - - @abstractmethod - def teardown(self, **kwargs: Any) -> None: - """Execute arbitrary post-profiling tear-down steps as defined by subclass.""" - - -class BaseProfiler(Profiler): - """ - .. deprecated:: v1.6 - `BaseProfiler` was deprecated in v1.6 and will be removed in v1.8. - Please use `Profiler` instead. - """ - - def __init__(self, *args, **kwargs): - rank_zero_deprecation( - "`BaseProfiler` was deprecated in v1.6 and will be removed in v1.8. Please use `Profiler` instead." - ) - super().__init__(*args, **kwargs) class PassThroughProfiler(Profiler): diff --git a/tests/tests_pytorch/deprecated_api/test_remove_1-8.py b/tests/tests_pytorch/deprecated_api/test_remove_1-8.py index 77e007951edb0..2d51ed6c5c0d4 100644 --- a/tests/tests_pytorch/deprecated_api/test_remove_1-8.py +++ b/tests/tests_pytorch/deprecated_api/test_remove_1-8.py @@ -28,8 +28,7 @@ from pytorch_lightning.demos.boring_classes import BoringDataModule, BoringModel from pytorch_lightning.loggers import CSVLogger, Logger from pytorch_lightning.plugins.precision.precision_plugin import PrecisionPlugin -from pytorch_lightning.profiler import AbstractProfiler, BaseProfiler -from pytorch_lightning.profilers import AdvancedProfiler, Profiler, SimpleProfiler +from pytorch_lightning.profilers import AdvancedProfiler, SimpleProfiler from pytorch_lightning.strategies import ParallelStrategy from pytorch_lightning.strategies.ipu import LightningIPUModule from pytorch_lightning.trainer.configuration_validator import _check_datamodule_checkpoint_hooks @@ -469,10 +468,6 @@ def on_load_checkpoint(self, checkpoint): trainer.fit(model) -def test_v1_8_0_abstract_profiler(): - assert "`AbstractProfiler` was deprecated in v1.6" in AbstractProfiler.__doc__ - - def test_v1_8_0_datamodule_checkpointhooks(): class CustomBoringDataModuleSave(BoringDataModule): def on_save_checkpoint(self, checkpoint): @@ -641,28 +636,6 @@ def test_trainer_num_gpu_0(monkeypatch, gpus, expected_num_gpus, strategy): assert Trainer(gpus=gpus, strategy=strategy).num_gpus == expected_num_gpus -def test_v1_8_0_base_profiler(tmpdir): - class CustomProfiler1(BaseProfiler): - def start(self, action_name: str) -> None: - pass - - def stop(self, action_name: str) -> None: - pass - - class CustomProfiler2(Profiler): - def start(self, action_name: str) -> None: - pass - - def stop(self, action_name: str) -> None: - pass - - with pytest.deprecated_call(match="`BaseProfiler` was deprecated in v1.6"): - CustomProfiler1() - - # No deprecation message - CustomProfiler2() - - @pytest.mark.parametrize( ["trainer_kwargs", "expected_ipus"], [ From b24f5f164a6e1a060181c2d13bdd68025a73cef6 Mon Sep 17 00:00:00 2001 From: Laverne Henderson Date: Tue, 13 Sep 2022 08:04:50 -0700 Subject: [PATCH 134/193] Content for Lightning with iOS and Android (#14038) * Content for Lightning with iOS and Android * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * make link clickable * Update docs/source-app/glossary/ios_and_android.rst Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: awaelchli Co-authored-by: Jirka Borovec --- docs/source-app/glossary/ios_and_android.rst | 26 ++++++++++++++++++++ docs/source-app/index.rst | 1 + 2 files changed, 27 insertions(+) create mode 100644 docs/source-app/glossary/ios_and_android.rst diff --git a/docs/source-app/glossary/ios_and_android.rst b/docs/source-app/glossary/ios_and_android.rst new file mode 100644 index 0000000000000..90aeecbc0b141 --- /dev/null +++ b/docs/source-app/glossary/ios_and_android.rst @@ -0,0 +1,26 @@ + +############################################### +Apple and Android mobile devices with Lightning +############################################### + +Audience: Users who want to develop Lightning Apps for Apple or Android mobile devices. + +---- + +*********************************************************** +Develop a Lightning App for Apple or Android mobile devices +*********************************************************** + +There are a couple of ways you can go about building Lightning Apps that work on Apple or Android mobile devices. + +Option 1 +^^^^^^^^ + +You can develop a Lightning App that interacts with an iOS or Android app. +The ML and backend services live on the Lightning App, but the iOS or Android code (obj-c/swift or android) lives on the mobile devices. + +Option 2 +^^^^^^^^ + +You can build a mobile-first React Lightning App that works on both Apple and Android mobile devices. +The `InVideo app `_ is a good example of a Lightning App that does just that. diff --git a/docs/source-app/index.rst b/docs/source-app/index.rst index af0e7eb350827..e9381860eae9a 100644 --- a/docs/source-app/index.rst +++ b/docs/source-app/index.rst @@ -271,6 +271,7 @@ Keep Learning Event Loop Environment Variables Frontend + Apple and Android mobile devices with Lighting Apps REST API Sharing Components Scheduling From c81a71c90807cbad40e0bb317241440200bbbf43 Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Tue, 13 Sep 2022 17:43:15 +0200 Subject: [PATCH 135/193] Lightning App Fixes from Training Studio App dev (#14532) * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update --- .lightningignore | 17 +++++++++++++ src/lightning_app/CHANGELOG.md | 11 +++++---- src/lightning_app/cli/commands/connection.py | 3 ++- src/lightning_app/cli/lightning_cli.py | 19 +++++++++++---- src/lightning_app/core/api.py | 5 +++- src/lightning_app/core/app.py | 13 +++++++--- src/lightning_app/core/constants.py | 1 + src/lightning_app/runners/runtime.py | 5 ---- src/lightning_app/testing/testing.py | 16 ++++++++----- src/lightning_app/utilities/cli_helpers.py | 4 +++- src/lightning_app/utilities/commands/base.py | 6 +++-- src/lightning_app/utilities/proxies.py | 24 ++++++++++++++++++- tests/tests_app/cli/test_cmd_show_logs.py | 4 ++-- .../components/python/test_python.py | 4 ++-- 14 files changed, 99 insertions(+), 33 deletions(-) create mode 100644 .lightningignore diff --git a/.lightningignore b/.lightningignore new file mode 100644 index 0000000000000..f3bdf641c1425 --- /dev/null +++ b/.lightningignore @@ -0,0 +1,17 @@ +_notebooks +.azure +.circleci +.github +.ipynb_checkpoints +.pytest_cache +.shared +.storage +.venv +.vscode +.git +artifacts +Datasets +dist +docs +examples +tests diff --git a/src/lightning_app/CHANGELOG.md b/src/lightning_app/CHANGELOG.md index 7927372278e36..2b8c69b276a4d 100644 --- a/src/lightning_app/CHANGELOG.md +++ b/src/lightning_app/CHANGELOG.md @@ -17,9 +17,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Application storage prefix moved from `app_id` to `project_id/app_id` ([#14583](https://github.com/Lightning-AI/lightning/pull/14583)) -### Deprecated +- Improve Lightning App connect logic by disconnecting automatically ([#14532](https://github.com/Lightning-AI/lightning/pull/14532)) -- ### Fixed @@ -27,11 +26,13 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Resolved a bug where the wrong client was passed to collect cloud logs ([#14684](https://github.com/Lightning-AI/lightning/pull/14684)) -### Removed -- +- Unification of app template: moved `app.py` to root dir for `lightning init app ` template ([#13853](https://github.com/Lightning-AI/lightning/pull/13853)) + + + +- Fixed a bug where the uploaded command file wasn't properly parsed ([#14532](https://github.com/Lightning-AI/lightning/pull/14532)) -### Fixed - Resolved `LightningApp(..., debug=True)` ([#14464](https://github.com/Lightning-AI/lightning/pull/14464)) diff --git a/src/lightning_app/cli/commands/connection.py b/src/lightning_app/cli/commands/connection.py index e4288219d3095..2c0cac195afa6 100644 --- a/src/lightning_app/cli/commands/connection.py +++ b/src/lightning_app/cli/commands/connection.py @@ -33,7 +33,8 @@ def connect(app_name_or_id: str, yes: bool = False): else: click.echo(f"You are already connected to the cloud Lightning App: {app_name_or_id}.") else: - click.echo("You are already connected to a Lightning App. Please, use `lightning disconnect`.") + disconnect() + connect(app_name_or_id, yes) elif app_name_or_id.startswith("localhost"): diff --git a/src/lightning_app/cli/lightning_cli.py b/src/lightning_app/cli/lightning_cli.py index 89f342d0e24e3..173b46251cdb0 100644 --- a/src/lightning_app/cli/lightning_cli.py +++ b/src/lightning_app/cli/lightning_cli.py @@ -57,7 +57,7 @@ def main(): if app_name: # 3: Handle development use case. is_local_app = app_name == "localhost" - if is_local_app and sys.argv[1:3] == ["run", "app"]: + if sys.argv[1:3] == ["run", "app"] or sys.argv[1:3] == ["show", "logs"]: _main() else: if is_local_app: @@ -147,9 +147,20 @@ def logs(app_name: str, components: List[str], follow: bool) -> None: if not components: components = app_component_names - for component in components: - if component not in app_component_names: - raise click.ClickException(f"Component '{component}' does not exist in app {app_name}.") + else: + + def add_prefix(c: str): + if c == "flow": + return c + if not c.startswith("root."): + return "root." + c + return c + + components = [add_prefix(c) for c in components] + + for component in components: + if component not in app_component_names: + raise click.ClickException(f"Component '{component}' does not exist in app {app_name}.") log_reader = _app_logs_reader( logs_api_client=_LightningLogsSocketAPI(client.api_client), diff --git a/src/lightning_app/core/api.py b/src/lightning_app/core/api.py index faf0aad061978..a1b75dcea1383 100644 --- a/src/lightning_app/core/api.py +++ b/src/lightning_app/core/api.py @@ -21,7 +21,7 @@ from lightning_app.api.http_methods import HttpMethod from lightning_app.api.request_types import DeltaRequest -from lightning_app.core.constants import FRONTEND_DIR +from lightning_app.core.constants import ENABLE_STATE_WEBSOCKET, FRONTEND_DIR from lightning_app.core.queues import RedisQueue from lightning_app.utilities.app_helpers import InMemoryStateStore, Logger, StateStore from lightning_app.utilities.enum import OpenAPITags @@ -261,6 +261,9 @@ async def healthz(response: Response): @fastapi_service.websocket("/api/v1/ws") async def websocket_endpoint(websocket: WebSocket): await websocket.accept() + if not ENABLE_STATE_WEBSOCKET: + await websocket.close() + return try: counter = global_app_state_store.counter while True: diff --git a/src/lightning_app/core/app.py b/src/lightning_app/core/app.py index fad3d33b53c81..5fb693e0fe9b5 100644 --- a/src/lightning_app/core/app.py +++ b/src/lightning_app/core/app.py @@ -293,9 +293,16 @@ def _collect_deltas_from_ui_and_work_queues(self) -> t.List[t.Union[Delta, APIRe component_output: t.Optional[ComponentDelta] = self.get_state_changed_from_queue(self.delta_queue) if component_output: logger.debug(f"Received from {component_output.id} : {component_output.delta.to_dict()}") - work = self.get_component_by_name(component_output.id) - new_work_delta = _delta_to_app_state_delta(self.root, work, deepcopy(component_output.delta)) - deltas.append(new_work_delta) + + work = None + try: + work = self.get_component_by_name(component_output.id) + except (KeyError, AttributeError) as e: + logger.error(f"The component {component_output.id} couldn't be accessed. Exception: {e}") + + if work: + new_work_delta = _delta_to_app_state_delta(self.root, work, deepcopy(component_output.delta)) + deltas.append(new_work_delta) else: should_get_component_output = False diff --git a/src/lightning_app/core/constants.py b/src/lightning_app/core/constants.py index 5caf497513837..85b15f4daa5f6 100644 --- a/src/lightning_app/core/constants.py +++ b/src/lightning_app/core/constants.py @@ -33,6 +33,7 @@ DEBUG_ENABLED = bool(int(os.getenv("LIGHTNING_DEBUG", "0"))) LIGHTNING_COMPONENT_PUBLIC_REGISTRY = "https://lightning.ai/v1/components" LIGHTNING_APPS_PUBLIC_REGISTRY = "https://lightning.ai/v1/apps" +ENABLE_STATE_WEBSOCKET = bool(int(os.getenv("ENABLE_STATE_WEBSOCKET", "0"))) def get_lightning_cloud_url() -> str: diff --git a/src/lightning_app/runners/runtime.py b/src/lightning_app/runners/runtime.py index 3de5a8556cdfb..59387238843cf 100644 --- a/src/lightning_app/runners/runtime.py +++ b/src/lightning_app/runners/runtime.py @@ -92,11 +92,6 @@ def terminate(self) -> None: has_messaged = False while not self.done: try: - for work in self.app.works: - if not hasattr(work, "_has_called_on_exit"): - work.on_exit() - work._has_called_on_exit = True - if self.app.backend is not None: self.app.backend.stop_all_works(self.app.works) diff --git a/src/lightning_app/testing/testing.py b/src/lightning_app/testing/testing.py index 14bca57fe1a4a..dd209a98a8f86 100644 --- a/src/lightning_app/testing/testing.py +++ b/src/lightning_app/testing/testing.py @@ -232,6 +232,16 @@ def run_app_in_cloud( os.environ["LIGHTNING_APP_NAME"] = name + url = Config.url + if url.endswith("/"): + url = url[:-1] + payload = {"apiKey": Config.api_key, "username": Config.username} + res = requests.post(url + "/v1/auth/login", data=json.dumps(payload)) + if "token" not in res.json(): + raise Exception("You haven't properly setup your environment variables.") + + token = res.json()["token"] + # 3. Disconnect from the App if any. Popen("lightning disconnect", shell=True).wait() @@ -273,7 +283,6 @@ def run_app_in_cloud( # 6. Create chromium browser, auth to lightning_app.ai and yield the admin and view pages. with sync_playwright() as p: browser = p.chromium.launch(headless=bool(int(os.getenv("HEADLESS", "0")))) - payload = {"apiKey": Config.api_key, "username": Config.username, "duration": "120000"} context = browser.new_context( # Eventually this will need to be deleted http_credentials=HttpCredentials( @@ -283,11 +292,6 @@ def run_app_in_cloud( record_har_path=Config.har_location, ) admin_page = context.new_page() - url = Config.url - if url.endswith("/"): - url = url[:-1] - res = requests.post(url + "/v1/auth/login", data=json.dumps(payload)) - token = res.json()["token"] print(f"The Lightning App Token is: {token}") print(f"The Lightning App user key is: {Config.key}") print(f"The Lightning App user id is: {Config.id}") diff --git a/src/lightning_app/utilities/cli_helpers.py b/src/lightning_app/utilities/cli_helpers.py index 5c885360ce11f..9cdca64a71d9b 100644 --- a/src/lightning_app/utilities/cli_helpers.py +++ b/src/lightning_app/utilities/cli_helpers.py @@ -116,7 +116,9 @@ def _retrieve_application_url_and_available_commands(app_id_or_name_or_url: Opti raise Exception("The application is starting. Try in a few moments.") resp = requests.get(lightningapp.status.url + "/openapi.json") if resp.status_code != 200: - raise Exception(f"The server didn't process the request properly. Found {resp.json()}") + raise Exception( + "The server didn't process the request properly. " "Try once your application is ready." + ) return lightningapp.status.url, _extract_command_from_openapi(resp.json()), lightningapp.id return None, None, None diff --git a/src/lightning_app/utilities/commands/base.py b/src/lightning_app/utilities/commands/base.py index 512858f8d0ab9..b036929d1a687 100644 --- a/src/lightning_app/utilities/commands/base.py +++ b/src/lightning_app/utilities/commands/base.py @@ -60,9 +60,9 @@ def state(self): def run(self, **cli_kwargs) -> None: """Overrides with the logic to execute on the client side.""" - def invoke_handler(self, config: BaseModel) -> Dict[str, Any]: + def invoke_handler(self, config: Optional[BaseModel] = None) -> Dict[str, Any]: command = self.command_name.replace(" ", "_") - resp = requests.post(self.app_url + f"/command/{command}", data=config.json()) + resp = requests.post(self.app_url + f"/command/{command}", data=config.json() if config else None) assert resp.status_code == 200, resp.json() return resp.json() @@ -155,6 +155,7 @@ def _validate_client_command(command: ClientCommand): def _upload_command(command_name: str, command: ClientCommand) -> Optional[str]: from lightning_app.storage.path import _is_s3fs_available, filesystem, shared_storage_path + command_name = command_name.replace(" ", "_") filepath = f"commands/{command_name}.py" remote_url = str(shared_storage_path() / "artifacts" / filepath) fs = filesystem() @@ -164,6 +165,7 @@ def _upload_command(command_name: str, command: ClientCommand) -> Optional[str]: if not isinstance(fs, S3FileSystem): return + source_file = str(inspect.getfile(command.__class__)) remote_url = str(shared_storage_path() / "artifacts" / filepath) fs.put(source_file, remote_url) diff --git a/src/lightning_app/utilities/proxies.py b/src/lightning_app/utilities/proxies.py index 9691454bb2697..00a8f00f7a90d 100644 --- a/src/lightning_app/utilities/proxies.py +++ b/src/lightning_app/utilities/proxies.py @@ -405,8 +405,29 @@ def run_once(self): except BaseException as e: # 10.2 Send failed delta to the flow. reference_state = deepcopy(self.work.state) + exp, val, tb = sys.exc_info() + listing = traceback.format_exception(exp, val, tb) + user_exception = False + used_runpy = False + trace = [] + for p in listing: + if "runpy.py" in p: + trace = [] + used_runpy = True + if user_exception: + trace.append(p) + if "ret = work_run(*args, **kwargs)" in p: + user_exception = True + + if used_runpy: + trace = trace[1:] + self.work._calls[call_hash]["statuses"].append( - make_status(WorkStageStatus.FAILED, message=str(e), reason=WorkFailureReasons.USER_EXCEPTION) + make_status( + WorkStageStatus.FAILED, + message=str("\n".join(trace)), + reason=WorkFailureReasons.USER_EXCEPTION, + ) ) self.delta_queue.put( ComponentDelta( @@ -452,6 +473,7 @@ def _sigterm_signal_handler(self, signum, frame, call_hash: str) -> None: logger.info(f"Received SIGTERM signal. Gracefully terminating {self.work.name.replace('root.', '')}...") persist_artifacts(work=self.work) with _state_observer_lock: + self.work.on_exit() self.work._calls[call_hash]["statuses"] = [] state = deepcopy(self.work.state) self.work._calls[call_hash]["statuses"].append( diff --git a/tests/tests_app/cli/test_cmd_show_logs.py b/tests/tests_app/cli/test_cmd_show_logs.py index 0dc06025151fa..2093fa77d7021 100644 --- a/tests/tests_app/cli/test_cmd_show_logs.py +++ b/tests/tests_app/cli/test_cmd_show_logs.py @@ -7,7 +7,7 @@ @mock.patch("lightning_app.cli.lightning_cli.LightningClient") @mock.patch("lightning_app.cli.lightning_cli._get_project") -def test_show_logs_errors(project, client): +def test_show_logs_errors(_, client): """Test that the CLI prints the errors for the show logs command.""" runner = CliRunner() @@ -58,4 +58,4 @@ def test_show_logs_errors(project, client): result = runner.invoke(logs, ["MyFakeApp", "NonExistentComponent"]) assert result.exit_code == 1 - assert "Component 'NonExistentComponent' does not exist in app MyFakeApp." in result.output + assert "Component 'root.NonExistentComponent' does not exist in app MyFakeApp." in result.output diff --git a/tests/tests_app/components/python/test_python.py b/tests/tests_app/components/python/test_python.py index a8554e133e1a9..ba86bd487d94d 100644 --- a/tests/tests_app/components/python/test_python.py +++ b/tests/tests_app/components/python/test_python.py @@ -55,7 +55,7 @@ def test_popen_python_script_failure(): ) run_work_isolated(python_script) assert python_script.has_failed - assert python_script.status.message == "1" + assert "Exception(self.exit_code)" in python_script.status.message def test_tracer_python_script_with_kwargs(): @@ -96,7 +96,7 @@ def test_tracer_component_with_code(): python_script = TracerPythonScript("file.py", script_args=["--b=1"], raise_exception=False, code=code) run_work_isolated(python_script, params={"a": "1"}, restart_count=0) - assert python_script.status.message == "An error" + assert "An error" in python_script.status.message with open("file.py", "w") as f: f.write("import sys\n") From f68c0909fd70e4b7bfab08d76eea646a7d41acb9 Mon Sep 17 00:00:00 2001 From: Krishna Kalyan Date: Tue, 13 Sep 2022 18:11:45 +0200 Subject: [PATCH 136/193] Fix mypy errors attributed to `pytorch_lightning.profilers.pytorch` (#14405) * remove toml ref * fix conflicts * small fix * move assertion Co-authored-by: rohitgr7 --- pyproject.toml | 1 - src/pytorch_lightning/profilers/pytorch.py | 69 ++++++++++++++-------- 2 files changed, 44 insertions(+), 26 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index dd48b8126a351..777f86841adc3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -52,7 +52,6 @@ warn_no_return = "False" # mypy --no-error-summary 2>&1 | tr ':' ' ' | awk '{print $1}' | sort | uniq | sed 's/\.py//g; s|src/||g; s|\/|\.|g' | xargs -I {} echo '"{}",' module = [ "pytorch_lightning.callbacks.progress.rich_progress", - "pytorch_lightning.profilers.pytorch", "pytorch_lightning.trainer.trainer", "pytorch_lightning.tuner.batch_size_scaling", "pytorch_lightning.utilities.data", diff --git a/src/pytorch_lightning/profilers/pytorch.py b/src/pytorch_lightning/profilers/pytorch.py index c7f34fdc79d9c..475db682d953f 100644 --- a/src/pytorch_lightning/profilers/pytorch.py +++ b/src/pytorch_lightning/profilers/pytorch.py @@ -17,7 +17,7 @@ import os from functools import lru_cache, partial from pathlib import Path -from typing import Any, Callable, Dict, List, Optional, Type, TYPE_CHECKING, Union +from typing import Any, Callable, ContextManager, Dict, List, Optional, Type, TYPE_CHECKING, Union import torch from lightning_utilities.core.rank_zero import WarningCache @@ -42,7 +42,7 @@ log = logging.getLogger(__name__) warning_cache = WarningCache() -_PROFILER = Union[torch.autograd.profiler.profile, torch.cuda.profiler.profile, torch.autograd.profiler.emit_nvtx] +_PROFILER = Union[torch.profiler.profile, torch.autograd.profiler.profile, torch.autograd.profiler.emit_nvtx] class RegisterRecordFunction: @@ -111,13 +111,7 @@ def __init__(self, schedule: Callable) -> None: self._schedule = schedule self.reset() - def setup(self, start_action_name: str) -> None: - self._start_action_name = start_action_name - - def pre_step(self, current_action: str) -> None: - self._current_action = current_action - - def reset(self): + def reset(self) -> None: # handle properly `fast_dev_run`. PyTorch Profiler will fail otherwise. self._num_training_step = 0 self._num_validation_step = 0 @@ -132,20 +126,30 @@ def reset(self): self._prev_schedule_action: Optional[ProfilerAction] = None self._start_action_name: Optional[str] = None + def setup(self, start_action_name: str) -> None: + self._start_action_name = start_action_name + + def pre_step(self, current_action: str) -> None: + self._current_action = current_action + @property - def is_training(self): + def is_training(self) -> bool: + assert self._current_action is not None return self._current_action.endswith("training_step") @property - def is_validating(self): + def is_validating(self) -> bool: + assert self._current_action is not None return self._current_action.endswith("validation_step") @property - def is_testing(self): + def is_testing(self) -> bool: + assert self._current_action is not None return self._current_action.endswith("test_step") @property - def is_predicting(self): + def is_predicting(self) -> bool: + assert self._current_action is not None return self._current_action.endswith("predict_step") @property @@ -164,6 +168,7 @@ def _step(self) -> None: if self.is_training: self._num_training_step += 1 elif self.is_validating: + assert self._start_action_name is not None if self._start_action_name.endswith("on_fit_start"): if self._num_training_step > 0: self._num_validation_step += 1 @@ -238,7 +243,7 @@ def __init__( record_module_names: bool = True, **profiler_kwargs: Any, ) -> None: - """This profiler uses PyTorch's Autograd Profiler and lets you inspect the cost of. + r"""This profiler uses PyTorch's Autograd Profiler and lets you inspect the cost of. different operators inside your model - both on the CPU and GPU @@ -276,7 +281,7 @@ def __init__( record_module_names: Whether to add module names while recording autograd operation. - profiler_kwargs: Keyword arguments for the PyTorch profiler. This depends on your PyTorch version + \**profiler_kwargs: Keyword arguments for the PyTorch profiler. This depends on your PyTorch version Raises: MisconfigurationException: @@ -298,7 +303,7 @@ def __init__( self.function_events: Optional["EventList"] = None self._lightning_module: Optional["LightningModule"] = None # set by ProfilerConnector self._register: Optional[RegisterRecordFunction] = None - self._parent_profiler: Optional[_PROFILER] = None + self._parent_profiler: Optional[ContextManager] = None self._recording_map: Dict[str, record_function] = {} self._start_action_name: Optional[str] = None self._schedule: Optional[ScheduleWrapper] = None @@ -317,7 +322,7 @@ def _init_kineto(self, profiler_kwargs: Any) -> None: schedule = profiler_kwargs.get("schedule", None) if schedule is not None: - if not isinstance(schedule, Callable): + if not callable(schedule): raise MisconfigurationException(f"Schedule should be a callable. Found: {schedule}") action = schedule(0) if not isinstance(action, ProfilerAction): @@ -337,7 +342,9 @@ def _init_kineto(self, profiler_kwargs: Any) -> None: self._profiler_kwargs["with_stack"] = with_stack @property - def _total_steps(self) -> int: + def _total_steps(self) -> Union[int, float]: + assert self._schedule is not None + assert self._lightning_module is not None trainer = self._lightning_module.trainer if self._schedule.is_training: return trainer.num_training_batches @@ -358,13 +365,13 @@ def _should_override_schedule(self) -> bool: @staticmethod @lru_cache(1) - def _default_schedule() -> Optional[callable]: + def _default_schedule() -> Optional[Callable]: if _KINETO_AVAILABLE: # Those schedule defaults allow the profiling overhead to be negligible over training time. return torch.profiler.schedule(wait=1, warmup=1, active=3) def _default_activities(self) -> List["ProfilerActivity"]: - activities = [] + activities: List["ProfilerActivity"] = [] if not _KINETO_AVAILABLE: return activities if self._profiler_kwargs.get("use_cpu", True): @@ -411,6 +418,7 @@ def stop(self, action_name: str) -> None: return if self.profiler is not None and any(action_name.endswith(func) for func in self.STEP_FUNCTIONS): + assert isinstance(self.profiler, torch.profiler.profile) if self._schedule is not None: self._schedule.pre_step(action_name) @@ -424,11 +432,11 @@ def stop(self, action_name: str) -> None: self._schedule = None self.profiler.schedule = torch.profiler.profiler._default_schedule_fn - def on_trace_ready(profiler): + def on_trace_ready(profiler: _PROFILER) -> None: if self.dirpath is not None: if self._export_to_chrome: handler = tensorboard_trace_handler( - self.dirpath, self._prepare_filename(action_name=action_name, extension="") + str(self.dirpath), self._prepare_filename(action_name=action_name, extension="") ) handler(profiler) @@ -436,6 +444,7 @@ def on_trace_ready(profiler): path = os.path.join( self.dirpath, self._prepare_filename(action_name=action_name, extension=".stack") ) + assert isinstance(profiler, torch.autograd.profiler.profile) profiler.export_stacks(path, metric=self._metric) else: rank_zero_warn("The PyTorchProfiler failed to export trace as `dirpath` is None") @@ -469,8 +478,12 @@ def summary(self) -> str: return self._stats_to_str(recorded_stats) def _create_profilers(self) -> None: + if self.profiler is not None: + return + if self._emit_nvtx: - self._parent_profiler = self._create_profiler(torch.cuda.profiler.profile) + if self._parent_profiler is None: + self._parent_profiler = torch.cuda.profiler.profile() self.profiler = self._create_profiler(torch.autograd.profiler.emit_nvtx) else: self._parent_profiler = None @@ -486,7 +499,13 @@ def _create_profiler(self, profiler: Type[_PROFILER]) -> _PROFILER: def _cache_functions_events(self) -> None: if self._emit_nvtx: return - self.function_events = self.profiler.events() if _KINETO_AVAILABLE else self.profiler.function_events + + if _KINETO_AVAILABLE: + assert isinstance(self.profiler, torch.profiler.profile) + self.function_events = self.profiler.events() + else: + assert isinstance(self.profiler, torch.autograd.profiler.profile) + self.function_events = self.profiler.function_events def _delete_profilers(self) -> None: if self.profiler is not None: @@ -505,7 +524,7 @@ def _delete_profilers(self) -> None: self._register.__exit__(None, None, None) self._register = None - def teardown(self, stage: str) -> None: + def teardown(self, stage: Optional[str]) -> None: self._delete_profilers() for k in list(self._recording_map): From 313c338da1721c6bf87bbf00508fe2527badac65 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Tue, 13 Sep 2022 19:47:51 +0200 Subject: [PATCH 137/193] Remove legacy examples from logging docs (#14686) --- .../visualize/logging_basic.rst | 35 +++---------------- 1 file changed, 4 insertions(+), 31 deletions(-) diff --git a/docs/source-pytorch/visualize/logging_basic.rst b/docs/source-pytorch/visualize/logging_basic.rst index 4628fd756b488..198639fd9cd4e 100644 --- a/docs/source-pytorch/visualize/logging_basic.rst +++ b/docs/source-pytorch/visualize/logging_basic.rst @@ -29,8 +29,8 @@ To track a metric, simply use the *self.log* method available inside the *Lightn class LitModel(pl.LightningModule): def training_step(self, batch, batch_idx): - value = self.global_step - self.log("some_value", self.global_step) + value = ... + self.log("some_value", value) To log multiple metrics at once, use *self.log_dict* @@ -50,7 +50,7 @@ To view metrics in the commandline progress bar, set the *prog_bar* argument to .. code-block:: python - self.log(prog_bar=True) + self.log(..., prog_bar=True) TODO: need progress bar here @@ -105,39 +105,12 @@ If you don't want to average you can also choose from ``{min,max,sum}`` by passi .. code-block:: python # default function - self.log(..., reduce_fx=torch.mean) + self.log(..., reduce_fx="mean") For other reductions, we recommend logging a :class:`torchmetrics.Metric` instance instead. ---- -************ -Track images -************ -If your *experiment manager* supports image visualization, simply *log* the image with *self.log* - -.. code-block:: python - - # (32 batch samples, 3 channels, 32 width, 32 height) - image = torch.Tensor(32, 3, 28, 28) - self.log("an_image", image) - ----- - -********** -Track text -********** -If your *experiment manager* supports text visualization, simply *log* the text with *self.log* - -.. code-block:: python - - text = "hello world" - self.log("some_text", text) - -# TODO: show screenshot - ----- - ****************************** Configure the saving directory ****************************** From fcd1a8d8c111779a1347878cf3d7f636b0ccc8f7 Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Tue, 13 Sep 2022 20:03:49 +0200 Subject: [PATCH 138/193] CI: install more OS (#14660) --- .github/workflows/ci-pkg-install.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ci-pkg-install.yml b/.github/workflows/ci-pkg-install.yml index 36015eea599f7..50e462e2c0f7d 100644 --- a/.github/workflows/ci-pkg-install.yml +++ b/.github/workflows/ci-pkg-install.yml @@ -33,7 +33,7 @@ jobs: fail-fast: true max-parallel: 1 matrix: - os: [ubuntu-20.04, macOS-11, windows-2022] + os: [ubuntu-20.04, ubuntu-22.04, macOS-11, macOS-12, windows-2022] pkg: ["app", "lite", "pytorch"] python-version: [3.8] # , 3.9 @@ -69,7 +69,7 @@ jobs: fail-fast: false # max-parallel: 1 matrix: - os: [ubuntu-20.04, macOS-11, windows-2022] + os: [ubuntu-20.04, ubuntu-22.04, macOS-11, macOS-12, windows-2022] pkg: ["", "lightning"] python-version: [3.8] # , 3.9 @@ -105,7 +105,7 @@ jobs: fail-fast: false # max-parallel: 1 matrix: - os: [ubuntu-20.04, macOS-11, windows-2022] + os: [ubuntu-20.04, ubuntu-22.04, macOS-11, macOS-12, windows-2022] python-version: [3.8] # , 3.9 steps: From bc9dd1fb2a87085bb31f465f2a749515e1858459 Mon Sep 17 00:00:00 2001 From: Jerome Anand <88475913+jerome-habana@users.noreply.github.com> Date: Wed, 14 Sep 2022 03:51:47 +0530 Subject: [PATCH 139/193] Break hpu graphs into two for better performance (#14656) Signed-off-by: Jerome Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Jirka Borovec Co-authored-by: Kaushik B <45285388+kaushikb11@users.noreply.github.com> --- src/pytorch_lightning/CHANGELOG.md | 3 +++ .../strategies/hpu_parallel.py | 22 ++++++++++++++---- .../strategies/single_hpu.py | 23 +++++++++++++++---- 3 files changed, 40 insertions(+), 8 deletions(-) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 58baf7d2a63cd..2c58a555a8d6f 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -178,6 +178,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed an issue where `self.log`-ing a tensor would create a user warning from PyTorch about cloning tensors ([#14599](https://github.com/Lightning-AI/lightning/pull/14599)) +- Break HPU Graphs into two parts (forward + backward as one and optimizer as another) for better performance ([#14656](https://github.com/Lightning-AI/lightning/pull/14656)) + + - Fixed compatibility when `torch.distributed` is not available ([#14454](https://github.com/Lightning-AI/lightning/pull/14454)) diff --git a/src/pytorch_lightning/strategies/hpu_parallel.py b/src/pytorch_lightning/strategies/hpu_parallel.py index 96c66224ed72b..fdca6813c44f3 100644 --- a/src/pytorch_lightning/strategies/hpu_parallel.py +++ b/src/pytorch_lightning/strategies/hpu_parallel.py @@ -13,9 +13,11 @@ # limitations under the License. import logging import os -from typing import Any, Callable, Dict, List, Optional +from typing import Any, Callable, Dict, List, Optional, Union import torch.distributed +from torch.nn import Module +from torch.optim.optimizer import Optimizer import pytorch_lightning as pl from lightning_lite.plugins.environments.cluster_environment import ClusterEnvironment @@ -137,10 +139,22 @@ def broadcast(self, obj: object, src: int = 0) -> object: # type: ignore broadcast_object_list(obj, src, group=_group.WORLD) return obj[0] - def training_step_end(self, step_output: STEP_OUTPUT) -> STEP_OUTPUT: - # Break lazy accumulation of graph after every step + def on_after_backward(self) -> None: + # Break lazy accumulation of graph after fwd+bwd htcore.mark_step() - return step_output + + def optimizer_step( + self, + optimizer: Optimizer, + opt_idx: int, + closure: Callable[[], Any], + model: Optional[Union["pl.LightningModule", Module]] = None, + **kwargs: Any, + ) -> Any: + optimizer_output = super().optimizer_step(optimizer, opt_idx, closure, model, **kwargs) + # Break lazy accumulation of graph after optimizer + htcore.mark_step() + return optimizer_output def validation_step_end(self, step_output: STEP_OUTPUT) -> STEP_OUTPUT: # Break lazy accumulation of graph after every step diff --git a/src/pytorch_lightning/strategies/single_hpu.py b/src/pytorch_lightning/strategies/single_hpu.py index 1e91150cded22..5d6ead0358744 100644 --- a/src/pytorch_lightning/strategies/single_hpu.py +++ b/src/pytorch_lightning/strategies/single_hpu.py @@ -12,7 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import Dict, Optional +from typing import Any, Callable, Dict, Optional, Union + +from torch.nn import Module +from torch.optim.optimizer import Optimizer import pytorch_lightning as pl from lightning_lite.plugins.io.checkpoint_plugin import CheckpointIO @@ -79,10 +82,22 @@ def setup_optimizers(self, trainer: "pl.Trainer") -> None: def model_to_device(self) -> None: self.model.to(self.root_device) # type: ignore - def training_step_end(self, step_output: STEP_OUTPUT) -> STEP_OUTPUT: - # Break lazy accumulation of graph after every step + def on_after_backward(self) -> None: + # Break lazy accumulation of graph after fwd+bwd htcore.mark_step() - return step_output + + def optimizer_step( + self, + optimizer: Optimizer, + opt_idx: int, + closure: Callable[[], Any], + model: Optional[Union["pl.LightningModule", Module]] = None, + **kwargs: Any, + ) -> Any: + optimizer_output = super().optimizer_step(optimizer, opt_idx, closure, model, **kwargs) + # Break lazy accumulation of graph after optimizer + htcore.mark_step() + return optimizer_output def validation_step_end(self, step_output: STEP_OUTPUT) -> STEP_OUTPUT: # Break lazy accumulation of graph after every step From f3736f642a2871b4ee39716b618a6d96fbcdfeb2 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 13 Sep 2022 22:22:58 +0000 Subject: [PATCH 140/193] Update ipython[all] requirement from <8.4.1 to <8.5.1 in /requirements (#14671) Updates the requirements on [ipython[all]](https://github.com/ipython/ipython) to permit the latest version. - [Release notes](https://github.com/ipython/ipython/releases) - [Commits](https://github.com/ipython/ipython/compare/rel-0.8.4...8.5.0) --- updated-dependencies: - dependency-name: ipython[all] dependency-type: direct:production ... Signed-off-by: dependabot[bot] Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- requirements/pytorch/examples.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/pytorch/examples.txt b/requirements/pytorch/examples.txt index 7f6682c974a47..f4793d0123b15 100644 --- a/requirements/pytorch/examples.txt +++ b/requirements/pytorch/examples.txt @@ -3,4 +3,4 @@ torchvision>=0.10.*, <=0.13.0 gym[classic_control]>=0.17.0, <0.24.2 -ipython[all] <8.4.1 +ipython[all] <8.5.1 From 14b36f8109db63f4c9ed2480734ce5a0dbbc71b9 Mon Sep 17 00:00:00 2001 From: Krishna Kalyan Date: Wed, 14 Sep 2022 00:23:31 +0200 Subject: [PATCH 141/193] Fix incorrect imports in lightning docs (#14678) --- docs/source-app/workflows/add_web_ui/html/basic.rst | 6 +++--- .../add_web_ui/react/connect_react_and_lightning.rst | 4 ++-- docs/source-app/workflows/add_web_ui/streamlit/basic.rst | 6 +++--- .../workflows/add_web_ui/streamlit/intermediate.rst | 4 ++-- .../workflows/build_lightning_component/intermediate.rst | 5 ++--- 5 files changed, 12 insertions(+), 13 deletions(-) diff --git a/docs/source-app/workflows/add_web_ui/html/basic.rst b/docs/source-app/workflows/add_web_ui/html/basic.rst index 83b6fb7d13e21..114323ced660e 100644 --- a/docs/source-app/workflows/add_web_ui/html/basic.rst +++ b/docs/source-app/workflows/add_web_ui/html/basic.rst @@ -56,7 +56,7 @@ First **create a file named app.py** with the app content (in the same folder as class HelloComponent(L.LightningFlow): def configure_layout(self): - return L.frontend.web.StaticWebFrontend(serve_dir='.') + return L.app.frontend.web.StaticWebFrontend(serve_dir='.') class LitApp(L.LightningFlow): def __init__(self): @@ -107,7 +107,7 @@ Give the component an HTML UI, by returning a ``StaticWebFrontend`` object from class HelloComponent(L.LightningFlow): def configure_layout(self): - return L.frontend.web.StaticWebFrontend(serve_dir='.') + return L.app.frontend.web.StaticWebFrontend(serve_dir='.') class LitApp(L.LightningFlow): def __init__(self): @@ -137,7 +137,7 @@ In this case, we render the ``HelloComponent`` UI in the ``home`` tab of the app class HelloComponent(L.LightningFlow): def configure_layout(self): - return L.frontend.web.StaticWebFrontend(serve_dir='.') + return L.app.frontend.web.StaticWebFrontend(serve_dir='.') class LitApp(L.LightningFlow): def __init__(self): diff --git a/docs/source-app/workflows/add_web_ui/react/connect_react_and_lightning.rst b/docs/source-app/workflows/add_web_ui/react/connect_react_and_lightning.rst index c1c0c5e2017c8..74b252c1215c5 100644 --- a/docs/source-app/workflows/add_web_ui/react/connect_react_and_lightning.rst +++ b/docs/source-app/workflows/add_web_ui/react/connect_react_and_lightning.rst @@ -82,12 +82,12 @@ You can use this single react app for the FULL Lightning app, or you can specify class ComponentA(L.LightningFlow): def configure_layout(self): - return L.frontend.StaticWebFrontend(Path(__file__).parent / "react_app_1/dist") + return L.app.frontend.StaticWebFrontend(Path(__file__).parent / "react_app_1/dist") class ComponentB(L.LightningFlow): def configure_layout(self): - return L.frontend.StaticWebFrontend(Path(__file__).parent / "react_app_2/dist") + return L.app.frontend.StaticWebFrontend(Path(__file__).parent / "react_app_2/dist") class HelloLitReact(L.LightningFlow): diff --git a/docs/source-app/workflows/add_web_ui/streamlit/basic.rst b/docs/source-app/workflows/add_web_ui/streamlit/basic.rst index 464b558032a69..6bad07f552b61 100644 --- a/docs/source-app/workflows/add_web_ui/streamlit/basic.rst +++ b/docs/source-app/workflows/add_web_ui/streamlit/basic.rst @@ -46,7 +46,7 @@ First **create a file named app.py** with the app content: class LitStreamlit(L.LightningFlow): def configure_layout(self): - return L.frontend.StreamlitFrontend(render_fn=your_streamlit_app) + return L.app.frontend.StreamlitFrontend(render_fn=your_streamlit_app) class LitApp(L.LightningFlow): def __init__(self): @@ -125,7 +125,7 @@ the ``configure_layout`` method of the Lightning component you want to connect t class LitStreamlit(L.LightningFlow): def configure_layout(self): - return L.frontend.StreamlitFrontend(render_fn=your_streamlit_app) + return L.app.frontend.StreamlitFrontend(render_fn=your_streamlit_app) class LitApp(L.LightningFlow): def __init__(self): @@ -160,7 +160,7 @@ In this case, we render the ``LitStreamlit`` UI in the ``home`` tab of the appli class LitStreamlit(L.LightningFlow): def configure_layout(self): - return L.frontend.StreamlitFrontend(render_fn=your_streamlit_app) + return L.app.frontend.StreamlitFrontend(render_fn=your_streamlit_app) class LitApp(L.LightningFlow): def __init__(self): diff --git a/docs/source-app/workflows/add_web_ui/streamlit/intermediate.rst b/docs/source-app/workflows/add_web_ui/streamlit/intermediate.rst index ac289c2eb27e1..2b828819d4d09 100644 --- a/docs/source-app/workflows/add_web_ui/streamlit/intermediate.rst +++ b/docs/source-app/workflows/add_web_ui/streamlit/intermediate.rst @@ -35,7 +35,7 @@ For example, here we increase the count variable of the Lightning Component ever self.count = 0 def configure_layout(self): - return L.frontend.StreamlitFrontend(render_fn=your_streamlit_app) + return L.app.frontend.StreamlitFrontend(render_fn=your_streamlit_app) class LitApp(L.LightningFlow): @@ -81,7 +81,7 @@ In this example we update the value of the counter from the component: self.count += 1 def configure_layout(self): - return L.frontend.StreamlitFrontend(render_fn=your_streamlit_app) + return L.app.frontend.StreamlitFrontend(render_fn=your_streamlit_app) class LitApp(L.LightningFlow): diff --git a/docs/source-app/workflows/build_lightning_component/intermediate.rst b/docs/source-app/workflows/build_lightning_component/intermediate.rst index 2533cbac35c77..01586c048b519 100644 --- a/docs/source-app/workflows/build_lightning_component/intermediate.rst +++ b/docs/source-app/workflows/build_lightning_component/intermediate.rst @@ -32,12 +32,11 @@ To *connect* this user interface to the Component, define the configure_layout m :emphasize-lines: 5, 6 import lightning as L - from lightning_app.frontend.web import StaticWebFrontend class LitHTMLComponent(L.LightningFlow): def configure_layout(self): - return StaticWebFrontend(serve_dir="path/to/folder/with/index.html/inside") + return L.app.frontend.StaticWebFrontend(serve_dir="path/to/folder/with/index.html/inside") Finally, route the Component's UI through the root Component's **configure_layout** method: @@ -50,7 +49,7 @@ Finally, route the Component's UI through the root Component's **configure_layou class LitHTMLComponent(L.LightningFlow): def configure_layout(self): - return L.frontend.web.StaticWebFrontend(serve_dir="path/to/folder/with/index.html/inside") + return L.app.frontend.web.StaticWebFrontend(serve_dir="path/to/folder/with/index.html/inside") class LitApp(L.LightningFlow): From d579733b081e0e082557ee8271ccbd3b1209270f Mon Sep 17 00:00:00 2001 From: Sherin Thomas Date: Wed, 14 Sep 2022 12:14:59 +0530 Subject: [PATCH 142/193] Lightning cloud client call with key word arguments (#14685) --- src/lightning_app/CHANGELOG.md | 1 + src/lightning_app/cli/cmd_apps.py | 10 +++--- src/lightning_app/cli/commands/connection.py | 4 ++- src/lightning_app/cli/lightning_cli.py | 4 ++- src/lightning_app/runners/cloud.py | 18 +++++------ src/lightning_app/testing/testing.py | 4 +-- src/lightning_app/utilities/cli_helpers.py | 4 ++- src/lightning_app/utilities/cluster_logs.py | 5 ++- src/lightning_app/utilities/commands/base.py | 4 ++- tests/tests_app/cli/test_cloud_cli.py | 2 +- tests/tests_app/runners/test_cloud.py | 33 ++++++++++---------- tests/tests_app_examples/conftest.py | 0 12 files changed, 49 insertions(+), 40 deletions(-) delete mode 100644 tests/tests_app_examples/conftest.py diff --git a/src/lightning_app/CHANGELOG.md b/src/lightning_app/CHANGELOG.md index 2b8c69b276a4d..3106b7a7a2683 100644 --- a/src/lightning_app/CHANGELOG.md +++ b/src/lightning_app/CHANGELOG.md @@ -15,6 +15,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Changed - Application storage prefix moved from `app_id` to `project_id/app_id` ([#14583](https://github.com/Lightning-AI/lightning/pull/14583)) +- LightningCloud client calls to use key word arguments instead of positional arguments ([#14685](https://github.com/Lightning-AI/lightning/pull/14685) - Improve Lightning App connect logic by disconnecting automatically ([#14532](https://github.com/Lightning-AI/lightning/pull/14532)) diff --git a/src/lightning_app/cli/cmd_apps.py b/src/lightning_app/cli/cmd_apps.py index b413a9effbc96..346fa16d1b34a 100644 --- a/src/lightning_app/cli/cmd_apps.py +++ b/src/lightning_app/cli/cmd_apps.py @@ -24,18 +24,18 @@ def __init__(self): def list(self, cluster_id: str = None, limit: int = 100): project = _get_project(self.api_client) - args = { + kwargs = { "project_id": project.project_id, "limit": limit, } if cluster_id is not None: - args["cluster_id"] = cluster_id + kwargs["cluster_id"] = cluster_id - resp = self.api_client.lightningapp_instance_service_list_lightningapp_instances(**args) + resp = self.api_client.lightningapp_instance_service_list_lightningapp_instances(**kwargs) apps = resp.lightningapps while resp.next_page_token is not None and resp.next_page_token != "": - args["page_token"] = resp.next_page_token - resp = self.api_client.lightningapp_instance_service_list_lightningapp_instances(**args) + kwargs["page_token"] = resp.next_page_token + resp = self.api_client.lightningapp_instance_service_list_lightningapp_instances(**kwargs) apps = apps + resp.lightningapps console = Console() console.print(_AppList(resp.lightningapps).as_table()) diff --git a/src/lightning_app/cli/commands/connection.py b/src/lightning_app/cli/commands/connection.py index 2c0cac195afa6..625b5b9fb3684 100644 --- a/src/lightning_app/cli/commands/connection.py +++ b/src/lightning_app/cli/commands/connection.py @@ -76,7 +76,9 @@ def connect(app_name_or_id: str, yes: bool = False): if not api_commands: client = LightningClient() project = _get_project(client) - lightningapps = client.lightningapp_instance_service_list_lightningapp_instances(project.project_id) + lightningapps = client.lightningapp_instance_service_list_lightningapp_instances( + project_id=project.project_id + ) click.echo( "We didn't find a matching App. Here are the available Apps that could be " f"connected to {[app.name for app in lightningapps.lightningapps]}." diff --git a/src/lightning_app/cli/lightning_cli.py b/src/lightning_app/cli/lightning_cli.py index 173b46251cdb0..faa0146cc6d8d 100644 --- a/src/lightning_app/cli/lightning_cli.py +++ b/src/lightning_app/cli/lightning_cli.py @@ -118,7 +118,9 @@ def logs(app_name: str, components: List[str], follow: bool) -> None: apps = { app.name: app - for app in client.lightningapp_instance_service_list_lightningapp_instances(project.project_id).lightningapps + for app in client.lightningapp_instance_service_list_lightningapp_instances( + project_id=project.project_id + ).lightningapps } if not apps: diff --git a/src/lightning_app/runners/cloud.py b/src/lightning_app/runners/cloud.py index 14f0f5794c27d..949665de8d28b 100644 --- a/src/lightning_app/runners/cloud.py +++ b/src/lightning_app/runners/cloud.py @@ -189,7 +189,7 @@ def dispatch( try: list_apps_resp = self.backend.client.lightningapp_v2_service_list_lightningapps_v2( - project.project_id, name=app_config.name + project_id=project.project_id, name=app_config.name ) if list_apps_resp.lightningapps: # There can be only one app with unique project_id<>name pair @@ -197,7 +197,7 @@ def dispatch( else: app_body = Body7(name=app_config.name, can_download_source_code=True) lightning_app = self.backend.client.lightningapp_v2_service_create_lightningapp_v2( - project.project_id, app_body + project_id=project.project_id, body=app_body ) release_body = Body8( @@ -214,7 +214,7 @@ def dispatch( self._ensure_cluster_project_binding(project.project_id, cluster_id) lightning_app_release = self.backend.client.lightningapp_v2_service_create_lightningapp_release( - project.project_id, lightning_app.id, release_body + project_id=project.project_id, app_id=lightning_app.id, body=release_body ) if cluster_id is not None: @@ -238,7 +238,7 @@ def dispatch( # right now we only allow a single instance of the app find_instances_resp = self.backend.client.lightningapp_instance_service_list_lightningapp_instances( - project.project_id, app_id=lightning_app.id + project_id=project.project_id, app_id=lightning_app.id ) if find_instances_resp.lightningapps: existing_instance = find_instances_resp.lightningapps[0] @@ -279,10 +279,10 @@ def dispatch( else: lightning_app_instance = ( self.backend.client.lightningapp_v2_service_create_lightningapp_release_instance( - project.project_id, - lightning_app.id, - lightning_app_release.id, - Body9( + project_id=project.project_id, + app_id=lightning_app.id, + id=lightning_app_release.id, + body=Body9( cluster_id=cluster_id, desired_state=app_release_desired_state, name=lightning_app.name, @@ -313,7 +313,7 @@ def _ensure_cluster_project_binding(self, project_id: str, cluster_id: str): return self.backend.client.projects_service_create_project_cluster_binding( - project_id, + project_id=project_id, body=V1ProjectClusterBinding(cluster_id=cluster_id, project_id=project_id), ) diff --git a/src/lightning_app/testing/testing.py b/src/lightning_app/testing/testing.py index dd209a98a8f86..3b9299b7df359 100644 --- a/src/lightning_app/testing/testing.py +++ b/src/lightning_app/testing/testing.py @@ -346,7 +346,7 @@ def run_app_in_cloud( lightning_apps = [ app for app in client.lightningapp_instance_service_list_lightningapp_instances( - project.project_id + project_id=project.project_id ).lightningapps if app.name == name ] @@ -444,7 +444,7 @@ def delete_cloud_lightning_apps(): print(f"deleting apps for pr_number: {pr_number}, app_name: {app_name}") project = _get_project(client) - list_lightningapps = client.lightningapp_instance_service_list_lightningapp_instances(project.project_id) + list_lightningapps = client.lightningapp_instance_service_list_lightningapp_instances(project_id=project.project_id) print([lightningapp.name for lightningapp in list_lightningapps.lightningapps]) diff --git a/src/lightning_app/utilities/cli_helpers.py b/src/lightning_app/utilities/cli_helpers.py index 9cdca64a71d9b..41bffaa743614 100644 --- a/src/lightning_app/utilities/cli_helpers.py +++ b/src/lightning_app/utilities/cli_helpers.py @@ -103,7 +103,9 @@ def _retrieve_application_url_and_available_commands(app_id_or_name_or_url: Opti else: client = LightningClient() project = _get_project(client) - list_lightningapps = client.lightningapp_instance_service_list_lightningapp_instances(project.project_id) + list_lightningapps = client.lightningapp_instance_service_list_lightningapp_instances( + project_id=project.project_id + ) lightningapp_names = [lightningapp.name for lightningapp in list_lightningapps.lightningapps] diff --git a/src/lightning_app/utilities/cluster_logs.py b/src/lightning_app/utilities/cluster_logs.py index 76eb45df8ab43..a2241f9f63a25 100644 --- a/src/lightning_app/utilities/cluster_logs.py +++ b/src/lightning_app/utilities/cluster_logs.py @@ -4,7 +4,6 @@ from threading import Thread from typing import Callable, Iterator, Optional -import arrow import dateutil.parser from websocket import WebSocketApp @@ -64,8 +63,8 @@ def callback(ws_app: WebSocketApp, msg: str): def _cluster_logs_reader( client: LightningClient, cluster_id: str, - start: arrow.Arrow, - end: arrow.Arrow, + start: int, + end: int, limit: int, follow: bool, on_error_callback: Optional[Callable] = None, diff --git a/src/lightning_app/utilities/commands/base.py b/src/lightning_app/utilities/commands/base.py index b036929d1a687..5138fc467dca1 100644 --- a/src/lightning_app/utilities/commands/base.py +++ b/src/lightning_app/utilities/commands/base.py @@ -95,7 +95,9 @@ def _download_command( if not os.path.exists(target_file): client = LightningClient() project_id = _get_project(client).project_id - response = client.lightningapp_instance_service_list_lightningapp_instance_artifacts(project_id, app_id) + response = client.lightningapp_instance_service_list_lightningapp_instance_artifacts( + project_id=project_id, id=app_id + ) for artifact in response.artifacts: if f"commands/{command_name}.py" == artifact.filename: resp = requests.get(artifact.url, allow_redirects=True) diff --git a/tests/tests_app/cli/test_cloud_cli.py b/tests/tests_app/cli/test_cloud_cli.py index fc50a49365dd4..c6a562c8143ba 100644 --- a/tests/tests_app/cli/test_cloud_cli.py +++ b/tests/tests_app/cli/test_cloud_cli.py @@ -112,7 +112,7 @@ def lightningapp_v2_service_create_lightningapp_release(self, project_id, app_id assert project_id == "test-project-id" return self.create_response - def lightningapp_v2_service_create_lightningapp_release_instance(self, project_id, app_id, release_id, body): + def lightningapp_v2_service_create_lightningapp_release_instance(self, project_id, app_id, id, body): assert project_id == "test-project-id" return self.create_response diff --git a/tests/tests_app/runners/test_cloud.py b/tests/tests_app/runners/test_cloud.py index 633a91359bbb3..fcf53d92b7f00 100644 --- a/tests/tests_app/runners/test_cloud.py +++ b/tests/tests_app/runners/test_cloud.py @@ -100,10 +100,11 @@ def test_run_on_byoc_cluster(self, monkeypatch): dependency_cache_key=mock.ANY, ) cloud_runtime.backend.client.lightningapp_v2_service_create_lightningapp_release.assert_called_once_with( - "default-project-id", mock.ANY, body + project_id="default-project-id", app_id=mock.ANY, body=body ) cloud_runtime.backend.client.projects_service_create_project_cluster_binding.assert_called_once_with( - "default-project-id", body=V1ProjectClusterBinding(cluster_id="test1234", project_id="default-project-id") + project_id="default-project-id", + body=V1ProjectClusterBinding(cluster_id="test1234", project_id="default-project-id"), ) @mock.patch("lightning_app.runners.backends.cloud.LightningClient", mock.MagicMock()) @@ -141,7 +142,7 @@ def test_requirements_file(self, monkeypatch): dependency_cache_key=mock.ANY, ) cloud_runtime.backend.client.lightningapp_v2_service_create_lightningapp_release.assert_called_once_with( - "test-project-id", mock.ANY, body + project_id="test-project-id", app_id=mock.ANY, body=body ) # with requirements file @@ -156,7 +157,7 @@ def test_requirements_file(self, monkeypatch): ), ) cloud_runtime.backend.client.lightningapp_v2_service_create_lightningapp_release.assert_called_with( - "test-project-id", mock.ANY, body + project_id="test-project-id", app_id=mock.ANY, body=body ) @mock.patch("lightning_app.runners.backends.cloud.LightningClient", mock.MagicMock()) @@ -190,7 +191,7 @@ def test_no_cache(self, monkeypatch): args, kwargs, ) = cloud_runtime.backend.client.lightningapp_v2_service_create_lightningapp_release.mock_calls[0] - body = args[2] + body = kwargs["body"] assert body.dependency_cache_key == "dummy-hash" # testing with no-cache True @@ -202,7 +203,7 @@ def test_no_cache(self, monkeypatch): args, kwargs, ) = cloud_runtime.backend.client.lightningapp_v2_service_create_lightningapp_release.mock_calls[0] - body = args[2] + body = kwargs["body"] assert body.dependency_cache_key is None @mock.patch("lightning_app.runners.backends.cloud.LightningClient", mock.MagicMock()) @@ -288,7 +289,7 @@ def test_call_with_work_app(self, lightningapps, monkeypatch, tmpdir): ], ) mock_client.lightningapp_v2_service_create_lightningapp_release.assert_called_once_with( - "test-project-id", mock.ANY, expected_body + project_id="test-project-id", app_id=mock.ANY, body=expected_body ) # running dispatch with disabled dependency cache @@ -297,11 +298,11 @@ def test_call_with_work_app(self, lightningapps, monkeypatch, tmpdir): expected_body.dependency_cache_key = None cloud_runtime.dispatch() mock_client.lightningapp_v2_service_create_lightningapp_release.assert_called_once_with( - "test-project-id", mock.ANY, expected_body + project_id="test-project-id", app_id=mock.ANY, body=expected_body ) else: mock_client.lightningapp_v2_service_create_lightningapp_release_instance.assert_called_once_with( - "test-project-id", mock.ANY, mock.ANY, mock.ANY + project_id="test-project-id", app_id=mock.ANY, id=mock.ANY, body=mock.ANY ) @mock.patch("lightning_app.runners.backends.cloud.LightningClient", mock.MagicMock()) @@ -415,7 +416,7 @@ def test_call_with_work_app_and_attached_drives(self, lightningapps, monkeypatch ], ) mock_client.lightningapp_v2_service_create_lightningapp_release.assert_called_once_with( - "test-project-id", mock.ANY, expected_body + project_id="test-project-id", app_id=mock.ANY, body=expected_body ) # running dispatch with disabled dependency cache @@ -424,11 +425,11 @@ def test_call_with_work_app_and_attached_drives(self, lightningapps, monkeypatch expected_body.dependency_cache_key = None cloud_runtime.dispatch() mock_client.lightningapp_v2_service_create_lightningapp_release.assert_called_once_with( - "test-project-id", mock.ANY, expected_body + project_id="test-project-id", app_id=mock.ANY, body=expected_body ) else: mock_client.lightningapp_v2_service_create_lightningapp_release_instance.assert_called_once_with( - "test-project-id", mock.ANY, mock.ANY, mock.ANY + project_id="test-project-id", app_id=mock.ANY, id=mock.ANY, body=mock.ANY ) @mock.patch("lightning_app.runners.backends.cloud.LightningClient", mock.MagicMock()) @@ -610,12 +611,12 @@ def test_call_with_work_app_and_multiple_attached_drives(self, lightningapps, mo expected_body = expected_body_option_1 try: mock_client.lightningapp_v2_service_create_lightningapp_release.assert_called_once_with( - "test-project-id", mock.ANY, expected_body + project_id="test-project-id", app_id=mock.ANY, body=expected_body ) except Exception: expected_body = expected_body_option_2 mock_client.lightningapp_v2_service_create_lightningapp_release.assert_called_once_with( - "test-project-id", mock.ANY, expected_body + project_id="test-project-id", app_id=mock.ANY, body=expected_body ) # running dispatch with disabled dependency cache @@ -624,11 +625,11 @@ def test_call_with_work_app_and_multiple_attached_drives(self, lightningapps, mo expected_body.dependency_cache_key = None cloud_runtime.dispatch() mock_client.lightningapp_v2_service_create_lightningapp_release.assert_called_once_with( - "test-project-id", mock.ANY, expected_body + project_id="test-project-id", app_id=mock.ANY, body=expected_body ) else: mock_client.lightningapp_v2_service_create_lightningapp_release_instance.assert_called_once_with( - "test-project-id", mock.ANY, mock.ANY, mock.ANY + project_id="test-project-id", app_id=mock.ANY, id=mock.ANY, body=mock.ANY ) diff --git a/tests/tests_app_examples/conftest.py b/tests/tests_app_examples/conftest.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 From 71dadb5307e82e06b0357e90f98939e8b15e54b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Wed, 14 Sep 2022 09:32:07 +0200 Subject: [PATCH 143/193] Fix TPU CI for non-forks (#14688) --- .github/workflows/ci-circleci.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/ci-circleci.yml b/.github/workflows/ci-circleci.yml index d1ff85e45e0dd..ec886d0d4a579 100644 --- a/.github/workflows/ci-circleci.yml +++ b/.github/workflows/ci-circleci.yml @@ -8,7 +8,10 @@ on: - "src/pytorch_lightning/**" - "tests/tests_pytorch/**" - "setup.cfg" # includes pytest config - pull_request_target: + # should use `pull_request_target` but it's blocked by + # https://github.com/CircleCI-Public/trigger-circleci-pipeline-action/issues/27 + # so this job will not run on forks until the above is fixed or we replace CircleCI for another provider + pull_request: branches: [master, "release/*"] paths: - ".github/workflows/ci-circleci.yml" @@ -23,9 +26,6 @@ jobs: trigger-circleci: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 - with: - ref: ${{ github.event.pull_request.head.sha }} - uses: CircleCI-Public/trigger-circleci-pipeline-action@v1.0.5 env: CCI_TOKEN: ${{ secrets.CCI_TOKEN }} From 9b01a0fd32df99fc2ed7bf815e6314d139e69466 Mon Sep 17 00:00:00 2001 From: Pritam Soni <23050213+pritamsoni-hsr@users.noreply.github.com> Date: Wed, 14 Sep 2022 16:22:55 +0530 Subject: [PATCH 144/193] fix: e2e with short form after signup (#14689) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- src/lightning_app/testing/testing.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/src/lightning_app/testing/testing.py b/src/lightning_app/testing/testing.py index 3b9299b7df359..430c78662581b 100644 --- a/src/lightning_app/testing/testing.py +++ b/src/lightning_app/testing/testing.py @@ -315,6 +315,21 @@ def run_app_in_cloud( ) admin_page.goto(f"{Config.url}/{Config.username}/apps", timeout=60 * 1000) + # Closing the Complete your profile dialog + try: + dialog = admin_page.locator("text=Complete your profile") + dialog.wait_for(timeout=10 * 1000, state="visible") + print("'Complete your profile' dialog visible, closing it.") + admin_page.locator('input[name="firstName"]').fill("first") + admin_page.locator('input[name="lastName"]').fill("last") + admin_page.locator('input[name="email"]').fill("e2e.test.admin@lightning.ai") + admin_page.locator('input[name="organization"]').fill("Lightning AI") + button = admin_page.locator('button:has-text("Confirm")') + button.wait_for(timeout=3 * 1000) + button.click() + except playwright._impl._api_types.TimeoutError: + print("'Complete your profile' dialog not visible, skipping.") + # Closing the Create Project dialog. try: project_dialog = admin_page.locator("text=Create a project") From 8e9780bd5bd1275cc417dafe3b093f4968b2aaed Mon Sep 17 00:00:00 2001 From: Ritik Nandwal <48522685+nandwalritik@users.noreply.github.com> Date: Wed, 14 Sep 2022 16:51:57 +0530 Subject: [PATCH 145/193] fix mypy typing errors in pytorch_lightning.utilities.data.py (#13901) Co-authored-by: otaj --- pyproject.toml | 4 +- src/lightning_lite/utilities/data.py | 33 +++++++------- src/pytorch_lightning/strategies/ipu.py | 2 +- .../utilities/auto_restart.py | 2 +- src/pytorch_lightning/utilities/data.py | 44 +++++++------------ 5 files changed, 36 insertions(+), 49 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 777f86841adc3..dbf58177e67e4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -53,8 +53,6 @@ warn_no_return = "False" module = [ "pytorch_lightning.callbacks.progress.rich_progress", "pytorch_lightning.trainer.trainer", - "pytorch_lightning.tuner.batch_size_scaling", - "pytorch_lightning.utilities.data", - "lightning_lite.utilities.data", + "pytorch_lightning.tuner.batch_size_scaling" ] ignore_errors = "True" diff --git a/src/lightning_lite/utilities/data.py b/src/lightning_lite/utilities/data.py index cdaf806a0c48d..ca50344567b8e 100644 --- a/src/lightning_lite/utilities/data.py +++ b/src/lightning_lite/utilities/data.py @@ -21,7 +21,7 @@ from typing import Any, Callable, Dict, Generator, Iterable, Optional, Tuple, Type, Union from lightning_utilities.core.inheritance import get_all_subclasses -from torch.utils.data import BatchSampler, DataLoader, IterableDataset, Sampler +from torch.utils.data import BatchSampler, DataLoader, Dataset, IterableDataset, Sampler from lightning_lite.utilities.enums import LightningEnum from lightning_lite.utilities.exceptions import MisconfigurationException @@ -33,7 +33,8 @@ class _WrapAttrTag(LightningEnum): SET = "set" DEL = "del" - def __call__(self, *args): + def __call__(self, *args: Any) -> None: + fn: Union[Callable[[object, str], None], Callable[[object, str, Any], None]] if self == self.SET: fn = setattr else: @@ -45,12 +46,12 @@ def has_iterable_dataset(dataloader: DataLoader) -> bool: return hasattr(dataloader, "dataset") and isinstance(dataloader.dataset, IterableDataset) -def has_len(dataloader: Union[DataLoader, Iterable]) -> bool: +def has_len(dataloader: Union[DataLoader, Iterable, Dataset]) -> bool: """Checks if a given Dataloader has ``__len__`` method implemented i.e. if it is a finite dataloader or infinite dataloader.""" try: # try getting the length - if len(dataloader) == 0: + if len(dataloader) == 0: # type: ignore [arg-type] rank_zero_warn( f"`{dataloader.__class__.__name__}` returned 0 length. Please make sure this was your intention." ) @@ -58,7 +59,7 @@ def has_len(dataloader: Union[DataLoader, Iterable]) -> bool: except (TypeError, NotImplementedError): has_len = False - if has_len and has_iterable_dataset(dataloader): + if has_len and isinstance(dataloader, DataLoader) and has_iterable_dataset(dataloader): rank_zero_warn( "Your `IterableDataset` has `__len__` defined." " In combination with multi-process data loading (when num_workers > 1)," @@ -76,7 +77,7 @@ def _update_dataloader(dataloader: DataLoader, sampler: Union[Sampler, Iterable] def _get_dataloader_init_args_and_kwargs( dataloader: DataLoader, - sampler: Optional[Sampler], + sampler: Union[Sampler, Iterable], disallow_batch_sampler: bool = False, ) -> Tuple[Tuple[Any], Dict[str, Any]]: if not isinstance(dataloader, DataLoader): @@ -99,7 +100,7 @@ def _get_dataloader_init_args_and_kwargs( arg_names = () # get the dataloader instance `__init__` parameters - params = dict(inspect.signature(dataloader.__init__).parameters) + params = dict(inspect.signature(dataloader.__init__).parameters) # type: ignore[misc] has_variadic_kwargs = any(p.kind is p.VAR_KEYWORD for p in params.values()) if has_variadic_kwargs: # if the signature takes **kwargs, assume they will be passed down with `super().__init__(**kwargs)` @@ -141,14 +142,14 @@ def _get_dataloader_init_args_and_kwargs( } # the dataloader has required args which we could not extract from the existing attributes if required_args: - required_args = sorted(required_args) + sorted_required_args = sorted(required_args) dataloader_cls_name = dataloader.__class__.__name__ - missing_args_message = ", ".join(f"`self.{arg_name}`" for arg_name in required_args) + missing_args_message = ", ".join(f"`self.{arg_name}`" for arg_name in sorted_required_args) raise MisconfigurationException( f"Trying to inject custom `Sampler` into the `{dataloader_cls_name}` instance. " "This would fail as some of the `__init__` arguments are not available as instance attributes. " - f"The missing attributes are {required_args}. If you instantiate your `{dataloader_cls_name}` inside a " - "`*_dataloader` hook of your module, we will do this for you." + f"The missing attributes are {sorted_required_args}. If you instantiate your `{dataloader_cls_name}` " + "inside a `*_dataloader` hook of your module, we will do this for you." f" Otherwise, define {missing_args_message} inside your `__init__`." ) @@ -156,13 +157,13 @@ def _get_dataloader_init_args_and_kwargs( # the dataloader signature does not allow keyword arguments that need to be passed missing_kwargs = (set(dl_kwargs) | set(arg_names)) - params.keys() if missing_kwargs: - missing_kwargs = sorted(missing_kwargs) + sorted_missing_kwargs = sorted(missing_kwargs) dataloader_cls_name = dataloader.__class__.__name__ raise TypeError( f"Trying to inject parameters into the `{dataloader_cls_name}` instance. " "This would fail as it doesn't expose all its attributes in the `__init__` signature. " - f"The missing arguments are {missing_kwargs}. HINT: If you wrote the `{dataloader_cls_name}` class, " - "add the `__init__` arguments or allow passing `**kwargs`" + f"The missing arguments are {sorted_missing_kwargs}. HINT: If you wrote the `{dataloader_cls_name}` " + "class, add the `__init__` arguments or allow passing `**kwargs`" ) return dl_args, dl_kwargs @@ -170,7 +171,7 @@ def _get_dataloader_init_args_and_kwargs( def _dataloader_init_kwargs_resolve_sampler( dataloader: DataLoader, - sampler: Optional[Sampler], + sampler: Union[Sampler, Iterable], disallow_batch_sampler: bool = False, ) -> Dict[str, Any]: """This function is used to handle the sampler, batch_sampler arguments associated within a DataLoader for its @@ -334,7 +335,7 @@ def _wrap_attr_method(method: Callable, tag: _WrapAttrTag) -> Callable: :class:`~torch.utils.data.BatchSampler`) in order to enable re-instantiation of custom subclasses.""" @functools.wraps(method) - def wrapper(obj: Any, *args: Any): + def wrapper(obj: Any, *args: Any) -> None: # First, let's find out if we're the first in inheritance chain calling the patched method. name, *_ = args prev_call_name, prev_call_method = getattr(obj, "__pl_current_call", (None, "method")) diff --git a/src/pytorch_lightning/strategies/ipu.py b/src/pytorch_lightning/strategies/ipu.py index 64898e6c76251..966789a07feaa 100644 --- a/src/pytorch_lightning/strategies/ipu.py +++ b/src/pytorch_lightning/strategies/ipu.py @@ -245,7 +245,7 @@ def _convert_to_poptorch_loader( return dataloader dl_args, dl_kwargs = _get_dataloader_init_args_and_kwargs( - dataloader, sampler, mode, self.replication_factor > 1 # type: ignore[arg-type] + dataloader, sampler, mode, self.replication_factor > 1 ) opts = self.training_opts if mode == RunningStage.TRAINING else self.inference_opts dataloader = _reinstantiate_wrapped_cls( diff --git a/src/pytorch_lightning/utilities/auto_restart.py b/src/pytorch_lightning/utilities/auto_restart.py index d9d8c5da38858..34033b898f3be 100644 --- a/src/pytorch_lightning/utilities/auto_restart.py +++ b/src/pytorch_lightning/utilities/auto_restart.py @@ -62,7 +62,7 @@ class FastForwardSampler(Sampler): samples seen in the last iterations (for the current worker). """ - def __init__(self, sampler: Iterator, attr_name: Optional[str] = None) -> None: + def __init__(self, sampler: Union[Sampler, Iterable], attr_name: Optional[str] = None) -> None: super().__init__(data_source=None) self._sampler = sampler self.restarting: bool = False diff --git a/src/pytorch_lightning/utilities/data.py b/src/pytorch_lightning/utilities/data.py index cf07949461f05..17f8b9f101cdd 100644 --- a/src/pytorch_lightning/utilities/data.py +++ b/src/pytorch_lightning/utilities/data.py @@ -30,7 +30,6 @@ ) import pytorch_lightning as pl -from lightning_lite.utilities import LightningEnum from lightning_lite.utilities.data import _reinstantiate_wrapped_cls, _replace_value_in_saved_args from lightning_lite.utilities.data import has_iterable_dataset as new_has_iterable_dataset from lightning_lite.utilities.data import has_len as new_has_len @@ -41,24 +40,13 @@ from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation, rank_zero_warn -BType = Union[Tensor, str, Mapping[Any, "BType"], Iterable["BType"]] +# might be supported in later releases, see https://github.com/python/mypy/pull/13297 +BType = Union[Tensor, str, Mapping[Any, "BType"], Iterable["BType"]] # type: ignore[misc] warning_cache = WarningCache() -class _WrapAttrTag(LightningEnum): - SET = "set" - DEL = "del" - - def __call__(self, *args): - if self == self.SET: - fn = setattr - else: - fn = delattr - return fn(*args) - - -def _extract_batch_size(batch: BType) -> Generator[int, None, None]: +def _extract_batch_size(batch: BType) -> Generator[Optional[int], None, None]: if isinstance(batch, Tensor): if batch.ndim == 0: yield 1 @@ -109,7 +97,7 @@ def extract_batch_size(batch: BType) -> int: def has_len_all_ranks( dataloader: DataLoader, - strategy: "pl.Strategy", + strategy: "pl.strategies.Strategy", model: Union["pl.LightningModule", "pl.LightningDataModule"], ) -> bool: """Checks if a given Dataloader has ``__len__`` method implemented i.e. if it is a finite dataloader or @@ -151,14 +139,14 @@ def has_len_all_ranks( return has_len -def get_len(dataloader: DataLoader) -> Union[int, float]: +def get_len(dataloader: Union[DataLoader, Dataset]) -> Union[int, float]: """Return the length of the given DataLoader. If ``__len__`` method is not implemented, return float('inf'). """ if new_has_len(dataloader): - return len(dataloader) + return len(dataloader) # type: ignore [arg-type] return float("inf") @@ -173,7 +161,7 @@ def _update_dataloader( def _get_dataloader_init_args_and_kwargs( dataloader: DataLoader, - sampler: Optional[Sampler], + sampler: Union[Sampler, Iterable], mode: Optional[RunningStage] = None, disallow_batch_sampler: bool = False, ) -> Tuple[Tuple[Any], Dict[str, Any]]: @@ -197,7 +185,7 @@ def _get_dataloader_init_args_and_kwargs( arg_names = () # get the dataloader instance `__init__` parameters - params = dict(inspect.signature(dataloader.__init__).parameters) + params = dict(inspect.signature(dataloader.__init__).parameters) # type: ignore[misc] has_variadic_kwargs = any(p.kind is p.VAR_KEYWORD for p in params.values()) if has_variadic_kwargs: # if the signature takes **kwargs, assume they will be passed down with `super().__init__(**kwargs)` @@ -239,14 +227,14 @@ def _get_dataloader_init_args_and_kwargs( } # the dataloader has required args which we could not extract from the existing attributes if required_args: - required_args = sorted(required_args) + sorted_required_args = sorted(required_args) dataloader_cls_name = dataloader.__class__.__name__ - missing_args_message = ", ".join(f"`self.{arg_name}`" for arg_name in required_args) + missing_args_message = ", ".join(f"`self.{arg_name}`" for arg_name in sorted_required_args) raise MisconfigurationException( f"Trying to inject custom `Sampler` into the `{dataloader_cls_name}` instance. " "This would fail as some of the `__init__` arguments are not available as instance attributes. " - f"The missing attributes are {required_args}. If you instantiate your `{dataloader_cls_name}` inside a " - "`*_dataloader` hook of your module, we will do this for you." + f"The missing attributes are {sorted_required_args}. If you instantiate your `{dataloader_cls_name}` " + "inside a `*_dataloader` hook of your module, we will do this for you." f" Otherwise, define {missing_args_message} inside your `__init__`." ) @@ -254,13 +242,13 @@ def _get_dataloader_init_args_and_kwargs( # the dataloader signature does not allow keyword arguments that need to be passed missing_kwargs = (set(dl_kwargs) | set(arg_names)) - params.keys() if missing_kwargs: - missing_kwargs = sorted(missing_kwargs) + sorted_missing_kwargs = sorted(missing_kwargs) dataloader_cls_name = dataloader.__class__.__name__ raise MisconfigurationException( f"Trying to inject parameters into the `{dataloader_cls_name}` instance. " "This would fail as it doesn't expose all its attributes in the `__init__` signature. " - f"The missing arguments are {missing_kwargs}. HINT: If you wrote the `{dataloader_cls_name}` class, " - "add the `__init__` arguments or allow passing `**kwargs`" + f"The missing arguments are {sorted_missing_kwargs}. HINT: If you wrote the `{dataloader_cls_name}` " + "class, add the `__init__` arguments or allow passing `**kwargs`" ) if _FaultTolerantMode.detect_current_mode().is_automatic: @@ -273,7 +261,7 @@ def _get_dataloader_init_args_and_kwargs( def _dataloader_init_kwargs_resolve_sampler( dataloader: DataLoader, - sampler: Optional[Sampler], + sampler: Union[Sampler, Iterable], mode: Optional[RunningStage] = None, disallow_batch_sampler: bool = False, ) -> Dict[str, Any]: From 6333caabb0432be40c7e9278de6be795ddc012de Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 14 Sep 2022 15:15:21 +0200 Subject: [PATCH 146/193] Standalone Lite: Strategy base classes and registry (#14662) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * add accelerator implementations to lite * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix imports * rename registry argument * fix test * fix tests * remove duplicated test * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix tests * deprecation * deprecations * flake8 * fixes * add mps to runif * fix tests * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Apply suggestions from code review Co-authored-by: Carlos Mocholí * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * remove more * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * local import * undo device stats :( * fix import * stupid typehints * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * more refactors :( * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix * rename init_device to setup_device * remove unused import * make uppercase to differentiate from class * trick test after moving import locally * add base classes and registry * reg * registry * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * tests * update to other branches * resolve todo(lite) * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add very basic unit tests * fix name assignment * Update src/lightning_lite/strategies/parallel.py Co-authored-by: Kaushik B <45285388+kaushikb11@users.noreply.github.com> * remove deprecated property * remove pre- and post backward for now * protecting the registry utility function * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * remove unused import Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Carlos Mocholí Co-authored-by: Jirka Borovec Co-authored-by: Kaushik B <45285388+kaushikb11@users.noreply.github.com> --- src/lightning_lite/strategies/__init__.py | 21 ++ .../strategies/launchers/multiprocessing.py | 4 +- .../strategies/launchers/subprocess_script.py | 4 +- src/lightning_lite/strategies/parallel.py | 112 +++++++ .../strategies/registry.py} | 14 +- .../strategies/single_device.py | 78 +++++ src/lightning_lite/strategies/strategy.py | 292 ++++++++++++++++++ src/pytorch_lightning/strategies/__init__.py | 9 +- src/pytorch_lightning/strategies/utils.py | 13 + tests/tests_lite/strategies/test_registry.py | 44 +++ .../strategies/test_single_device.py | 54 ++++ ..._strategy_registry.py => test_registry.py} | 24 -- 12 files changed, 628 insertions(+), 41 deletions(-) create mode 100644 src/lightning_lite/strategies/parallel.py rename src/{pytorch_lightning/strategies/strategy_registry.py => lightning_lite/strategies/registry.py} (89%) create mode 100644 src/lightning_lite/strategies/single_device.py create mode 100644 src/lightning_lite/strategies/strategy.py create mode 100644 tests/tests_lite/strategies/test_registry.py create mode 100644 tests/tests_lite/strategies/test_single_device.py rename tests/tests_pytorch/strategies/{test_strategy_registry.py => test_registry.py} (85%) diff --git a/src/lightning_lite/strategies/__init__.py b/src/lightning_lite/strategies/__init__.py index e69de29bb2d1d..b76af7a22df17 100644 --- a/src/lightning_lite/strategies/__init__.py +++ b/src/lightning_lite/strategies/__init__.py @@ -0,0 +1,21 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from lightning_lite.strategies.parallel import ParallelStrategy # noqa: F401 +from lightning_lite.strategies.registry import _call_register_strategies, _StrategyRegistry +from lightning_lite.strategies.single_device import SingleDeviceStrategy # noqa: F401 +from lightning_lite.strategies.strategy import Strategy # noqa: F401 + +STRATEGY_REGISTRY = _StrategyRegistry() +_STRATEGIES_BASE_MODULE = "lightning_lite.strategies" +_call_register_strategies(STRATEGY_REGISTRY, _STRATEGIES_BASE_MODULE) diff --git a/src/lightning_lite/strategies/launchers/multiprocessing.py b/src/lightning_lite/strategies/launchers/multiprocessing.py index fc6dd5025fdf5..ca47efe030302 100644 --- a/src/lightning_lite/strategies/launchers/multiprocessing.py +++ b/src/lightning_lite/strategies/launchers/multiprocessing.py @@ -22,6 +22,7 @@ from typing_extensions import Literal from lightning_lite.strategies.launchers.base import _Launcher +from lightning_lite.strategies.strategy import Strategy from lightning_lite.utilities.apply_func import move_data_to_device from lightning_lite.utilities.imports import _TORCH_GREATER_EQUAL_1_11 from lightning_lite.utilities.seed import _collect_rng_states, _set_rng_states @@ -52,8 +53,7 @@ class _MultiProcessingLauncher(_Launcher): def __init__( self, - # TODO(lite): Fix this type annotation once the strategy base class gets added to Lite - strategy: "Strategy", # type: ignore[name-defined] # noqa: F821 + strategy: "Strategy", start_method: Literal["spawn", "fork", "forkserver"] = "spawn", ) -> None: self._strategy = strategy diff --git a/src/lightning_lite/strategies/launchers/subprocess_script.py b/src/lightning_lite/strategies/launchers/subprocess_script.py index 7f814e01e2b71..b2da01835d9ec 100644 --- a/src/lightning_lite/strategies/launchers/subprocess_script.py +++ b/src/lightning_lite/strategies/launchers/subprocess_script.py @@ -21,6 +21,7 @@ import numpy as np from lightning_utilities.core.imports import RequirementCache +from lightning_lite.plugins.environments.cluster_environment import ClusterEnvironment from lightning_lite.strategies.launchers.base import _Launcher _HYDRA_AVAILABLE = RequirementCache("hydra") @@ -66,8 +67,7 @@ class _SubprocessScriptLauncher(_Launcher): def __init__( self, - # TODO(lite): Update type annotation once ClusterEnvironment has moved to Lite - cluster_environment: "ClusterEnvironment", # type: ignore[name-defined] # noqa: F821 + cluster_environment: "ClusterEnvironment", num_processes: int, num_nodes: int, ) -> None: diff --git a/src/lightning_lite/strategies/parallel.py b/src/lightning_lite/strategies/parallel.py new file mode 100644 index 0000000000000..455c32d5e8d4a --- /dev/null +++ b/src/lightning_lite/strategies/parallel.py @@ -0,0 +1,112 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from abc import ABC, abstractmethod +from contextlib import contextmanager +from typing import Any, Dict, Generator, List, Optional + +import torch +from torch import Tensor +from torch.nn import Module + +import lightning_lite as lite +from lightning_lite.accelerators.accelerator import Accelerator +from lightning_lite.plugins.environments.cluster_environment import ClusterEnvironment +from lightning_lite.plugins.io.checkpoint_plugin import CheckpointIO +from lightning_lite.plugins.precision import Precision +from lightning_lite.strategies.strategy import Strategy +from lightning_lite.utilities.distributed import all_gather_ddp_if_available, ReduceOp + + +class ParallelStrategy(Strategy, ABC): + """Strategy for training with multiple processes in parallel.""" + + def __init__( + self, + accelerator: Optional[Accelerator] = None, + parallel_devices: Optional[List[torch.device]] = None, + cluster_environment: Optional[ClusterEnvironment] = None, + checkpoint_io: Optional[CheckpointIO] = None, + precision_plugin: Optional[Precision] = None, + ): + super().__init__(accelerator=accelerator, checkpoint_io=checkpoint_io, precision_plugin=precision_plugin) + self.parallel_devices = parallel_devices + self.cluster_environment = cluster_environment + + @property + @abstractmethod + def root_device(self) -> torch.device: + """Return the root device.""" + + @property + def global_rank(self) -> int: + return self.cluster_environment.global_rank() if self.cluster_environment is not None else 0 + + @property + def local_rank(self) -> int: + return self.cluster_environment.local_rank() if self.cluster_environment is not None else 0 + + @property + def node_rank(self) -> int: + return self.cluster_environment.node_rank() if self.cluster_environment is not None else 0 + + @property + def world_size(self) -> int: + return self.cluster_environment.world_size() if self.cluster_environment is not None else 1 + + @property + def is_global_zero(self) -> bool: + return self.global_rank == 0 + + @property + def parallel_devices(self) -> Optional[List[torch.device]]: + return self._parallel_devices + + @parallel_devices.setter + def parallel_devices(self, parallel_devices: Optional[List[torch.device]]) -> None: + self._parallel_devices = parallel_devices + + @property + def distributed_sampler_kwargs(self) -> Dict[str, Any]: + distributed_sampler_kwargs = dict( + num_replicas=len(self.parallel_devices) if self.parallel_devices is not None else 0, rank=self.global_rank + ) + return distributed_sampler_kwargs + + def all_gather(self, tensor: Tensor, group: Optional[Any] = None, sync_grads: bool = False) -> Tensor: + """Perform a all_gather on all processes.""" + return all_gather_ddp_if_available(tensor, group=group, sync_grads=sync_grads) + + def reduce_boolean_decision(self, decision: bool) -> bool: + decision = torch.tensor(int(decision), device=self.root_device) + decision = self.reduce(decision, reduce_op=ReduceOp.SUM) + decision = bool(decision == self.world_size) + return decision + + @contextmanager + def block_backward_sync(self, module: Module) -> Generator: + """Blocks ddp sync gradients behaviour on backwards pass. + + This is useful for skipping sync when accumulating gradients, reducing communication overhead + Returns: context manager with sync behaviour off + """ + if isinstance(module, lite.utilities.types.DistributedDataParallel): + with module.no_sync(): + yield None + else: + yield None + + def teardown(self) -> None: + assert self.cluster_environment is not None + self.cluster_environment.teardown() + return super().teardown() diff --git a/src/pytorch_lightning/strategies/strategy_registry.py b/src/lightning_lite/strategies/registry.py similarity index 89% rename from src/pytorch_lightning/strategies/strategy_registry.py rename to src/lightning_lite/strategies/registry.py index 43089b735aca0..4b35e82e3a814 100644 --- a/src/pytorch_lightning/strategies/strategy_registry.py +++ b/src/lightning_lite/strategies/registry.py @@ -15,9 +15,8 @@ from inspect import getmembers, isclass from typing import Any, Callable, Dict, List, Optional +from lightning_lite.strategies.strategy import Strategy from lightning_lite.utilities.registry import _is_register_method_overridden -from pytorch_lightning.strategies.strategy import Strategy -from pytorch_lightning.utilities.exceptions import MisconfigurationException class _StrategyRegistry(dict): @@ -65,7 +64,7 @@ def register( raise TypeError(f"`name` must be a str, found {name}") if name in self and not override: - raise MisconfigurationException(f"'{name}' is already present in the registry. HINT: Use `override=True`.") + raise ValueError(f"'{name}' is already present in the registry. HINT: Use `override=True`.") data: Dict[str, Any] = {} data["description"] = description if description is not None else "" @@ -74,7 +73,7 @@ def register( def do_register(strategy: Callable) -> Callable: data["strategy"] = strategy - data["strategy_name"] = strategy.strategy_name + data["strategy_name"] = name self[name] = data return strategy @@ -112,11 +111,8 @@ def __str__(self) -> str: return "Registered Strategies: {}".format(", ".join(self.keys())) -StrategyRegistry = _StrategyRegistry() - - -def call_register_strategies(base_module: str) -> None: +def _call_register_strategies(registry: _StrategyRegistry, base_module: str) -> None: module = importlib.import_module(base_module) for _, mod in getmembers(module, isclass): if issubclass(mod, Strategy) and _is_register_method_overridden(mod, Strategy, "register_strategies"): - mod.register_strategies(StrategyRegistry) + mod.register_strategies(registry) diff --git a/src/lightning_lite/strategies/single_device.py b/src/lightning_lite/strategies/single_device.py new file mode 100644 index 0000000000000..aa851ca419b22 --- /dev/null +++ b/src/lightning_lite/strategies/single_device.py @@ -0,0 +1,78 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +from typing import Any + +import torch +from torch import Tensor +from torch.nn import Module + +from lightning_lite.accelerators import Accelerator +from lightning_lite.plugins.io.checkpoint_plugin import CheckpointIO +from lightning_lite.plugins.precision import Precision +from lightning_lite.strategies.strategy import Strategy, TBroadcast +from lightning_lite.utilities.types import _DEVICE + + +class SingleDeviceStrategy(Strategy): + """Strategy that handles communication on a single device.""" + + def __init__( + self, + device: _DEVICE = "cpu", + accelerator: Accelerator | None = None, + checkpoint_io: CheckpointIO | None = None, + precision_plugin: Precision | None = None, + ): + super().__init__(accelerator=accelerator, checkpoint_io=checkpoint_io, precision_plugin=precision_plugin) + self._root_device = torch.device(device) + self.global_rank = 0 + self.local_rank = 0 + self.world_size = 1 + + @property + def root_device(self) -> torch.device: + return self._root_device + + @property + def is_global_zero(self) -> bool: + return True + + def module_to_device(self, module: Module) -> None: + module.to(self.root_device) + + def reduce(self, tensor: Any | Tensor, *args: Any, **kwargs: Any) -> Any | Tensor: + """Reduces a tensor from several distributed processes to one aggregated tensor. As this plugin only + operates with a single device, the reduction is simply the identity. + + Args: + tensor: the tensor to sync and reduce + *args: ignored + **kwargs: ignored + + Return: + the unmodified input as reduction is not needed for single process operation + """ + return tensor + + def all_gather(self, tensor: Tensor, group: Any | None = None, sync_grads: bool = False) -> Tensor: + """Perform a all_gather on all processes.""" + return tensor + + def barrier(self, *args: Any, **kwargs: Any) -> None: + pass + + def broadcast(self, obj: TBroadcast, src: int = 0) -> TBroadcast: + return obj diff --git a/src/lightning_lite/strategies/strategy.py b/src/lightning_lite/strategies/strategy.py new file mode 100644 index 0000000000000..e761718065b58 --- /dev/null +++ b/src/lightning_lite/strategies/strategy.py @@ -0,0 +1,292 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import contextlib +import logging +from abc import ABC, abstractmethod +from typing import Any, Dict, Generator, Iterable, List, Mapping, Optional, Tuple, TypeVar, Union + +import torch +from torch import Tensor +from torch.nn import Module +from torch.optim import Optimizer +from torch.utils.data import DataLoader + +from lightning_lite.accelerators import Accelerator +from lightning_lite.plugins import TorchCheckpointIO +from lightning_lite.plugins.io.checkpoint_plugin import CheckpointIO +from lightning_lite.plugins.precision import Precision +from lightning_lite.strategies.launchers.base import _Launcher +from lightning_lite.utilities.apply_func import move_data_to_device +from lightning_lite.utilities.distributed import ReduceOp +from lightning_lite.utilities.optimizer import optimizer_to_device +from lightning_lite.utilities.types import _PATH + +TBroadcast = TypeVar("TBroadcast") +TReduce = TypeVar("TReduce") + +log = logging.getLogger(__name__) + + +class Strategy(ABC): + """Base class for all strategies that change the behaviour of the training, validation and test- loop.""" + + def __init__( + self, + accelerator: Optional[Accelerator] = None, + checkpoint_io: Optional[CheckpointIO] = None, + precision_plugin: Optional[Precision] = None, + ) -> None: + self._accelerator: Optional[Accelerator] = accelerator + self._checkpoint_io: Optional[CheckpointIO] = checkpoint_io + self._precision_plugin: Optional[Precision] = precision_plugin + self._launcher: Optional[_Launcher] = None + + @property + @abstractmethod + def root_device(self) -> torch.device: + """Returns the root device.""" + + @property + @abstractmethod + def is_global_zero(self) -> bool: + """Whether the current process is the rank zero process not only on the local node, but for all nodes.""" + + @property + def launcher(self) -> Optional[_Launcher]: + return self._launcher + + @property + def accelerator(self) -> Optional[Accelerator]: + return self._accelerator + + @accelerator.setter + def accelerator(self, accelerator: Accelerator) -> None: + self._accelerator = accelerator + + @property + def checkpoint_io(self) -> CheckpointIO: + if self._checkpoint_io is None: + self._checkpoint_io = TorchCheckpointIO() + return self._checkpoint_io + + @checkpoint_io.setter + def checkpoint_io(self, io: Optional[CheckpointIO]) -> None: + self._checkpoint_io = io + + @property + def precision_plugin(self) -> Precision: + return self._precision_plugin if self._precision_plugin is not None else Precision() + + @precision_plugin.setter + def precision_plugin(self, precision_plugin: Optional[Precision]) -> None: + self._precision_plugin = precision_plugin + + def _configure_launcher(self) -> None: + """Attach the launcher based on Strategy.""" + + def setup_environment(self) -> None: + """Setup any processes or distributed connections. + + This must be called by the framework at the beginning of every process, before any distributed communication + takes place. + """ + assert self.accelerator is not None + self.accelerator.setup_device(self.root_device) + + def process_dataloader(self, dataloader: DataLoader) -> DataLoader: + """Wraps the dataloader if necessary. + + Args: + dataloader: iterable. Ideally of type: :class:`torch.utils.data.DataLoader` + """ + return dataloader + + def setup_module_and_optimizers( + self, module: Module, optimizers: List[Optimizer] + ) -> Tuple[Module, List[Optimizer]]: + """Set up a model and multiple optimizers together. + + The returned objects are expected to be in the same order they were passed in. The default implementation will + call :meth:`_setup_model` and :meth:`_setup_optimizer` on the inputs. + """ + module = self.setup_module(module) + optimizers = [self.setup_optimizer(optimizer) for optimizer in optimizers] + return module, optimizers + + def setup_module(self, module: Module) -> Module: + """Performs setup for the model, e.g., by wrapping it by another class.""" + return module + + def setup_optimizer(self, optimizer: Optimizer) -> Optimizer: + """Performs setup for the optimizer, e.g., by wrapping it by another class.""" + return optimizer + + @abstractmethod + def module_to_device(self, module: Module) -> None: + """Moves the model to the correct device.""" + + def batch_to_device(self, batch: Any, device: Optional[torch.device] = None) -> Any: + """Moves the batch to the correct device. + + The returned batch is of the same type as the input batch, just + having all tensors on the correct device. + + Args: + batch: The batch of samples to move to the correct device + device: The target device + """ + device = device or self.root_device + return move_data_to_device(batch, device) + + @contextlib.contextmanager + def module_sharded_context(self) -> Generator: + """Provide hook to create modules in a distributed aware context. This is useful for when we'd like to + shard the model instantly, which is useful for extremely large models which can save memory and + initialization time. + + Returns: Model parallel context. + """ + yield + + def backward(self, tensor: Tensor, module: Optional[Module], *args: Any, **kwargs: Any) -> None: + r"""Forwards backward-calls to the precision plugin.""" + self.precision_plugin.pre_backward(tensor, module) + self.precision_plugin.backward(tensor, module, *args, **kwargs) + self.precision_plugin.post_backward(tensor, module) + + def optimizer_step( + self, + optimizer: Optimizer, + model: Optional[Module] = None, + **kwargs: Any, + ) -> Any: + """Performs the actual optimizer step. + + Args: + optimizer: the optimizer performing the step + model: reference to the model, optionally defining optimizer step related hooks + **kwargs: Any extra arguments to ``optimizer.step`` + """ + return self.precision_plugin.optimizer_step(optimizer, model=model, **kwargs) + + @abstractmethod + def reduce( + self, + tensor: Union[Tensor, Any], + group: Optional[Any] = None, + reduce_op: Optional[Union[ReduceOp, str]] = "mean", + ) -> Union[Tensor, Any]: + """Reduces the given tensor (e.g. across GPUs/processes). + + Args: + tensor: the tensor to sync and reduce + group: the process group to reduce + reduce_op: the reduction operation. Defaults to 'mean'. + Can also be a string 'sum' or ReduceOp. + """ + + @abstractmethod + def barrier(self, name: Optional[str] = None) -> None: + """Synchronizes all processes which blocks processes until the whole group enters this function. + + Args: + name: an optional name to pass into barrier. + """ + + @abstractmethod + def broadcast(self, obj: TBroadcast, src: int = 0) -> TBroadcast: + """Broadcasts an object to all processes. + + Args: + obj: the object to broadcast + src: source rank + """ + + @abstractmethod + def all_gather(self, tensor: Tensor, group: Optional[Any] = None, sync_grads: bool = False) -> Tensor: + """Perform an all_gather on all processes. + + Args: + tensor: the tensor to all_gather + group: the process group to gather results from + sync_grads: flag that allows users to synchronize gradients for all_gather op + """ + + def reduce_boolean_decision(self, decision: bool) -> bool: + """Reduce a boolean decision across all processes.""" + return decision + + def save_checkpoint( + self, checkpoint: Dict[str, Any], filepath: _PATH, storage_options: Optional[Any] = None + ) -> None: + """Save model/training states as a checkpoint file through state-dump and file-write. + + Args: + checkpoint: dict containing model and trainer state + filepath: write-target file's path + storage_options: parameter for how to save to storage, passed to ``CheckpointIO`` plugin + """ + if self.is_global_zero: + self.checkpoint_io.save_checkpoint(checkpoint, filepath, storage_options=storage_options) + + def get_module_state_dict(self, module: Module) -> Dict[str, Union[Any, Tensor]]: + """Returns model state.""" + return module.state_dict() + + def get_optimizer_state(self, optimizer: Optimizer) -> Dict[str, Tensor]: + """Returns state of an optimizer. + + Allows for syncing/collating optimizer state from processes in custom plugins. + """ + return optimizer.state_dict() + + def load_checkpoint(self, checkpoint_path: _PATH) -> Dict[str, Any]: + torch.cuda.empty_cache() + return self.checkpoint_io.load_checkpoint(checkpoint_path) + + def load_module_state_dict(self, module: Module, checkpoint: Mapping[str, Any]) -> None: + module.load_state_dict(checkpoint["state_dict"]) + + def load_optimizer_state_dict( + self, optimizers: Union[Optimizer, Iterable[Optimizer]], checkpoint: Mapping[str, Any] + ) -> None: + if not isinstance(optimizers, Iterable): + optimizers = [optimizers] + optimizer_states = checkpoint["optimizer_states"] + for optimizer, opt_state in zip(optimizers, optimizer_states): + optimizer.load_state_dict(opt_state) + optimizer_to_device(optimizer, self.root_device) + + def remove_checkpoint(self, filepath: _PATH) -> None: + """Remove checkpoint filepath from the filesystem. + + Args: + filepath: Path to checkpoint + """ + if self.is_global_zero: + self.checkpoint_io.remove_checkpoint(filepath) + + def teardown(self) -> None: + """This method is called to teardown the training process. + + It is the right place to release memory and free other resources. + """ + self.precision_plugin.teardown() + assert self.accelerator is not None + self.accelerator.teardown() + self.checkpoint_io.teardown() + + @classmethod + def register_strategies(cls, strategy_registry: Dict[str, Any]) -> None: + pass diff --git a/src/pytorch_lightning/strategies/__init__.py b/src/pytorch_lightning/strategies/__init__.py index a85d1064e988d..63bb4354e00d1 100644 --- a/src/pytorch_lightning/strategies/__init__.py +++ b/src/pytorch_lightning/strategies/__init__.py @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from lightning_lite.strategies.registry import _StrategyRegistry from pytorch_lightning.strategies.bagua import BaguaStrategy # noqa: F401 from pytorch_lightning.strategies.ddp import DDPStrategy # noqa: F401 from pytorch_lightning.strategies.ddp_spawn import DDPSpawnStrategy # noqa: F401 @@ -29,9 +30,9 @@ from pytorch_lightning.strategies.single_hpu import SingleHPUStrategy # noqa: F401 from pytorch_lightning.strategies.single_tpu import SingleTPUStrategy # noqa: F401 from pytorch_lightning.strategies.strategy import Strategy # noqa: F401 -from pytorch_lightning.strategies.strategy_registry import call_register_strategies, StrategyRegistry # noqa: F401 from pytorch_lightning.strategies.tpu_spawn import TPUSpawnStrategy # noqa: F401 +from pytorch_lightning.strategies.utils import _call_register_strategies -STRATEGIES_BASE_MODULE = "pytorch_lightning.strategies" - -call_register_strategies(STRATEGIES_BASE_MODULE) +_STRATEGIES_BASE_MODULE = "pytorch_lightning.strategies" +StrategyRegistry = _StrategyRegistry() +_call_register_strategies(StrategyRegistry, _STRATEGIES_BASE_MODULE) diff --git a/src/pytorch_lightning/strategies/utils.py b/src/pytorch_lightning/strategies/utils.py index 3c3ebbe241811..6a8f8ae19f4e4 100644 --- a/src/pytorch_lightning/strategies/utils.py +++ b/src/pytorch_lightning/strategies/utils.py @@ -11,11 +11,16 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import importlib import os +from inspect import getmembers, isclass import torch +from lightning_lite.strategies import _StrategyRegistry from lightning_lite.utilities.enums import PrecisionType +from lightning_lite.utilities.registry import _is_register_method_overridden +from pytorch_lightning.strategies.strategy import Strategy from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation @@ -34,3 +39,11 @@ def _fp_to_half(tensor: torch.Tensor, precision: PrecisionType) -> torch.Tensor: return tensor.bfloat16() return tensor + + +def _call_register_strategies(registry: _StrategyRegistry, base_module: str) -> None: + # TODO(lite): Remove this function once PL strategies inherit from Lite's Strategy base class + module = importlib.import_module(base_module) + for _, mod in getmembers(module, isclass): + if issubclass(mod, Strategy) and _is_register_method_overridden(mod, Strategy, "register_strategies"): + mod.register_strategies(registry) diff --git a/tests/tests_lite/strategies/test_registry.py b/tests/tests_lite/strategies/test_registry.py new file mode 100644 index 0000000000000..7d6edfd449b8d --- /dev/null +++ b/tests/tests_lite/strategies/test_registry.py @@ -0,0 +1,44 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from lightning_lite.strategies import STRATEGY_REGISTRY + + +def test_strategy_registry_with_new_strategy(): + class TestStrategy: + + strategy_name = "test_strategy" + + def __init__(self, param1, param2): + self.param1 = param1 + self.param2 = param2 + + strategy_name = "test_strategy" + strategy_description = "Test Strategy" + + # TODO(lite): Registering classes that do not inherit from Strategy should not be allowed + STRATEGY_REGISTRY.register(strategy_name, TestStrategy, description=strategy_description, param1="abc", param2=123) + + assert strategy_name in STRATEGY_REGISTRY + assert STRATEGY_REGISTRY[strategy_name]["description"] == strategy_description + assert STRATEGY_REGISTRY[strategy_name]["init_params"] == {"param1": "abc", "param2": 123} + assert STRATEGY_REGISTRY[strategy_name]["strategy_name"] == "test_strategy" + assert isinstance(STRATEGY_REGISTRY.get(strategy_name), TestStrategy) + + STRATEGY_REGISTRY.remove(strategy_name) + assert strategy_name not in STRATEGY_REGISTRY + + +def test_available_strategies_in_registry(): + assert STRATEGY_REGISTRY.available_strategies() == [] diff --git a/tests/tests_lite/strategies/test_single_device.py b/tests/tests_lite/strategies/test_single_device.py new file mode 100644 index 0000000000000..ccf34d43b25b9 --- /dev/null +++ b/tests/tests_lite/strategies/test_single_device.py @@ -0,0 +1,54 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from unittest.mock import Mock + +import pytest +import torch + +from lightning_lite.strategies import SingleDeviceStrategy + + +def test_single_device_default_device(): + assert SingleDeviceStrategy().root_device == torch.device("cpu") + + +@pytest.mark.parametrize("device", ["cpu", torch.device("cpu"), "cuda:1", torch.device("cuda")]) +def test_single_device_root_device(device): + assert SingleDeviceStrategy(device).root_device == torch.device(device) + + +@pytest.mark.parametrize("device", [torch.device("cpu"), torch.device("cuda", 3)]) +def test_single_device_ranks(device): + strategy = SingleDeviceStrategy(device) + assert strategy.world_size == 1 + assert strategy.local_rank == 0 + assert strategy.global_rank == 0 + assert strategy.is_global_zero + + +def test_single_device_collectives(): + """Test that collectives in the single-device strategy act as the identity.""" + strategy = SingleDeviceStrategy() + tensor = Mock() + assert strategy.all_gather(tensor) == tensor + assert strategy.reduce(tensor) == tensor + assert strategy.broadcast(tensor) == tensor + + +def test_single_device_module_to_device(): + strategy = SingleDeviceStrategy() + strategy._root_device = Mock() + module = Mock(spec=torch.nn.Module) + strategy.module_to_device(module) + module.to.assert_called_with(strategy.root_device) diff --git a/tests/tests_pytorch/strategies/test_strategy_registry.py b/tests/tests_pytorch/strategies/test_registry.py similarity index 85% rename from tests/tests_pytorch/strategies/test_strategy_registry.py rename to tests/tests_pytorch/strategies/test_registry.py index 9cabe33b9f037..dcb182b657c49 100644 --- a/tests/tests_pytorch/strategies/test_strategy_registry.py +++ b/tests/tests_pytorch/strategies/test_registry.py @@ -28,30 +28,6 @@ from tests_pytorch.helpers.runif import RunIf -def test_strategy_registry_with_new_strategy(): - class TestStrategy: - - strategy_name = "test_strategy" - - def __init__(self, param1, param2): - self.param1 = param1 - self.param2 = param2 - - strategy_name = "test_strategy" - strategy_description = "Test Strategy" - - StrategyRegistry.register(strategy_name, TestStrategy, description=strategy_description, param1="abc", param2=123) - - assert strategy_name in StrategyRegistry - assert StrategyRegistry[strategy_name]["description"] == strategy_description - assert StrategyRegistry[strategy_name]["init_params"] == {"param1": "abc", "param2": 123} - assert StrategyRegistry[strategy_name]["strategy_name"] == "test_strategy" - assert isinstance(StrategyRegistry.get(strategy_name), TestStrategy) - - StrategyRegistry.remove(strategy_name) - assert strategy_name not in StrategyRegistry - - @pytest.mark.parametrize( "strategy_name, init_params", [ From 48e783dd0d17b813a6389b53baf182f0aa20347b Mon Sep 17 00:00:00 2001 From: Manan Goel Date: Wed, 14 Sep 2022 07:11:52 -0700 Subject: [PATCH 147/193] Added support for downloading wandb artifacts in the WandbLogger (#14551) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Added functions to the WandbLogger to download and use artifacts without having to access the experiment object * Updated CHANGLELOG.md * Added suggested changes * Delete test_script Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Carlos Mocholí Co-authored-by: awaelchli --- src/pytorch_lightning/CHANGELOG.md | 3 + src/pytorch_lightning/loggers/wandb.py | 69 +++++++++++++++++++++++ tests/tests_pytorch/loggers/test_wandb.py | 17 ++++++ 3 files changed, 89 insertions(+) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 2c58a555a8d6f..128a7b7ee0d6d 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -37,6 +37,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). [#14620](https://github.com/Lightning-AI/lightning/issues/14620)) +- Added `WandbLogger.download_artifact` and `WandbLogger.use_artifact` for managing artifacts with Weights and Biases ([#14551](https://github.com/Lightning-AI/lightning/issues/14551)) + + ### Changed - The `Trainer.{fit,validate,test,predict,tune}` methods now raise a useful error message if the input is not a `LightningModule` ([#13892](https://github.com/Lightning-AI/lightning/pull/13892)) diff --git a/src/pytorch_lightning/loggers/wandb.py b/src/pytorch_lightning/loggers/wandb.py index 396bc49ea0e70..6284062ebacc5 100644 --- a/src/pytorch_lightning/loggers/wandb.py +++ b/src/pytorch_lightning/loggers/wandb.py @@ -217,6 +217,36 @@ def __init__(self, *args, **kwarg): data = [["cheese", wandb.Image(img_1), wandb.Audio(snd_1)], ["wine", wandb.Image(img_2), wandb.Audio(snd_2)]] wandb_logger.log_table(key="samples", columns=columns, data=data) + + **Downloading and Using Artifacts** + + To download an artifact without starting a run, call the ``download_artifact`` + function on the class: + + .. code-block:: python + + from pytorch_lightning.loggers import WandbLogger + + artifact_dir = WandbLogger.download_artifact(artifact="path/to/artifact") + + To download an artifact and link it to an ongoing run call the ``download_artifact`` + function on the logger instance: + + .. code-block:: python + + class MyModule(LightningModule): + def any_lightning_module_function_or_hook(self): + self.logger.download_artifact(artifact="path/to/artifact") + + To link an artifact from a previous run you can use ``use_artifact`` function: + + .. code-block:: python + + from pytorch_lightning.loggers import WandbLogger + + wandb_logger = WandbLogger(project="my_project", name="my_run") + wandb_logger.use_artifact(artifact="path/to/artifact") + See Also: - `Demo in Google Colab `__ with hyperparameter search and model logging - `W&B Documentation `__ @@ -481,6 +511,45 @@ def after_save_checkpoint(self, checkpoint_callback: "ReferenceType[Checkpoint]" elif self._log_model is True: self._checkpoint_callback = checkpoint_callback + @staticmethod + @rank_zero_only + def download_artifact( + artifact: str, + save_dir: Optional[str] = None, + artifact_type: Optional[str] = None, + use_artifact: Optional[bool] = True, + ) -> str: + """Downloads an artifact from the wandb server. + + Args: + artifact: The path of the artifact to download. + save_dir: The directory to save the artifact to. + artifact_type: The type of artifact to download. + use_artifact: Whether to add an edge between the artifact graph. + + Returns: + The path to the downloaded artifact. + """ + if wandb.run is not None and use_artifact: + artifact = wandb.run.use_artifact(artifact) + else: + api = wandb.Api() + artifact = api.artifact(artifact, type=artifact_type) + + return artifact.download(root=save_dir) + + def use_artifact(self, artifact: str, artifact_type: Optional[str] = None) -> "wandb.Artifact": + """Logs to the wandb dashboard that the mentioned artifact is used by the run. + + Args: + artifact: The path of the artifact. + artifact_type: The type of artifact being used. + + Returns: + wandb Artifact object for the artifact. + """ + return self.experiment.use_artifact(artifact, type=artifact_type) + @rank_zero_only def finalize(self, status: str) -> None: # log checkpoints as artifacts diff --git a/tests/tests_pytorch/loggers/test_wandb.py b/tests/tests_pytorch/loggers/test_wandb.py index b408046c9e5d2..8b87af59c6f7e 100644 --- a/tests/tests_pytorch/loggers/test_wandb.py +++ b/tests/tests_pytorch/loggers/test_wandb.py @@ -322,3 +322,20 @@ def test_wandb_logger_offline_log_model(wandb, tmpdir): """Test that log_model=True raises an error in offline mode.""" with pytest.raises(MisconfigurationException, match="checkpoints cannot be uploaded in offline mode"): _ = WandbLogger(save_dir=str(tmpdir), offline=True, log_model=True) + + +@mock.patch("pytorch_lightning.loggers.wandb.Run", object) +@mock.patch("pytorch_lightning.loggers.wandb.wandb") +def test_wandb_logger_download_artifact(wandb, tmpdir): + """Test that download_artifact works.""" + + wandb.run = wandb.init() + logger = WandbLogger() + logger.download_artifact("test_artifact", str(tmpdir), "model", True) + wandb.run.use_artifact.assert_called_once_with("test_artifact") + + wandb.run = None + + WandbLogger.download_artifact("test_artifact", str(tmpdir), "model", True) + + wandb.Api().artifact.assert_called_once_with("test_artifact", type="model") From 32cb774a5c0cdbc4e4594cd4b99a71174637aa4f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 14 Sep 2022 16:22:07 +0200 Subject: [PATCH 148/193] Standalone Lite: Single Device TPU Strategy (#14663) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Carlos Mocholí Co-authored-by: Jirka Borovec --- src/lightning_lite/strategies/__init__.py | 1 + src/lightning_lite/strategies/single_tpu.py | 58 ++++++++++++++++++++ tests/tests_lite/strategies/test_registry.py | 4 +- 3 files changed, 62 insertions(+), 1 deletion(-) create mode 100644 src/lightning_lite/strategies/single_tpu.py diff --git a/src/lightning_lite/strategies/__init__.py b/src/lightning_lite/strategies/__init__.py index b76af7a22df17..cdbcb6bad1f00 100644 --- a/src/lightning_lite/strategies/__init__.py +++ b/src/lightning_lite/strategies/__init__.py @@ -14,6 +14,7 @@ from lightning_lite.strategies.parallel import ParallelStrategy # noqa: F401 from lightning_lite.strategies.registry import _call_register_strategies, _StrategyRegistry from lightning_lite.strategies.single_device import SingleDeviceStrategy # noqa: F401 +from lightning_lite.strategies.single_tpu import SingleTPUStrategy # noqa: F401 from lightning_lite.strategies.strategy import Strategy # noqa: F401 STRATEGY_REGISTRY = _StrategyRegistry() diff --git a/src/lightning_lite/strategies/single_tpu.py b/src/lightning_lite/strategies/single_tpu.py new file mode 100644 index 0000000000000..fe1dad21a7d50 --- /dev/null +++ b/src/lightning_lite/strategies/single_tpu.py @@ -0,0 +1,58 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import Dict, Optional + +from lightning_lite.accelerators import Accelerator +from lightning_lite.plugins.io.checkpoint_plugin import CheckpointIO +from lightning_lite.plugins.io.xla_plugin import XLACheckpointIO +from lightning_lite.plugins.precision import Precision +from lightning_lite.strategies.single_device import SingleDeviceStrategy + + +class SingleTPUStrategy(SingleDeviceStrategy): + """Strategy for training on a single TPU device.""" + + def __init__( + self, + device: int, + accelerator: Optional[Accelerator] = None, + checkpoint_io: Optional[CheckpointIO] = None, + precision_plugin: Optional[Precision] = None, + ): + import torch_xla.core.xla_model as xm + + super().__init__( + accelerator=accelerator, + device=xm.xla_device(device), + checkpoint_io=checkpoint_io, + precision_plugin=precision_plugin, + ) + + @property + def checkpoint_io(self) -> CheckpointIO: + if self._checkpoint_io is None: + self._checkpoint_io = XLACheckpointIO() + return self._checkpoint_io + + @checkpoint_io.setter + def checkpoint_io(self, io: Optional[CheckpointIO]) -> None: + self._checkpoint_io = io + + @property + def is_distributed(self) -> bool: + return False + + @classmethod + def register_strategies(cls, strategy_registry: Dict) -> None: + strategy_registry.register("single_tpu", cls, description=f"{cls.__class__.__name__}") diff --git a/tests/tests_lite/strategies/test_registry.py b/tests/tests_lite/strategies/test_registry.py index 7d6edfd449b8d..d94198d5710ce 100644 --- a/tests/tests_lite/strategies/test_registry.py +++ b/tests/tests_lite/strategies/test_registry.py @@ -41,4 +41,6 @@ def __init__(self, param1, param2): def test_available_strategies_in_registry(): - assert STRATEGY_REGISTRY.available_strategies() == [] + assert STRATEGY_REGISTRY.available_strategies() == [ + "single_tpu", + ] From c2378bd3b17fbfd3d5ce5fdfb72e258262ecfd2a Mon Sep 17 00:00:00 2001 From: Benjamin Krala <56834877+KralaBenjamin@users.noreply.github.com> Date: Wed, 14 Sep 2022 17:46:34 +0200 Subject: [PATCH 149/193] Add documentation for trainer.datamodule and dataloaders of a trainer object (#14600) * Update trainer.rst * Update datamodule.rst --- docs/source-pytorch/common/trainer.rst | 55 +++++++++++++++++++++++++ docs/source-pytorch/data/datamodule.rst | 4 ++ 2 files changed, 59 insertions(+) diff --git a/docs/source-pytorch/common/trainer.rst b/docs/source-pytorch/common/trainer.rst index cc8b57a182988..caad2a9f21a42 100644 --- a/docs/source-pytorch/common/trainer.rst +++ b/docs/source-pytorch/common/trainer.rst @@ -1597,6 +1597,16 @@ The number of epochs run. if trainer.current_epoch >= 10: ... + +datamodule +********** + +The current datamodule, which is used by the trainer. + +.. code-block:: python + + used_datamodule = trainer.datamodule + is_last_batch ************* @@ -1694,6 +1704,17 @@ The metrics sent to the progress bar. assert progress_bar_metrics["a_val"] == 2 +predict_dataloaders +******************* + +The current predict dataloaders of the trainer. +Note that property returns a list of predict dataloaders. + +.. code-block:: python + + used_predict_dataloaders = trainer.predict_dataloaders + + estimated_stepping_batches ************************** @@ -1773,3 +1794,37 @@ both conditions are met. If any of these arguments is not set, it won't be consi trainer = Trainer(min_steps=5, min_epochs=5, max_epochs=100) model = LitModel() trainer.fit(model) + + +train_dataloader +**************** + +The current train dataloader of the trainer. + +.. code-block:: python + + used_train_dataloader = trainer.train_dataloader + + +test_dataloaders +**************** + +The current test dataloaders of the trainer. +Note that property returns a list of test dataloaders. + + +.. code-block:: python + + used_test_dataloaders = trainer.test_dataloaders + +val_dataloaders +*************** + + +The current val dataloaders of the trainer. +Note that property returns a list of val dataloaders. + + +.. code-block:: python + + used_val_dataloaders = trainer.val_dataloaders diff --git a/docs/source-pytorch/data/datamodule.rst b/docs/source-pytorch/data/datamodule.rst index fbee2e80e4ea2..4a6bce702937e 100644 --- a/docs/source-pytorch/data/datamodule.rst +++ b/docs/source-pytorch/data/datamodule.rst @@ -412,6 +412,10 @@ the method runs on the correct devices). dm.setup(stage="test") trainer.test(datamodule=dm) +You can access the current used datamodule of a trainer via ``trainer.datamodule`` and the current used +dataloaders via ``trainer.train_dataloader``, ``trainer.val_dataloaders`` and ``trainer.test_dataloaders``. + + ---------------- ***************************** From 7867d152b343d0bcd867f357dfd59312fff32802 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Thu, 15 Sep 2022 01:27:53 +0200 Subject: [PATCH 150/193] Standalone Lite: DataParallel Strategy (#14681) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Carlos Mocholí Co-authored-by: Jirka Borovec --- src/lightning_lite/strategies/__init__.py | 1 + src/lightning_lite/strategies/dp.py | 84 ++++++++++++++++++++ tests/tests_lite/strategies/test_dp.py | 50 ++++++++++++ tests/tests_lite/strategies/test_registry.py | 5 +- 4 files changed, 138 insertions(+), 2 deletions(-) create mode 100644 src/lightning_lite/strategies/dp.py create mode 100644 tests/tests_lite/strategies/test_dp.py diff --git a/src/lightning_lite/strategies/__init__.py b/src/lightning_lite/strategies/__init__.py index cdbcb6bad1f00..eb02cb01b977b 100644 --- a/src/lightning_lite/strategies/__init__.py +++ b/src/lightning_lite/strategies/__init__.py @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from lightning_lite.strategies.dp import DataParallelStrategy # noqa: F401 from lightning_lite.strategies.parallel import ParallelStrategy # noqa: F401 from lightning_lite.strategies.registry import _call_register_strategies, _StrategyRegistry from lightning_lite.strategies.single_device import SingleDeviceStrategy # noqa: F401 diff --git a/src/lightning_lite/strategies/dp.py b/src/lightning_lite/strategies/dp.py new file mode 100644 index 0000000000000..8ecc239356415 --- /dev/null +++ b/src/lightning_lite/strategies/dp.py @@ -0,0 +1,84 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import Any, Dict, List, Optional, Union + +import torch +from torch import Tensor +from torch.nn import DataParallel, Module + +from lightning_lite.accelerators import Accelerator +from lightning_lite.plugins.io.checkpoint_plugin import CheckpointIO +from lightning_lite.plugins.precision import Precision +from lightning_lite.strategies.parallel import ParallelStrategy +from lightning_lite.strategies.strategy import TBroadcast, TReduce +from lightning_lite.utilities.apply_func import apply_to_collection +from lightning_lite.utilities.distributed import ReduceOp + + +class DataParallelStrategy(ParallelStrategy): + """Implements data-parallel training in a single process, i.e., the model gets replicated to each device and + each gets a split of the data.""" + + def __init__( + self, + accelerator: Optional[Accelerator] = None, + parallel_devices: Optional[List[torch.device]] = None, + checkpoint_io: Optional[CheckpointIO] = None, + precision_plugin: Optional[Precision] = None, + ): + super().__init__( + accelerator=accelerator, + parallel_devices=parallel_devices, + cluster_environment=None, + checkpoint_io=checkpoint_io, + precision_plugin=precision_plugin, + ) + + @property + def root_device(self) -> torch.device: + assert self.parallel_devices is not None + return self.parallel_devices[0] + + def setup_module(self, module: Module) -> DataParallel: + """Wraps the given model into a :class:`~torch.nn.parallel.DataParallel` module.""" + return DataParallel(module=module, device_ids=self.parallel_devices) + + def module_to_device(self, module: Module) -> None: + module.to(self.root_device) + + def batch_to_device(self, batch: Any, device: Optional[torch.device] = None) -> Any: + # DataParallel handles the transfer of batch to the device + return batch + + def reduce( + self, collection: TReduce, group: Optional[Any] = None, reduce_op: Optional[Union[ReduceOp, str]] = "mean" + ) -> TReduce: + def mean(t: Tensor) -> Tensor: + original_dtype = t.dtype + return t.float().mean().to(original_dtype) + + return apply_to_collection(collection, Tensor, mean) + + def barrier(self, *args: Any, **kwargs: Any) -> None: + pass + + def broadcast(self, obj: TBroadcast, src: int = 0) -> TBroadcast: + return obj + + def reduce_boolean_decision(self, decision: bool) -> bool: + return decision + + @classmethod + def register_strategies(cls, strategy_registry: Dict) -> None: + strategy_registry.register("dp", cls, description=cls.__class__.__name__) diff --git a/tests/tests_lite/strategies/test_dp.py b/tests/tests_lite/strategies/test_dp.py new file mode 100644 index 0000000000000..12a98d8e463b4 --- /dev/null +++ b/tests/tests_lite/strategies/test_dp.py @@ -0,0 +1,50 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from unittest import mock +from unittest.mock import Mock + +import torch + +from lightning_lite.strategies import DataParallelStrategy + + +def test_data_parallel_root_device(): + strategy = DataParallelStrategy() + strategy.parallel_devices = [torch.device("cuda", 2), torch.device("cuda", 0), torch.device("cuda", 1)] + assert strategy.root_device == torch.device("cuda", 2) + + +def test_data_parallel_ranks(): + strategy = DataParallelStrategy() + assert strategy.world_size == 1 + assert strategy.local_rank == 0 + assert strategy.global_rank == 0 + assert strategy.is_global_zero + + +@mock.patch("lightning_lite.strategies.dp.DataParallel") +def test_data_parallel_setup_module(data_parallel_mock): + strategy = DataParallelStrategy() + strategy.parallel_devices = [0, 2, 1] + module = torch.nn.Linear(2, 2) + wrapped_module = strategy.setup_module(module) + assert wrapped_module == data_parallel_mock(module=module, device_ids=[0, 2, 1]) + + +def test_data_parallel_module_to_device(): + strategy = DataParallelStrategy() + strategy.parallel_devices = [torch.device("cuda", 2)] + module = Mock() + strategy.module_to_device(module) + module.to.assert_called_with(torch.device("cuda", 2)) diff --git a/tests/tests_lite/strategies/test_registry.py b/tests/tests_lite/strategies/test_registry.py index d94198d5710ce..76a6bea00f249 100644 --- a/tests/tests_lite/strategies/test_registry.py +++ b/tests/tests_lite/strategies/test_registry.py @@ -41,6 +41,7 @@ def __init__(self, param1, param2): def test_available_strategies_in_registry(): - assert STRATEGY_REGISTRY.available_strategies() == [ + assert set(STRATEGY_REGISTRY.available_strategies()) == { + "dp", "single_tpu", - ] + } From 8b3d6d8febd943483a7ce30d7e507c565ff883da Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Thu, 15 Sep 2022 01:29:23 +0200 Subject: [PATCH 151/193] Add easy access to `state_dict` in Lite module wrapper (#14629) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Jirka Borovec Co-authored-by: Carlos Mocholí --- src/pytorch_lightning/CHANGELOG.md | 3 +++ src/pytorch_lightning/lite/wrappers.py | 25 ++++++++++++++++++- tests/tests_pytorch/lite/test_wrappers.py | 29 ++++++++++++++++++++++- 3 files changed, 55 insertions(+), 2 deletions(-) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 128a7b7ee0d6d..6c28bb3a92efa 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -69,6 +69,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - When using multiple loggers, by default checkpoints and profiler output now get saved to the log dir of the first logger in the list ([#14325](https://github.com/Lightning-AI/lightning/pull/14325)) +- In Lightning Lite, state-dict access to the module wrapper now gets passed through to the original module reference ([#14629](https://github.com/Lightning-AI/lightning/pull/14629)) + + - Improved the error messaging when passing `Trainer.method(model, x_dataloader=None)` with no module-method implementations available ([#14614](https://github.com/Lightning-AI/lightning/pull/14614)) diff --git a/src/pytorch_lightning/lite/wrappers.py b/src/pytorch_lightning/lite/wrappers.py index f6d75941aab45..29a0c17341666 100644 --- a/src/pytorch_lightning/lite/wrappers.py +++ b/src/pytorch_lightning/lite/wrappers.py @@ -11,12 +11,13 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import Any, Callable, Dict, Generator, Iterator, Optional, Union +from typing import Any, Callable, Dict, Generator, Iterator, Mapping, Optional, overload, TypeVar, Union import torch from lightning_utilities.core.apply_func import apply_to_collection from torch import nn as nn from torch import Tensor +from torch.nn.modules.module import _IncompatibleKeys from torch.optim import Optimizer from torch.utils.data import DataLoader @@ -25,6 +26,8 @@ from pytorch_lightning.plugins import PrecisionPlugin from pytorch_lightning.strategies import Strategy +T_destination = TypeVar("T_destination", bound=Dict[str, Any]) + def _do_nothing_closure() -> None: return None @@ -115,6 +118,26 @@ def _convert_float_tensor(t: Tensor) -> Tensor: output = apply_to_collection(output, function=_convert_float_tensor, dtype=Tensor) return output + @overload + def state_dict(self, *, destination: T_destination, prefix: str = ..., keep_vars: bool = ...) -> T_destination: + ... + + @overload + def state_dict(self, *, prefix: str = ..., keep_vars: bool = ...) -> T_destination: + ... + + def state_dict( + self, destination: Optional[T_destination] = None, prefix: str = "", keep_vars: bool = False + ) -> Optional[T_destination]: + return self._original_module.state_dict( + destination=destination, # type: ignore[type-var] + prefix=prefix, + keep_vars=keep_vars, + ) + + def load_state_dict(self, state_dict: Mapping[str, Any], strict: bool = True) -> _IncompatibleKeys: + return self._original_module.load_state_dict(state_dict=state_dict, strict=strict) + def __getattr__(self, item: Any) -> Any: try: # __getattr__ gets called as a last resort if the attribute does not exist diff --git a/tests/tests_pytorch/lite/test_wrappers.py b/tests/tests_pytorch/lite/test_wrappers.py index 957c321dd1b40..acc05cfdcda8f 100644 --- a/tests/tests_pytorch/lite/test_wrappers.py +++ b/tests/tests_pytorch/lite/test_wrappers.py @@ -39,7 +39,7 @@ def test_lite_module_wraps(): def test_lite_module_attribute_lookup(): - """Test that attribute lookup passes through to the original model when possible.""" + """Test that attribute lookup passes through to the original module when possible.""" class OriginalModule(torch.nn.Module): def __init__(self): @@ -69,6 +69,33 @@ def __init__(self): _ = lite_module.not_exists +def test_lite_module_state_dict_access(): + """Test that state_dict access passes through to the original module.""" + + class OriginalModule(torch.nn.Module): + def __init__(self): + super().__init__() + self.layer = torch.nn.Linear(2, 3) + + original_module = OriginalModule() + + class ModuleWrapper(torch.nn.Module): + def __init__(self): + super().__init__() + self.wrapped = original_module + + wrapped_module = ModuleWrapper() + + lite_module = _LiteModule(wrapped_module, Mock(), original_module=original_module) + state_dict = lite_module.state_dict() + assert set(state_dict.keys()) == {"layer.weight", "layer.bias"} + + weight, bias = torch.rand(3, 2), torch.rand(3) + lite_module.load_state_dict({"layer.weight": weight, "layer.bias": bias}) + assert torch.equal(lite_module.layer.weight, weight) + assert torch.equal(lite_module.layer.bias, bias) + + @pytest.mark.parametrize( "precision, input_type, expected_type, accelerator, device_str", [ From 98cf134bb3d80c40ab07b0dcd3f1aa42537c5e78 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Thu, 15 Sep 2022 02:08:11 +0200 Subject: [PATCH 152/193] Add the probot check-group action (#14621) Co-authored-by: Jirka Borovec --- .github/workflows/probot-check-group.yml | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 .github/workflows/probot-check-group.yml diff --git a/.github/workflows/probot-check-group.yml b/.github/workflows/probot-check-group.yml new file mode 100644 index 0000000000000..2e69ff59b0779 --- /dev/null +++ b/.github/workflows/probot-check-group.yml @@ -0,0 +1,20 @@ +name: Probot + +on: + pull_request: + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }} + cancel-in-progress: true + +jobs: + required-jobs: + runs-on: ubuntu-latest + # if this timeout triggers, then the job needs to be manually restarted through the GitHub interface + timeout-minutes: 60 + steps: + - uses: carmocca/probot@v2 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + with: + interval: 180 # seconds From deca6cc5c454752d88586db1d29bc95c48eae4a1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Thu, 15 Sep 2022 03:36:17 +0200 Subject: [PATCH 153/193] Standalone Lite: DDP Strategy Family (#14670) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Carlos Mocholí Co-authored-by: Jirka Borovec --- src/lightning_lite/plugins/precision/utils.py | 27 + src/lightning_lite/strategies/__init__.py | 3 + src/lightning_lite/strategies/ddp.py | 180 +++++ src/lightning_lite/strategies/deepspeed.py | 630 ++++++++++++++++++ src/lightning_lite/strategies/fairscale.py | 122 ++++ .../strategies/launchers/base.py | 2 +- src/lightning_lite/utilities/imports.py | 3 - src/pytorch_lightning/strategies/deepspeed.py | 2 +- src/pytorch_lightning/strategies/ipu.py | 2 +- src/pytorch_lightning/strategies/utils.py | 13 - tests/tests_lite/helpers/runif.py | 10 +- tests/tests_lite/strategies/test_ddp.py | 42 ++ tests/tests_lite/strategies/test_deepspeed.py | 76 +++ tests/tests_lite/strategies/test_fairscale.py | 28 + tests/tests_lite/strategies/test_registry.py | 11 + tests/tests_lite/utilities/test_imports.py | 16 +- 16 files changed, 1139 insertions(+), 28 deletions(-) create mode 100644 src/lightning_lite/plugins/precision/utils.py create mode 100644 src/lightning_lite/strategies/ddp.py create mode 100644 src/lightning_lite/strategies/deepspeed.py create mode 100644 src/lightning_lite/strategies/fairscale.py create mode 100644 tests/tests_lite/strategies/test_ddp.py create mode 100644 tests/tests_lite/strategies/test_deepspeed.py create mode 100644 tests/tests_lite/strategies/test_fairscale.py diff --git a/src/lightning_lite/plugins/precision/utils.py b/src/lightning_lite/plugins/precision/utils.py new file mode 100644 index 0000000000000..f9af7de5baf75 --- /dev/null +++ b/src/lightning_lite/plugins/precision/utils.py @@ -0,0 +1,27 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch + +from lightning_lite.utilities.enums import PrecisionType + + +def _fp_to_half(tensor: torch.Tensor, precision: PrecisionType) -> torch.Tensor: + if torch.is_floating_point(tensor): + if precision == PrecisionType.HALF: + return tensor.half() + if precision == PrecisionType.BFLOAT: + return tensor.bfloat16() + + return tensor diff --git a/src/lightning_lite/strategies/__init__.py b/src/lightning_lite/strategies/__init__.py index eb02cb01b977b..8ced098e3a8dd 100644 --- a/src/lightning_lite/strategies/__init__.py +++ b/src/lightning_lite/strategies/__init__.py @@ -11,7 +11,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from lightning_lite.strategies.ddp import DDPStrategy # noqa: F401 +from lightning_lite.strategies.deepspeed import DeepSpeedStrategy # noqa: F401 from lightning_lite.strategies.dp import DataParallelStrategy # noqa: F401 +from lightning_lite.strategies.fairscale import DDPShardedStrategy # noqa: F401 from lightning_lite.strategies.parallel import ParallelStrategy # noqa: F401 from lightning_lite.strategies.registry import _call_register_strategies, _StrategyRegistry from lightning_lite.strategies.single_device import SingleDeviceStrategy # noqa: F401 diff --git a/src/lightning_lite/strategies/ddp.py b/src/lightning_lite/strategies/ddp.py new file mode 100644 index 0000000000000..bd229be91934b --- /dev/null +++ b/src/lightning_lite/strategies/ddp.py @@ -0,0 +1,180 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from datetime import timedelta +from typing import Any, Dict, List, Optional, Union + +import torch +import torch.distributed +from torch import Tensor +from torch.distributed.constants import default_pg_timeout +from torch.nn import Module +from torch.nn.parallel.distributed import DistributedDataParallel + +from lightning_lite.accelerators.accelerator import Accelerator +from lightning_lite.plugins.environments.cluster_environment import ClusterEnvironment +from lightning_lite.plugins.io.checkpoint_plugin import CheckpointIO +from lightning_lite.plugins.precision import Precision +from lightning_lite.strategies.launchers.subprocess_script import _SubprocessScriptLauncher +from lightning_lite.strategies.parallel import ParallelStrategy +from lightning_lite.strategies.strategy import TBroadcast +from lightning_lite.utilities.distributed import distributed_available, get_default_process_group_backend_for_device +from lightning_lite.utilities.distributed import group as _group +from lightning_lite.utilities.distributed import init_dist_connection, ReduceOp, sync_ddp_if_available +from lightning_lite.utilities.rank_zero import rank_zero_only +from lightning_lite.utilities.seed import reset_seed + + +class DDPStrategy(ParallelStrategy): + """Strategy for multi-process single-device training on one or multiple nodes.""" + + def __init__( + self, + accelerator: Optional[Accelerator] = None, + parallel_devices: Optional[List[torch.device]] = None, + cluster_environment: Optional[ClusterEnvironment] = None, + checkpoint_io: Optional[CheckpointIO] = None, + precision_plugin: Optional[Precision] = None, + process_group_backend: Optional[str] = None, + timeout: Optional[timedelta] = default_pg_timeout, + **kwargs: Any, + ) -> None: + super().__init__( + accelerator=accelerator, + parallel_devices=parallel_devices, + cluster_environment=cluster_environment, + checkpoint_io=checkpoint_io, + precision_plugin=precision_plugin, + ) + self._num_nodes = 1 + self._process_group_backend: Optional[str] = process_group_backend + self._timeout: Optional[timedelta] = timeout + self._ddp_kwargs = kwargs + + @property + def root_device(self) -> torch.device: + assert self.parallel_devices is not None + return self.parallel_devices[self.local_rank] + + @property + def is_distributed(self) -> bool: + return True + + @property + def num_nodes(self) -> int: + return self._num_nodes + + @num_nodes.setter + def num_nodes(self, num_nodes: int) -> None: + # note that world ranks is related to num_nodes, when resetting it, need to reset world ranks + self._num_nodes = num_nodes + + @property + def num_processes(self) -> int: + return len(self.parallel_devices) if self.parallel_devices is not None else 0 + + @property + def distributed_sampler_kwargs(self) -> Dict[str, Any]: + distributed_sampler_kwargs = dict(num_replicas=(self.num_nodes * self.num_processes), rank=self.global_rank) + return distributed_sampler_kwargs + + @property + def process_group_backend(self) -> Optional[str]: + return self._process_group_backend + + def _configure_launcher(self) -> None: + assert self.cluster_environment is not None + if not self.cluster_environment.creates_processes_externally: + self._launcher = _SubprocessScriptLauncher(self.cluster_environment, self.num_processes, self.num_nodes) + + def setup_environment(self) -> None: + self._setup_distributed() + super().setup_environment() + + def setup_module(self, module: Module) -> DistributedDataParallel: + """Wraps the model into a :class:`~torch.nn.parallel.distributed.DistributedDataParallel` module.""" + device_ids = self._determine_ddp_device_ids() + return DistributedDataParallel(module=module, device_ids=device_ids, **self._ddp_kwargs) + + def module_to_device(self, module: Module) -> None: + module.to(self.root_device) + + def reduce( + self, tensor: Tensor, group: Optional[Any] = None, reduce_op: Optional[Union[ReduceOp, str]] = "mean" + ) -> Tensor: + """Reduces a tensor from several distributed processes to one aggregated tensor. + + Args: + tensor: the tensor to sync and reduce + group: the process group to gather results from. Defaults to all processes (world) + reduce_op: the reduction operation. Defaults to 'mean'/'avg'. + Can also be a string 'sum' to calculate the sum during reduction. + + Return: + reduced value, except when the input was not a tensor the output remains is unchanged + """ + if isinstance(tensor, Tensor): + tensor = sync_ddp_if_available(tensor, group, reduce_op=reduce_op) + return tensor + + def barrier(self, *args: Any, **kwargs: Any) -> None: + if not distributed_available(): + return + if torch.distributed.get_backend() == "nccl": + torch.distributed.barrier(device_ids=self._determine_ddp_device_ids()) + else: + torch.distributed.barrier() + + def broadcast(self, obj: TBroadcast, src: int = 0) -> TBroadcast: + obj = [obj] + if self.global_rank != src: + obj = [None] # type: ignore[list-item] + torch.distributed.broadcast_object_list(obj, src, group=_group.WORLD) + return obj[0] + + @classmethod + def register_strategies(cls, strategy_registry: Dict) -> None: + strategy_registry.register( + "ddp_find_unused_parameters_false", + cls, + description="DDP Strategy with `find_unused_parameters` as False", + find_unused_parameters=False, + ) + strategy_registry.register( + "ddp", + cls, + description=cls.__class__.__name__, + ) + + def _setup_distributed(self) -> None: + reset_seed() + self._set_world_ranks() + rank_zero_only.rank = self.global_rank + self._process_group_backend = self._get_process_group_backend() + assert self.cluster_environment is not None + init_dist_connection(self.cluster_environment, self._process_group_backend, timeout=self._timeout) + + def _get_process_group_backend(self) -> str: + return self._process_group_backend or get_default_process_group_backend_for_device(self.root_device) + + def _set_world_ranks(self) -> None: + if self.cluster_environment is None: + return + self.cluster_environment.set_global_rank(self.node_rank * self.num_processes + self.local_rank) + self.cluster_environment.set_world_size(self.num_nodes * self.num_processes) + rank_zero_only.rank = self.cluster_environment.global_rank() + + def _determine_ddp_device_ids(self) -> Optional[List[int]]: + if self.root_device.type == "cpu": + return None + return [self.root_device.index] diff --git a/src/lightning_lite/strategies/deepspeed.py b/src/lightning_lite/strategies/deepspeed.py new file mode 100644 index 0000000000000..5241d30f97976 --- /dev/null +++ b/src/lightning_lite/strategies/deepspeed.py @@ -0,0 +1,630 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse +import contextlib +import json +import logging +import os +import platform +from pathlib import Path +from typing import Any, Dict, Generator, Iterable, List, Mapping, Optional, Tuple, Union + +import torch +from lightning_utilities.core.imports import RequirementCache +from lightning_utilities.core.rank_zero import rank_zero_only +from torch import Tensor +from torch.nn import Module +from torch.optim import Optimizer + +from lightning_lite.accelerators import Accelerator +from lightning_lite.plugins.environments.cluster_environment import ClusterEnvironment +from lightning_lite.plugins.precision import Precision +from lightning_lite.plugins.precision.utils import _fp_to_half +from lightning_lite.strategies.ddp import DDPStrategy +from lightning_lite.utilities.apply_func import apply_to_collection +from lightning_lite.utilities.distributed import get_default_process_group_backend_for_device, log +from lightning_lite.utilities.enums import AMPType, PrecisionType +from lightning_lite.utilities.rank_zero import rank_zero_info +from lightning_lite.utilities.seed import reset_seed +from lightning_lite.utilities.types import _LRScheduler, _PATH, ReduceLROnPlateau + +_DEEPSPEED_AVAILABLE = RequirementCache("deepspeed") +if _DEEPSPEED_AVAILABLE: + import deepspeed + + +# TODO(lite): Links in the docstrings to PL-specific deepspeed user docs need to be replaced. +class DeepSpeedStrategy(DDPStrategy): + DEEPSPEED_ENV_VAR = "PL_DEEPSPEED_CONFIG_PATH" + + def __init__( + self, + accelerator: Optional[Accelerator] = None, + zero_optimization: bool = True, + stage: int = 2, + remote_device: str = "cpu", + offload_optimizer: bool = False, + offload_parameters: bool = False, + offload_params_device: str = "cpu", + nvme_path: str = "/local_nvme", + params_buffer_count: int = 5, + params_buffer_size: int = 100_000_000, + max_in_cpu: int = 1_000_000_000, + offload_optimizer_device: str = "cpu", + optimizer_buffer_count: int = 4, + block_size: int = 1048576, + queue_depth: int = 8, + single_submit: bool = False, + overlap_events: bool = True, + thread_count: int = 1, + pin_memory: bool = False, + sub_group_size: int = 1_000_000_000_000, + contiguous_gradients: bool = True, + overlap_comm: bool = True, + allgather_partitions: bool = True, + reduce_scatter: bool = True, + allgather_bucket_size: int = 200_000_000, + reduce_bucket_size: int = 200_000_000, + zero_allow_untested_optimizer: bool = True, + logging_batch_size_per_gpu: Union[str, int] = "auto", + config: Optional[Union[_PATH, Dict[str, Any]]] = None, + logging_level: int = logging.WARN, + parallel_devices: Optional[List[torch.device]] = None, + cluster_environment: Optional[ClusterEnvironment] = None, + loss_scale: float = 0, + initial_scale_power: int = 16, + loss_scale_window: int = 1000, + hysteresis: int = 2, + min_loss_scale: int = 1, + partition_activations: bool = False, + cpu_checkpointing: bool = False, + contiguous_memory_optimization: bool = False, + synchronize_checkpoint_boundary: bool = False, + load_full_weights: bool = False, + precision_plugin: Optional[Precision] = None, + process_group_backend: Optional[str] = None, + ) -> None: + """Provides capabilities to run training using the DeepSpeed library, with training optimizations for large + billion parameter models. `For more information: https://pytorch- + lightning.readthedocs.io/en/stable/advanced/model_parallel.html#deepspeed`. + + .. warning:: ``DeepSpeedStrategy`` is in beta and subject to change. + + Defaults have been set to enable ZeRO-Offload and some have been taken from the link below. + These defaults have been set generally, but may require tuning for optimum performance based on your model size. + `For more information: https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training`. + + Arguments: + + zero_optimization: Enable ZeRO optimization. This is compatible with either ``precision=16`` or + ``precision="bf16"``. + + stage: Different stages of the ZeRO Optimizer. 0 is disabled, + 1 is optimizer state partitioning, 2 is optimizer+gradient state partitioning, + 3 is optimizer+gradient_parameter partitioning using the infinity engine. + + remote_device: Device to instantiate the model on initially (``cpu`` or ``nvme``). + + offload_optimizer: Enable offloading optimizer memory and computation to CPU or NVMe + based on ``offload_optimizer_device``. + + offload_parameters: When using ZeRO Stage 3, Enable offloading parameter memory and computation + to CPU or NVMe based on ``offload_params_device``. + + offload_params_device: When offloading parameters choose the device to offload to, ``cpu`` or ``nvme``. + + offload_optimizer_device: When offloading optimizer state choose the device to offload to, + ``cpu`` or ``nvme``. + + params_buffer_count: Number of buffers in buffer pool for + parameter offloading when ``offload_params_device`` is ``nvme``. + + params_buffer_size: Size of buffers in buffer pool for parameter offloading + when ``offload_params_device`` is ``nvme``. + + max_in_cpu: Number of parameter elements to maintain in CPU memory when offloading to NVMe is enabled. + + nvme_path: Filesystem path for NVMe device for optimizer/parameter state offloading. + + optimizer_buffer_count: Number of buffers in buffer pool for optimizer state offloading + when ``offload_optimizer_device`` is set to to ``nvme``. + This should be at least the number of states maintained per parameter by the optimizer. + For example, Adam optimizer has 4 states (parameter, gradient, momentum, and variance). + + block_size: When using NVMe Offloading, the I/O block size in bytes. + + queue_depth: When using NVMe Offloading, the I/O queue depth. + + single_submit: When using NVMe Offloading, + submit requests to storage device as multiple individual requests, + as opposed to one block of requests. + + overlap_events: When using NVMe Offloading, + submit requests to storage device in an overlapped fashion + without waiting for completion of earlier requests. + + thread_count: When using NVMe Offloading, + Intra-request parallelism for each read/write submitted by a user thread. + + pin_memory: When using ZeRO stage 3, pin optimizer state memory on CPU. + This could boost throughput at the cost of extra memory overhead. + + sub_group_size: When using ZeRO stage 3, defines the number of parameters + within a sub group to offload at a time. + Smaller numbers require more communication, but improve memory efficiency. + + contiguous_gradients: Copies gradients to a continuous buffer as they are produced. + Avoids memory fragmentation during backwards. Useful when training large models. + + overlap_comm: Overlap the reduction (synchronization) of gradients with the backwards computation. + This is a speed optimization when training across multiple GPUs/machines. + + allgather_partitions: All gather updated parameters at the end of training step, + instead of using a series of broadcast collectives. + + reduce_scatter: Use reduce/scatter instead of allreduce to average gradients. + + allgather_bucket_size: Number of elements to allgather at once. + Used to limit the memory required for larger model sizes, with a tradeoff with speed. + + reduce_bucket_size: Number of elements to reduce at once. + Used to limit the memory required for larger model sizes, with a tradeoff with speed. + + zero_allow_untested_optimizer: Allow untested optimizers to be used with ZeRO. Currently only Adam is a + DeepSpeed supported optimizer when using ZeRO. + + logging_batch_size_per_gpu: Config used in DeepSpeed to calculate verbose timing for logging + on a per sample per second basis (only displayed if logging=logging.INFO). + If set to "auto", the plugin tries to infer this from + the train DataLoader's BatchSampler, else defaults to 1. + To obtain accurate logs when using datasets that do not support batch samplers, + set this to the actual per gpu batch size. + + config: Pass in a deepspeed formatted config dict, + or path to a deepspeed config: https://www.deepspeed.ai/docs/config-json. + All defaults will be ignored if a config is passed in. + + logging_level: Set logging level for deepspeed. + + loss_scale: Loss scaling value for FP16 training. + 0.0 results in dynamic loss scaling, otherwise static. + + initial_scale_power: Power of the initial dynamic loss scale value. Loss scale is computed + by ``2^initial_scale_power``. + + loss_scale_window: Window in which to raise/lower the dynamic FP16 loss scaling value. + + hysteresis: FP16 Delay shift in Dynamic Loss scaling. + + min_loss_scale: The minimum FP16 dynamic loss scaling value. + + partition_activations: Enables partition activation when used with ZeRO stage 3 and model parallelism. + Still requires you to wrap your forward functions in deepspeed.checkpointing.checkpoint. + See `deepspeed tutorial + `_. + + cpu_checkpointing: Offloads partitioned activations to CPU if ``partition_activations`` is enabled. + + contiguous_memory_optimization: Copies partitioned activations so that they are contiguous in memory. + Not supported by all models. + + synchronize_checkpoint_boundary: Insert :func:`torch.cuda.synchronize` at each checkpoint boundary. + + load_full_weights: True when loading a single checkpoint file containing the model state dict + when using ZeRO Stage 3. This differs from the DeepSpeed checkpoint which contains shards + per worker. + """ + if not _DEEPSPEED_AVAILABLE: + raise ImportError( + "To use the `DeepSpeedStrategy`, you must have DeepSpeed installed." + " Install it by running `pip install -U deepspeed`." + ) + + super().__init__( + accelerator=accelerator, + parallel_devices=parallel_devices, + cluster_environment=cluster_environment, + precision_plugin=precision_plugin, + process_group_backend=process_group_backend, + ) + + self.config = self._load_config(config) + if self.config is None: + # User has not overridden config, set defaults + self.config = self._create_default_config( + zero_optimization, + zero_allow_untested_optimizer, + logging_batch_size_per_gpu, + offload_optimizer=offload_optimizer, + offload_parameters=offload_parameters, + nvme_path=nvme_path, + offload_params_device=offload_params_device, + params_buffer_count=params_buffer_count, + params_buffer_size=params_buffer_size, + max_in_cpu=max_in_cpu, + pin_memory=pin_memory, + offload_optimizer_device=offload_optimizer_device, + optimizer_buffer_count=optimizer_buffer_count, + block_size=block_size, + queue_depth=queue_depth, + single_submit=single_submit, + overlap_events=overlap_events, + thread_count=thread_count, + partition_activations=partition_activations, + cpu_checkpointing=cpu_checkpointing, + contiguous_memory_optimization=contiguous_memory_optimization, + synchronize_checkpoint_boundary=synchronize_checkpoint_boundary, + stage=stage, + contiguous_gradients=contiguous_gradients, + overlap_comm=overlap_comm, + allgather_partitions=allgather_partitions, + reduce_scatter=reduce_scatter, + allgather_bucket_size=allgather_bucket_size, + reduce_bucket_size=reduce_bucket_size, + sub_group_size=sub_group_size, + ) + self._config_initialized = False + deepspeed.utils.logging.logger.setLevel(logging_level) + + self.remote_device = remote_device + self.load_full_weights = load_full_weights + + # default FP16 parameters. + self.loss_scale = loss_scale + self.initial_scale_power = initial_scale_power + self.loss_scale_window = loss_scale_window + self.hysteresis = hysteresis + self.min_loss_scale = min_loss_scale + + self._deepspeed_engine: Optional["deepspeed.DeepSpeedEngine"] = None + + @property + def zero_stage_3(self) -> bool: + assert isinstance(self.config, dict) + zero_optimization = self.config.get("zero_optimization") + return zero_optimization is not None and zero_optimization.get("stage") == 3 + + @property + def distributed_sampler_kwargs(self) -> Dict[str, int]: + distributed_sampler_kwargs = dict(num_replicas=self.world_size, rank=self.global_rank) + return distributed_sampler_kwargs + + @property + def model(self) -> "deepspeed.DeepSpeedEngine": + return self._deepspeed_engine + + def setup_module_and_optimizers( + self, model: Module, optimizers: List[Optimizer] + ) -> Tuple["deepspeed.DeepSpeedEngine", List[Optimizer]]: + """Setup a model and multiple optimizers together. + + Currently only a single optimizer is supported. + + Return: + The model wrapped into a :class:`deepspeed.DeepSpeedEngine` and a list with a single + deepspeed optimizer. + """ + if len(optimizers) != 1: + raise ValueError( + f"Currently only one optimizer is supported with DeepSpeed." + f" Got {len(optimizers)} optimizers instead." + ) + + # train_micro_batch_size_per_gpu is used for throughput logging purposes + # normally we set this to the batch size, but it is not available here unless the user provides it + # as part of the config + assert self.config is not None + self.config.setdefault("train_micro_batch_size_per_gpu", 1) + self._deepspeed_engine, optimizer = self._setup_module_and_optimizer(model, optimizers[0]) + self._set_deepspeed_activation_checkpointing() + return self._deepspeed_engine, [optimizer] + + @contextlib.contextmanager + def module_sharded_context(self) -> Generator[None, None, None]: + if self.zero_stage_3: + assert self._config_initialized + + if self.precision_plugin.precision == PrecisionType.HALF: + dtype = torch.float16 + elif self.precision_plugin.precision == PrecisionType.BFLOAT: + dtype = torch.bfloat16 + else: + dtype = torch.float32 + + model_parallel_context = deepspeed.zero.Init( + remote_device=self.remote_device, pin_memory=True, config_dict_or_path=self.config, dtype=dtype + ) + else: + model_parallel_context = super().module_sharded_context() + + with model_parallel_context: + yield + + def save_checkpoint(self, checkpoint: Dict, filepath: _PATH, storage_options: Optional[Any] = None) -> None: + raise NotImplementedError + + def load_checkpoint(self, checkpoint_path: _PATH) -> Dict[str, Any]: + raise NotImplementedError + + def load_optimizer_state_dict( + self, optimizers: Union[Optimizer, Iterable[Optimizer]], checkpoint: Mapping[str, Any] + ) -> None: + # override to do nothing, deepspeed engine already loaded the states in `load_checkpoint()` + pass + + def load_module_state_dict(self, module: Module, checkpoint: Mapping[str, Any]) -> None: + # override to do nothing, deepspeed engine already loaded the weights in `load_checkpoint()` + if self.load_full_weights and self.zero_stage_3: + self.module_to_device(module) + self._restore_zero_state(module, checkpoint) + + def batch_to_device(self, batch: Any, device: Optional[torch.device] = None) -> Any: + batch = apply_to_collection(batch, Tensor, function=_fp_to_half, precision=self.precision_plugin.precision) + return super().batch_to_device(batch, device) + + @classmethod + def register_strategies(cls, strategy_registry: Dict) -> None: + strategy_registry.register("deepspeed", cls, description="Default DeepSpeed Strategy") + strategy_registry.register("deepspeed_stage_1", cls, description="DeepSpeed with ZeRO Stage 1 enabled", stage=1) + strategy_registry.register("deepspeed_stage_2", cls, description="DeepSpeed with ZeRO Stage 2 enabled", stage=2) + strategy_registry.register( + "deepspeed_stage_2_offload", + cls, + description="DeepSpeed ZeRO Stage 2 and CPU Offload", + stage=2, + offload_optimizer=True, + ) + strategy_registry.register("deepspeed_stage_3", cls, description="DeepSpeed ZeRO Stage 3", stage=3) + strategy_registry.register( + "deepspeed_stage_3_offload", + cls, + description="DeepSpeed ZeRO Stage 3 and CPU Offload", + stage=3, + offload_optimizer=True, + offload_parameters=True, + ) + strategy_registry.register( + "deepspeed_stage_3_offload_nvme", + cls, + description="DeepSpeed ZeRO Stage 3 and NVMe Offload", + stage=3, + offload_optimizer=True, + offload_parameters=True, + remote_device="nvme", + offload_params_device="nvme", + offload_optimizer_device="nvme", + ) + + def _setup_module_and_optimizer( + self, + model: Module, + optimizer: Optional[Optimizer], + lr_scheduler: Optional[Union[_LRScheduler, ReduceLROnPlateau]] = None, + ) -> Tuple["deepspeed.DeepSpeedEngine", Optimizer]: + """Initialize one model and one optimizer with an optional learning rate scheduler. + + This calls :func:`deepspeed.initialize` internally. + """ + model_parameters = filter(lambda p: p.requires_grad, model.parameters()) + deepspeed_engine, deepspeed_optimizer, _, _ = deepspeed.initialize( + args=argparse.Namespace(device_rank=self.root_device.index), + config=self.config, + model=model, + model_parameters=model_parameters, + optimizer=optimizer, + lr_scheduler=lr_scheduler, + dist_init_required=False, + ) + return deepspeed_engine, deepspeed_optimizer + + def _setup_distributed(self) -> None: + reset_seed() + self._set_world_ranks() + rank_zero_only.rank = self.global_rank + self._init_deepspeed_distributed() + if not self._config_initialized: + self._format_config() + self._config_initialized = True + + def _init_deepspeed_distributed(self) -> None: + assert self.cluster_environment is not None + if platform.system() != "Windows": + # do not set env variables on windows, allow deepspeed to control setup + self._set_node_environment_variables() + log.info( + "initializing deepspeed distributed: " + f"GLOBAL_RANK: {self.global_rank}, " + f"MEMBER: {self.global_rank + 1}/{self.world_size}" + ) + self._process_group_backend = self._get_process_group_backend() + deepspeed.init_distributed(self._process_group_backend, distributed_port=self.cluster_environment.main_port) + + def _get_process_group_backend(self) -> str: + return self._process_group_backend or get_default_process_group_backend_for_device(self.root_device) + + def _set_node_environment_variables(self) -> None: + assert self.cluster_environment is not None + os.environ["MASTER_ADDR"] = self.cluster_environment.main_address + os.environ["MASTER_PORT"] = str(self.cluster_environment.main_port) + os.environ["RANK"] = str(self.global_rank) + os.environ["WORLD_SIZE"] = str(self.world_size) + os.environ["LOCAL_RANK"] = str(self.local_rank) + + def _set_deepspeed_activation_checkpointing(self) -> None: + assert isinstance(self.config, dict) + if self.config.get("activation_checkpointing"): + checkpoint_config = self.config["activation_checkpointing"] + deepspeed.checkpointing.configure( + mpu_=None, + partition_activations=checkpoint_config.get("partition_activations"), + contiguous_checkpointing=checkpoint_config.get("contiguous_memory_optimization"), + checkpoint_in_cpu=checkpoint_config.get("cpu_checkpointing"), + profile=checkpoint_config.get("profile"), + ) + + def _format_config(self) -> None: + if self.config is None: + raise ValueError( + "To use DeepSpeed you must pass in a DeepSpeed config dict, or a path to a JSON config." + " See: https://pytorch-lightning.readthedocs.io/en/stable/advanced/model_parallel.html#deepspeed" + ) + self._format_precision_config() + + def _format_precision_config(self) -> None: + assert isinstance(self.config, dict) + if self.precision_plugin.precision == PrecisionType.HALF: + if "fp16" not in self.config and self.precision_plugin.amp_type == AMPType.NATIVE: + # FP16 is a DeepSpeed standalone AMP implementation + rank_zero_info("Enabling DeepSpeed FP16.") + self.config["fp16"] = { + "enabled": True, + "loss_scale": self.loss_scale, + "initial_scale_power": self.initial_scale_power, + "loss_scale_window": self.loss_scale_window, + "hysteresis": self.hysteresis, + "min_loss_scale": self.min_loss_scale, + } + elif "amp" not in self.config and self.precision_plugin.amp_type == AMPType.APEX: + rank_zero_info("Enabling DeepSpeed APEX Implementation.") + self.config["amp"] = {"enabled": True, "opt_level": self.precision_plugin.amp_level} + elif "bf16" not in self.config and self.precision_plugin.precision == PrecisionType.BFLOAT: + rank_zero_info("Enabling DeepSpeed BF16.") + self.config["bf16"] = {"enabled": True} + + def _create_default_config( + self, + zero_optimization: bool, + zero_allow_untested_optimizer: bool, + logging_batch_size_per_gpu: Union[str, int], + partition_activations: bool, + cpu_checkpointing: bool, + contiguous_memory_optimization: bool, + synchronize_checkpoint_boundary: bool, + offload_optimizer: bool, + offload_parameters: bool, + nvme_path: str, + offload_params_device: str, + params_buffer_count: int, + params_buffer_size: int, + max_in_cpu: int, + offload_optimizer_device: str, + optimizer_buffer_count: int, + pin_memory: bool, + block_size: int, + queue_depth: int, + single_submit: bool, + overlap_events: bool, + thread_count: int, + **zero_kwargs: Any, + ) -> Dict: + cfg = { + "activation_checkpointing": { + "partition_activations": partition_activations, + "cpu_checkpointing": cpu_checkpointing, + "contiguous_memory_optimization": contiguous_memory_optimization, + "synchronize_checkpoint_boundary": synchronize_checkpoint_boundary, + }, + "aio": { + "block_size": block_size, + "queue_depth": queue_depth, + "single_submit": single_submit, + "overlap_events": overlap_events, + "thread_count": thread_count, + }, + } + if zero_optimization: + zero_config = zero_kwargs + + if offload_optimizer: + zero_config["offload_optimizer"] = { + "device": offload_optimizer_device, + "nvme_path": nvme_path, + "buffer_count": optimizer_buffer_count, + "pin_memory": pin_memory, + } + if offload_parameters: + zero_config["offload_param"] = { + "device": offload_params_device, + "nvme_path": nvme_path, + "buffer_count": params_buffer_count, + "buffer_size": params_buffer_size, + "max_in_cpu": max_in_cpu, + "pin_memory": pin_memory, + } + cfg = { + "zero_allow_untested_optimizer": zero_allow_untested_optimizer, + "zero_optimization": zero_config, + **cfg, + } + if logging_batch_size_per_gpu != "auto": + cfg = {"train_micro_batch_size_per_gpu": logging_batch_size_per_gpu, **cfg} + return cfg + + def _restore_zero_state(self, module: Module, ckpt: Mapping[str, Any]) -> None: + """Overrides the normal load_state_dict behaviour in PyTorch to ensure we gather parameters that may be + sharded across processes before loading the state dictionary when using ZeRO stage 3. This is then + automatically synced across processes. + + Args: + ckpt: The ckpt file. + """ + + def load(module: torch.nn.Module, prefix: str = "") -> None: + + missing_keys: List[str] = [] + unexpected_keys: List[str] = [] + error_msgs: List[str] = [] + state_dict = ckpt["state_dict"] + + # copy state_dict so _load_from_state_dict can modify it + metadata = getattr(state_dict, "_metadata", None) + state_dict = state_dict.copy() + if metadata is not None: + state_dict._metadata = metadata + + local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {}) + # because zero3 puts placeholders in model params, this context + # manager gathers (unpartitions) the params of the current layer, then loads from + # the state dict and then re-partitions them again + with deepspeed.zero.GatheredParameters(list(module.parameters(recurse=False)), modifier_rank=0): + if self.is_global_zero: + module._load_from_state_dict( + state_dict=state_dict, + prefix=prefix, + local_metadata=local_metadata, + strict=True, + missing_keys=missing_keys, + unexpected_keys=unexpected_keys, + error_msgs=error_msgs, + ) + + for name, child in module._modules.items(): + if child is not None: + load(child, prefix + name + ".") + + load(module, prefix="") + + def _load_config(self, config: Optional[Union[_PATH, Dict[str, Any]]]) -> Optional[Dict[str, Any]]: + if config is None and self.DEEPSPEED_ENV_VAR in os.environ: + rank_zero_info(f"Loading DeepSpeed config from set {self.DEEPSPEED_ENV_VAR} environment variable") + config = os.environ[self.DEEPSPEED_ENV_VAR] + if isinstance(config, (str, Path)): + if not os.path.isfile(config): + raise FileNotFoundError( + f"You passed in a path to a DeepSpeed config but the path does not exist: {config}" + ) + with open(config) as f: + config = json.load(f) + assert isinstance(config, dict) or config is None + return config diff --git a/src/lightning_lite/strategies/fairscale.py b/src/lightning_lite/strategies/fairscale.py new file mode 100644 index 0000000000000..b2c630a4dbd44 --- /dev/null +++ b/src/lightning_lite/strategies/fairscale.py @@ -0,0 +1,122 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from contextlib import contextmanager +from datetime import timedelta +from typing import Any, Dict, Generator, List, Optional, Tuple + +import torch +from lightning_utilities.core.imports import module_available +from torch.distributed.constants import default_pg_timeout +from torch.nn import Module +from torch.optim import Optimizer + +from lightning_lite.accelerators import Accelerator +from lightning_lite.plugins import CheckpointIO, ClusterEnvironment, Precision +from lightning_lite.strategies.ddp import DDPStrategy +from lightning_lite.utilities.enums import PrecisionType +from lightning_lite.utilities.imports import _IS_WINDOWS + +_FAIRSCALE_AVAILABLE = not _IS_WINDOWS and module_available("fairscale.nn") + +if _FAIRSCALE_AVAILABLE: + from fairscale.nn.data_parallel.sharded_ddp import ShardedDataParallel + from fairscale.optim import OSS +else: + OSS = ShardedDataParallel = object + + +class DDPShardedStrategy(DDPStrategy): + """Optimizer and gradient sharded training provided by FairScale.""" + + _REDUCE_BUFFER_SIZE_DEFAULT: int = 2**23 # 8M + + def __init__( + self, + accelerator: Optional[Accelerator] = None, + parallel_devices: Optional[List[torch.device]] = None, + cluster_environment: Optional[ClusterEnvironment] = None, + checkpoint_io: Optional[CheckpointIO] = None, + precision_plugin: Optional[Precision] = None, + process_group_backend: Optional[str] = None, + timeout: Optional[timedelta] = default_pg_timeout, + **kwargs: Any, + ) -> None: + super().__init__( + accelerator=accelerator, + parallel_devices=parallel_devices, + cluster_environment=cluster_environment, + checkpoint_io=checkpoint_io, + precision_plugin=precision_plugin, + process_group_backen=process_group_backend, + timeout=timeout, + **kwargs, + ) + super().__init__() + if "reduce_buffer_size" not in self._ddp_kwargs: + # For multi-node training, enabling bucketing will improve performance. + self._ddp_kwargs["reduce_buffer_size"] = self._REDUCE_BUFFER_SIZE_DEFAULT if self.num_nodes > 1 else 0 + + def setup_module_and_optimizers( + self, module: Module, optimizers: List[Optimizer] + ) -> Tuple[Module, List[Optimizer]]: + """Wraps the model and optimizers with fairscale components. + + Return: + The model wrapped into a :class:`~fairscale.nn.data_parallel.ShardedDataParallel` module + and a list of optimizer wrapped in :class:~`fairscale.optim.OSS`. + """ + optimizers = self._reinit_optimizers_with_oss(optimizers) + model = ShardedDataParallel(module, sharded_optimizer=optimizers, **self._ddp_kwargs) + return model, optimizers + + @contextmanager + def block_backward_sync(self, module: Module) -> Generator: + """Blocks syncing gradients behaviour on backwards pass. + + This is useful for skipping sync when accumulating gradients, reducing communication overhead + Returns: context manager with sync behaviour off + """ + if isinstance(module, ShardedDataParallel): + with module.no_sync(): + yield None + else: + yield None + + @classmethod + def register_strategies(cls, strategy_registry: Dict) -> None: + strategy_registry.register( + "ddp_sharded_find_unused_parameters_false", + cls, + description="DDP Sharded Strategy with `find_unused_parameters` as False", + find_unused_parameters=False, + ) + strategy_registry.register( + "ddp_sharded", + cls, + description=cls.__class__.__name__, + ) + + def _reinit_optimizers_with_oss(self, optimizers: List[Optimizer]) -> List["OSS"]: + for x, optimizer in enumerate(optimizers): + if not isinstance(optimizer, OSS): + optim_class = type(optimizer) + zero_optimizer = OSS(params=optimizer.param_groups, optim=optim_class, **optimizer.defaults) + is_fp16 = self.precision_plugin.precision in (PrecisionType.MIXED, PrecisionType.HALF) + # For multi-node training, compressing the model shards in fp16 before broadcasting + # improves performance. When using PyTorch AMP, it will not degrade + # the model performance. + zero_optimizer.broadcast_fp16 = is_fp16 and self.num_nodes > 1 + optimizers[x] = zero_optimizer + del optimizer + return optimizers diff --git a/src/lightning_lite/strategies/launchers/base.py b/src/lightning_lite/strategies/launchers/base.py index 2acf54afef245..f2d02973a203e 100644 --- a/src/lightning_lite/strategies/launchers/base.py +++ b/src/lightning_lite/strategies/launchers/base.py @@ -20,7 +20,7 @@ class _Launcher(ABC): Abstract base class for all Launchers. Launchers are responsible for the creation and instrumentation of new processes so that the - :class:`~pytorch_lightning.strategies.base.Strategy` can set up communication between all them. + :class:`~lightning_lite.strategies.strategy.Strategy` can set up communication between all them. Subclass this class and override any of the relevant methods to provide a custom implementation depending on cluster environment, hardware, strategy, etc. diff --git a/src/lightning_lite/utilities/imports.py b/src/lightning_lite/utilities/imports.py index 70d8549368ea5..aa9a1fed3726b 100644 --- a/src/lightning_lite/utilities/imports.py +++ b/src/lightning_lite/utilities/imports.py @@ -37,9 +37,6 @@ _POPTORCH_AVAILABLE = package_available("poptorch") _XLA_AVAILABLE: bool = package_available("torch_xla") -# TODO(lite): import this from the fairscale files once they move to lite package -_FAIRSCALE_AVAILABLE = not _IS_WINDOWS and module_available("fairscale.nn") - from lightning_lite.utilities.xla_device import XLADeviceUtils # noqa: E402 diff --git a/src/pytorch_lightning/strategies/deepspeed.py b/src/pytorch_lightning/strategies/deepspeed.py index ad8d4da106ec8..4116448ab95fc 100644 --- a/src/pytorch_lightning/strategies/deepspeed.py +++ b/src/pytorch_lightning/strategies/deepspeed.py @@ -31,6 +31,7 @@ import pytorch_lightning as pl from lightning_lite.plugins.environments.cluster_environment import ClusterEnvironment +from lightning_lite.plugins.precision.utils import _fp_to_half from lightning_lite.utilities.distributed import ( _get_process_group_backend_from_env, get_default_process_group_backend_for_device, @@ -45,7 +46,6 @@ from pytorch_lightning.overrides.base import _LightningModuleWrapperBase, _LightningPrecisionModuleWrapperBase from pytorch_lightning.plugins.precision import PrecisionPlugin from pytorch_lightning.strategies.ddp import DDPStrategy -from pytorch_lightning.strategies.utils import _fp_to_half from pytorch_lightning.trainer.states import TrainerFn from pytorch_lightning.utilities import GradClipAlgorithmType from pytorch_lightning.utilities.exceptions import MisconfigurationException diff --git a/src/pytorch_lightning/strategies/ipu.py b/src/pytorch_lightning/strategies/ipu.py index 966789a07feaa..dffe47be0e2e1 100644 --- a/src/pytorch_lightning/strategies/ipu.py +++ b/src/pytorch_lightning/strategies/ipu.py @@ -23,13 +23,13 @@ import pytorch_lightning as pl from lightning_lite.plugins.environments.cluster_environment import ClusterEnvironment from lightning_lite.plugins.io.checkpoint_plugin import CheckpointIO +from lightning_lite.plugins.precision.utils import _fp_to_half from lightning_lite.utilities.cloud_io import get_filesystem from lightning_lite.utilities.enums import PrecisionType from pytorch_lightning.overrides.base import _LightningModuleWrapperBase, _LightningPrecisionModuleWrapperBase from pytorch_lightning.plugins.precision import PrecisionPlugin from pytorch_lightning.strategies.parallel import ParallelStrategy from pytorch_lightning.strategies.strategy import TBroadcast -from pytorch_lightning.strategies.utils import _fp_to_half from pytorch_lightning.trainer.states import RunningStage, TrainerFn from pytorch_lightning.utilities import _IPU_AVAILABLE, _POPTORCH_AVAILABLE, rank_zero_warn from pytorch_lightning.utilities.data import _get_dataloader_init_args_and_kwargs, _reinstantiate_wrapped_cls diff --git a/src/pytorch_lightning/strategies/utils.py b/src/pytorch_lightning/strategies/utils.py index 6a8f8ae19f4e4..fa360d3770cd7 100644 --- a/src/pytorch_lightning/strategies/utils.py +++ b/src/pytorch_lightning/strategies/utils.py @@ -15,10 +15,7 @@ import os from inspect import getmembers, isclass -import torch - from lightning_lite.strategies import _StrategyRegistry -from lightning_lite.utilities.enums import PrecisionType from lightning_lite.utilities.registry import _is_register_method_overridden from pytorch_lightning.strategies.strategy import Strategy from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation @@ -31,16 +28,6 @@ def on_colab_kaggle() -> bool: return bool(os.getenv("COLAB_GPU") or os.getenv("KAGGLE_URL_BASE")) -def _fp_to_half(tensor: torch.Tensor, precision: PrecisionType) -> torch.Tensor: - if torch.is_floating_point(tensor): - if precision == PrecisionType.HALF: - return tensor.half() - if precision == PrecisionType.BFLOAT: - return tensor.bfloat16() - - return tensor - - def _call_register_strategies(registry: _StrategyRegistry, base_module: str) -> None: # TODO(lite): Remove this function once PL strategies inherit from Lite's Strategy base class module = importlib.import_module(base_module) diff --git a/tests/tests_lite/helpers/runif.py b/tests/tests_lite/helpers/runif.py index 00a31d0f48b4e..e996aec4d315b 100644 --- a/tests/tests_lite/helpers/runif.py +++ b/tests/tests_lite/helpers/runif.py @@ -21,7 +21,9 @@ from pkg_resources import get_distribution from lightning_lite.accelerators.mps import MPSAccelerator -from lightning_lite.utilities.imports import _FAIRSCALE_AVAILABLE, _TPU_AVAILABLE +from lightning_lite.strategies.deepspeed import _DEEPSPEED_AVAILABLE +from lightning_lite.strategies.fairscale import _FAIRSCALE_AVAILABLE +from lightning_lite.utilities.imports import _TPU_AVAILABLE class RunIf: @@ -45,6 +47,7 @@ def __new__( skip_windows: bool = False, standalone: bool = False, fairscale: bool = False, + deepspeed: bool = False, **kwargs, ): """ @@ -61,6 +64,7 @@ def __new__( standalone: Mark the test as standalone, our CI will run it in a separate process. This requires that the ``PL_RUN_STANDALONE_TESTS=1`` environment variable is set. fairscale: Require that facebookresearch/fairscale is installed. + deepspeed: Require that microsoft/DeepSpeed is installed. **kwargs: Any :class:`pytest.mark.skipif` keyword arguments. """ conditions = [] @@ -120,6 +124,10 @@ def __new__( conditions.append(not _FAIRSCALE_AVAILABLE) reasons.append("Fairscale") + if deepspeed: + conditions.append(not _DEEPSPEED_AVAILABLE) + reasons.append("Deepspeed") + reasons = [rs for cond, rs in zip(conditions, reasons) if cond] return pytest.mark.skipif( *args, condition=any(conditions), reason=f"Requires: [{' + '.join(reasons)}]", **kwargs diff --git a/tests/tests_lite/strategies/test_ddp.py b/tests/tests_lite/strategies/test_ddp.py new file mode 100644 index 0000000000000..178f8529fb40c --- /dev/null +++ b/tests/tests_lite/strategies/test_ddp.py @@ -0,0 +1,42 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import pytest +import torch + +from lightning_lite.strategies import DDPStrategy + + +@pytest.mark.parametrize( + ["process_group_backend", "device_str", "expected_process_group_backend"], + [ + pytest.param("foo", "cpu", "foo"), + pytest.param("foo", "cuda:0", "foo"), + pytest.param(None, "cuda:0", "nccl"), + pytest.param(None, "cpu", "gloo"), + ], +) +def test_ddp_process_group_backend(process_group_backend, device_str, expected_process_group_backend): + """Test settings for process group backend.""" + + class MockDDPStrategy(DDPStrategy): + def __init__(self, root_device, process_group_backend): + self._root_device = root_device + super().__init__(process_group_backend=process_group_backend) + + @property + def root_device(self): + return self._root_device + + strategy = MockDDPStrategy(process_group_backend=process_group_backend, root_device=torch.device(device_str)) + assert strategy._get_process_group_backend() == expected_process_group_backend diff --git a/tests/tests_lite/strategies/test_deepspeed.py b/tests/tests_lite/strategies/test_deepspeed.py new file mode 100644 index 0000000000000..1e3d7c30738c3 --- /dev/null +++ b/tests/tests_lite/strategies/test_deepspeed.py @@ -0,0 +1,76 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import json +import os + +import pytest +from tests_lite.helpers.runif import RunIf + +from lightning_lite.strategies import DeepSpeedStrategy + + +@pytest.fixture +def deepspeed_config(): + return { + "optimizer": {"type": "SGD", "params": {"lr": 3e-5}}, + "scheduler": { + "type": "WarmupLR", + "params": {"last_batch_iteration": -1, "warmup_min_lr": 0, "warmup_max_lr": 3e-5, "warmup_num_steps": 100}, + }, + } + + +@RunIf(deepspeed=True) +def test_deepspeed_with_invalid_config_path(): + """Test to ensure if we pass an invalid config path we throw an exception.""" + + with pytest.raises( + FileNotFoundError, match="You passed in a path to a DeepSpeed config but the path does not exist" + ): + DeepSpeedStrategy(config="invalid_path.json") + + +@RunIf(deepspeed=True) +def test_deepspeed_with_env_path(tmpdir, monkeypatch, deepspeed_config): + """Test to ensure if we pass an env variable, we load the config from the path.""" + config_path = os.path.join(tmpdir, "temp.json") + with open(config_path, "w") as f: + f.write(json.dumps(deepspeed_config)) + monkeypatch.setenv("PL_DEEPSPEED_CONFIG_PATH", config_path) + strategy = DeepSpeedStrategy() + assert strategy.config == deepspeed_config + + +@RunIf(deepspeed=True) +def test_deepspeed_defaults(): + """Ensure that defaults are correctly set as a config for DeepSpeed if no arguments are passed.""" + strategy = DeepSpeedStrategy() + assert strategy.config is not None + assert isinstance(strategy.config["zero_optimization"], dict) + + +@RunIf(deepspeed=True) +def test_deepspeed_custom_activation_checkpointing_params(tmpdir): + """Ensure if we modify the activation checkpointing parameters, the deepspeed config contains these changes.""" + ds = DeepSpeedStrategy( + partition_activations=True, + cpu_checkpointing=True, + contiguous_memory_optimization=True, + synchronize_checkpoint_boundary=True, + ) + checkpoint_config = ds.config["activation_checkpointing"] + assert checkpoint_config["partition_activations"] + assert checkpoint_config["cpu_checkpointing"] + assert checkpoint_config["contiguous_memory_optimization"] + assert checkpoint_config["synchronize_checkpoint_boundary"] diff --git a/tests/tests_lite/strategies/test_fairscale.py b/tests/tests_lite/strategies/test_fairscale.py new file mode 100644 index 0000000000000..77ee5cb3d0f53 --- /dev/null +++ b/tests/tests_lite/strategies/test_fairscale.py @@ -0,0 +1,28 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from unittest import mock + +from tests_lite.helpers.runif import RunIf + +from lightning_lite.strategies import DDPShardedStrategy +from lightning_lite.strategies.fairscale import ShardedDataParallel + + +@RunIf(fairscale=True) +def test_block_backward_sync(): + strategy = DDPShardedStrategy() + model = mock.MagicMock(spec=ShardedDataParallel) + with strategy.block_backward_sync(model): + pass + model.no_sync.assert_called_once() diff --git a/tests/tests_lite/strategies/test_registry.py b/tests/tests_lite/strategies/test_registry.py index 76a6bea00f249..627837b4524b7 100644 --- a/tests/tests_lite/strategies/test_registry.py +++ b/tests/tests_lite/strategies/test_registry.py @@ -42,6 +42,17 @@ def __init__(self, param1, param2): def test_available_strategies_in_registry(): assert set(STRATEGY_REGISTRY.available_strategies()) == { + "ddp_sharded_find_unused_parameters_false", + "ddp_sharded", + "ddp_find_unused_parameters_false", + "ddp", + "deepspeed", + "deepspeed_stage_1", + "deepspeed_stage_2", + "deepspeed_stage_2_offload", + "deepspeed_stage_3", + "deepspeed_stage_3_offload", + "deepspeed_stage_3_offload_nvme", "dp", "single_tpu", } diff --git a/tests/tests_lite/utilities/test_imports.py b/tests/tests_lite/utilities/test_imports.py index 3a8444ef728ff..0c20d2172c901 100644 --- a/tests/tests_lite/utilities/test_imports.py +++ b/tests/tests_lite/utilities/test_imports.py @@ -27,9 +27,10 @@ # See the License for the specific language governing permissions and # limitations under the License. +from lightning_lite.strategies.deepspeed import _DEEPSPEED_AVAILABLE +from lightning_lite.strategies.fairscale import _FAIRSCALE_AVAILABLE from lightning_lite.utilities.imports import ( _APEX_AVAILABLE, - _FAIRSCALE_AVAILABLE, _HOROVOD_AVAILABLE, _OMEGACONF_AVAILABLE, _POPTORCH_AVAILABLE, @@ -44,13 +45,12 @@ def test_imports(): else: assert _APEX_AVAILABLE - # TODO(lite): re-enable these once deepspeed strategy has moved - # try: - # import deepspeed - # except ModuleNotFoundError: - # assert not _DEEPSPEED_AVAILABLE - # else: - # assert _DEEPSPEED_AVAILABLE + try: + import deepspeed # noqa + except ModuleNotFoundError: + assert not _DEEPSPEED_AVAILABLE + else: + assert _DEEPSPEED_AVAILABLE try: import fairscale.nn # noqa From 692f0f3d5d4f4d638f557eedf6625928655a73b7 Mon Sep 17 00:00:00 2001 From: Laverne Henderson Date: Thu, 15 Sep 2022 01:37:50 -0700 Subject: [PATCH 154/193] Resolve minor formatting issue (#14706) --- docs/source-app/workflows/build_rest_api/add_api.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source-app/workflows/build_rest_api/add_api.rst b/docs/source-app/workflows/build_rest_api/add_api.rst index 9538c7792f8d0..c5d427a30ba9b 100644 --- a/docs/source-app/workflows/build_rest_api/add_api.rst +++ b/docs/source-app/workflows/build_rest_api/add_api.rst @@ -28,7 +28,7 @@ Execute the following command in a terminal: .. code-block:: -lightning run app app.py + lightning run app app.py The following appears: From d3dcd688526c27744c751ca8b6802e35d781cc52 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Thu, 15 Sep 2022 12:51:12 +0200 Subject: [PATCH 155/193] Standalone Lite: DDP Spawn Strategy Family (#14675) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Carlos Mocholí Co-authored-by: Jirka Borovec --- src/lightning_lite/strategies/__init__.py | 3 + src/lightning_lite/strategies/ddp_spawn.py | 214 ++++++++++++++++++ src/lightning_lite/strategies/fairscale.py | 97 +++++++- .../strategies/launchers/multiprocessing.py | 3 +- .../strategies/launchers/xla.py | 3 +- src/lightning_lite/strategies/xla.py | 204 +++++++++++++++++ src/lightning_lite/utilities/distributed.py | 2 +- tests/tests_lite/strategies/test_registry.py | 12 +- 8 files changed, 526 insertions(+), 12 deletions(-) create mode 100644 src/lightning_lite/strategies/ddp_spawn.py create mode 100644 src/lightning_lite/strategies/xla.py diff --git a/src/lightning_lite/strategies/__init__.py b/src/lightning_lite/strategies/__init__.py index 8ced098e3a8dd..f9cf74e30e4c0 100644 --- a/src/lightning_lite/strategies/__init__.py +++ b/src/lightning_lite/strategies/__init__.py @@ -12,14 +12,17 @@ # See the License for the specific language governing permissions and # limitations under the License. from lightning_lite.strategies.ddp import DDPStrategy # noqa: F401 +from lightning_lite.strategies.ddp_spawn import DDPSpawnStrategy # noqa: F401 from lightning_lite.strategies.deepspeed import DeepSpeedStrategy # noqa: F401 from lightning_lite.strategies.dp import DataParallelStrategy # noqa: F401 from lightning_lite.strategies.fairscale import DDPShardedStrategy # noqa: F401 +from lightning_lite.strategies.fairscale import DDPSpawnShardedStrategy # noqa: F401 from lightning_lite.strategies.parallel import ParallelStrategy # noqa: F401 from lightning_lite.strategies.registry import _call_register_strategies, _StrategyRegistry from lightning_lite.strategies.single_device import SingleDeviceStrategy # noqa: F401 from lightning_lite.strategies.single_tpu import SingleTPUStrategy # noqa: F401 from lightning_lite.strategies.strategy import Strategy # noqa: F401 +from lightning_lite.strategies.xla import XLAStrategy # noqa: F401 STRATEGY_REGISTRY = _StrategyRegistry() _STRATEGIES_BASE_MODULE = "lightning_lite.strategies" diff --git a/src/lightning_lite/strategies/ddp_spawn.py b/src/lightning_lite/strategies/ddp_spawn.py new file mode 100644 index 0000000000000..3e8b48b2a6b43 --- /dev/null +++ b/src/lightning_lite/strategies/ddp_spawn.py @@ -0,0 +1,214 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from datetime import timedelta +from typing import Any, Dict, List, Optional, Union + +import torch +import torch.distributed +from torch import Tensor +from torch.distributed.constants import default_pg_timeout +from torch.nn import Module +from torch.nn.parallel.distributed import DistributedDataParallel +from typing_extensions import Literal + +from lightning_lite.accelerators.accelerator import Accelerator +from lightning_lite.plugins.environments.cluster_environment import ClusterEnvironment +from lightning_lite.plugins.io.checkpoint_plugin import CheckpointIO +from lightning_lite.plugins.precision import Precision +from lightning_lite.strategies.launchers.multiprocessing import _MultiProcessingLauncher +from lightning_lite.strategies.parallel import ParallelStrategy +from lightning_lite.strategies.strategy import TBroadcast +from lightning_lite.utilities.distributed import distributed_available, get_default_process_group_backend_for_device +from lightning_lite.utilities.distributed import group as _group +from lightning_lite.utilities.distributed import init_dist_connection, ReduceOp, sync_ddp_if_available +from lightning_lite.utilities.rank_zero import rank_zero_only + +_DDP_FORK_ALIASES = ( + "ddp_fork", + "ddp_fork_find_unused_parameters_false", + "ddp_notebook", + "ddp_notebook_find_unused_parameters_false", +) + + +class DDPSpawnStrategy(ParallelStrategy): + """Spawns processes using the :func:`torch.multiprocessing.spawn` method and joins processes after training + finishes.""" + + def __init__( + self, + accelerator: Optional[Accelerator] = None, + parallel_devices: Optional[List[torch.device]] = None, + cluster_environment: Optional[ClusterEnvironment] = None, + checkpoint_io: Optional[CheckpointIO] = None, + precision_plugin: Optional[Precision] = None, + process_group_backend: Optional[str] = None, + timeout: Optional[timedelta] = default_pg_timeout, + start_method: Literal["spawn", "fork", "forkserver"] = "spawn", + **kwargs: Any, + ): + super().__init__( + accelerator=accelerator, + parallel_devices=parallel_devices, + cluster_environment=cluster_environment, + checkpoint_io=checkpoint_io, + precision_plugin=precision_plugin, + ) + self._num_nodes = 1 + self._process_group_backend: Optional[str] = process_group_backend + self._timeout: Optional[timedelta] = timeout + self._start_method = start_method + self._ddp_kwargs = kwargs + self._local_rank = 0 + + @property + def root_device(self) -> torch.device: + assert self.parallel_devices is not None + return self.parallel_devices[self.local_rank] + + @property + def num_nodes(self) -> int: + return self._num_nodes + + @num_nodes.setter + def num_nodes(self, num_nodes: int) -> None: + # note that world ranks is related to num_nodes, when resetting it, need to reset world ranks + self._num_nodes = num_nodes + + @property + def num_processes(self) -> int: + return len(self.parallel_devices) if self.parallel_devices is not None else 0 + + @property + def distributed_sampler_kwargs(self) -> Dict[str, int]: + distributed_sampler_kwargs = dict(num_replicas=(self.num_nodes * self.num_processes), rank=self.global_rank) + return distributed_sampler_kwargs + + @property + def process_group_backend(self) -> Optional[str]: + return self._process_group_backend + + @property + def local_rank(self) -> int: + return self._local_rank + + def _configure_launcher(self) -> None: + self._launcher = _MultiProcessingLauncher(self, start_method=self._start_method) + + def setup_environment(self) -> None: + self._setup_distributed() + super().setup_environment() + + def setup_module(self, module: Module) -> Module: + return DistributedDataParallel(module=module, device_ids=self._determine_ddp_device_ids(), **self._ddp_kwargs) + + def module_to_device(self, module: Module) -> None: + if self.root_device.type == "cuda": + # TODO(lite): This should be handled outside module_to_device, by a call to accelerator.setup_device() + # set the device on the spawned subprocesses + torch.cuda.set_device(self.root_device) + module.to(self.root_device) + + def reduce( + self, tensor: Tensor, group: Optional[Any] = None, reduce_op: Optional[Union[ReduceOp, str]] = "mean" + ) -> Tensor: + """Reduces a tensor from several distributed processes to one aggregated tensor. + + Args: + tensor: the tensor to sync and reduce + group: the process group to gather results from. Defaults to all processes (world) + reduce_op: the reduction operation. Defaults to 'mean'/'avg'. + Can also be a string 'sum' to calculate the sum during reduction. + + Return: + reduced value, except when the input was not a tensor the output remains is unchanged + """ + if isinstance(tensor, Tensor): + tensor = sync_ddp_if_available(tensor, group, reduce_op=reduce_op) + return tensor + + def barrier(self, *args: Any, **kwargs: Any) -> None: + if not distributed_available(): + return + if torch.distributed.get_backend() == "nccl": + torch.distributed.barrier(device_ids=self._determine_ddp_device_ids()) + else: + torch.distributed.barrier() + + def broadcast(self, obj: TBroadcast, src: int = 0) -> TBroadcast: + if not distributed_available(): + return obj + obj = [obj] + if self.global_rank != src: + obj = [None] # type: ignore[list-item] + torch.distributed.broadcast_object_list(obj, src, group=_group.WORLD) + return obj[0] + + @classmethod + def register_strategies(cls, strategy_registry: Dict) -> None: + entries = ( + ("ddp_spawn", "spawn"), + ("ddp_fork", "fork"), + ("ddp_notebook", "fork"), + ) + for name, start_method in entries: + strategy_registry.register( + name, + cls, + description=f"DDP strategy with `start_method` '{start_method}'", + start_method=start_method, + ) + + entries = ( + ("ddp_spawn_find_unused_parameters_false", "spawn"), + ("ddp_fork_find_unused_parameters_false", "fork"), + ("ddp_notebook_find_unused_parameters_false", "fork"), + ) + for name, start_method in entries: + strategy_registry.register( + name, + cls, + description=f"DDP strategy with `find_unused_parameters` as False and `start_method` '{start_method}'", + find_unused_parameters=False, + start_method=start_method, + ) + + def _setup_distributed(self) -> None: + self._set_world_ranks() + rank_zero_only.rank = self.global_rank + self._process_group_backend = self._get_process_group_backend() + assert self.cluster_environment is not None + init_dist_connection( + self.cluster_environment, + self._process_group_backend, + self.global_rank, + self.world_size, + timeout=self._timeout, + ) + + def _get_process_group_backend(self) -> str: + return self._process_group_backend or get_default_process_group_backend_for_device(self.root_device) + + def _set_world_ranks(self, process_idx: int = 0) -> None: + self._local_rank = process_idx + if self.cluster_environment is None: + return + self.cluster_environment.set_global_rank(self.node_rank * self.num_processes + self.local_rank) + self.cluster_environment.set_world_size(self.num_nodes * self.num_processes) + rank_zero_only.rank = self.cluster_environment.global_rank() + + def _determine_ddp_device_ids(self) -> Optional[List[int]]: + if self.root_device.type == "cpu": + return None + return [self.root_device.index] diff --git a/src/lightning_lite/strategies/fairscale.py b/src/lightning_lite/strategies/fairscale.py index b2c630a4dbd44..7c39f94e66969 100644 --- a/src/lightning_lite/strategies/fairscale.py +++ b/src/lightning_lite/strategies/fairscale.py @@ -23,6 +23,7 @@ from lightning_lite.accelerators import Accelerator from lightning_lite.plugins import CheckpointIO, ClusterEnvironment, Precision +from lightning_lite.strategies import DDPSpawnStrategy from lightning_lite.strategies.ddp import DDPStrategy from lightning_lite.utilities.enums import PrecisionType from lightning_lite.utilities.imports import _IS_WINDOWS @@ -76,7 +77,7 @@ def setup_module_and_optimizers( The model wrapped into a :class:`~fairscale.nn.data_parallel.ShardedDataParallel` module and a list of optimizer wrapped in :class:~`fairscale.optim.OSS`. """ - optimizers = self._reinit_optimizers_with_oss(optimizers) + optimizers = _reinit_optimizers_with_oss(optimizers, self.precision_plugin, self.num_nodes) model = ShardedDataParallel(module, sharded_optimizer=optimizers, **self._ddp_kwargs) return model, optimizers @@ -107,16 +108,100 @@ def register_strategies(cls, strategy_registry: Dict) -> None: description=cls.__class__.__name__, ) + +class DDPSpawnShardedStrategy(DDPSpawnStrategy): + """Optimizer and gradient sharded training provided by FairScale with Spawn.""" + + _REDUCE_BUFFER_SIZE_DEFAULT: int = 2**23 # 8M + + def __init__( + self, + accelerator: Optional[Accelerator] = None, + parallel_devices: Optional[List[torch.device]] = None, + cluster_environment: Optional[ClusterEnvironment] = None, + checkpoint_io: Optional[CheckpointIO] = None, + precision_plugin: Optional[Precision] = None, + process_group_backend: Optional[str] = None, + timeout: Optional[timedelta] = default_pg_timeout, + **kwargs: Any, + ) -> None: + super().__init__( + accelerator=accelerator, + parallel_devices=parallel_devices, + cluster_environment=cluster_environment, + checkpoint_io=checkpoint_io, + precision_plugin=precision_plugin, + process_group_backen=process_group_backend, + timeout=timeout, + **kwargs, + ) + super().__init__() + if "reduce_buffer_size" not in self._ddp_kwargs: + # For multi-node training, enabling bucketing will improve performance. + self._ddp_kwargs["reduce_buffer_size"] = self._REDUCE_BUFFER_SIZE_DEFAULT if self.num_nodes > 1 else 0 + + def setup_module_and_optimizers( + self, module: Module, optimizers: List[Optimizer] + ) -> Tuple[Module, List[Optimizer]]: + """Wraps the model and optimizers with fairscale components. + + Return: + The model wrapped into a :class:`~fairscale.nn.data_parallel.ShardedDataParallel` module + and a list of optimizer wrapped in :class:~`fairscale.optim.OSS`. + """ + optimizers = _reinit_optimizers_with_oss(optimizers, self.precision_plugin, self.num_nodes) + model = ShardedDataParallel(module, sharded_optimizer=optimizers, **self._ddp_kwargs) + return model, optimizers + + @contextmanager + def block_backward_sync(self, module: Module) -> Generator: + """Blocks syncing gradients behaviour on backwards pass. + + This is useful for skipping sync when accumulating gradients, reducing communication overhead + Returns: context manager with sync behaviour off + """ + if isinstance(module, ShardedDataParallel): + with module.no_sync(): + yield None + else: + yield None + + @classmethod + def register_strategies(cls, strategy_registry: Dict) -> None: + strategy_registry.register( + "ddp_sharded_spawn_find_unused_parameters_false", + cls, + description="DDP Spawn Sharded Strategy with `find_unused_parameters` as False", + find_unused_parameters=False, + ) + strategy_registry.register( + "ddp_sharded_spawn", + cls, + description=cls.__class__.__name__, + ) + def _reinit_optimizers_with_oss(self, optimizers: List[Optimizer]) -> List["OSS"]: for x, optimizer in enumerate(optimizers): if not isinstance(optimizer, OSS): optim_class = type(optimizer) zero_optimizer = OSS(params=optimizer.param_groups, optim=optim_class, **optimizer.defaults) - is_fp16 = self.precision_plugin.precision in (PrecisionType.MIXED, PrecisionType.HALF) - # For multi-node training, compressing the model shards in fp16 before broadcasting - # improves performance. When using PyTorch AMP, it will not degrade - # the model performance. - zero_optimizer.broadcast_fp16 = is_fp16 and self.num_nodes > 1 optimizers[x] = zero_optimizer del optimizer return optimizers + + +def _reinit_optimizers_with_oss( + optimizers: List[Optimizer], precision_plugin: Precision, num_nodes: int +) -> List["OSS"]: + for x, optimizer in enumerate(optimizers): + if not isinstance(optimizer, OSS): + optim_class = type(optimizer) + zero_optimizer = OSS(params=optimizer.param_groups, optim=optim_class, **optimizer.defaults) + is_fp16 = precision_plugin.precision in (PrecisionType.MIXED, PrecisionType.HALF) + # For multi-node training, compressing the model shards in fp16 before broadcasting + # improves performance. When using PyTorch AMP, it will not degrade + # the model performance. + zero_optimizer.broadcast_fp16 = is_fp16 and num_nodes > 1 + optimizers[x] = zero_optimizer + del optimizer + return optimizers diff --git a/src/lightning_lite/strategies/launchers/multiprocessing.py b/src/lightning_lite/strategies/launchers/multiprocessing.py index ca47efe030302..d416efee56185 100644 --- a/src/lightning_lite/strategies/launchers/multiprocessing.py +++ b/src/lightning_lite/strategies/launchers/multiprocessing.py @@ -118,8 +118,7 @@ def _wrapping_function( ) -> None: if global_states: global_states.restore() - # TODO(lite): Update worker setup once DDPSpawn strategy is in Lite - self._strategy._worker_setup(process_idx) + self._strategy._local_rank = process_idx results = function(*args, **kwargs) if self._strategy.local_rank == 0: diff --git a/src/lightning_lite/strategies/launchers/xla.py b/src/lightning_lite/strategies/launchers/xla.py index 6580fd4a01d0e..60342b344097c 100644 --- a/src/lightning_lite/strategies/launchers/xla.py +++ b/src/lightning_lite/strategies/launchers/xla.py @@ -86,8 +86,7 @@ def _wrapping_function( return_queue: SimpleQueue, global_states: Optional[_GlobalStateSnapshot] = None, ) -> None: - # TODO(lite): Update worker setup once TPUSpawn strategy is in Lite - self._strategy._worker_setup(process_idx) + self._strategy._local_rank = process_idx results = function(*args, **kwargs) if self._strategy.local_rank == 0: diff --git a/src/lightning_lite/strategies/xla.py b/src/lightning_lite/strategies/xla.py new file mode 100644 index 0000000000000..d11e05099b850 --- /dev/null +++ b/src/lightning_lite/strategies/xla.py @@ -0,0 +1,204 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import io +import os +from typing import Any, Dict, List, Mapping, Optional, Sequence, Union + +import torch +from torch import Tensor +from torch.nn import Module +from torch.utils.data import DataLoader + +from lightning_lite.accelerators import Accelerator +from lightning_lite.plugins.environments import XLAEnvironment +from lightning_lite.plugins.io.checkpoint_plugin import CheckpointIO +from lightning_lite.plugins.io.xla_plugin import XLACheckpointIO +from lightning_lite.plugins.precision import Precision +from lightning_lite.strategies.ddp_spawn import DDPSpawnStrategy +from lightning_lite.strategies.launchers.xla import _XLALauncher +from lightning_lite.strategies.strategy import TBroadcast +from lightning_lite.utilities import _TPU_AVAILABLE +from lightning_lite.utilities.apply_func import apply_to_collection +from lightning_lite.utilities.data import has_len +from lightning_lite.utilities.distributed import ReduceOp +from lightning_lite.utilities.rank_zero import rank_zero_only +from lightning_lite.utilities.types import _PATH + +if _TPU_AVAILABLE: + import torch_xla.core.xla_env_vars as xenv + import torch_xla.core.xla_model as xm + from torch_xla.core.xla_model import rendezvous + from torch_xla.distributed.parallel_loader import MpDeviceLoader +else: + xm, xmp, MpDeviceLoader, rendezvous = [None] * 4 + + +class XLAStrategy(DDPSpawnStrategy): + """Strategy for training multiple TPU devices using the :func:`torch_xla.distributed.xla_multiprocessing.spawn` + method.""" + + def __init__( + self, + accelerator: Optional[Accelerator] = None, + parallel_devices: Optional[List[torch.device]] = None, + checkpoint_io: Optional[CheckpointIO] = None, + precision_plugin: Optional[Precision] = None, + **_: Any, + ) -> None: + super().__init__( + accelerator=accelerator, + parallel_devices=parallel_devices, + cluster_environment=XLAEnvironment(), + checkpoint_io=checkpoint_io, + precision_plugin=precision_plugin, + start_method="fork", + ) + self._checkpoint_io: Optional[CheckpointIO] + self._launched = False + + @property + def root_device(self) -> torch.device: + if not self._launched: + raise RuntimeError("Accessing the XLA device before processes have spawned is not allowed.") + return xm.xla_device() + + @property + def checkpoint_io(self) -> CheckpointIO: + if self._checkpoint_io is None: + self._checkpoint_io = XLACheckpointIO() + return self._checkpoint_io + + @checkpoint_io.setter + def checkpoint_io(self, io: Optional[CheckpointIO]) -> None: + self._checkpoint_io = io + + @property + def distributed_sampler_kwargs(self) -> Dict[str, int]: + return dict(num_replicas=self.world_size, rank=self.global_rank) + + @property + def is_distributed(self) -> bool: + # HOST_WORLD_SIZE is not set outside the xmp.spawn process + return (xenv.HOST_WORLD_SIZE in os.environ) and self.world_size != 1 + + def _configure_launcher(self) -> None: + self._launcher = _XLALauncher(self) + + def setup_environment(self) -> None: + self._launched = True + self._set_world_ranks() + rank_zero_only.rank = self.global_rank + + def setup_module(self, module: Module) -> Module: + return module + + def module_to_device(self, module: Module) -> None: + module.to(self.root_device) + + def process_dataloader(self, dataloader: DataLoader) -> MpDeviceLoader: + XLAStrategy._validate_dataloader(dataloader) + dataloader = MpDeviceLoader(dataloader, self.root_device) + # Mimic interface to torch.utils.data.DataLoader + dataloader.dataset = dataloader._loader.dataset + return dataloader + + def reduce( + self, output: Union[Tensor, Any], group: Optional[Any] = None, reduce_op: Optional[Union[ReduceOp, str]] = None + ) -> Tensor: + if not isinstance(output, Tensor): + output = torch.tensor(output, device=self.root_device) + + invalid_reduce_op = isinstance(reduce_op, ReduceOp) and reduce_op != ReduceOp.SUM + invalid_reduce_op_str = isinstance(reduce_op, str) and reduce_op.lower() not in ("sum", "mean", "avg") + if invalid_reduce_op or invalid_reduce_op_str: + raise ValueError( + "Currently, the XLAStrategy only supports `sum`, `mean`, `avg` for the reduce operation, got:" + f" {reduce_op}" + ) + + output = xm.mesh_reduce("reduce", output, sum) + + if isinstance(reduce_op, str) and reduce_op.lower() in ("avg", "mean"): + output = output / self.world_size + + return output + + def barrier(self, name: Optional[str] = None, *args: Any, **kwargs: Any) -> None: + if self.is_distributed: + rendezvous(name) + + def broadcast(self, obj: TBroadcast, src: int = 0) -> TBroadcast: + if not self.is_distributed: + return obj + buffer = io.BytesIO() + torch.save(obj, buffer) + data = bytearray(buffer.getbuffer()) + data_tensor = torch.tensor(data, device=self.root_device, dtype=torch.float) + data = xm.all_gather(data_tensor) + buffer = io.BytesIO(data.cpu().byte().numpy()) + obj = torch.load(buffer) + return obj + + def all_gather(self, tensor: Tensor, group: Optional[Any] = None, sync_grads: bool = False) -> Tensor: + """ + Function to gather a tensor from several distributed processes + Args: + tensor: tensor of shape (batch, ...) + group: not available with TPUs + sync_grads: not available with TPUs + Return: + A tensor of shape (world_size, batch, ...) + """ + if isinstance(tensor, Tensor) and tensor.dim() == 0: + tensor = tensor.unsqueeze(0) + return xm.all_gather(tensor) + + def save_checkpoint( + self, checkpoint: Dict[str, Any], filepath: _PATH, storage_options: Optional[Any] = None + ) -> None: + """Save model/training states as a checkpoint file through state-dump and file-write. + + Args: + checkpoint: dict containing model and trainer state + filepath: write-target file's path + storage_options: parameter for how to save to storage, passed to ``CheckpointIO`` plugin + """ + # `xla_model.save` needs to be called on all ranks. It internally checks if the local rank is 0 + self.checkpoint_io.save_checkpoint(checkpoint, filepath, storage_options=storage_options) + + def remove_checkpoint(self, filepath: _PATH) -> None: + """Remove checkpoint filepath from the filesystem. + + Args: + filepath: Path to checkpoint + """ + if self.local_rank == 0: + self.checkpoint_io.remove_checkpoint(filepath) + + @classmethod + def register_strategies(cls, strategy_registry: Dict) -> None: + # TODO(lite): Deprecate the name "tpu_spawn" through the connector + strategy_registry.register("tpu_spawn", cls, description=cls.__class__.__name__) + strategy_registry.register("xla", cls, description=cls.__class__.__name__) + + @staticmethod + def _validate_dataloader(dataloaders: DataLoader) -> None: + def check_has_len(dataloader: DataLoader) -> None: + if not has_len(dataloader): + raise TypeError( + "TPUs do not currently support IterableDataset objects, the dataset must implement `__len__`." + " HINT: You can mock the length on your dataset to bypass this MisconfigurationException." + ) + + apply_to_collection(dataloaders, dtype=object, wrong_dtype=(Sequence, Mapping), function=check_has_len) diff --git a/src/lightning_lite/utilities/distributed.py b/src/lightning_lite/utilities/distributed.py index 166b28a5c948f..26fa3e1e230d0 100644 --- a/src/lightning_lite/utilities/distributed.py +++ b/src/lightning_lite/utilities/distributed.py @@ -3,12 +3,12 @@ from typing import Any, List, Optional, Tuple, Union import torch +from lightning_utilities.core.rank_zero import rank_zero_deprecation from torch import Tensor from torch.nn import functional as F from lightning_lite.plugins.environments.cluster_environment import ClusterEnvironment from lightning_lite.utilities.imports import _HPU_AVAILABLE, _TPU_AVAILABLE -from lightning_lite.utilities.rank_zero import rank_zero_deprecation from lightning_lite.utilities.rank_zero import rank_zero_info as new_rank_zero_info if _TPU_AVAILABLE: diff --git a/tests/tests_lite/strategies/test_registry.py b/tests/tests_lite/strategies/test_registry.py index 627837b4524b7..93c0071d9cd47 100644 --- a/tests/tests_lite/strategies/test_registry.py +++ b/tests/tests_lite/strategies/test_registry.py @@ -53,6 +53,16 @@ def test_available_strategies_in_registry(): "deepspeed_stage_3", "deepspeed_stage_3_offload", "deepspeed_stage_3_offload_nvme", - "dp", + "ddp_sharded_spawn_find_unused_parameters_false", + "ddp_sharded_spawn", + "ddp_spawn", + "ddp_fork", + "ddp_notebook", + "ddp_spawn_find_unused_parameters_false", + "ddp_fork_find_unused_parameters_false", + "ddp_notebook_find_unused_parameters_false", "single_tpu", + "tpu_spawn", + "xla", + "dp", } From 75e6c9109c7fe81f8e2c4937cf2e0d848546fd48 Mon Sep 17 00:00:00 2001 From: Neven Miculinic Date: Thu, 15 Sep 2022 12:05:46 +0100 Subject: [PATCH 156/193] [CLI] Fix cluster logs with over 5000 entries (#14458) --- src/lightning_app/CHANGELOG.md | 2 + src/lightning_app/cli/lightning_cli.py | 38 ++- src/lightning_app/core/constants.py | 4 + src/lightning_app/utilities/cluster_logs.py | 131 ++++---- .../utilities/data_structures.py | 36 +++ src/lightning_app/utilities/exceptions.py | 4 + .../utilities/logs_socket_api.py | 30 +- .../utilities/test_logs_socket_api.py | 22 +- tests/tests_clusters/test_cluster_logs.py | 302 ++++++++++++++++++ 9 files changed, 482 insertions(+), 87 deletions(-) create mode 100644 src/lightning_app/utilities/data_structures.py diff --git a/src/lightning_app/CHANGELOG.md b/src/lightning_app/CHANGELOG.md index 3106b7a7a2683..fb4609ba2c7b4 100644 --- a/src/lightning_app/CHANGELOG.md +++ b/src/lightning_app/CHANGELOG.md @@ -30,6 +30,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Unification of app template: moved `app.py` to root dir for `lightning init app ` template ([#13853](https://github.com/Lightning-AI/lightning/pull/13853)) +- Fixing 5000 log line limitation for Lightning AI BYOC cluster logs ([#14458](https://github.com/Lightning-AI/lightning/pull/14458)) + - Fixed a bug where the uploaded command file wasn't properly parsed ([#14532](https://github.com/Lightning-AI/lightning/pull/14532)) diff --git a/src/lightning_app/cli/lightning_cli.py b/src/lightning_app/cli/lightning_cli.py index faa0146cc6d8d..51fc08e5fd49a 100644 --- a/src/lightning_app/cli/lightning_cli.py +++ b/src/lightning_app/cli/lightning_cli.py @@ -23,7 +23,7 @@ from lightning_app.cli.lightning_cli_create import create from lightning_app.cli.lightning_cli_delete import delete from lightning_app.cli.lightning_cli_list import get_list -from lightning_app.core.constants import get_lightning_cloud_url +from lightning_app.core.constants import DEBUG, get_lightning_cloud_url from lightning_app.runners.runtime import dispatch from lightning_app.runners.runtime_type import RuntimeType from lightning_app.utilities.app_helpers import Logger @@ -31,8 +31,9 @@ from lightning_app.utilities.cli_helpers import _arrow_time_callback, _format_input_env_variables from lightning_app.utilities.cloud import _get_project from lightning_app.utilities.cluster_logs import _cluster_logs_reader +from lightning_app.utilities.exceptions import LogLinesLimitExceeded from lightning_app.utilities.login import Auth -from lightning_app.utilities.logs_socket_api import _LightningLogsSocketAPI +from lightning_app.utilities.logs_socket_api import _ClusterLogsSocketAPI, _LightningLogsSocketAPI from lightning_app.utilities.network import LightningClient logger = Logger(__name__) @@ -205,7 +206,7 @@ def cluster(): help="The end timestamp / relative time increment to query logs for. This is ignored when following logs (with " "-f/--follow). The same format as --from option has.", ) -@click.option("--limit", default=1000, help="The max number of log lines returned.") +@click.option("--limit", default=10000, help="The max number of log lines returned.") @click.option("-f", "--follow", required=False, is_flag=True, help="Wait for new logs, to exit use CTRL+C.") def cluster_logs(cluster_name: str, to_time: arrow.Arrow, from_time: arrow.Arrow, limit: int, follow: bool) -> None: """Show cluster logs. @@ -252,21 +253,26 @@ def cluster_logs(cluster_name: str, to_time: arrow.Arrow, from_time: arrow.Arrow f" Please select one of the following: [{', '.join(clusters.keys())}]" ) - log_reader = _cluster_logs_reader( - client=client, - cluster_id=clusters[cluster_name], - start=from_time.int_timestamp, - end=to_time.int_timestamp, - limit=limit, - follow=follow, - ) + try: + log_reader = _cluster_logs_reader( + logs_api_client=_ClusterLogsSocketAPI(client.api_client), + cluster_id=clusters[cluster_name], + start=from_time.int_timestamp, + end=to_time.int_timestamp if not follow else None, + limit=limit, + follow=follow, + ) - colors = {"error": "red", "warn": "yellow", "info": "green"} + colors = {"error": "red", "warn": "yellow", "info": "green"} - for log_event in log_reader: - date = log_event.timestamp.strftime("%m/%d/%Y %H:%M:%S") - color = colors.get(log_event.labels.level, "green") - rich.print(f"[{color}]{log_event.labels.level:5}[/{color}] {date} {log_event.message.rstrip()}") + for log_event in log_reader: + date = log_event.timestamp.strftime("%m/%d/%Y %H:%M:%S") + color = colors.get(log_event.labels.level, "green") + rich.print(f"[{color}]{log_event.labels.level:5}[/{color}] {date} {log_event.message.rstrip()}") + except LogLinesLimitExceeded: + raise click.ClickException(f"Read {limit} log lines, but there may be more. Use --limit param to read more") + except Exception as error: + logger.error(f"⚡ Error while reading logs ({type(error)}), {error}", exc_info=DEBUG) @_main.command() diff --git a/src/lightning_app/core/constants.py b/src/lightning_app/core/constants.py index 85b15f4daa5f6..4713a955313c9 100644 --- a/src/lightning_app/core/constants.py +++ b/src/lightning_app/core/constants.py @@ -1,6 +1,8 @@ import os from pathlib import Path +import lightning_cloud.env + import lightning_app SUPPORTED_PRIMITIVE_TYPES = (type(None), str, int, float, bool) @@ -35,6 +37,8 @@ LIGHTNING_APPS_PUBLIC_REGISTRY = "https://lightning.ai/v1/apps" ENABLE_STATE_WEBSOCKET = bool(int(os.getenv("ENABLE_STATE_WEBSOCKET", "0"))) +DEBUG: bool = lightning_cloud.env.DEBUG + def get_lightning_cloud_url() -> str: # DO NOT CHANGE! diff --git a/src/lightning_app/utilities/cluster_logs.py b/src/lightning_app/utilities/cluster_logs.py index a2241f9f63a25..977bb86423525 100644 --- a/src/lightning_app/utilities/cluster_logs.py +++ b/src/lightning_app/utilities/cluster_logs.py @@ -1,34 +1,25 @@ import json import queue from dataclasses import dataclass +from datetime import timedelta from threading import Thread -from typing import Callable, Iterator, Optional +from typing import Callable, Iterator, List, Optional import dateutil.parser from websocket import WebSocketApp -from lightning_app.utilities.log_helpers import _error_callback, _OrderedLogEntry +from lightning_app.utilities.data_structures import AttributeDict +from lightning_app.utilities.exceptions import LogLinesLimitExceeded +from lightning_app.utilities.log_helpers import _OrderedLogEntry from lightning_app.utilities.logs_socket_api import _ClusterLogsSocketAPI -from lightning_app.utilities.network import LightningClient -@dataclass -class _ClusterLogEventLabels: +class _ClusterLogEventLabels(AttributeDict): cluster_id: str grid_url: str hostname: str level: str logger: str - path: Optional[str] = None - workspace: Optional[str] = None - identifier: Optional[str] = None - issuer: Optional[str] = None - error: Optional[str] = None - errorVerbose: Optional[str] = None - dir: Optional[str] = None - bucket: Optional[str] = None - prefix: Optional[str] = None - loki_s3: Optional[str] = None @dataclass @@ -36,65 +27,101 @@ class _ClusterLogEvent(_OrderedLogEntry): labels: _ClusterLogEventLabels -def _push_log_events_to_read_queue_callback(read_queue: queue.PriorityQueue): +def _push_log_events_to_read_queue_callback(read_queue: queue.Queue): """Pushes _LogEvents from websocket to read_queue. Returns callback function used with `on_message_callback` of websocket.WebSocketApp. """ - def callback(ws_app: WebSocketApp, msg: str): - # We strongly trust that the contract on API will hold atm :D - event_dict = json.loads(msg) - labels = _ClusterLogEventLabels(**event_dict["labels"]) - - if "message" in event_dict: - message = event_dict["message"] - timestamp = dateutil.parser.isoparse(event_dict["timestamp"]) - event = _ClusterLogEvent( - message=message, - timestamp=timestamp, - labels=labels, - ) - read_queue.put(event) + def callback(_: WebSocketApp, msg: str): + for ev in _parse_log_event(msg): + read_queue.put(ev) return callback +def _parse_log_event(msg: str) -> List[_ClusterLogEvent]: + # We strongly trust that the contract on API will hold atm :D + event_dict = json.loads(msg) + labels = _ClusterLogEventLabels(**event_dict["labels"]) + log_events = [] + + if "message" in event_dict: + message = event_dict["message"] + timestamp = dateutil.parser.isoparse(event_dict["timestamp"]) + event = _ClusterLogEvent( + message=message, + timestamp=timestamp, + labels=labels, + ) + log_events.append(event) + return log_events + + def _cluster_logs_reader( - client: LightningClient, + logs_api_client: _ClusterLogsSocketAPI, cluster_id: str, - start: int, - end: int, + start: float, + end: Optional[float], limit: int, follow: bool, - on_error_callback: Optional[Callable] = None, + batch_size: int = 5000, ) -> Iterator[_ClusterLogEvent]: - - logs_api_client = _ClusterLogsSocketAPI(client.api_client) - read_queue = queue.PriorityQueue() + read_queue = queue.Queue(maxsize=2 * batch_size) + items_read = 0 # We will use a socket inside a thread to read logs, # to follow our typical reading pattern - log_socket = logs_api_client.create_cluster_logs_socket( - cluster_id=cluster_id, - start=start, - end=end, - limit=limit, - on_message_callback=_push_log_events_to_read_queue_callback(read_queue), - on_error_callback=on_error_callback or _error_callback, - ) - log_thread = Thread(target=log_socket.run_forever) + # helper function which will start logs streams to the read_queue from the start onwards, till the end + def start_logs(start: float) -> Callable: + log_socket = logs_api_client.create_cluster_logs_socket( + cluster_id=cluster_id, + start=start, + end=end, + limit=min(limit - items_read, batch_size), + on_message_callback=_push_log_events_to_read_queue_callback(read_queue), + on_error_callback=lambda _, ex: read_queue.put(ex), + ) + + log_thread = Thread(target=log_socket.run_forever, daemon=True) + + # Establish connection and begin pushing logs to the queue + log_thread.start() - # Establish connection and begin pushing logs to the print queue - log_thread.start() + def stop(): + # Close connection - it will cause run_forever() to finish -> thread as finishes as well + log_socket.close() + # The socket was closed, we can just wait for thread to finish. + log_thread.join() + + return stop + + stop_fn = start_logs(start) # Print logs from queue when log event is available try: + items_remaining_in_batch = batch_size while True: - log_event = read_queue.get(timeout=None if follow else 1.0) + log_event: _ClusterLogEvent = read_queue.get(timeout=None if follow else 1.0) + + # Exception happened during queue processing + if isinstance(log_event, Exception): + raise log_event + yield log_event + items_read += 1 + if items_read == limit: + # We've read enough entries, just terminate and close the connection + raise LogLinesLimitExceeded() + + items_remaining_in_batch -= 1 + if items_remaining_in_batch == 0: + stop_fn() + start_logs((log_event.timestamp + timedelta(microseconds=1)).timestamp()) + items_remaining_in_batch = batch_size + except queue.Empty: # Empty is raised by queue.get if timeout is reached. Follow = False case. pass @@ -104,8 +131,4 @@ def _cluster_logs_reader( pass finally: - # Close connection - it will cause run_forever() to finish -> thread as finishes as well - log_socket.close() - - # The socket was closed, we can just wait for thread to finish. - log_thread.join() + stop_fn() diff --git a/src/lightning_app/utilities/data_structures.py b/src/lightning_app/utilities/data_structures.py new file mode 100644 index 0000000000000..d218b875ac925 --- /dev/null +++ b/src/lightning_app/utilities/data_structures.py @@ -0,0 +1,36 @@ +from typing import Any, Dict, Optional + + +class AttributeDict(Dict): + """Extended dictionary accessible with dot notation. + + >>> ad = AttributeDict({'key1': 1, 'key2': 'abc'}) + >>> ad.key1 + 1 + >>> ad.update({'my-key': 3.14}) + >>> ad.update(new_key=42) + >>> ad.key1 = 2 + >>> ad + "key1": 2 + "key2": abc + "my-key": 3.14 + "new_key": 42 + """ + + def __getattr__(self, key: str) -> Optional[Any]: + try: + return self[key] + except KeyError as exp: + raise AttributeError(f'Missing attribute "{key}"') from exp + + def __setattr__(self, key: str, val: Any) -> None: + self[key] = val + + def __repr__(self) -> str: + if not len(self): + return "" + max_key_length = max(len(str(k)) for k in self) + tmp_name = "{:" + str(max_key_length + 3) + "s} {}" + rows = [tmp_name.format(f'"{n}":', self[n]) for n in sorted(self.keys())] + out = "\n".join(rows) + return out diff --git a/src/lightning_app/utilities/exceptions.py b/src/lightning_app/utilities/exceptions.py index 93bf5b7f319e8..dee677d66248d 100644 --- a/src/lightning_app/utilities/exceptions.py +++ b/src/lightning_app/utilities/exceptions.py @@ -58,3 +58,7 @@ class LightningSigtermStateException(Exception): def __init__(self, exit_code): self.exit_code = exit_code + + +class LogLinesLimitExceeded(Exception): + """Exception to inform the user that we've reached the maximum number of log lines.""" diff --git a/src/lightning_app/utilities/logs_socket_api.py b/src/lightning_app/utilities/logs_socket_api.py index 28569a4879134..479f142cd5e12 100644 --- a/src/lightning_app/utilities/logs_socket_api.py +++ b/src/lightning_app/utilities/logs_socket_api.py @@ -1,9 +1,11 @@ +import urllib from typing import Callable, Optional from urllib.parse import urlparse from lightning_cloud.openapi import ApiClient, AuthServiceApi, V1LoginRequest from websocket import WebSocketApp +from lightning.app.core import constants from lightning_app.utilities.login import Auth @@ -101,18 +103,27 @@ def print_log_msg(ws_app, msg): class _ClusterLogsSocketAPI(_LogsSocketAPI): @staticmethod - def _cluster_logs_socket_url(host: str, cluster_id: str, start: int, end: int, limit: int, token: str) -> str: - return ( - f"wss://{host}/v1/core/clusters/{cluster_id}/logs?" - f"start={start}&end={end}&token={token}&limit={limit}" - f"&follow=true" - ) + def _cluster_logs_socket_url( + host: str, cluster_id: str, start: float, end: Optional[float], limit: int, token: str + ) -> str: + params = { + "start": start, + "limit": limit, + "token": token, + } + + if end: + params["end"] = end + else: + params["follow"] = "true" + + return f"wss://{host}/v1/core/clusters/{cluster_id}/logs?{urllib.parse.urlencode(params)}" def create_cluster_logs_socket( self, cluster_id: str, - start: int, # unix timestamp - end: int, # unix timestamp + start: float, # unix timestamp + end: Optional[float], # unix timestamp limit: int, on_message_callback: Callable[[WebSocketApp, str], None], on_error_callback: Optional[Callable[[Exception, str], None]] = None, @@ -170,4 +181,7 @@ def print_log_msg(ws_app, msg): end=end, ) + if constants.DEBUG: + print(f"Connecting to websocket {socket_url}") + return WebSocketApp(socket_url, on_message=on_message_callback, on_error=on_error_callback) diff --git a/tests/tests_app/utilities/test_logs_socket_api.py b/tests/tests_app/utilities/test_logs_socket_api.py index 9fc6e2ee4086b..7116cd9e45277 100644 --- a/tests/tests_app/utilities/test_logs_socket_api.py +++ b/tests/tests_app/utilities/test_logs_socket_api.py @@ -3,16 +3,24 @@ from lightning_app.utilities.logs_socket_api import _ClusterLogsSocketAPI -def test_cluster_logs_socket_api(): - websocket_url = _ClusterLogsSocketAPI._cluster_logs_socket_url( - "example.org", "my-cluster", 1661100000, 1661101000, 10, "TOKEN" +def test_cluster_logs_socket_url1(): + assert ( + _ClusterLogsSocketAPI._cluster_logs_socket_url( + "example.org", "my-cluster", 1661100000.123, 1661101000.321, 10, "TOKEN" + ) + == "wss://example.org/v1/core/clusters/my-cluster/logs?start=1661100000.123&" + "limit=10&token=TOKEN&end=1661101000.321" ) + +def test_cluster_logs_socket_url2(): assert ( - websocket_url == "wss://example.org/v1/core/clusters/my-cluster/logs?start=1661100000&end=1661101000" - "&token=TOKEN&limit=10&follow=true" + _ClusterLogsSocketAPI._cluster_logs_socket_url("example.org", "my-cluster", 1661100000, None, 10, "TOKEN") + == "wss://example.org/v1/core/clusters/my-cluster/logs?start=1661100000&limit=10&token=TOKEN&follow=true" ) + +def test_cluster_logs_socket_api(): api_client = mock.Mock() api_client.configuration.host = "https://example.com" api_client.call_api.return_value.token = "TOKEN" @@ -25,8 +33,4 @@ def on_message_func(): "my-cluster", 1661100000, 1661101000, 10, on_message_func ) - assert ( - web_socket_app.url == "wss://example.com/v1/core/clusters/my-cluster/logs?start=1661100000&end=1661101000" - "&token=TOKEN&limit=10&follow=true" - ) assert web_socket_app.on_message == on_message_func diff --git a/tests/tests_clusters/test_cluster_logs.py b/tests/tests_clusters/test_cluster_logs.py index 4855288dcf1ea..497cb0ada4fc0 100644 --- a/tests/tests_clusters/test_cluster_logs.py +++ b/tests/tests_clusters/test_cluster_logs.py @@ -1,9 +1,20 @@ import os import random import string +from datetime import datetime +from itertools import repeat +from unittest import mock import pytest +from dateutil.tz import tzutc +from lightning_app.utilities.cluster_logs import ( + _cluster_logs_reader, + _ClusterLogEvent, + _ClusterLogEventLabels, + _parse_log_event, +) +from lightning_app.utilities.exceptions import LogLinesLimitExceeded from src.lightning_app.testing.testing import run_cli @@ -101,3 +112,294 @@ def test_lighting_cloud_logs() -> None: ] ) as (stdout, stderr): assert "Error while reading logs" in stdout, f"stdout: {stdout}\nstderr: {stderr}" + + +def test_cluster_logs_reader(): + logs_api_client = mock.Mock() + log_socket = mock.Mock() + + def create_cluster_logs_socket( + cluster_id: str, + start: float, # unix timestamp + end: float, # unix timestamp + limit: int, + on_message_callback, + on_error_callback, + ): + assert start == 0 + assert end == 10 + assert limit == 10 + + def run_forever(): + on_message_callback( + None, + r""" + { + "message": "getting file lock", + "timestamp": "2022-08-30T00:57:59.370356800Z", + "labels": { + "cluster_id": "cluster_id", + "grid_url": "https://lightning.ai", + "hostname": "ec2-001", + "level": "info", + "logger": "test.logger", + "path": "/tmp/grid.terraform" + } + } + """, + ) + + log_socket.run_forever = run_forever + return log_socket + + logs_api_client.create_cluster_logs_socket = mock.MagicMock( + side_effect=create_cluster_logs_socket, + ) + + logs = list( + _cluster_logs_reader( + logs_api_client=logs_api_client, + cluster_id="cluster_id", + start=0, + end=10, + limit=10, + follow=False, + ) + ) + logs_api_client.create_cluster_logs_socket.assert_called_once() + + assert logs == [ + _ClusterLogEvent( + message="getting file lock", + timestamp=datetime(2022, 8, 30, 0, 57, 59, 370356, tzinfo=tzutc()), + labels=_ClusterLogEventLabels( + cluster_id="cluster_id", + grid_url="https://lightning.ai", + hostname="ec2-001", + level="info", + logger="test.logger", + path="/tmp/grid.terraform", + ), + ), + ] + + +def test_cluster_logs_reader_pagination(): + logs_api_client = mock.Mock() + log_socket = mock.Mock() + + messages = iter( + [ + r""" + { + "message": "v2", + "timestamp": "2022-08-30T00:57:59.370356800Z", + "labels": { + "cluster_id": "cluster_id", + "grid_url": "https://lightning.ai", + "hostname": "ec2-001", + "level": "info", + "logger": "test.logger", + "path": "/tmp/grid.terraform" + } + } + """, + r""" + { + "message": "v3", + "timestamp": "2022-08-30T00:58:59.370356800Z", + "labels": { + "cluster_id": "cluster_id", + "grid_url": "https://lightning.ai", + "hostname": "ec2-001", + "level": "info", + "logger": "test.logger", + "path": "/tmp/grid.terraform" + } + } + """, + ] + ) + + def create_cluster_logs_socket( + cluster_id: str, + start: int, # unix timestamp + end: int, # unix timestamp + limit: int, + on_message_callback, + on_error_callback, + ): + def run_forever(): + on_message_callback(None, next(messages)) + + log_socket.run_forever = run_forever + return log_socket + + logs_api_client.create_cluster_logs_socket = mock.Mock( + side_effect=create_cluster_logs_socket, + ) + + logs = list( + _cluster_logs_reader( + logs_api_client=logs_api_client, + cluster_id="cluster_id", + start=0, + end=10, + limit=5, + follow=False, + batch_size=1, + ) + ) + + assert logs_api_client.create_cluster_logs_socket.call_args_list[0].kwargs["start"] == 0 + assert logs_api_client.create_cluster_logs_socket.call_args_list[0].kwargs["limit"] == 1 + assert logs_api_client.create_cluster_logs_socket.call_args_list[0].kwargs["end"] == 10 + + assert logs_api_client.create_cluster_logs_socket.call_args_list[1].kwargs["start"] == 1661821079.370357 + assert logs_api_client.create_cluster_logs_socket.call_args_list[1].kwargs["limit"] == 1 + assert logs_api_client.create_cluster_logs_socket.call_args_list[1].kwargs["end"] == 10 + + assert len(logs_api_client.create_cluster_logs_socket.call_args_list) == 3 + + assert logs == [ + _ClusterLogEvent( + message="v2", + timestamp=datetime(2022, 8, 30, 0, 57, 59, 370356, tzinfo=tzutc()), + labels=_ClusterLogEventLabels( + cluster_id="cluster_id", + grid_url="https://lightning.ai", + hostname="ec2-001", + level="info", + logger="test.logger", + path="/tmp/grid.terraform", + ), + ), + _ClusterLogEvent( + message="v3", + timestamp=datetime(2022, 8, 30, 0, 58, 59, 370356, tzinfo=tzutc()), + labels=_ClusterLogEventLabels( + cluster_id="cluster_id", + grid_url="https://lightning.ai", + hostname="ec2-001", + level="info", + logger="test.logger", + path="/tmp/grid.terraform", + ), + ), + ] + + +def test_cluster_logs_limit_exceeded(): + logs_api_client = mock.Mock() + log_socket = mock.Mock() + + log_message = r""" + { + "message": "v2", + "timestamp": "2022-08-30T00:57:59.370356800Z", + "labels": { + "cluster_id": "cluster_id", + "grid_url": "https://lightning.ai", + "hostname": "ec2-001", + "level": "info", + "logger": "test.logger", + "path": "/tmp/grid.terraform" + } + } + """ + + messages = None + + def create_cluster_logs_socket( + cluster_id: str, + start: int, # unix timestamp + end: int, # unix timestamp + limit: int, + on_message_callback, + on_error_callback, + ): + def run_forever(): + on_message_callback(None, next(messages)) + + log_socket.run_forever = run_forever + return log_socket + + logs_api_client.create_cluster_logs_socket = mock.Mock( + side_effect=create_cluster_logs_socket, + ) + + messages = repeat(log_message, 2) + _ = list( + _cluster_logs_reader( + logs_api_client=logs_api_client, + cluster_id="cluster_id", + start=0, + end=10, + limit=3, + follow=False, + batch_size=1, + ) + ) + + messages = repeat(log_message, 2) + with pytest.raises(LogLinesLimitExceeded): + _ = list( + _cluster_logs_reader( + logs_api_client=logs_api_client, + cluster_id="cluster_id", + start=0, + end=10, + limit=2, + follow=False, + batch_size=1, + ) + ) + + messages = repeat(log_message, 2) + with pytest.raises(LogLinesLimitExceeded): + _ = list( + _cluster_logs_reader( + logs_api_client=logs_api_client, + cluster_id="cluster_id", + start=0, + end=10, + limit=1, + follow=False, + batch_size=1, + ) + ) + + +def test_parse_log_event(): + assert ( + _parse_log_event( + r""" + { + "message": "getting file lock", + "timestamp": "2022-08-30T00:57:59.370356800Z", + "labels": { + "cluster_id": "cluster_id", + "grid_url": "https://lightning.ai", + "hostname": "ec2-001", + "level": "info", + "logger": "test.logger", + "path": "/tmp/grid.terraform" + } + } + """ + ) + == [ + _ClusterLogEvent( + message="getting file lock", + timestamp=datetime(2022, 8, 30, 0, 57, 59, 370356, tzinfo=tzutc()), + labels=_ClusterLogEventLabels( + cluster_id="cluster_id", + grid_url="https://lightning.ai", + hostname="ec2-001", + level="info", + logger="test.logger", + path="/tmp/grid.terraform", + ), + ), + ] + ) From df640f7d870e3eb28320514fcaba2f656b572383 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Thu, 15 Sep 2022 13:29:47 +0200 Subject: [PATCH 157/193] Drop gatekeeper CI checks (#14717) --- .github/checkgroup.yml | 1 - .github/gatekeeper-config_app.yml | 12 --------- .github/gatekeeper-config_pytorch.yml | 15 ----------- .github/workflows/README.md | 2 +- .github/workflows/ci-pr-gatekeeper.yml | 35 -------------------------- 5 files changed, 1 insertion(+), 64 deletions(-) delete mode 100644 .github/gatekeeper-config_app.yml delete mode 100644 .github/gatekeeper-config_pytorch.yml delete mode 100644 .github/workflows/ci-pr-gatekeeper.yml diff --git a/.github/checkgroup.yml b/.github/checkgroup.yml index 6acef1517738c..11e3a427bd612 100644 --- a/.github/checkgroup.yml +++ b/.github/checkgroup.yml @@ -47,7 +47,6 @@ subprojects: - "make-doctest (pytorch)" - "make-html (pytorch)" - "mypy" - - "PR Gatekeeper (pytorch)" - "pytorch-lightning (GPUs)" - "pytorch-lightning (HPUs)" - "pytorch-lightning (IPUs)" diff --git a/.github/gatekeeper-config_app.yml b/.github/gatekeeper-config_app.yml deleted file mode 100644 index 9e51c23458fc0..0000000000000 --- a/.github/gatekeeper-config_app.yml +++ /dev/null @@ -1,12 +0,0 @@ -approvals: - groups: - - name: 'Lightning Apps' - minimum: 1 - from: - - alecmerdler - - awaelchli - - hhsecond - - lantiga - - manskx - - nohalon - - tchaton diff --git a/.github/gatekeeper-config_pytorch.yml b/.github/gatekeeper-config_pytorch.yml deleted file mode 100644 index 75201f796c9ac..0000000000000 --- a/.github/gatekeeper-config_pytorch.yml +++ /dev/null @@ -1,15 +0,0 @@ -approvals: - groups: - - name: 'PyTorch Lightning' - minimum: 1 - from: - - awaelchli - - Borda - - carmocca - - ethanwharris - - kaushikb11 - - krshrimali - - otaj - - rohitgr7 - - tchaton - - williamFalcon diff --git a/.github/workflows/README.md b/.github/workflows/README.md index 4ed903c0f3a93..2067c244f1684 100644 --- a/.github/workflows/README.md +++ b/.github/workflows/README.md @@ -59,5 +59,5 @@ | .github/mergify.yml | Label PRs as conflicts or ready, and request reviews if needed. | | .github/stale.yml | Close inactive issues/PRs sometimes after adding the "won't fix" label to them. | | .github/workflows/probot-auto-cc.yml, .github/lightning-probot.yml | Notify maintainers of interest depending on labels added to an issue We utilize lightning-probot forked from PyTorch’s probot. | +| .github/workflows/probot-check-group.yml, .github/checkgroup.yml | Checks whether the relevant jobs were successfully run based on the changed files in the PR | | .pre-commit-config.yaml | pre-commit.ci runs a set of linters and formatters, such as black, flake8 and isort. When formatting is applied, the bot pushes a commit with its change. This configuration is also used for running pre-commit locally. | -| .github/workflows/ci-pr-gatekeeper.yml | Prevent PRs from merging into master without any Grid.ai employees’ approval. | diff --git a/.github/workflows/ci-pr-gatekeeper.yml b/.github/workflows/ci-pr-gatekeeper.yml deleted file mode 100644 index f13aa98f87dca..0000000000000 --- a/.github/workflows/ci-pr-gatekeeper.yml +++ /dev/null @@ -1,35 +0,0 @@ -name: 'PR Gatekeeper' - -on: - pull_request_review: - types: [submitted] - -# todo: simplify after adding https://github.com/octodemo/pr-gatekeeper/issues/293 - -jobs: - gatekeeper: - name: PR Gatekeeper - runs-on: ubuntu-20.04 - strategy: - fail-fast: false - matrix: - pkg: ["app", "pytorch"] - steps: - - uses: actions/checkout@v3 - with: - fetch-depth: "2" # To retrieve the preceding commit. - - name: Get changed files using defaults - id: changed-files - uses: tj-actions/changed-files@v29.0.4 - - name: Determine changes - id: touched - run: | - patterns = ('docs/source-${{ matrix.pkg }}', 'src/lightning_${{ matrix.pkg }}', 'tests/tests_${{ matrix.pkg }}') - changed = any(p in "${{steps.changed-files.outputs.all_changed_and_modified_files}}" for p in patterns) - print(f'::set-output name=files::{int(changed)}') - shell: python - - uses: octodemo/pr-gatekeeper@main - if: steps.touched.outputs.files == 1 - with: - token: ${{ secrets.GITHUB_TOKEN }} - config-file: './.github/gatekeeper-config_${{ matrix.pkg }}.yml' From 63014a80cb009dbd0727a1a72d21ce27b34fd19d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Thu, 15 Sep 2022 13:41:25 +0200 Subject: [PATCH 158/193] Tune the checkgroup config (#14712) * Tune the checkgroup config * Lite does not support HPU and IPU atm * Skip HPU as the server is down --- .azure/hpu-tests.yml | 1 + .github/checkgroup.yml | 26 +++++++++++++++++++--- .github/workflows/ci-pytorch-test-slow.yml | 5 ++++- .github/workflows/probot-check-group.yml | 2 ++ 4 files changed, 30 insertions(+), 4 deletions(-) diff --git a/.azure/hpu-tests.yml b/.azure/hpu-tests.yml index c445cc92dff46..fbe30f7539632 100644 --- a/.azure/hpu-tests.yml +++ b/.azure/hpu-tests.yml @@ -16,6 +16,7 @@ trigger: - "requirements/pytorch/**" - "src/pytorch_lightning/**" - "tests/tests_pytorch/**" + - "setup.cfg" # includes pytest config pr: branches: diff --git a/.github/checkgroup.yml b/.github/checkgroup.yml index 11e3a427bd612..099b05f3a2c24 100644 --- a/.github/checkgroup.yml +++ b/.github/checkgroup.yml @@ -31,6 +31,7 @@ subprojects: - "setup.cfg" # includes pytest config - ".github/workflows/docs-*.yml" checks: + # Note: updates here should be applied to the lightning_lite group - "pl-conda (3.8, 1.10)" - "pl-conda (3.8, 1.9)" - "pl-conda (3.9, 1.11)" @@ -48,12 +49,14 @@ subprojects: - "make-html (pytorch)" - "mypy" - "pytorch-lightning (GPUs)" - - "pytorch-lightning (HPUs)" + # TODO: the HPU server is down + #- "pytorch-lightning (HPUs)" - "pytorch-lightning (IPUs)" - "pl-slow (macOS-11, 3.7, 1.11)" - "pl-slow (ubuntu-20.04, 3.7, 1.11)" - "pl-slow (windows-2022, 3.7, 1.11)" - - "test-on-tpus" + # TODO: since this job cannot run on forks, it cannot be required or it will block all PL PRs from forks + #- "test-on-tpus" - id: "pytorch_lightning: Conda" paths: @@ -162,8 +165,25 @@ subprojects: - "lite-cpu (windows-2022, 3.10, latest, stable)" - "lite-cpu (windows-2022, 3.7, latest, stable)" - "lite-cpu (windows-2022, 3.7, oldest, stable)" + - "lightning-lite (GPUs)" - "mypy" - # TODO: lite should also require (some?) pl checks. this also requires that the path filters are modified + # Lite also requires PL checks as it depends on Lite + - "pl-cpu (macOS-11, 3.10, latest, stable)" + - "pl-cpu (macOS-11, 3.7, latest, stable)" + - "pl-cpu (macOS-11, 3.7, oldest, stable)" + - "pl-cpu (ubuntu-20.04, 3.10, latest, stable)" + - "pl-cpu (ubuntu-20.04, 3.7, latest, stable)" + - "pl-cpu (ubuntu-20.04, 3.7, oldest, stable)" + - "pl-cpu (windows-2022, 3.10, latest, stable)" + - "pl-cpu (windows-2022, 3.7, latest, stable)" + - "pl-cpu (windows-2022, 3.7, oldest, stable)" + - "make-doctest (pytorch)" + - "make-html (pytorch)" + - "pytorch-lightning (GPUs)" + - "pl-slow (macOS-11, 3.7, 1.11)" + - "pl-slow (ubuntu-20.04, 3.7, 1.11)" + - "pl-slow (windows-2022, 3.7, 1.11)" + #- "test-on-tpus" - id: "lightning_lite: Azure GPU" paths: diff --git a/.github/workflows/ci-pytorch-test-slow.yml b/.github/workflows/ci-pytorch-test-slow.yml index c1b2ab2292009..8c65f50fcb30f 100644 --- a/.github/workflows/ci-pytorch-test-slow.yml +++ b/.github/workflows/ci-pytorch-test-slow.yml @@ -8,11 +8,14 @@ on: branches: [master, "release/*"] types: [opened, reopened, ready_for_review, synchronize] # add `ready_for_review` since draft is skipped paths: + - ".github/workflows/ci-pytorch-test-slow.yml" - "requirements/pytorch/**" - "src/pytorch_lightning/**" - "tests/tests_pytorch/**" - "setup.cfg" # includes pytest config - - ".github/workflows/ci-pytorch-test-slow.yml" + - "requirements/lite/**" + - "src/lightning_lite/**" + - "tests/tests_lite/**" concurrency: group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }} diff --git a/.github/workflows/probot-check-group.yml b/.github/workflows/probot-check-group.yml index 2e69ff59b0779..28d4b3994db02 100644 --- a/.github/workflows/probot-check-group.yml +++ b/.github/workflows/probot-check-group.yml @@ -2,6 +2,7 @@ name: Probot on: pull_request: + types: [opened, reopened, ready_for_review, synchronize] # add `ready_for_review` since draft is skipped concurrency: group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }} @@ -10,6 +11,7 @@ concurrency: jobs: required-jobs: runs-on: ubuntu-latest + if: github.event.pull_request.draft == false # if this timeout triggers, then the job needs to be manually restarted through the GitHub interface timeout-minutes: 60 steps: From 7dbd038a9455f3c67da3fe19ef579058feef51b2 Mon Sep 17 00:00:00 2001 From: Sherin Thomas Date: Thu, 15 Sep 2022 17:59:12 +0530 Subject: [PATCH 159/193] Bump lightning cloud for memory leak fix (#14697) Bump lightning cloud for memory leak fix (#14697) --- requirements/app/base.txt | 2 +- src/lightning_app/CHANGELOG.md | 2 ++ src/lightning_app/cli/cmd_clusters.py | 4 +++- src/lightning_app/utilities/packaging/app_config.py | 3 ++- tests/tests_app/cli/test_cmd_clusters.py | 5 ++++- 5 files changed, 12 insertions(+), 4 deletions(-) diff --git a/requirements/app/base.txt b/requirements/app/base.txt index d19c4ca207117..3e5ec44be652d 100644 --- a/requirements/app/base.txt +++ b/requirements/app/base.txt @@ -1,4 +1,4 @@ -lightning-cloud==0.5.3 +lightning-cloud==0.5.6 packaging deepdiff>=5.7.0, <=5.8.1 starsessions>=1.2.1, <2.0 # strict diff --git a/src/lightning_app/CHANGELOG.md b/src/lightning_app/CHANGELOG.md index fb4609ba2c7b4..ae80e00e7c98a 100644 --- a/src/lightning_app/CHANGELOG.md +++ b/src/lightning_app/CHANGELOG.md @@ -27,6 +27,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Resolved a bug where the wrong client was passed to collect cloud logs ([#14684](https://github.com/Lightning-AI/lightning/pull/14684)) +- Resolved the memory leak issue with Lightning Cloud package and bumped the requirements to use the latest version ([#14697](https://github.com/Lightning-AI/lightning/pull/14697) + - Unification of app template: moved `app.py` to root dir for `lightning init app ` template ([#13853](https://github.com/Lightning-AI/lightning/pull/13853)) diff --git a/src/lightning_app/cli/cmd_clusters.py b/src/lightning_app/cli/cmd_clusters.py index a41d44ceba383..5c00acffb8051 100644 --- a/src/lightning_app/cli/cmd_clusters.py +++ b/src/lightning_app/cli/cmd_clusters.py @@ -5,15 +5,17 @@ import click from lightning_cloud.openapi import ( + Externalv1Cluster, V1AWSClusterDriverSpec, V1ClusterDriver, V1ClusterPerformanceProfile, V1ClusterSpec, + V1ClusterState, + V1ClusterType, V1CreateClusterRequest, V1InstanceSpec, V1KubernetesClusterDriver, ) -from lightning_cloud.openapi.models import Externalv1Cluster, V1ClusterState, V1ClusterType from rich.console import Console from rich.table import Table from rich.text import Text diff --git a/src/lightning_app/utilities/packaging/app_config.py b/src/lightning_app/utilities/packaging/app_config.py index 894a874daea15..9c4939432f14f 100644 --- a/src/lightning_app/utilities/packaging/app_config.py +++ b/src/lightning_app/utilities/packaging/app_config.py @@ -3,7 +3,8 @@ from typing import Optional, Union import yaml -from lightning_cloud.utils.name_generator import get_unique_name + +from lightning_app.utilities.name_generator import get_unique_name _APP_CONFIG_FILENAME = ".lightning" diff --git a/tests/tests_app/cli/test_cmd_clusters.py b/tests/tests_app/cli/test_cmd_clusters.py index e835643fd94fa..3063ffe742cb1 100644 --- a/tests/tests_app/cli/test_cmd_clusters.py +++ b/tests/tests_app/cli/test_cmd_clusters.py @@ -4,16 +4,19 @@ import click import pytest from lightning_cloud.openapi import ( + Externalv1Cluster, V1AWSClusterDriverSpec, V1ClusterDriver, V1ClusterPerformanceProfile, V1ClusterSpec, + V1ClusterState, + V1ClusterStatus, V1ClusterType, V1CreateClusterRequest, V1InstanceSpec, V1KubernetesClusterDriver, + V1ListClustersResponse, ) -from lightning_cloud.openapi.models import Externalv1Cluster, V1ClusterState, V1ClusterStatus, V1ListClustersResponse from lightning_app.cli import cmd_clusters from lightning_app.cli.cmd_clusters import AWSClusterManager From 6585f5e9625254255ecf7807dcb60ef53dc4174b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Thu, 15 Sep 2022 15:15:08 +0200 Subject: [PATCH 160/193] Add code-owners for standalone Lite package (#14694) * Add Lite codeowners * remove Borda on request --- .github/CODEOWNERS | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 2460e2a71d761..086f9e6b09e49 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -56,12 +56,18 @@ /src/lightning_app/testing @tchaton @manskx /src/lightning_app/__about__.py @nohalon @edenlightning @lantiga +/src/lightning_lite @awaelchli @carmocca @justusschock + # Examples /examples/app_* @tchaton @awaelchli @manskx @hhsecond + # App tests /tests/tests_app @tchaton @awaelchli @manskx @hhsecond /tests/tests_app_examples @tchaton @awaelchli @manskx @hhsecond +# Lite tests +/tests/tests_lite @awaelchli @carmocca @justusschock + # Specifics /src/pytorch_lightning/trainer/connectors/logger_connector @tchaton @carmocca /src/pytorch_lightning/trainer/progress.py @tchaton @awaelchli @carmocca From d4afab2243fd95febbe1a06b8a9910ccb49ce19d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Thu, 15 Sep 2022 15:15:27 +0200 Subject: [PATCH 161/193] Remove mentions of @awaelchli from source code (#14425) * Remove a todo from trainer regarding exception handling * Remove mentions of TODO(@awaelchli) from code Co-authored-by: Jirka Borovec --- _notebooks | 2 +- src/pytorch_lightning/lite/wrappers.py | 2 +- src/pytorch_lightning/loops/epoch/training_epoch_loop.py | 2 +- src/pytorch_lightning/loops/fit_loop.py | 6 +++--- src/pytorch_lightning/strategies/strategy.py | 6 +++--- .../trainer/connectors/accelerator_connector.py | 6 +++--- src/pytorch_lightning/trainer/trainer.py | 2 +- 7 files changed, 13 insertions(+), 13 deletions(-) diff --git a/_notebooks b/_notebooks index 6d5634b794218..8a36a41548f34 160000 --- a/_notebooks +++ b/_notebooks @@ -1 +1 @@ -Subproject commit 6d5634b7942180e6ba4a30bfbd74926d1c22f1eb +Subproject commit 8a36a41548f34c44ac455d515a72994487e85813 diff --git a/src/pytorch_lightning/lite/wrappers.py b/src/pytorch_lightning/lite/wrappers.py index 29a0c17341666..0c3924694cc06 100644 --- a/src/pytorch_lightning/lite/wrappers.py +++ b/src/pytorch_lightning/lite/wrappers.py @@ -103,7 +103,7 @@ def forward(self, *args: Any, **kwargs: Any) -> Any: 32: torch.float32, 64: torch.float64, } - # TODO (@awaelchli): let the precision plugin handle the conversion + # TODO: let the precision plugin handle the conversion to_type = precision_to_type[precision] def _convert_float_tensor(t: Tensor) -> Tensor: diff --git a/src/pytorch_lightning/loops/epoch/training_epoch_loop.py b/src/pytorch_lightning/loops/epoch/training_epoch_loop.py index a633a7edf6309..0877bb7347ff7 100644 --- a/src/pytorch_lightning/loops/epoch/training_epoch_loop.py +++ b/src/pytorch_lightning/loops/epoch/training_epoch_loop.py @@ -490,7 +490,7 @@ def _should_check_val_fx(self) -> bool: if self.trainer.should_stop: return True - # TODO(@awaelchli): let training/eval loop handle logic around limit_*_batches and val_check_batch + # TODO: let training/eval loop handle logic around limit_*_batches and val_check_batch is_val_check_batch = is_last_batch if isinstance(self.trainer.limit_train_batches, int) and is_infinite_dataset: is_val_check_batch = (self.batch_idx + 1) % self.trainer.limit_train_batches == 0 diff --git a/src/pytorch_lightning/loops/fit_loop.py b/src/pytorch_lightning/loops/fit_loop.py index e6dcc581e247d..48a5d1ef124e2 100644 --- a/src/pytorch_lightning/loops/fit_loop.py +++ b/src/pytorch_lightning/loops/fit_loop.py @@ -91,7 +91,7 @@ def min_steps(self) -> Optional[int]: @min_steps.setter def min_steps(self, value: Optional[int]) -> None: """Sets the minimum number of steps (forwards to epoch_loop)""" - # TODO(@awaelchli): This setter is required by debugging connector (fast dev run), should be avoided + # TODO: This setter is required by debugging connector (fast dev run), should be avoided self.epoch_loop.min_steps = value @property @@ -102,7 +102,7 @@ def max_steps(self) -> int: @max_steps.setter def max_steps(self, value: int) -> None: """Sets the maximum number of steps (forwards to epoch_loop)""" - # TODO(@awaelchli): This setter is required by debugging connector (fast dev run), should be avoided + # TODO: This setter is required by debugging connector (fast dev run), should be avoided if value < -1: raise MisconfigurationException( f"`max_steps` must be a non-negative integer or -1 (infinite steps). You passed in {value}." @@ -159,7 +159,7 @@ def done(self) -> bool: rank_zero_info("`Trainer.fit` stopped: No training batches.") return True - # TODO(@awaelchli): Move track steps inside training loop and move part of these condition inside training loop + # TODO: Move track steps inside training loop and move part of these condition inside training loop stop_steps = _is_max_limit_reached(self.epoch_loop.global_step, self.max_steps) if stop_steps: rank_zero_info(f"`Trainer.fit` stopped: `max_steps={self.max_steps!r}` reached.") diff --git a/src/pytorch_lightning/strategies/strategy.py b/src/pytorch_lightning/strategies/strategy.py index 2b85bbb88cc9d..508d5d41019bb 100644 --- a/src/pytorch_lightning/strategies/strategy.py +++ b/src/pytorch_lightning/strategies/strategy.py @@ -237,19 +237,19 @@ def _setup_model_and_optimizers(self, model: Module, optimizers: List[Optimizer] The returned objects are expected to be in the same order they were passed in. The default implementation will call :meth:`_setup_model` and :meth:`_setup_optimizer` on the inputs. """ - # TODO (@awaelchli): standardize this across all plugins in Lightning and Lite. Related refactor: #7324 + # TODO: standardize this across all plugins in Lightning and Lite. Related refactor: #7324 model = self._setup_model(model) optimizers = [self._setup_optimizer(optimizer) for optimizer in optimizers] return model, optimizers def _setup_model(self, model: Module) -> Module: """Performs setup for the model, e.g., by wrapping it by another class.""" - # TODO (@awaelchli): standardize this across all plugins in Lightning and Lite. Related refactor: #7324 + # TODO: standardize this across all plugins in Lightning and Lite. Related refactor: #7324 return model def _setup_optimizer(self, optimizer: Optimizer) -> Optimizer: """Performs setup for the optimizer, e.g., by wrapping it by another class.""" - # TODO (@awaelchli): standardize this across all plugins in Lightning and Lite. Related refactor: #7324 + # TODO: standardize this across all plugins in Lightning and Lite. Related refactor: #7324 return optimizer def batch_to_device(self, batch: Any, device: Optional[torch.device] = None, dataloader_idx: int = 0) -> Any: diff --git a/src/pytorch_lightning/trainer/connectors/accelerator_connector.py b/src/pytorch_lightning/trainer/connectors/accelerator_connector.py index c7432ff298c88..8e73d13458238 100644 --- a/src/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/src/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -336,7 +336,7 @@ def _check_config_and_set_final_flags( # handle the case when the user passes in a strategy instance which has an accelerator, precision, # checkpoint io or cluster env set up - # TODO: @awaelchli improve the error messages below + # TODO: improve the error messages below if self._strategy_flag and isinstance(self._strategy_flag, Strategy): if self._strategy_flag._accelerator: if self._accelerator_flag: @@ -461,7 +461,7 @@ def _map_deprecated_devices_specific_info_to_accelerator_and_device_flag( deprecated_devices_specific_flag = num_processes or gpus or ipus or tpu_cores if deprecated_devices_specific_flag and deprecated_devices_specific_flag not in ([], 0, "0"): if devices: - # TODO: @awaelchli improve error message + # TODO improve error message rank_zero_warn( f"The flag `devices={devices}` will be ignored, " f"instead the device specific number {deprecated_devices_specific_flag} will be used" @@ -470,7 +470,7 @@ def _map_deprecated_devices_specific_info_to_accelerator_and_device_flag( if [(num_processes is not None), (gpus is not None), (ipus is not None), (tpu_cores is not None)].count( True ) > 1: - # TODO: @awaelchli improve error message + # TODO: improve error message rank_zero_warn("more than one device specific flag has been set") self._devices_flag = deprecated_devices_specific_flag diff --git a/src/pytorch_lightning/trainer/trainer.py b/src/pytorch_lightning/trainer/trainer.py index aeb36f52639ac..3789e7612aa14 100644 --- a/src/pytorch_lightning/trainer/trainer.py +++ b/src/pytorch_lightning/trainer/trainer.py @@ -637,7 +637,7 @@ def _call_and_handle_interrupt(self, trainer_fn: Callable, *args: Any, **kwargs: return self.strategy.launcher.launch(trainer_fn, *args, trainer=self, **kwargs) else: return trainer_fn(*args, **kwargs) - # TODO(awaelchli): Unify both exceptions below, where `KeyboardError` doesn't re-raise + # TODO: Unify both exceptions below, where `KeyboardError` doesn't re-raise except KeyboardInterrupt as exception: rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...") # user could press Ctrl+c many times... only shutdown once From 71719b9960403322cdb43aa2dc13a2e62d840be9 Mon Sep 17 00:00:00 2001 From: Alec Merdler Date: Thu, 15 Sep 2022 06:15:59 -0700 Subject: [PATCH 162/193] Support Injecting Secrets into Apps Running in the Cloud (#14612) Adds a new '--secret' flag to 'lightning run app': lightning run app --cloud --secret MY_SECRET=my-secret-name app.py When the Lightning App runs in the cloud, the 'MY_SECRET' environment variable will be populated with the value of the referenced Secret. The value of the Secret is encrypted in the database, and will only be decrypted and accessible to the Flow/Work processes in the cloud. Co-authored-by: Sherin Thomas Co-authored-by: Noha Alon Co-authored-by: thomas chaton --- .../glossary/environment_variables.rst | 8 +-- docs/source-app/glossary/secrets.rst | 57 +++++++++++++++++++ docs/source-app/index.rst | 1 + docs/source-lit/glossary/secrets.rst | 1 + docs/source-lit/index.rst | 1 + .../commands/notebook/run.py | 1 + src/lightning_app/CHANGELOG.md | 1 + src/lightning_app/cli/lightning_cli.py | 17 +++++- src/lightning_app/runners/cloud.py | 11 +++- src/lightning_app/runners/runtime.py | 16 +++++- src/lightning_app/utilities/secrets.py | 26 +++++++++ tests/tests_app/cli/test_run_app.py | 25 +++++++- tests/tests_app/utilities/test_secrets.py | 49 ++++++++++++++++ 13 files changed, 202 insertions(+), 12 deletions(-) create mode 100644 docs/source-app/glossary/secrets.rst create mode 120000 docs/source-lit/glossary/secrets.rst create mode 120000 docs/source-lit/workflows/build_command_line_interface/commands/notebook/run.py create mode 100644 src/lightning_app/utilities/secrets.py create mode 100644 tests/tests_app/utilities/test_secrets.py diff --git a/docs/source-app/glossary/environment_variables.rst b/docs/source-app/glossary/environment_variables.rst index fd41594656b0f..10c3e9ae2efac 100644 --- a/docs/source-app/glossary/environment_variables.rst +++ b/docs/source-app/glossary/environment_variables.rst @@ -4,9 +4,9 @@ Environment Variables ********************* -If your app is using secrets or values you don't want to expose in your app code such as API keys or access tokens, you can use environment variables. +If your App is using configuration values you don't want to commit with your App source code, you can use environment variables. -Lightning allows you to set environment variables when running the app from the CLI with the `lightning run app` command. You can use environment variables to pass any value such as API keys or other similar configurations to the app, avoiding having to stick them in the source code. +Lightning allows you to set environment variables when running the App from the CLI with the `lightning run app` command. You can use environment variables to pass any values to the App, and avoiding sticking those values in the source code. Set one or multiple variables using the **--env** option: @@ -14,7 +14,7 @@ Set one or multiple variables using the **--env** option: lightning run app app.py --cloud --env FOO=BAR --env BAZ=FAZ -The environment variables are available in all flows and works, and can be accessed as follows: +Environment variables are available in all Flows and Works, and can be accessed as follows: .. code:: python @@ -24,4 +24,4 @@ The environment variables are available in all flows and works, and can be acces print(os.environ["BAZ"]) # FAZ .. note:: - Environment variables are currently not encrypted. + Environment variables are not encrypted. For sensitive values, we recommend using :ref:`Encrypted Secrets `. diff --git a/docs/source-app/glossary/secrets.rst b/docs/source-app/glossary/secrets.rst new file mode 100644 index 0000000000000..9e1191beca6e5 --- /dev/null +++ b/docs/source-app/glossary/secrets.rst @@ -0,0 +1,57 @@ +.. _secrets: + +################# +Encrypted Secrets +################# + +We understand that many Apps require access to private data like API keys, access tokens, database passwords, or other credentials. And that you need to protect this data. + +Secrets provie a secure way to make private data like API keys or passwords accessible to your app, without hardcoding. You can use secrets to authenticate third-party services/solutions. + +.. tip:: + For non-sensitive configuration values, we recommend using :ref:`plain-text Environment Variables `. + +******************* +Overview of Secrets +******************* + +The ``--secret`` option has been added to the **lightning run app** command. ``--secret`` can be used by itself or alongside ``--env``. + +When a Lightning App (App) **runs in the cloud**, the Secret can be exposed to the App using environment variables. +The value of the Secret is encrypted in the Lightning.ai database, and is only decrypted and accessible to +LightningFlow (Flow) or LightningWork (Work) processes in the cloud (when you use the ``--cloud`` option running your App). + +---- + +********************* +Use Encrypted Secrets +********************* + +First, a Secret must be created using the admin web UI. Once you create a Secret, you can bind it to any of your Apps. You do not need to create a new Secret for each App if the Secret value is the same. + +.. note:: + Secret names must start with a letter and can only contain letters, numbers, dashes, and periods. The Secret names must comply with `RFC1123 naming conventions `_. The Secret value has no restrictions. + +In the example below, we already used the admin UI to create a Secret named ``my-secret`` with the value ``some-value``` and will bind it to the environment variable ``MY_APP_SECRET`` within our App. The binding is accomplished by using the ``--secret`` option when running the App from the Lightning CLI. + +The ``--secret``` option works similar to ``--env``, but instead of providing a value, you provide the name of the Secret which will be replaced with with the value that you want to bind to the environment variable: + +.. code:: bash + + lightning run app app.py --cloud --secret MY_APP_SECRET=my-secret + +The environment variables are available in all Flows and Works, and can be accessed as follows: + +.. code:: python + + import os + + print(os.environ["MY_APP_SECRET"]) + +The code above will print out ``some-value``. + +The ``--secret`` option can be used for multiple Secrets, and alongside the ``--env`` option: + +.. code:: bash + + lightning run app app.py --cloud --env FOO=bar --secret MY_APP_SECRET=my-secret --secret ANOTHER_SECRET=another-secret diff --git a/docs/source-app/index.rst b/docs/source-app/index.rst index e9381860eae9a..55b1ac79d06c4 100644 --- a/docs/source-app/index.rst +++ b/docs/source-app/index.rst @@ -270,6 +270,7 @@ Keep Learning DAG Event Loop Environment Variables + Encrypted Secrets Frontend Apple and Android mobile devices with Lighting Apps REST API diff --git a/docs/source-lit/glossary/secrets.rst b/docs/source-lit/glossary/secrets.rst new file mode 120000 index 0000000000000..7f2765c434c2a --- /dev/null +++ b/docs/source-lit/glossary/secrets.rst @@ -0,0 +1 @@ +../../source-app/glossary/secrets.rst \ No newline at end of file diff --git a/docs/source-lit/index.rst b/docs/source-lit/index.rst index d62cc72d1a836..48fa845f8daf8 100644 --- a/docs/source-lit/index.rst +++ b/docs/source-lit/index.rst @@ -113,6 +113,7 @@ Welcome to ⚡ Lightning Apps DAG Event Loop Environment Variables + Encrypted Secrets Frontend Sharing Components Scheduling diff --git a/docs/source-lit/workflows/build_command_line_interface/commands/notebook/run.py b/docs/source-lit/workflows/build_command_line_interface/commands/notebook/run.py new file mode 120000 index 0000000000000..76cf1a9718e1a --- /dev/null +++ b/docs/source-lit/workflows/build_command_line_interface/commands/notebook/run.py @@ -0,0 +1 @@ +../../../../../source-app/workflows/build_command_line_interface/commands/notebook/run.py \ No newline at end of file diff --git a/src/lightning_app/CHANGELOG.md b/src/lightning_app/CHANGELOG.md index ae80e00e7c98a..5b05863b15fbf 100644 --- a/src/lightning_app/CHANGELOG.md +++ b/src/lightning_app/CHANGELOG.md @@ -10,6 +10,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Added - Add `load_state_dict` and `state_dict` ([#14100](https://github.com/Lightning-AI/lightning/pull/14100)) +- Add `--secret` option to CLI to allow binding Secrets to app environment variables when running in the cloud ([#14612](https://github.com/Lightning-AI/lightning/pull/14612)) ### Changed diff --git a/src/lightning_app/cli/lightning_cli.py b/src/lightning_app/cli/lightning_cli.py index 51fc08e5fd49a..4cd38f4cdbf4b 100644 --- a/src/lightning_app/cli/lightning_cli.py +++ b/src/lightning_app/cli/lightning_cli.py @@ -305,6 +305,7 @@ def _run_app( blocking: bool, open_ui: bool, env: tuple, + secret: tuple, ): file = _prepare_file(file) @@ -320,10 +321,17 @@ def _run_app( "Caching is a property of apps running in cloud. " "Using the flag --no-cache in local execution is not supported." ) + if secret: + raise click.ClickException( + "Secrets can only be used for apps running in cloud. " + "Using the option --secret in local execution is not supported." + ) env_vars = _format_input_env_variables(env) os.environ.update(env_vars) + secrets = _format_input_env_variables(secret) + def on_before_run(*args, **kwargs): if open_ui and not without_server: click.launch(get_app_url(runtime_type, *args, **kwargs)) @@ -342,6 +350,7 @@ def on_before_run(*args, **kwargs): on_before_run=on_before_run, name=name, env_vars=env_vars, + secrets=secrets, cluster_id=cluster_id, ) if runtime_type == RuntimeType.CLOUD: @@ -377,7 +386,8 @@ def run(): default=True, help="Decide whether to launch the app UI in a web browser", ) -@click.option("--env", type=str, default=[], multiple=True, help="Env variables to be set for the app.") +@click.option("--env", type=str, default=[], multiple=True, help="Environment variables to be set for the app.") +@click.option("--secret", type=str, default=[], multiple=True, help="Secret variables to be set for the app.") @click.option("--app_args", type=str, default=[], multiple=True, help="Collection of arguments for the app.") def run_app( file: str, @@ -389,10 +399,11 @@ def run_app( blocking: bool, open_ui: bool, env: tuple, - app_args: List[str], + secret: tuple, + app_args: tuple, ): """Run an app from a file.""" - _run_app(file, cloud, cluster_id, without_server, no_cache, name, blocking, open_ui, env) + _run_app(file, cloud, cluster_id, without_server, no_cache, name, blocking, open_ui, env, secret) @_main.group(hidden=True) diff --git a/src/lightning_app/runners/cloud.py b/src/lightning_app/runners/cloud.py index 949665de8d28b..b873f6db48ab6 100644 --- a/src/lightning_app/runners/cloud.py +++ b/src/lightning_app/runners/cloud.py @@ -50,6 +50,7 @@ from lightning_app.utilities.dependency_caching import get_hash from lightning_app.utilities.packaging.app_config import AppConfig, find_config_file from lightning_app.utilities.packaging.lightning_utils import _prepare_lightning_wheels_and_requirements +from lightning_app.utilities.secrets import _names_to_ids logger = Logger(__name__) @@ -98,8 +99,16 @@ def dispatch( print(f"The name of the app is: {app_config.name}") - work_reqs: List[V1Work] = [] v1_env_vars = [V1EnvVar(name=k, value=v) for k, v in self.env_vars.items()] + + if len(self.secrets.values()) > 0: + secret_names_to_ids = _names_to_ids(self.secrets.values()) + env_vars_from_secrets = [ + V1EnvVar(name=k, from_secret=secret_names_to_ids[v]) for k, v in self.secrets.items() + ] + v1_env_vars.extend(env_vars_from_secrets) + + work_reqs: List[V1Work] = [] for flow in self.app.flows: for work in flow.works(recurse=False): work_requirements = "\n".join(work.cloud_build_config.requirements) diff --git a/src/lightning_app/runners/runtime.py b/src/lightning_app/runners/runtime.py index 59387238843cf..64d5c1050214b 100644 --- a/src/lightning_app/runners/runtime.py +++ b/src/lightning_app/runners/runtime.py @@ -27,7 +27,8 @@ def dispatch( blocking: bool = True, on_before_run: Optional[Callable] = None, name: str = "", - env_vars: Dict[str, str] = {}, + env_vars: Dict[str, str] = None, + secrets: Dict[str, str] = None, cluster_id: str = None, ) -> Optional[Any]: """Bootstrap and dispatch the application to the target. @@ -43,6 +44,7 @@ def dispatch( on_before_run: Callable to be executed before run. name: Name of app execution env_vars: Dict of env variables to be set on the app + secrets: Dict of secrets to be passed as environment variables to the app cluster_id: the Lightning AI cluster to run the app on. Defaults to managed Lightning AI cloud """ from lightning_app.runners.runtime_type import RuntimeType @@ -54,11 +56,20 @@ def dispatch( runtime_cls: Type[Runtime] = runtime_type.get_runtime() app = load_app_from_file(str(entrypoint_file)) + env_vars = {} if env_vars is None else env_vars + secrets = {} if secrets is None else secrets + if blocking: app.stage = AppStage.BLOCKING runtime = runtime_cls( - app=app, entrypoint_file=entrypoint_file, start_server=start_server, host=host, port=port, env_vars=env_vars + app=app, + entrypoint_file=entrypoint_file, + start_server=start_server, + host=host, + port=port, + env_vars=env_vars, + secrets=secrets, ) # a cloud dispatcher will return the result while local # dispatchers will be running the app in the main process @@ -78,6 +89,7 @@ class Runtime: done: bool = False backend: Optional[Union[str, Backend]] = "multiprocessing" env_vars: Dict[str, str] = field(default_factory=dict) + secrets: Dict[str, str] = field(default_factory=dict) def __post_init__(self): if isinstance(self.backend, str): diff --git a/src/lightning_app/utilities/secrets.py b/src/lightning_app/utilities/secrets.py new file mode 100644 index 0000000000000..28d4d62fc5074 --- /dev/null +++ b/src/lightning_app/utilities/secrets.py @@ -0,0 +1,26 @@ +from typing import Dict, List + +from lightning_app.utilities.cloud import _get_project +from lightning_app.utilities.network import LightningClient + + +def _names_to_ids(secret_names: List[str]) -> Dict[str, str]: + """Returns the name/ID pair for each given Secret name. + + Raises a `ValueError` if any of the given Secret names do not exist. + """ + lightning_client = LightningClient() + + project = _get_project(lightning_client) + secrets = lightning_client.secret_service_list_secrets(project_id=project.project_id) + + secret_names_to_ids: Dict[str, str] = {} + for secret in secrets.secrets: + if secret.name in secret_names: + secret_names_to_ids[secret.name] = secret.id + + for secret_name in secret_names: + if secret_name not in secret_names_to_ids.keys(): + raise ValueError(f"Secret with name '{secret_name}' not found") + + return secret_names_to_ids diff --git a/tests/tests_app/cli/test_run_app.py b/tests/tests_app/cli/test_run_app.py index 221e3e2ab3a6f..014c3a70ec096 100644 --- a/tests/tests_app/cli/test_run_app.py +++ b/tests/tests_app/cli/test_run_app.py @@ -33,7 +33,6 @@ def _lightning_app_run_and_logging(self, *args, **kwargs): with caplog.at_level(logging.INFO): with mock.patch("lightning_app.LightningApp._run", _lightning_app_run_and_logging): - runner = CliRunner() result = runner.invoke( run_app, @@ -70,6 +69,7 @@ def test_lightning_run_cluster_without_cloud(monkeypatch): open_ui=False, no_cache=True, env=("FOO=bar",), + secret=(), ) @@ -80,7 +80,7 @@ def test_lightning_run_app_cloud(mock_dispatch: mock.MagicMock, open_ui, caplog, """This test validates the command has ran properly when --cloud argument is passed. It tests it by checking if the click.launch is called with the right url if --open-ui was true and also checks the - call to `dispatch` for the right arguments + call to `dispatch` for the right arguments. """ monkeypatch.setattr("lightning_app.runners.cloud.logger", logging.getLogger()) @@ -95,6 +95,7 @@ def test_lightning_run_app_cloud(mock_dispatch: mock.MagicMock, open_ui, caplog, open_ui=open_ui, no_cache=True, env=("FOO=bar",), + secret=("BAR=my-secret",), ) # capture logs. # TODO(yurij): refactor the test, check if the actual HTTP request is being sent and that the proper admin @@ -108,5 +109,25 @@ def test_lightning_run_app_cloud(mock_dispatch: mock.MagicMock, open_ui, caplog, name="", no_cache=True, env_vars={"FOO": "bar"}, + secrets={"BAR": "my-secret"}, cluster_id="", ) + + +def test_lightning_run_app_secrets(monkeypatch): + """Validates that running apps only supports the `--secrets` argument if the `--cloud` argument is passed.""" + monkeypatch.setattr("lightning_app.runners.cloud.logger", logging.getLogger()) + + with pytest.raises(click.exceptions.ClickException): + _run_app( + file=os.path.join(_PROJECT_ROOT, "tests/tests_app/core/scripts/app_metadata.py"), + cloud=False, + cluster_id="test-cluster", + without_server=False, + name="", + blocking=False, + open_ui=False, + no_cache=True, + env=(), + secret=("FOO=my-secret"), + ) diff --git a/tests/tests_app/utilities/test_secrets.py b/tests/tests_app/utilities/test_secrets.py new file mode 100644 index 0000000000000..2ab17689f6c2c --- /dev/null +++ b/tests/tests_app/utilities/test_secrets.py @@ -0,0 +1,49 @@ +from typing import Dict, List +from unittest import mock +from unittest.mock import MagicMock + +import pytest +from lightning_cloud.openapi import V1ListMembershipsResponse, V1ListSecretsResponse, V1Membership, V1Secret + +from lightning_app.utilities.secrets import _names_to_ids + + +@pytest.mark.parametrize( + "secret_names, secrets, expected, expected_exception", + [ + ([], [], {}, False), + ( + ["first-secret", "second-secret"], + [ + V1Secret(name="first-secret", id="1234"), + ], + {}, + True, + ), + ( + ["first-secret", "second-secret"], + [V1Secret(name="first-secret", id="1234"), V1Secret(name="second-secret", id="5678")], + {"first-secret": "1234", "second-secret": "5678"}, + False, + ), + ], +) +@mock.patch("lightning_cloud.login.Auth.authenticate", MagicMock()) +@mock.patch("lightning_app.utilities.network.LightningClient.secret_service_list_secrets") +@mock.patch("lightning_app.utilities.network.LightningClient.projects_service_list_memberships") +def test_names_to_ids( + list_memberships: MagicMock, + list_secrets: MagicMock, + secret_names: List[str], + secrets: List[V1Secret], + expected: Dict[str, str], + expected_exception: bool, +): + list_memberships.return_value = V1ListMembershipsResponse(memberships=[V1Membership(project_id="default-project")]) + list_secrets.return_value = V1ListSecretsResponse(secrets=secrets) + + if expected_exception: + with pytest.raises(ValueError): + _names_to_ids(secret_names) + else: + assert _names_to_ids(secret_names) == expected From 3c5e03e0357605b8cbc6c935165d7c8d3bc66b97 Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Thu, 15 Sep 2022 22:16:14 +0900 Subject: [PATCH 163/193] docs: Clarify versioning and API stability (#14549) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * mv releases to a standalone page * Include release_policy in index * Update policy * mv releases to a standalone page * Include release_policy in index * Update policy * Update title * remove release_policy.rst * Update versioning * syntax * simplify wording * Include examples that don't follow X+2 rule * syntax * update * consistency * rm noninformative statement * . * Reduce redundancy in the deprecation process * grammar? * consistency * Update docs/source-pytorch/versioning.rst Co-authored-by: Adrian Wälchli Co-authored-by: Jirka Borovec Co-authored-by: Adrian Wälchli --- docs/source-pytorch/governance.rst | 32 --------------- docs/source-pytorch/index.rst | 1 + docs/source-pytorch/versioning.rst | 66 ++++++++++++++++++++++++++++++ 3 files changed, 67 insertions(+), 32 deletions(-) create mode 100644 docs/source-pytorch/versioning.rst diff --git a/docs/source-pytorch/governance.rst b/docs/source-pytorch/governance.rst index b9673a6985031..f884b63f09d24 100644 --- a/docs/source-pytorch/governance.rst +++ b/docs/source-pytorch/governance.rst @@ -57,15 +57,6 @@ Alumni - Peter Yu (`yukw777 `_) - Roger Shieh (`s-rog `_) - -Releases -******** - -We release a new minor version (e.g., 1.5.0) every three months and bugfix releases every week. -The minor versions contain new features, API changes, deprecations, removals, potential backward-incompatible -changes and also all previous bugfixes included in any bugfix release. With every release, we publish a changelog -where we list additions, removals, changed functionality and fixes. - Project Management and Decision Making ************************************** @@ -80,26 +71,3 @@ review them. However, reviews submitted by `code owners `_ have higher weight and it is necessary to get the approval of code owners before a pull request can be merged. Additional requirements may apply case by case. - -API Evolution -************* - -Lightning's development is driven by research and best practices in a rapidly developing field of AI and machine -learning. Change is inevitable and when it happens, the Lightning team is committed to minimizing user friction and -maximizing ease of transition from one version to the next. We take backward compatibility and reproducibility very -seriously. - -For API removal, renaming or other forms of backward-incompatible changes, the procedure is: - -#. A deprecation process is initiated at version X, producing warning messages at runtime and in the documentation. -#. Calls to the deprecated API remain unchanged in their function during the deprecation phase. -#. Two minor versions in the future at version X+2 the breaking change takes effect. - -The "X+2" rule is a recommendation and not a strict requirement. Longer deprecation cycles may apply for some cases. - -New API and features are declared as: - -- *Experimental*: Anything labelled as *experimental* or *beta* in the documentation is considered unstable and should - not be used in production. The community is encouraged to test the feature and report issues directly on GitHub. -- *Stable*: Everything not specifically labelled as experimental should be considered stable. Reported issues will be - treated with priority. diff --git a/docs/source-pytorch/index.rst b/docs/source-pytorch/index.rst index 73f5291d6b56b..215199ec224dc 100644 --- a/docs/source-pytorch/index.rst +++ b/docs/source-pytorch/index.rst @@ -287,6 +287,7 @@ Current Lightning Users generated/CONTRIBUTING.md generated/BECOMING_A_CORE_CONTRIBUTOR.md governance + versioning generated/CHANGELOG.md .. raw:: html diff --git a/docs/source-pytorch/versioning.rst b/docs/source-pytorch/versioning.rst new file mode 100644 index 0000000000000..b3491dd0d1de7 --- /dev/null +++ b/docs/source-pytorch/versioning.rst @@ -0,0 +1,66 @@ +.. _versioning: + +Versioning Policy +################# + +PyTorch Lightning follows its own versioning policy but not `semantic versioning (SemVer) `_. + +Versioning +********** + +A Lightning release number is in the format of ``MAJOR.MINOR.PATCH``. + +- A patch release contains only bug fixes. Since it introduces no breaking changes, we recommend users always update the package to the latest version within the minor version whenever possible. +- A minor release, unlike SemVer, contains backwards-incompatible changes, such as API changes and removals, as well as new features, deprecations and all bugfixes since the last release. + +With every release, we publish a changelog where we list additions, removals, deprecations, changed functionality and fixes. + +API Stability +************* + +In Lightning, all API and features are marked as either stable or experimental. + +Experimental API +---------------- + +Experimental APIs are labelled as experimental or beta in the documentation and/or in the release note and are considered unstable and should not be used in production. + +For experimental features, any of the following may be true: + +- The feature has unstable dependencies. +- The API may change without notice in future versions. +- The performance of the feature has not been verified. +- The docs for this feature are under active development. + +Stable API +---------- + +Everything not specifically labelled as experimental is stable. + +For stable APIs, all of the following are true: + +- The API is not expected to change. +- If anything does change, we show a deprecation warning before applying the breaking change following the rule described below. + +API Evolution +************* + +Lightning's development is driven by research and best practices in a rapidly developing field of AI and machine learning. Change is inevitable and when it happens, the Lightning team is committed to minimizing user friction and maximizing ease of transition from one version to the next. We take backwards compatibility and reproducibility very seriously. + +For API removal, renaming or other forms of backwards-incompatible changes, the procedure is: + +#. A deprecation process is initiated at a minor version ``X``, producing a deprecation warning at runtime and in the documentation. +#. The deprecated API remains unchanged during the deprecation phase for two minor versions. +#. The breaking change takes effect at a minor version ``X+2``. + +The ``X+2`` rule is a recommendation and not a strict requirement. Shorter or longer deprecation cycles may apply to some cases. In the past, DDP2 was removed without a deprecation process because the feature was broken and unusable beyond fixing as discussed in `#12584 `_. Also, `#10410 `_ is an example that a longer deprecation applied to. We deprecated the accelerator arguments, such as ``Trainer(gpus=...)``, in 1.7, however, because the APIs were so core that they would impact almost all use cases, we decided not to introduce the breaking change until 2.0. + +Python Support +************** + +PyTorch Lightning follows `NEP 29 `_ which PyTorch also follows (`#74203 `_). + +PyTorch Support +*************** + +PyTorch Lightning supports the latest four minor versions of PyTorch at the time of release. For example, PyTorch Lightning 1.7 supports PyTorch 1.9, 1.10, 1.11 and 1.12. From 5ff78f0753c6ac030b344fd985cf21ec8dc3b47f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Thu, 15 Sep 2022 15:58:12 +0200 Subject: [PATCH 164/193] Use the setter in the children recursively (#14724) --- src/pytorch_lightning/CHANGELOG.md | 21 ++++++++----------- src/pytorch_lightning/core/module.py | 2 +- .../tests_pytorch/models/test_torchscript.py | 10 ++++++++- 3 files changed, 19 insertions(+), 14 deletions(-) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 6c28bb3a92efa..5b433e265f018 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -72,10 +72,6 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - In Lightning Lite, state-dict access to the module wrapper now gets passed through to the original module reference ([#14629](https://github.com/Lightning-AI/lightning/pull/14629)) -- Improved the error messaging when passing `Trainer.method(model, x_dataloader=None)` with no module-method implementations available ([#14614](https://github.com/Lightning-AI/lightning/pull/14614)) - - - ### Deprecated - Deprecated `LightningDeepSpeedModule` ([#14000](https://github.com/Lightning-AI/lightning/pull/14000)) @@ -175,25 +171,26 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Fixed -- Reset the dataloaders on OOM failure in batch size finder to use the last successful batch size ([#14372](https://github.com/Lightning-AI/lightning/pull/14372)) +- Break HPU Graphs into two parts (forward + backward as one and optimizer as another) for better performance ([#14656](https://github.com/Lightning-AI/lightning/pull/14656)) -- Fixed an issue to keep downscaling the batch size in case there hasn't been even a single successful optimal batch size with `mode="power"` ([#14372](https://github.com/Lightning-AI/lightning/pull/14372)) +- Fixed torchscript error with ensembles of LightningModules ([#14657](https://github.com/Lightning-AI/lightning/pull/14657), [#14724](https://github.com/Lightning-AI/lightning/pull/14724)) -- Fixed an issue where `self.log`-ing a tensor would create a user warning from PyTorch about cloning tensors ([#14599](https://github.com/Lightning-AI/lightning/pull/14599)) +## [1.7.6] - 2022-09-13 +### Changed -- Break HPU Graphs into two parts (forward + backward as one and optimizer as another) for better performance ([#14656](https://github.com/Lightning-AI/lightning/pull/14656)) +- Improved the error messaging when passing `Trainer.method(model, x_dataloader=None)` with no module-method implementations available ([#14614](https://github.com/Lightning-AI/lightning/pull/14614)) +### Fixed +- Reset the dataloaders on OOM failure in batch size finder to use the last successful batch size ([#14372](https://github.com/Lightning-AI/lightning/pull/14372)) +- Fixed an issue to keep downscaling the batch size in case there hasn't been even a single successful optimal batch size with `mode="power"` ([#14372](https://github.com/Lightning-AI/lightning/pull/14372)) +- Fixed an issue where `self.log`-ing a tensor would create a user warning from PyTorch about cloning tensors ([#14599](https://github.com/Lightning-AI/lightning/pull/14599)) - Fixed compatibility when `torch.distributed` is not available ([#14454](https://github.com/Lightning-AI/lightning/pull/14454)) -- Fixed torchscript error with ensembles of LightningModules ([#14657](https://github.com/Lightning-AI/lightning/pull/14657)) - - - ## [1.7.5] - 2022-09-06 ### Fixed diff --git a/src/pytorch_lightning/core/module.py b/src/pytorch_lightning/core/module.py index 95aad6a2f8c94..54a8fd64cfc74 100644 --- a/src/pytorch_lightning/core/module.py +++ b/src/pytorch_lightning/core/module.py @@ -281,7 +281,7 @@ def _running_torchscript(self) -> bool: def _running_torchscript(self, value: bool) -> None: for v in self.children(): if isinstance(v, LightningModule): - v._running_torchscript_internal = value + v._running_torchscript = value self._running_torchscript_internal = value def _call_batch_hook(self, hook_name: str, *args: Any) -> Any: diff --git a/tests/tests_pytorch/models/test_torchscript.py b/tests/tests_pytorch/models/test_torchscript.py index a7a1006542b0a..857307a0f5977 100644 --- a/tests/tests_pytorch/models/test_torchscript.py +++ b/tests/tests_pytorch/models/test_torchscript.py @@ -174,7 +174,7 @@ def test_torchscript_with_no_input(tmpdir): def test_torchscript_script_recursively(): - class Child(LightningModule): + class GrandChild(LightningModule): def __init__(self): super().__init__() self.model = torch.nn.Linear(1, 1) @@ -182,6 +182,14 @@ def __init__(self): def forward(self, inputs): return self.model(inputs) + class Child(LightningModule): + def __init__(self): + super().__init__() + self.model = GrandChild() + + def forward(self, inputs): + return self.model(inputs) + class Parent(LightningModule): def __init__(self): super().__init__() From 38d89713a5d74ff51222b140c133407d6814d448 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Thu, 15 Sep 2022 16:14:51 +0200 Subject: [PATCH 165/193] Standalone Lite: Connector (#14692) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Carlos Mocholí Co-authored-by: Jirka Borovec Co-authored-by: Kaushik B <45285388+kaushikb11@users.noreply.github.com> --- src/lightning_lite/connector.py | 602 ++++++++++++++++++++++++ tests/tests_lite/test_connector.py | 720 +++++++++++++++++++++++++++++ 2 files changed, 1322 insertions(+) create mode 100644 src/lightning_lite/connector.py create mode 100644 tests/tests_lite/test_connector.py diff --git a/src/lightning_lite/connector.py b/src/lightning_lite/connector.py new file mode 100644 index 0000000000000..4d512933d029a --- /dev/null +++ b/src/lightning_lite/connector.py @@ -0,0 +1,602 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from collections import Counter +from typing import Dict, List, Optional, Union + +import torch + +from lightning_lite.accelerators import ACCELERATOR_REGISTRY +from lightning_lite.accelerators.accelerator import Accelerator +from lightning_lite.accelerators.cuda import CUDAAccelerator +from lightning_lite.accelerators.mps import MPSAccelerator +from lightning_lite.accelerators.tpu import TPUAccelerator +from lightning_lite.plugins import ( + CheckpointIO, + DeepSpeedPrecision, + NativeMixedPrecision, + Precision, + TPUBf16Precision, + TPUPrecision, +) +from lightning_lite.plugins.environments import ( + ClusterEnvironment, + KubeflowEnvironment, + LightningEnvironment, + LSFEnvironment, + SLURMEnvironment, + TorchElasticEnvironment, +) +from lightning_lite.plugins.precision.double import DoublePrecision +from lightning_lite.strategies import ( + DDPShardedStrategy, + DDPSpawnShardedStrategy, + DDPSpawnStrategy, + DDPStrategy, + DeepSpeedStrategy, + SingleDeviceStrategy, + SingleTPUStrategy, + Strategy, + STRATEGY_REGISTRY, + XLAStrategy, +) +from lightning_lite.strategies.ddp_spawn import _DDP_FORK_ALIASES +from lightning_lite.utilities import _StrategyType, device_parser, rank_zero_deprecation, rank_zero_info, rank_zero_warn +from lightning_lite.utilities.imports import _HPU_AVAILABLE, _IPU_AVAILABLE, _IS_INTERACTIVE, _TPU_AVAILABLE + +_PLUGIN = Union[Strategy, Precision, ClusterEnvironment, CheckpointIO] +_PLUGIN_INPUT = Union[_PLUGIN, str] + + +class _Connector: + """The Connector parses several Lite arguments and instantiates the Strategy including other components such as + the Accelerator and Precision plugins. + + A. accelerator flag could be: + 1. accelerator class + 2. accelerator str + 3. accelerator auto + + B. strategy flag could be: + 1. strategy class + 2. strategy str registered with STRATEGY_REGISTRY + 3. strategy str in _strategy_type enum which listed in each strategy as + backend (registed these too, and _strategy_type could be deprecated) + + C. plugins flag could be: + 1. List of str, which could contain: + i. precision str (Not supported in the old accelerator_connector version) + ii. checkpoint_io str (Not supported in the old accelerator_connector version) + iii. cluster_environment str (Not supported in the old accelerator_connector version) + 2. List of class, which could contains: + i. precision class (should be removed, and precision flag should allow user pass classes) + ii. checkpoint_io class + iii. cluster_environment class + + + priorities which to take when: + A. Class > str + B. Strategy > Accelerator/precision/plugins + """ + + def __init__( + self, + accelerator: Optional[Union[str, Accelerator]] = None, + strategy: Optional[Union[str, Strategy]] = None, + devices: Optional[Union[List[int], str, int]] = None, + num_nodes: int = 1, + precision: Union[int, str] = 32, + plugins: Optional[Union[_PLUGIN_INPUT, List[_PLUGIN_INPUT]]] = None, + tpu_cores: Optional[Union[List[int], str, int]] = None, # deprecated + gpus: Optional[Union[List[int], str, int]] = None, # deprecated + ) -> None: + # 1. Parsing flags + # Get registered strategies, built-in accelerators and precision plugins + self._registered_strategies = STRATEGY_REGISTRY.available_strategies() + self._registered_accelerators = ACCELERATOR_REGISTRY.available_accelerators() + self._precision_types = ("16", "32", "64", "bf16", "mixed") + + # Raise an exception if there are conflicts between flags + # Set each valid flag to `self._x_flag` after validation + # For devices: Assign gpus, ipus, etc. to the accelerator flag and devices flag + self._strategy_flag: Optional[Union[Strategy, str]] = None + self._accelerator_flag: Optional[Union[Accelerator, str]] = None + self._precision_flag: Optional[Union[int, str]] = None + self._precision_plugin_flag: Optional[Precision] = None + self._cluster_environment_flag: Optional[Union[ClusterEnvironment, str]] = None + self._parallel_devices: List[Union[int, torch.device, str]] = [] + self.checkpoint_io: Optional[CheckpointIO] = None + + self._check_config_and_set_final_flags( + strategy=strategy, + accelerator=accelerator, + precision=precision, + plugins=plugins, + ) + self._check_device_config_and_set_final_flags( + devices=devices, num_nodes=num_nodes, gpus=gpus, tpu_cores=tpu_cores + ) + + # 2. Instantiate Accelerator + # handle `auto`, `None` and `gpu` + if self._accelerator_flag == "auto" or self._accelerator_flag is None: + self._accelerator_flag = self._choose_auto_accelerator() + elif self._accelerator_flag == "gpu": + self._accelerator_flag = self._choose_gpu_accelerator_backend() + + self._set_parallel_devices_and_init_accelerator() + + # 3. Instantiate ClusterEnvironment + self.cluster_environment: ClusterEnvironment = self._choose_and_init_cluster_environment() + + # 4. Instantiate Strategy - Part 1 + if self._strategy_flag is None: + self._strategy_flag = self._choose_strategy() + # In specific cases, ignore user selection and fall back to a different strategy + self._check_strategy_and_fallback() + self._init_strategy() + + # 5. Instantiate Precision Plugin + self.precision_plugin = self._check_and_init_precision() + + # 6. Instantiate Strategy - Part 2 + self._lazy_init_strategy() + + def _check_config_and_set_final_flags( + self, + strategy: Optional[Union[str, Strategy]], + accelerator: Optional[Union[str, Accelerator]], + precision: Union[int, str], + plugins: Optional[Union[_PLUGIN_INPUT, List[_PLUGIN_INPUT]]], + ) -> None: + """This method checks: + + 1. strategy: whether the strategy name is valid, and sets the internal flags if it is. + 2. accelerator: if the value of the accelerator argument is a type of accelerator (instance or string), + set self._accelerator_flag accordingly. + 3. precision: The final value of the precision flag may be determined either by the precision argument or + by a plugin instance. + 4. plugins: The list of plugins may contain a Precision plugin, CheckpointIO, ClusterEnvironment and others. + Additionally, other flags such as `precision` can populate the list with the + corresponding plugin instances. + """ + if plugins is not None: + plugins = [plugins] if not isinstance(plugins, list) else plugins + + if isinstance(strategy, str): + strategy = strategy.lower() + + if strategy is not None: + self._strategy_flag = strategy + + if strategy is not None and strategy not in self._registered_strategies and not isinstance(strategy, Strategy): + raise ValueError( + f"You selected an invalid strategy name: `strategy={strategy!r}`." + f" Available names are: {', '.join(self._registered_strategies)}." + ) + + if ( + accelerator is not None + and accelerator not in self._registered_accelerators + and accelerator not in ("auto", "gpu") + and not isinstance(accelerator, Accelerator) + ): + raise ValueError( + f"You selected an invalid accelerator name: `accelerator={accelerator!r}`." + f" Available names are: {', '.join(self._registered_accelerators)}." + ) + + self._accelerator_flag = accelerator + + if precision is not None: + if str(precision) not in self._precision_types: + raise ValueError( + f"Precision {repr(precision)} is invalid. Allowed precision values: {self._precision_types}" + ) + self._precision_flag = precision + + if plugins: + plugins_flags_types: Dict[str, int] = Counter() + for plugin in plugins: + if isinstance(plugin, Precision): + self._precision_plugin_flag = plugin + plugins_flags_types[Precision.__name__] += 1 + elif isinstance(plugin, CheckpointIO): + self.checkpoint_io = plugin + plugins_flags_types[CheckpointIO.__name__] += 1 + elif isinstance(plugin, ClusterEnvironment): + self._cluster_environment_flag = plugin + plugins_flags_types[ClusterEnvironment.__name__] += 1 + else: + raise TypeError( + f"Found invalid type for plugin {plugin}. Expected one of: Precision, " + "CheckpointIO, ClusterEnviroment." + ) + + duplicated_plugin_key = [k for k, v in plugins_flags_types.items() if v > 1] + if duplicated_plugin_key: + raise ValueError( + f"Received multiple values for {', '.join(duplicated_plugin_key)} flags in `plugins`." + " Expected one value for each type at most." + ) + + # handle the case when the user passes in a strategy instance which has an accelerator, precision, + # checkpoint io or cluster env set up + # TODO: improve the error messages below + if self._strategy_flag and isinstance(self._strategy_flag, Strategy): + if self._strategy_flag._accelerator: + if self._accelerator_flag: + raise ValueError("accelerator set through both strategy class and accelerator flag, choose one") + else: + self._accelerator_flag = self._strategy_flag._accelerator + if self._strategy_flag._precision_plugin: + # [RFC] handle precision plugin set up conflict? + if self._precision_plugin_flag: + raise ValueError("precision set through both strategy class and plugins, choose one") + else: + self._precision_plugin_flag = self._strategy_flag._precision_plugin + if self._strategy_flag._checkpoint_io: + if self.checkpoint_io: + raise ValueError("checkpoint_io set through both strategy class and plugins, choose one") + else: + self.checkpoint_io = self._strategy_flag._checkpoint_io + if getattr(self._strategy_flag, "cluster_environment", None): + if self._cluster_environment_flag: + raise ValueError("cluster_environment set through both strategy class and plugins, choose one") + else: + self._cluster_environment_flag = getattr(self._strategy_flag, "cluster_environment") + + if hasattr(self._strategy_flag, "parallel_devices"): + if self._strategy_flag.parallel_devices: + if self._strategy_flag.parallel_devices[0].type == "cpu": + if self._accelerator_flag and self._accelerator_flag not in ("auto", "cpu"): + raise ValueError( + f"CPU parallel_devices set through {self._strategy_flag.__class__.__name__} class," + f" but accelerator set to {self._accelerator_flag}, please choose one device type" + ) + self._accelerator_flag = "cpu" + if self._strategy_flag.parallel_devices[0].type == "cuda": + if self._accelerator_flag and self._accelerator_flag not in ("auto", "cuda", "gpu"): + raise ValueError( + f"GPU parallel_devices set through {self._strategy_flag.__class__.__name__} class," + f" but accelerator set to {self._accelerator_flag}, please choose one device type" + ) + self._accelerator_flag = "cuda" + self._parallel_devices = self._strategy_flag.parallel_devices + + def _check_device_config_and_set_final_flags( + self, + devices: Optional[Union[List[int], str, int]], + num_nodes: int, + gpus: Optional[Union[List[int], str, int]], + tpu_cores: Optional[Union[List[int], str, int]], + ) -> None: + self._num_nodes_flag = int(num_nodes) if num_nodes is not None else 1 + self._devices_flag = devices + + if self._devices_flag in ([], 0, "0"): + accelerator_name = ( + self._accelerator_flag.__class__.__qualname__ + if isinstance(self._accelerator_flag, Accelerator) + else self._accelerator_flag + ) + raise ValueError( + f"`Lite(devices={self._devices_flag!r})` value is not a valid input" + f" using {accelerator_name} accelerator." + ) + + # TODO: Delete this method when num_processes, gpus, ipus and tpu_cores gets removed + self._map_deprecated_devices_specific_info_to_accelerator_and_device_flag(devices, gpus, tpu_cores) + + if self._devices_flag == "auto" and self._accelerator_flag is None: + raise ValueError( + f"You passed `devices={devices}` but haven't specified" + " `accelerator=('auto'|'tpu'|'gpu'|'cpu'|'mps')` for the devices mapping." + ) + + def _map_deprecated_devices_specific_info_to_accelerator_and_device_flag( + self, + devices: Optional[Union[List[int], str, int]], + gpus: Optional[Union[List[int], str, int]], + tpu_cores: Optional[Union[List[int], str, int]], + ) -> None: + """Emit deprecation warnings for num_processes, gpus, ipus, tpu_cores and set the `devices_flag` and + `accelerator_flag`.""" + if gpus is not None: + rank_zero_deprecation( + f"Setting `Lite(gpus={gpus!r})` is deprecated in v1.7 and will be removed" + f" in v2.0. Please use `Lite(accelerator='gpu', devices={gpus!r})` instead." + ) + if tpu_cores is not None: + rank_zero_deprecation( + f"Setting `Lite(tpu_cores={tpu_cores!r})` is deprecated in v1.7 and will be removed" + f" in v2.0. Please use `Lite(accelerator='tpu', devices={tpu_cores!r})` instead." + ) + self._gpus: Optional[Union[List[int], str, int]] = gpus + self._tpu_cores: Optional[Union[List[int], str, int]] = tpu_cores + deprecated_devices_specific_flag = gpus or tpu_cores + if deprecated_devices_specific_flag and deprecated_devices_specific_flag not in ([], 0, "0"): + if devices: + # TODO: improve error message + rank_zero_warn( + f"The flag `devices={devices}` will be ignored, " + f"instead the device specific number {deprecated_devices_specific_flag} will be used" + ) + + if [(gpus is not None), (tpu_cores is not None)].count(True) > 1: + # TODO: improve error message + rank_zero_warn("more than one device specific flag has been set") + self._devices_flag = deprecated_devices_specific_flag + + if self._accelerator_flag is None: + # set accelerator type based on num_processes, gpus, ipus, tpu_cores + if tpu_cores: + self._accelerator_flag = "tpu" + if gpus: + self._accelerator_flag = "cuda" + + def _choose_auto_accelerator(self) -> str: + """Choose the accelerator type (str) based on availability when ``accelerator='auto'``.""" + if self._accelerator_flag == "auto": + if _TPU_AVAILABLE: + return "tpu" + if _IPU_AVAILABLE: + return "ipu" + if _HPU_AVAILABLE: + return "hpu" + if MPSAccelerator.is_available(): + return "mps" + if CUDAAccelerator.is_available(): + return "cuda" + return "cpu" + + @staticmethod + def _choose_gpu_accelerator_backend() -> str: + if MPSAccelerator.is_available(): + return "mps" + if CUDAAccelerator.is_available(): + return "cuda" + + raise RuntimeError("No supported gpu backend found!") + + def _set_parallel_devices_and_init_accelerator(self) -> None: + if isinstance(self._accelerator_flag, Accelerator): + self.accelerator: Accelerator = self._accelerator_flag + else: + assert self._accelerator_flag is not None + self.accelerator = ACCELERATOR_REGISTRY.get(self._accelerator_flag) + + if not self.accelerator.is_available(): + available_accelerator = [ + acc_str for acc_str in self._registered_accelerators if ACCELERATOR_REGISTRY.get(acc_str).is_available() + ] + raise RuntimeError( + f"{self.accelerator.__class__.__qualname__} can not run on your system" + " since the accelerator is not available. The following accelerator(s)" + " is available and can be passed into `accelerator` argument of" + f" `Lite`: {available_accelerator}." + ) + + self._set_devices_flag_if_auto_passed() + + self._gpus = self._devices_flag if not self._gpus else self._gpus + self._tpu_cores = self._devices_flag if not self._tpu_cores else self._tpu_cores + + self._devices_flag = self.accelerator.parse_devices(self._devices_flag) + if not self._parallel_devices: + self._parallel_devices = self.accelerator.get_parallel_devices(self._devices_flag) + + def _set_devices_flag_if_auto_passed(self) -> None: + if self._devices_flag == "auto" or self._devices_flag is None: + self._devices_flag = self.accelerator.auto_device_count() + + def _choose_and_init_cluster_environment(self) -> ClusterEnvironment: + if isinstance(self._cluster_environment_flag, ClusterEnvironment): + return self._cluster_environment_flag + if self._is_slurm_managing_tasks(): + rank_zero_info("Multiprocessing is handled by SLURM.") + return SLURMEnvironment() + for env_type in (TorchElasticEnvironment, KubeflowEnvironment, LSFEnvironment): + if env_type.detect(): + # Ignore type error because it is a false positive: https://github.com/python/mypy/issues/13044 + return env_type() # type: ignore[abstract] + return LightningEnvironment() + + def _is_slurm_managing_tasks(self) -> bool: + """used by choosing cluster enviroment.""" + # TODO(lite): Remove this, see: https://github.com/Lightning-AI/lightning/pull/14300 + if not SLURMEnvironment.detect() or SLURMEnvironment.job_name() == "bash": + return False + + total_requested_devices = len(self._parallel_devices) * self._num_nodes_flag + num_slurm_tasks = int(os.environ["SLURM_NTASKS"], 0) + return num_slurm_tasks == total_requested_devices + + def _choose_strategy(self) -> Union[Strategy, str]: + if self._accelerator_flag == "tpu": + if self._parallel_devices and len(self._parallel_devices) > 1: + return "tpu_spawn" + else: + # TODO: lazy initialized device, then here could be self._strategy_flag = "single_tpu_device" + return SingleTPUStrategy(device=self._parallel_devices[0]) # type: ignore + if self._num_nodes_flag > 1: + return "ddp" + if len(self._parallel_devices) <= 1: + # TODO: Change this once gpu accelerator was renamed to cuda accelerator + if isinstance(self._accelerator_flag, (CUDAAccelerator, MPSAccelerator)) or ( + isinstance(self._accelerator_flag, str) and self._accelerator_flag in ("cuda", "gpu", "mps") + ): + device = device_parser.determine_root_gpu_device(self._parallel_devices) + else: + device = "cpu" + # TODO: lazy initialized device, then here could be self._strategy_flag = "single_device" + return SingleDeviceStrategy(device=device) # type: ignore + if len(self._parallel_devices) > 1: + if _IS_INTERACTIVE: + return "ddp_fork" + return "ddp_spawn" + + return "ddp" + + def _check_strategy_and_fallback(self) -> None: + """Checks edge cases when the strategy selection was a string input, and we need to fall back to a + different choice depending on other parameters or the environment.""" + # current fallback and check logic only apply to user pass in str config and object config + # TODO this logic should apply to both str and object config + strategy_flag = "" if isinstance(self._strategy_flag, Strategy) else self._strategy_flag + + if strategy_flag in ("ddp_spawn", "ddp_spawn_find_unused_parameters_false") and ( + TorchElasticEnvironment.detect() or KubeflowEnvironment.detect() or self._is_slurm_managing_tasks() + ): + strategy_flag = "ddp" + if strategy_flag == "dp" and self._accelerator_flag == "cpu": + rank_zero_warn(f"{strategy_flag!r} is not supported on CPUs, hence setting `strategy='ddp'`.") + strategy_flag = "ddp" + if strategy_flag in _DDP_FORK_ALIASES and "fork" not in torch.multiprocessing.get_all_start_methods(): + raise ValueError( + f"You selected `Lite(strategy='{strategy_flag}')` but process forking is not supported on this" + f" platform. We recommed `Lite(strategy='ddp_spawn')` instead." + ) + if strategy_flag: + self._strategy_flag = strategy_flag + + def _init_strategy(self) -> None: + """Instantiate the Strategy given depending on the setting of ``_strategy_flag``.""" + if isinstance(self._strategy_flag, str): + self.strategy = STRATEGY_REGISTRY.get(self._strategy_flag) + elif isinstance(self._strategy_flag, Strategy): + self.strategy = self._strategy_flag + else: + raise RuntimeError(f"{self.strategy} is not valid type: {self.strategy}") + + def _check_and_init_precision(self) -> Precision: + self._validate_precision_choice() + if isinstance(self._precision_plugin_flag, Precision): + return self._precision_plugin_flag + + if isinstance(self.accelerator, TPUAccelerator): + if self._precision_flag == 32: + return TPUPrecision() + elif self._precision_flag in (16, "bf16"): + if self._precision_flag == 16: + rank_zero_warn( + "You passed `Lite(accelerator='tpu', precision=16)` but AMP" + " is not supported with TPUs. Using `precision='bf16'` instead." + ) + return TPUBf16Precision() + if isinstance(self.strategy, DeepSpeedStrategy): + return DeepSpeedPrecision(self._precision_flag, amp_type="native", amp_level=None) # type: ignore + + if self._precision_flag == 32: + return Precision() + if self._precision_flag == 64: + return DoublePrecision() + + if self._precision_flag == 16 and self._accelerator_flag == "cpu": + rank_zero_warn( + "You passed `Lite(accelerator='cpu', precision=16)` but native AMP is not supported on CPU." + " Using `precision='bf16'` instead." + ) + self._precision_flag = "bf16" + + if self._precision_flag in (16, "bf16"): + rank_zero_info( + "Using 16-bit Automatic Mixed Precision (AMP)" + if self._precision_flag == 16 + else "Using bfloat16 Automatic Mixed Precision (AMP)" + ) + + device = "cpu" if self._accelerator_flag == "cpu" else "cuda" + return NativeMixedPrecision(self._precision_flag, device) + + raise RuntimeError("No precision set") + + def _validate_precision_choice(self) -> None: + """Validate the combination of choices for precision, and accelerator.""" + if isinstance(self.accelerator, TPUAccelerator): + if self._precision_flag == 64: + raise NotImplementedError( + "`Lite(accelerator='tpu', precision=64)` is not implemented." + " Please, open an issue in `https://github.com/Lightning-AI/lightning/issues`" + " requesting this feature." + ) + if self._precision_plugin_flag and not isinstance( + self._precision_plugin_flag, (TPUPrecision, TPUBf16Precision) + ): + raise ValueError( + f"The `TPUAccelerator` can only be used with a `TPUPrecision` plugin," + f" found: {self._precision_plugin_flag}." + ) + + def _lazy_init_strategy(self) -> None: + """Lazily set missing attributes on the previously instantiated strategy.""" + self.strategy.accelerator = self.accelerator + if self.precision_plugin: + self.strategy.precision_plugin = self.precision_plugin + if self.checkpoint_io: + self.strategy.checkpoint_io = self.checkpoint_io + if hasattr(self.strategy, "cluster_environment"): + self.strategy.cluster_environment = self.cluster_environment + if hasattr(self.strategy, "parallel_devices"): + if self.strategy.parallel_devices: + self._parallel_devices = self.strategy.parallel_devices + else: + self.strategy.parallel_devices = self._parallel_devices + if hasattr(self.strategy, "num_nodes"): + self.strategy._num_nodes = self._num_nodes_flag + if hasattr(self.strategy, "set_world_ranks"): + self.strategy.set_world_ranks() + self.strategy._configure_launcher() + + from lightning_lite.utilities import _IS_INTERACTIVE + + if _IS_INTERACTIVE and self.strategy.launcher and not self.strategy.launcher.is_interactive_compatible: + raise RuntimeError( + f"`Lite(strategy={self._strategy_flag!r})` is not compatible with an interactive" + " environment. Run your code as a script, or choose one of the compatible strategies:" + f" Lite(strategy=None|{'|'.join(_StrategyType.interactive_compatible_types())})." + " In case you are spawning processes yourself, make sure to include the Lite" + " creation inside the worker function." + ) + + # TODO: should be moved to _check_strategy_and_fallback(). + # Current test check precision first, so keep this check here to meet error order + if isinstance(self.accelerator, TPUAccelerator) and not isinstance( + self.strategy, (SingleTPUStrategy, XLAStrategy) + ): + raise ValueError( + "The `TPUAccelerator` can only be used with a `SingleTPUStrategy` or `XLAStrategy`," + f" found {self.strategy.__class__.__name__}." + ) + + @property + def is_distributed(self) -> bool: + # TODO: deprecate this property + # Used for custom plugins. + # Custom plugins should implement is_distributed property. + if hasattr(self.strategy, "is_distributed") and not isinstance(self.accelerator, TPUAccelerator): + return self.strategy.is_distributed + distributed_strategy = ( + DDPStrategy, + DDPSpawnShardedStrategy, + DDPShardedStrategy, + DDPSpawnStrategy, + DeepSpeedStrategy, + XLAStrategy, + ) + is_distributed = isinstance(self.strategy, distributed_strategy) + if isinstance(self.accelerator, TPUAccelerator): + is_distributed |= self.strategy.is_distributed + return is_distributed diff --git a/tests/tests_lite/test_connector.py b/tests/tests_lite/test_connector.py new file mode 100644 index 0000000000000..8f9f9984ef53b --- /dev/null +++ b/tests/tests_lite/test_connector.py @@ -0,0 +1,720 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License + +import os +from typing import Any, Dict +from unittest import mock + +import pytest +import torch +import torch.distributed +from tests_lite.helpers.runif import RunIf + +import lightning_lite +from lightning_lite.accelerators.accelerator import Accelerator +from lightning_lite.accelerators.cpu import CPUAccelerator +from lightning_lite.accelerators.cuda import CUDAAccelerator +from lightning_lite.accelerators.mps import MPSAccelerator +from lightning_lite.connector import _Connector +from lightning_lite.plugins import DoublePrecision, Precision +from lightning_lite.plugins.environments import ( + KubeflowEnvironment, + LightningEnvironment, + SLURMEnvironment, + TorchElasticEnvironment, +) +from lightning_lite.plugins.io import TorchCheckpointIO +from lightning_lite.strategies import ( + DataParallelStrategy, + DDPShardedStrategy, + DDPSpawnShardedStrategy, + DDPSpawnStrategy, + DDPStrategy, + DeepSpeedStrategy, + SingleDeviceStrategy, +) +from lightning_lite.strategies.ddp_spawn import _DDP_FORK_ALIASES +from lightning_lite.utilities.exceptions import MisconfigurationException + + +def test_accelerator_choice_cpu(tmpdir): + connector = _Connector() + assert isinstance(connector.accelerator, CPUAccelerator) + assert isinstance(connector.strategy, SingleDeviceStrategy) + + +@RunIf(skip_windows=True, standalone=True) +def test_strategy_choice_ddp_on_cpu(tmpdir): + """Test that selecting DDPStrategy on CPU works.""" + _test_strategy_choice_ddp_and_cpu(ddp_strategy_class=DDPStrategy) + + +@RunIf(skip_windows=True) +def test_strategy_choice_ddp_spawn_on_cpu(tmpdir): + """Test that selecting DDPSpawnStrategy on CPU works.""" + _test_strategy_choice_ddp_and_cpu(ddp_strategy_class=DDPSpawnStrategy) + + +def _test_strategy_choice_ddp_and_cpu(ddp_strategy_class): + connector = _Connector( + strategy=ddp_strategy_class(find_unused_parameters=True), + accelerator="cpu", + devices=2, + ) + assert isinstance(connector.strategy, ddp_strategy_class) + assert isinstance(connector.accelerator, CPUAccelerator) + assert connector.strategy.num_processes == 2 + assert connector.strategy.parallel_devices == [torch.device("cpu")] * 2 + + +@mock.patch.dict( + os.environ, + { + "SLURM_NTASKS": "2", + "SLURM_JOB_NAME": "SOME_NAME", + "SLURM_NODEID": "0", + "LOCAL_RANK": "0", + "SLURM_PROCID": "0", + "SLURM_LOCALID": "0", + }, +) +@mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=0) +def test_custom_cluster_environment_in_slurm_environment(_): + """Test that we choose the custom cluster even when SLURM or TE flags are around.""" + + class CustomCluster(LightningEnvironment): + @property + def main_address(self): + return "asdf" + + @property + def creates_processes_externally(self) -> bool: + return True + + connector = _Connector( + plugins=[CustomCluster()], + accelerator="cpu", + strategy="ddp", + devices=2, + ) + assert isinstance(connector.accelerator, CPUAccelerator) + assert isinstance(connector.strategy, DDPStrategy) + assert isinstance(connector.strategy.cluster_environment, CustomCluster) + + +@mock.patch.dict( + os.environ, + { + "SLURM_NTASKS": "2", + "SLURM_JOB_NAME": "SOME_NAME", + "SLURM_NODEID": "0", + "LOCAL_RANK": "0", + "SLURM_PROCID": "0", + "SLURM_LOCALID": "0", + }, +) +@mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=0) +def test_custom_accelerator(*_): + class Accel(Accelerator): + def setup_device(self, device: torch.device) -> None: + pass + + def get_device_stats(self, device: torch.device) -> Dict[str, Any]: + pass + + def teardown(self) -> None: + pass + + @staticmethod + def parse_devices(devices): + return devices + + @staticmethod + def get_parallel_devices(devices): + return [torch.device("cpu")] * devices + + @staticmethod + def auto_device_count() -> int: + return 1 + + @staticmethod + def is_available() -> bool: + return True + + @staticmethod + def name() -> str: + return "custom_acc_name" + + class Prec(Precision): + pass + + class Strat(SingleDeviceStrategy): + pass + + strategy = Strat(device=torch.device("cpu"), accelerator=Accel(), precision_plugin=Prec()) + connector = _Connector(strategy=strategy, devices=2) + assert isinstance(connector.accelerator, Accel) + assert isinstance(connector.strategy, Strat) + assert isinstance(connector.precision_plugin, Prec) + assert connector.strategy is strategy + + class Strat(DDPStrategy): + pass + + strategy = Strat(accelerator=Accel(), precision_plugin=Prec()) + connector = _Connector(strategy=strategy, devices=2) + assert isinstance(connector.accelerator, Accel) + assert isinstance(connector.strategy, Strat) + assert isinstance(connector.precision_plugin, Prec) + assert connector.strategy is strategy + + +@mock.patch.dict( + os.environ, + { + "SLURM_NTASKS": "2", + "SLURM_JOB_NAME": "SOME_NAME", + "SLURM_NODEID": "0", + "LOCAL_RANK": "0", + "SLURM_PROCID": "0", + "SLURM_LOCALID": "0", + }, +) +@mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=0) +def test_dist_backend_accelerator_mapping(*_): + connector = _Connector(strategy="ddp_spawn", accelerator="cpu", devices=2) + assert isinstance(connector.accelerator, CPUAccelerator) + assert isinstance(connector.strategy, DDPStrategy) + assert connector.strategy.local_rank == 0 + + +@mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=2) +@mock.patch("lightning_lite.utilities.device_parser._get_all_available_mps_gpus", return_value=[0, 1]) +def test_ipython_incompatible_backend_error(_, __, monkeypatch): + monkeypatch.setattr(lightning_lite.utilities, "_IS_INTERACTIVE", True) + with pytest.raises(RuntimeError, match=r"strategy='ddp'\)`.*is not compatible"): + _Connector(strategy="ddp", accelerator="gpu", devices=2) + + with pytest.raises(RuntimeError, match=r"strategy='ddp_spawn'\)`.*is not compatible"): + _Connector(strategy="ddp_spawn", accelerator="gpu", devices=2) + + with pytest.raises(RuntimeError, match=r"strategy='ddp_sharded_spawn'\)`.*is not compatible"): + _Connector(strategy="ddp_sharded_spawn", accelerator="gpu", devices=2) + + with pytest.raises(RuntimeError, match=r"strategy='ddp'\)`.*is not compatible"): + # Edge case: _Connector maps dp to ddp if accelerator != gpu + _Connector(strategy="dp") + + +@mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=2) +def test_ipython_compatible_dp_strategy_gpu(_, monkeypatch): + monkeypatch.setattr(lightning_lite.utilities, "_IS_INTERACTIVE", True) + connector = _Connector(strategy="dp", accelerator="gpu") + assert connector.strategy.launcher is None + + +@RunIf(skip_windows=True) +@mock.patch("lightning_lite.accelerators.tpu.TPUAccelerator.is_available", return_value=True) +def test_ipython_compatible_strategy_tpu(_, monkeypatch): + monkeypatch.setattr(lightning_lite.utilities, "_IS_INTERACTIVE", True) + connector = _Connector(accelerator="tpu") + assert connector.strategy.launcher.is_interactive_compatible + + +@RunIf(skip_windows=True) +def test_ipython_compatible_strategy_ddp_fork(monkeypatch): + monkeypatch.setattr(lightning_lite.utilities, "_IS_INTERACTIVE", True) + connector = _Connector(strategy="ddp_fork", accelerator="cpu") + assert connector.strategy.launcher.is_interactive_compatible + + +@pytest.mark.parametrize( + ["strategy", "strategy_class"], + [ + ("ddp", DDPStrategy), + ("ddp_spawn", DDPSpawnStrategy), + ("ddp_sharded", DDPShardedStrategy), + ("ddp_sharded_spawn", DDPSpawnShardedStrategy), + pytest.param("deepspeed", DeepSpeedStrategy, marks=RunIf(deepspeed=True)), + ], +) +@pytest.mark.parametrize("devices", [1, 2]) +@mock.patch("lightning_lite.utilities.device_parser.is_cuda_available", return_value=True) +@mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=2) +@mock.patch("lightning_lite.utilities.device_parser._get_all_available_mps_gpus", return_value=[0, 1]) +def test_accelerator_choice_multi_node_gpu(_, __, ___, strategy, strategy_class, devices): + connector = _Connector(num_nodes=2, accelerator="gpu", strategy=strategy, devices=devices) + assert isinstance(connector.strategy, strategy_class) + + +@mock.patch("lightning_lite.accelerators.cuda.device_parser.num_cuda_devices", return_value=0) +def test_accelerator_cpu(*_): + connector = _Connector(accelerator="cpu") + assert isinstance(connector.accelerator, CPUAccelerator) + + with pytest.raises( + RuntimeError, + match="CUDAAccelerator can not run on your system since the accelerator is not available", + ): + with pytest.deprecated_call(match=r"is deprecated in v1.7 and will be removed"): + _Connector(gpus=1) + + with pytest.raises( + RuntimeError, + match="CUDAAccelerator can not run on your system since the accelerator is not available.", + ): + _Connector(accelerator="cuda") + + with pytest.deprecated_call(match=r"is deprecated in v1.7 and will be removed"): + _Connector(accelerator="cpu", gpus=1) + + +@mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=2) +@mock.patch("lightning_lite.utilities.device_parser.is_cuda_available", return_value=True) +@pytest.mark.parametrize("device_count", (["0"], [0, "1"], ["GPU"], [["0", "1"], [0, 1]], [False])) +def test_accelererator_invalid_type_devices(_, __, device_count): + with pytest.raises( + MisconfigurationException, match=r"must be an int, a string, a sequence of ints or None, but you" + ): + _ = _Connector(accelerator="gpu", devices=device_count) + + +@RunIf(min_cuda_gpus=1) +def test_accelerator_gpu(): + connector = _Connector(accelerator="gpu", devices=1) + assert isinstance(connector.accelerator, CUDAAccelerator) + + connector = _Connector(accelerator="gpu") + assert isinstance(connector.accelerator, CUDAAccelerator) + + connector = _Connector(accelerator="auto", devices=1) + assert isinstance(connector.accelerator, CUDAAccelerator) + + +@pytest.mark.parametrize(["devices", "strategy_class"], [(1, SingleDeviceStrategy), (5, DDPSpawnStrategy)]) +def test_accelerator_cpu_with_devices(devices, strategy_class): + connector = _Connector(accelerator="cpu", devices=devices) + assert connector._parallel_devices == [torch.device("cpu")] * devices + assert isinstance(connector.strategy, strategy_class) + assert isinstance(connector.accelerator, CPUAccelerator) + + +@RunIf(min_cuda_gpus=2) +@pytest.mark.parametrize( + ["devices", "strategy_class"], [(1, SingleDeviceStrategy), ([1], SingleDeviceStrategy), (2, DDPSpawnStrategy)] +) +def test_accelerator_gpu_with_devices(devices, strategy_class): + connector = _Connector(accelerator="gpu", devices=devices) + assert len(connector._parallel_devices) == len(devices) if isinstance(devices, list) else devices + assert isinstance(connector.strategy, strategy_class) + assert isinstance(connector.accelerator, CUDAAccelerator) + + +@RunIf(min_cuda_gpus=1) +def test_accelerator_auto_with_devices_gpu(): + connector = _Connector(accelerator="auto", devices=1) + assert isinstance(connector.accelerator, CUDAAccelerator) + assert connector._parallel_devices == [torch.device("cuda", 0)] + + +def test_set_devices_if_none_cpu(): + connector = _Connector(accelerator="cpu", devices=3) + assert connector._parallel_devices == [torch.device("cpu")] * 3 + + +def test_unsupported_strategy_types_on_cpu_and_fallback(): + with pytest.warns(UserWarning, match="is not supported on CPUs, hence setting `strategy='ddp"): + connector = _Connector(strategy="dp", devices=2) + assert isinstance(connector.strategy, DDPStrategy) + + +def test_invalid_accelerator_choice(): + with pytest.raises(ValueError, match="You selected an invalid accelerator name: `accelerator='cocofruit'`"): + _Connector(accelerator="cocofruit") + + +def test_invalid_strategy_choice(): + with pytest.raises(ValueError, match="You selected an invalid strategy name: `strategy='cocofruit'`"): + _Connector(strategy="cocofruit") + + +@pytest.mark.parametrize( + ["strategy", "strategy_class"], + [ + ("ddp_spawn", DDPSpawnStrategy), + ("ddp_spawn_find_unused_parameters_false", DDPSpawnStrategy), + ("ddp", DDPStrategy), + ("ddp_find_unused_parameters_false", DDPStrategy), + ], +) +def test_strategy_choice_cpu_str(strategy, strategy_class): + connector = _Connector(strategy=strategy, accelerator="cpu", devices=2) + assert isinstance(connector.strategy, strategy_class) + + +@pytest.mark.parametrize("strategy_class", [DDPSpawnStrategy, DDPStrategy]) +def test_strategy_choice_cpu_instance(strategy_class): + connector = _Connector(strategy=strategy_class(), accelerator="cpu", devices=2) + assert isinstance(connector.strategy, strategy_class) + + +@RunIf(min_cuda_gpus=2) +@pytest.mark.parametrize( + ["strategy", "strategy_class"], + [ + ("ddp_spawn", DDPSpawnStrategy), + ("ddp_spawn_find_unused_parameters_false", DDPSpawnStrategy), + ("ddp", DDPStrategy), + ("ddp_find_unused_parameters_false", DDPStrategy), + ("dp", DataParallelStrategy), + ("ddp_sharded", DDPShardedStrategy), + ("ddp_sharded_spawn", DDPSpawnShardedStrategy), + pytest.param("deepspeed", DeepSpeedStrategy, marks=RunIf(deepspeed=True)), + ], +) +def test_strategy_choice_gpu_str(strategy, strategy_class): + connector = _Connector(strategy=strategy, accelerator="gpu", devices=2) + assert isinstance(connector.strategy, strategy_class) + + +@RunIf(min_cuda_gpus=2) +@pytest.mark.parametrize("strategy_class", [DDPSpawnStrategy, DDPStrategy]) +def test_strategy_choice_gpu_instance(strategy_class): + connector = _Connector(strategy=strategy_class(), accelerator="gpu", devices=2) + assert isinstance(connector.strategy, strategy_class) + + +@RunIf(min_cuda_gpus=2) +@pytest.mark.parametrize("strategy_class", [DDPSpawnStrategy, DDPStrategy]) +def test_device_type_when_strategy_instance_gpu_passed(strategy_class): + connector = _Connector(strategy=strategy_class(), accelerator="gpu", devices=2) + assert isinstance(connector.strategy, strategy_class) + assert isinstance(connector.accelerator, CUDAAccelerator) + + +@pytest.mark.parametrize("precision", [1, 12, "invalid"]) +def test_validate_precision_type(precision): + with pytest.raises(ValueError, match=f"Precision {repr(precision)} is invalid"): + _Connector(precision=precision) + + +def test_strategy_choice_ddp_spawn_cpu(): + connector = _Connector(strategy="ddp_spawn", accelerator="cpu", devices=2) + assert isinstance(connector.accelerator, CPUAccelerator) + assert isinstance(connector.strategy, DDPSpawnStrategy) + assert isinstance(connector.strategy.cluster_environment, LightningEnvironment) + assert connector.strategy.launcher._start_method == "spawn" + + +@RunIf(skip_windows=True) +@mock.patch("lightning_lite.connector._IS_INTERACTIVE", True) +def test_strategy_choice_ddp_fork_in_interactive(): + """Test that when accelerator and strategy are unspecified, the connector chooses DDP Fork in interactive + environments by default.""" + connector = _Connector(devices=2) + assert isinstance(connector.accelerator, CPUAccelerator) + assert isinstance(connector.strategy, DDPSpawnStrategy) + assert isinstance(connector.strategy.cluster_environment, LightningEnvironment) + assert connector.strategy.launcher._start_method == "fork" + + +@RunIf(skip_windows=True) +def test_strategy_choice_ddp_fork_cpu(): + connector = _Connector(strategy="ddp_fork", accelerator="cpu", devices=2) + assert isinstance(connector.accelerator, CPUAccelerator) + assert isinstance(connector.strategy, DDPSpawnStrategy) + assert isinstance(connector.strategy.cluster_environment, LightningEnvironment) + assert connector.strategy.launcher._start_method == "fork" + + +@mock.patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0,1"}) +@mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=2) +@mock.patch("lightning_lite.utilities.device_parser.is_cuda_available", return_value=True) +@mock.patch("lightning_lite.accelerators.mps.MPSAccelerator.is_available", return_value=False) +def test_strategy_choice_ddp(*_): + connector = _Connector(strategy="ddp", accelerator="gpu", devices=1) + assert isinstance(connector.accelerator, CUDAAccelerator) + assert isinstance(connector.strategy, DDPStrategy) + assert isinstance(connector.strategy.cluster_environment, LightningEnvironment) + + +@mock.patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0,1"}) +@mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=2) +@mock.patch("lightning_lite.utilities.device_parser.is_cuda_available", return_value=True) +@mock.patch("lightning_lite.accelerators.mps.MPSAccelerator.is_available", return_value=False) +def test_strategy_choice_ddp_spawn(*_): + connector = _Connector(strategy="ddp_spawn", accelerator="gpu", devices=1) + assert isinstance(connector.accelerator, CUDAAccelerator) + assert isinstance(connector.strategy, DDPSpawnStrategy) + assert isinstance(connector.strategy.cluster_environment, LightningEnvironment) + + +@RunIf(min_cuda_gpus=2) +@mock.patch.dict( + os.environ, + { + "CUDA_VISIBLE_DEVICES": "0,1", + "SLURM_NTASKS": "2", + "SLURM_JOB_NAME": "SOME_NAME", + "SLURM_NODEID": "0", + "SLURM_PROCID": "1", + "SLURM_LOCALID": "1", + }, +) +@mock.patch("pytorch_lightning.strategies.DDPStrategy.setup_distributed", autospec=True) +@pytest.mark.parametrize("strategy", ["ddp", DDPStrategy()]) +def test_strategy_choice_ddp_slurm(_, strategy): + connector = _Connector(strategy=strategy, accelerator="gpu", devices=2) + assert connector._is_slurm_managing_tasks() + assert isinstance(connector.accelerator, CUDAAccelerator) + assert isinstance(connector.strategy, DDPStrategy) + assert isinstance(connector.strategy.cluster_environment, SLURMEnvironment) + assert connector.strategy.cluster_environment.local_rank() == 1 + assert connector.strategy.local_rank == 1 + + +@mock.patch.dict( + os.environ, + { + "CUDA_VISIBLE_DEVICES": "0,1", + "WORLD_SIZE": "2", + "LOCAL_WORLD_SIZE": "2", + "RANK": "1", + "LOCAL_RANK": "1", + "GROUP_RANK": "0", + "TORCHELASTIC_RUN_ID": "1", + }, +) +@mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=2) +@mock.patch("lightning_lite.utilities.device_parser.is_cuda_available", return_value=True) +@mock.patch("lightning_lite.accelerators.mps.MPSAccelerator.is_available", return_value=False) +def test_strategy_choice_ddp_te(*_): + connector = _Connector(strategy="ddp", accelerator="gpu", devices=2) + assert isinstance(connector.accelerator, CUDAAccelerator) + assert isinstance(connector.strategy, DDPStrategy) + assert isinstance(connector.strategy.cluster_environment, TorchElasticEnvironment) + assert connector.strategy.cluster_environment.local_rank() == 1 + assert connector.strategy.local_rank == 1 + + +@mock.patch.dict( + os.environ, + { + "WORLD_SIZE": "2", + "LOCAL_WORLD_SIZE": "2", + "RANK": "1", + "LOCAL_RANK": "1", + "GROUP_RANK": "0", + "TORCHELASTIC_RUN_ID": "1", + }, +) +def test_strategy_choice_ddp_cpu_te(): + connector = _Connector(strategy="ddp_spawn", accelerator="cpu", devices=2) + assert isinstance(connector.accelerator, CPUAccelerator) + assert isinstance(connector.strategy, DDPStrategy) + assert isinstance(connector.strategy.cluster_environment, TorchElasticEnvironment) + assert connector.strategy.cluster_environment.local_rank() == 1 + assert connector.strategy.local_rank == 1 + + +@mock.patch.dict( + os.environ, + { + "CUDA_VISIBLE_DEVICES": "0", + "KUBERNETES_PORT": "tcp://127.0.0.1:443", + "MASTER_ADDR": "1.2.3.4", + "MASTER_PORT": "500", + "WORLD_SIZE": "20", + "RANK": "1", + }, +) +@mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=1) +@mock.patch("lightning_lite.utilities.device_parser.is_cuda_available", return_value=True) +@mock.patch("lightning_lite.accelerators.mps.MPSAccelerator.is_available", return_value=False) +def test_strategy_choice_ddp_kubeflow(*_): + connector = _Connector(strategy="ddp", accelerator="gpu", devices=1) + assert isinstance(connector.accelerator, CUDAAccelerator) + assert isinstance(connector.strategy, DDPStrategy) + assert isinstance(connector.strategy.cluster_environment, KubeflowEnvironment) + assert connector.strategy.cluster_environment.local_rank() == 0 + assert connector.strategy.local_rank == 0 + + +@mock.patch.dict( + os.environ, + { + "KUBERNETES_PORT": "tcp://127.0.0.1:443", + "MASTER_ADDR": "1.2.3.4", + "MASTER_PORT": "500", + "WORLD_SIZE": "20", + "RANK": "1", + }, +) +def test_strategy_choice_ddp_cpu_kubeflow(): + connector = _Connector(strategy="ddp_spawn", accelerator="cpu", devices=2) + assert isinstance(connector.accelerator, CPUAccelerator) + assert isinstance(connector.strategy, DDPStrategy) + assert isinstance(connector.strategy.cluster_environment, KubeflowEnvironment) + assert connector.strategy.cluster_environment.local_rank() == 0 + assert connector.strategy.local_rank == 0 + + +@mock.patch.dict( + os.environ, + { + "SLURM_NTASKS": "2", + "SLURM_JOB_NAME": "SOME_NAME", + "SLURM_NODEID": "0", + "LOCAL_RANK": "0", + "SLURM_PROCID": "0", + "SLURM_LOCALID": "0", + }, +) +@pytest.mark.parametrize("strategy", ["ddp", DDPStrategy()]) +def test_strategy_choice_ddp_cpu_slurm(strategy): + connector = _Connector(strategy=strategy, accelerator="cpu", devices=2) + assert isinstance(connector.accelerator, CPUAccelerator) + assert isinstance(connector.strategy, DDPStrategy) + assert isinstance(connector.strategy.cluster_environment, SLURMEnvironment) + assert connector.strategy.local_rank == 0 + + +@mock.patch("lightning_lite.accelerators.tpu.TPUAccelerator.is_available", return_value=True) +@mock.patch.dict(os.environ, {}, clear=True) +def test_unsupported_tpu_choice(*_): + + with pytest.raises(NotImplementedError, match=r"accelerator='tpu', precision=64\)` is not implemented"): + _Connector(accelerator="tpu", precision=64) + + # if user didn't set strategy, _Connector will choose the TPUSingleStrategy or TPUSpawnStrategy + with pytest.raises(ValueError, match="TPUAccelerator` can only be used with a `SingleTPUStrategy`"): + with pytest.warns(UserWarning, match=r"accelerator='tpu', precision=16\)` but native AMP is not supported"): + _Connector(accelerator="tpu", precision=16, strategy="ddp") + + +@mock.patch("lightning_lite.accelerators.cuda.CUDAAccelerator.is_available", return_value=False) +@mock.patch("lightning_lite.accelerators.tpu.TPUAccelerator.is_available", return_value=False) +@mock.patch("lightning_lite.accelerators.mps.MPSAccelerator.is_available", return_value=False) +def test_devices_auto_choice_cpu(*_): + connector = _Connector(accelerator="auto", devices="auto") + assert isinstance(connector.accelerator, CPUAccelerator) + assert isinstance(connector.strategy, SingleDeviceStrategy) + assert connector.strategy.root_device == torch.device("cpu") + + +@RunIf(mps=False) +@mock.patch("lightning_lite.utilities.device_parser.is_cuda_available", return_value=True) +@mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=2) +def test_devices_auto_choice_gpu(*_): + connector = _Connector(accelerator="auto", devices="auto") + assert isinstance(connector.accelerator, CUDAAccelerator) + assert isinstance(connector.strategy, DDPSpawnStrategy) + assert len(connector._parallel_devices) == 2 + + +@RunIf(mps=True) +def test_devices_auto_choice_mps(): + connector = _Connector(accelerator="auto", devices="auto") + assert isinstance(connector.accelerator, MPSAccelerator) + assert isinstance(connector.strategy, SingleDeviceStrategy) + assert connector.strategy.root_device == torch.device("mps", 0) + assert connector._parallel_devices == [torch.device("mps", 0)] + + +@pytest.mark.parametrize( + ["parallel_devices", "accelerator"], + [([torch.device("cpu")], "cuda"), ([torch.device("cuda", i) for i in range(8)], "tpu")], +) +def test_parallel_devices_in_strategy_confilict_with_accelerator(parallel_devices, accelerator): + with pytest.raises(ValueError, match=r"parallel_devices set through"): + _Connector(strategy=DDPStrategy(parallel_devices=parallel_devices), accelerator=accelerator) + + +@pytest.mark.parametrize( + ["plugins", "expected"], + [ + ([LightningEnvironment(), SLURMEnvironment()], "ClusterEnvironment"), + ([TorchCheckpointIO(), TorchCheckpointIO()], "CheckpointIO"), + ( + [Precision(), DoublePrecision(), LightningEnvironment(), SLURMEnvironment()], + "Precision, ClusterEnvironment", + ), + ], +) +def test_plugin_only_one_instance_for_one_type(plugins, expected): + with pytest.raises(ValueError, match=f"Received multiple values for {expected}"): + _Connector(plugins=plugins) + + +@pytest.mark.parametrize("accelerator", ("cpu", "cuda", "mps", "tpu")) +@pytest.mark.parametrize("devices", ("0", 0, [])) +def test_passing_zero_and_empty_list_to_devices_flag(accelerator, devices): + with pytest.raises(ValueError, match="value is not a valid input using"): + _Connector(accelerator=accelerator, devices=devices) + + +@pytest.mark.parametrize( + "expected_accelerator_flag,expected_accelerator_class", + [ + pytest.param("cuda", CUDAAccelerator, marks=RunIf(min_cuda_gpus=1)), + pytest.param("mps", MPSAccelerator, marks=RunIf(mps=True)), + ], +) +def test_gpu_accelerator_backend_choice(expected_accelerator_flag, expected_accelerator_class): + connector = _Connector(accelerator="gpu") + assert connector._accelerator_flag == expected_accelerator_flag + assert isinstance(connector.accelerator, expected_accelerator_class) + + +@mock.patch("lightning_lite.accelerators.mps.MPSAccelerator.is_available", return_value=False) +@mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=1) +def test_gpu_accelerator_backend_choice_cuda(*_): + connector = _Connector(accelerator="gpu") + assert connector._accelerator_flag == "cuda" + assert isinstance(connector.accelerator, CUDAAccelerator) + + +@mock.patch("lightning_lite.accelerators.mps.MPSAccelerator.is_available", return_value=True) +@mock.patch("lightning_lite.utilities.device_parser._get_all_available_mps_gpus", return_value=[0]) +@mock.patch("torch.device", return_value="mps") # necessary because torch doesn't allow creation of mps devices +def test_gpu_accelerator_backend_choice_mps(*_): + connector = _Connector(accelerator="gpu") + assert connector._accelerator_flag == "mps" + assert isinstance(connector.accelerator, MPSAccelerator) + + +@mock.patch("lightning_lite.accelerators.mps.MPSAccelerator.is_available", return_value=False) +@mock.patch("lightning_lite.accelerators.cuda.CUDAAccelerator.is_available", return_value=False) +def test_gpu_accelerator_no_gpu_backend_found_error(*_): + with pytest.raises(RuntimeError, match="No supported gpu backend found!"): + _Connector(accelerator="gpu") + + +@pytest.mark.parametrize("strategy", _DDP_FORK_ALIASES) +@mock.patch( + "lightning_lite.connector.torch.multiprocessing.get_all_start_methods", + return_value=[], +) +def test_ddp_fork_on_unsupported_platform(_, strategy): + with pytest.raises(ValueError, match="process forking is not supported on this platform"): + _Connector(strategy=strategy) + + +@RunIf(skip_windows=True) +@pytest.mark.parametrize("strategy", _DDP_FORK_ALIASES) +@mock.patch.dict(os.environ, {"PL_DISABLE_FORK": "1"}, clear=True) +def test_strategy_choice_ddp_spawn_in_interactive_when_fork_disabled(strategy): + """Test there is an error when forking is disabled via the environment variable and the user requests fork.""" + with pytest.raises(ValueError, match="Forking is disabled in this environment"): + _Connector(devices=2, strategy=strategy) From 81c53fdaecaa39b97fc06ee91a53f347685f134d Mon Sep 17 00:00:00 2001 From: Ethan Harris Date: Thu, 15 Sep 2022 15:58:07 +0100 Subject: [PATCH 166/193] Invalid cache before listing drive when collecting component names (#13971) Co-authored-by: thomas chaton Co-authored-by: Jirka Borovec --- src/lightning_app/storage/drive.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/lightning_app/storage/drive.py b/src/lightning_app/storage/drive.py index f72ad38b6e130..d413e93bb9837 100644 --- a/src/lightning_app/storage/drive.py +++ b/src/lightning_app/storage/drive.py @@ -260,6 +260,9 @@ def __deepcopy__(self, memo): def _collect_component_names(self) -> List[str]: sep = "/" if self.fs.exists(self.drive_root): + # Invalidate cache before running ls in case new directories have been added + # TODO: Re-evaluate this - may lead to performance issues + self.fs.invalidate_cache() return [str(p.split(sep)[-1]) for p in self.fs.ls(self.drive_root)] return [] From 4f88014cc5d19e395d93afcde1f7c53fa0b958ef Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Thu, 15 Sep 2022 18:57:43 +0200 Subject: [PATCH 167/193] Docs: fix link to title (#14730) * lower --- .github/BECOMING_A_CORE_CONTRIBUTOR.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/BECOMING_A_CORE_CONTRIBUTOR.md b/.github/BECOMING_A_CORE_CONTRIBUTOR.md index fd40e29e1ebf1..85098b06acad2 100644 --- a/.github/BECOMING_A_CORE_CONTRIBUTOR.md +++ b/.github/BECOMING_A_CORE_CONTRIBUTOR.md @@ -38,7 +38,7 @@ Here, we describe general expectations from core contributors: ### Pull Requests (PRs) -- Pull requests are the evolutionary mechanism of Lightning, so quality is extremely important. Make sure contributors adhere to the guidelines described in the [contributing section](CONTRIBUTING.md#Pull-Request). +- Pull requests are the evolutionary mechanism of Lightning, so quality is extremely important. Make sure contributors adhere to the guidelines described in the [contributing section](CONTRIBUTING.md#pull-request). - Some PRs are from people who want to get involved and try to add something unnecessary. We do want their help though! So don’t approve the PR, but direct them to a Github issue that they might be interested in helping with instead! From 89adc4bb408005dd9cb4180c26acd1592b396049 Mon Sep 17 00:00:00 2001 From: Laverne Henderson Date: Thu, 15 Sep 2022 10:09:33 -0700 Subject: [PATCH 168/193] New bug form (#14193) Created a YAML form to replace the MD form for issues. * Minor update to form * Removed MD issue form * More updates * Renamed the file Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Jirka Borovec --- .github/ISSUE_TEMPLATE/1_bug_report.yaml | 98 ++++++++++++++++++++++++ .github/ISSUE_TEMPLATE/bug_report.md | 72 ----------------- 2 files changed, 98 insertions(+), 72 deletions(-) create mode 100644 .github/ISSUE_TEMPLATE/1_bug_report.yaml delete mode 100644 .github/ISSUE_TEMPLATE/bug_report.md diff --git a/.github/ISSUE_TEMPLATE/1_bug_report.yaml b/.github/ISSUE_TEMPLATE/1_bug_report.yaml new file mode 100644 index 0000000000000..315f965e46091 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/1_bug_report.yaml @@ -0,0 +1,98 @@ +name: Report a bug +description: Any errors that are not UI related. +labels: ["needs triage", "bug"] +body: + - type: markdown + attributes: + value: > + WAIT! + + Before you go any further. Is this really a **🐛 bug**? + + If it's a question about how Lightning works, have a look at our [Lightning documentation](https://lightning.ai/lightning-docs/) + or ask a question on our [Community Slack](https://pytorch-lightning.slack.com). + + If it really is a **🐛 bug**, please fill out this form as completely as you can. + + - type: checkboxes + id: checks + attributes: + label: First check + description: Please confirm and check all the following options. + options: + - label: I'm sure this is a bug. + required: true + - label: I've added a descriptive title to this bug. + required: true + - label: I've provided clear instructions on how to reproduce the bug. + required: true + - label: I've added a code sample. + required: true + - label: I've provided any other important info that is required. + required: true + + - type: textarea + attributes: + label: Bug description + description: A clear and concise description of the 🐛bug🐛. + validations: + required: true + + - type: textarea + attributes: + label: How to reproduce the bug + description: > + Provide steps and example code here. + placeholder: "# Insert code here" + render: python + validations: + required: true + + - type: textarea + attributes: + label: Error messages and logs + description: > + Provide any error messages and/or logs + placeholder: "# Copy the complete error messages and logs" + value: | + ``` + + # Error messages and logs here please + + ``` + validations: + required: false + + - type: textarea + attributes: + label: Important info + description: > + Provide all the following info + value: | + ``` + + #- Lightning Component (e.g. Trainer, LightningModule, LightningApp, LightningWork, LightningFlow): + #- PyTorch Lightning Version (e.g., 1.5.0): + #- Lightning App Version (e.g., 0.5.2): + #- PyTorch Version (e.g., 1.10): + #- Python version (e.g., 3.9): + #- OS (e.g., Linux): + #- CUDA/cuDNN version: + #- GPU models and configuration: + #- How you installed Lightning(`conda`, `pip`, source): + #- Running environment of LightningApp (e.g. local, cloud): + + ``` + validations: + required: true + + - type: textarea + attributes: + label: More info + description: Add any other info about the issue here. + validations: + required: false + + - type: markdown + attributes: + value: "**Happy engineering!**" diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md deleted file mode 100644 index de4eacde1f39e..0000000000000 --- a/.github/ISSUE_TEMPLATE/bug_report.md +++ /dev/null @@ -1,72 +0,0 @@ ---- -name: Bug report -about: Create a bug report to help us improve -title: '' -labels: needs triage -assignees: '' ---- - -## 🐛 Bug - - - -### To Reproduce - - - -### Expected behavior - - - -### Environment - - - -- Lightning Component (e.g. Trainer, LightningModule, LightningApp, LightningWork, LightningFlow): -- PyTorch Lightning Version (e.g., 1.5.0): -- Lightning App Version (e.g., 0.5.2): -- PyTorch Version (e.g., 1.10): -- Python version (e.g., 3.9): -- OS (e.g., Linux): -- CUDA/cuDNN version: -- GPU models and configuration: -- How you installed PyTorch (`conda`, `pip`, source): -- If compiling from source, the output of `torch.__config__.show()`: -- Running environment of LightningApp (e.g. local, cloud): -- Any other relevant information: - -### Additional context - - From 09f50b4295b1373e8973651e18e71ba668887782 Mon Sep 17 00:00:00 2001 From: Yurij Mikhalevich Date: Thu, 15 Sep 2022 21:35:16 +0300 Subject: [PATCH 169/193] Fix Google Tag Manager for the Lightning App docs (#14731) - updates the Lightning App docs theme to the one without Pytorch Lightning docs Google Tag Manager hardcoded - sets the GTM id in the conf.py for Lightning App docs --- docs/source-app/conf.py | 1 + requirements/app/docs.txt | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/source-app/conf.py b/docs/source-app/conf.py index de2c737bf4fd2..62bd18a900417 100644 --- a/docs/source-app/conf.py +++ b/docs/source-app/conf.py @@ -158,6 +158,7 @@ html_theme_options = { "pytorch_project": lightning_app.__homepage__, + "analytics_id": "G-D3Q2ESCTZR", "canonical_url": lightning_app.__homepage__, "collapse_navigation": False, "display_version": True, diff --git a/requirements/app/docs.txt b/requirements/app/docs.txt index c189d6034ab28..a255ce32cb688 100644 --- a/requirements/app/docs.txt +++ b/requirements/app/docs.txt @@ -5,4 +5,4 @@ ipython_genutils pytorch-lightning sphinx-autobuild -https://storage.googleapis.com/grid-packages/lightning-ai-sphinx-theme/build-31.3.zip +https://storage.googleapis.com/grid-packages/lightning-ai-sphinx-theme/build-32.zip From 9ea4ab6b191e86fd6d72e6852eb4bdd5978aa19b Mon Sep 17 00:00:00 2001 From: Laverne Henderson Date: Thu, 15 Sep 2022 12:30:32 -0700 Subject: [PATCH 170/193] Update installation (#14732) * Update installation Updates to use python -m pip install -U lightning and adds troubleshooting note * Apply suggestions from code review Co-authored-by: Jirka Borovec --- docs/source-app/index.rst | 2 +- docs/source-app/installation.rst | 8 ++++++++ docs/source-app/installation_mac.rst | 2 +- 3 files changed, 10 insertions(+), 2 deletions(-) diff --git a/docs/source-app/index.rst b/docs/source-app/index.rst index 55b1ac79d06c4..8bbe84702ccc4 100644 --- a/docs/source-app/index.rst +++ b/docs/source-app/index.rst @@ -27,7 +27,7 @@ Install Lightning .. code-block:: bash - pip install lightning + python -m pip install -U lightning Or read the :ref:`advanced install ` guide. diff --git a/docs/source-app/installation.rst b/docs/source-app/installation.rst index 340fd86da802a..0faa50216e9af 100644 --- a/docs/source-app/installation.rst +++ b/docs/source-app/installation.rst @@ -28,3 +28,11 @@ Install with pip .. code:: bash python -m pip install -U lightning + +.. note:: + + If you encounter issues during installation use the following to help troubleshoot: + + .. code:: bash + + pip list | grep lightning diff --git a/docs/source-app/installation_mac.rst b/docs/source-app/installation_mac.rst index 180a3a88936ff..d6274be24fdf5 100644 --- a/docs/source-app/installation_mac.rst +++ b/docs/source-app/installation_mac.rst @@ -19,4 +19,4 @@ Install the ``lightning`` package export GRPC_PYTHON_BUILD_SYSTEM_OPENSSL=1 export GRPC_PYTHON_BUILD_SYSTEM_ZLIB=1 - pip install lightning + python -m pip install -U lightning From 57ae32783ab1967549d9f2cabdc01d12cb545f94 Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Fri, 16 Sep 2022 04:32:04 +0200 Subject: [PATCH 171/193] update codeowners (#14718) --- .github/CODEOWNERS | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 086f9e6b09e49..a6ffc9b82b498 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -30,30 +30,29 @@ # Packages /src/pytorch_lightning @borda @awaelchli @carmocca @justusschock @rohitgr7 @otaj /src/pytorch_lightning/accelerators @williamfalcon @tchaton @SeanNaren @awaelchli @justusschock @kaushikb11 -/src/pytorch_lightning/callbacks @williamfalcon @tchaton @carmocca @borda @kaushikb11 +/src/pytorch_lightning/callbacks @williamfalcon @tchaton @carmocca @kaushikb11 /src/pytorch_lightning/core @tchaton @borda @carmocca @justusschock @kaushikb11 /src/pytorch_lightning/distributed @williamfalcon @tchaton @awaelchli @kaushikb11 /src/pytorch_lightning/lite @tchaton @awaelchli @carmocca /src/pytorch_lightning/loggers @tchaton @awaelchli @borda -/src/pytorch_lightning/loggers/wandb.py @borisdayma @borda +/src/pytorch_lightning/loggers/wandb.py @borisdayma @otaj @rohitgr7 /src/pytorch_lightning/loggers/neptune.py @shnela @HubertJaworski @pkasprzyk @pitercl @Raalsky @aniezurawski @kamil-kaczmarek /src/pytorch_lightning/loops @tchaton @awaelchli @justusschock @carmocca -/src/pytorch_lightning/overrides @tchaton @borda /src/pytorch_lightning/plugins @tchaton @awaelchli @justusschock /src/pytorch_lightning/profilers @williamfalcon @tchaton @borda @carmocca -/src/pytorch_lightning/profilers/pytorch.py @nbcsm @guotuofeng +/src/pytorch_lightning/profilers/pytorch.py @nbcsm @guotuofeng @carmocca /src/pytorch_lightning/strategies @tchaton @SeanNaren @awaelchli @justusschock @kaushikb11 /src/pytorch_lightning/trainer @williamfalcon @borda @tchaton @carmocca @awaelchli @justusschock @kaushikb11 -/src/pytorch_lightning/trainer/connectors @tchaton @carmocca @borda +/src/pytorch_lightning/trainer/connectors @tchaton @carmocca @awaelchli /src/pytorch_lightning/tuner @SkafteNicki @borda @awaelchli /src/pytorch_lightning/utilities @borda @tchaton @carmocca -/src/lightning_app @tchaton @manskx -/src/lightning_app/cli/pl-app-template @tchaton @awaelchli @Borda +/src/lightning_app @tchaton @manskx @lantiga +/src/lightning_app/cli/pl-app-template @tchaton @awaelchli @lantiga /src/lightning_app/core @tchaton @awaelchli @manskx /src/lightning_app/core/queues.py @tchaton @hhsecond @manskx -/src/lightning_app/runners/cloud.py @tchaton @hhsecond -/src/lightning_app/testing @tchaton @manskx +/src/lightning_app/runners/cloud.py @tchaton @hhsecond @lantiga +/src/lightning_app/testing @tchaton @manskx @lantiga /src/lightning_app/__about__.py @nohalon @edenlightning @lantiga /src/lightning_lite @awaelchli @carmocca @justusschock @@ -68,18 +67,18 @@ # Lite tests /tests/tests_lite @awaelchli @carmocca @justusschock -# Specifics -/src/pytorch_lightning/trainer/connectors/logger_connector @tchaton @carmocca -/src/pytorch_lightning/trainer/progress.py @tchaton @awaelchli @carmocca # API /src/pytorch_lightning/callbacks/callback.py @williamfalcon @awaelchli @ananthsub @carmocca /src/pytorch_lightning/core/datamodule.py @williamFalcon @awaelchli @ananthsub @carmocca -/src/pytorch_lightning/trainer/trainer.py @williamfalcon @tchaton @awaelchli -/src/pytorch_lightning/core/hooks.py @williamfalcon @tchaton @awaelchli @ananthsub @carmocca -/src/pytorch_lightning/core/module.py @williamfalcon @tchaton @awaelchli +/src/pytorch_lightning/trainer/trainer.py @williamfalcon @tchaton @awaelchli @carmocca +/src/pytorch_lightning/core/hooks.py @williamfalcon @tchaton @awaelchli @carmocca +/src/pytorch_lightning/core/module.py @williamfalcon @tchaton @awaelchli @carmocca /.github/CODEOWNERS @williamfalcon /SECURITY.md @williamfalcon /README.md @williamfalcon @edenlightning @borda /setup.py @williamfalcon @borda @carmocca /src/pytorch_lightning/__about__.py @williamfalcon @borda @carmocca +/src/lightning_app/__about__.py @williamfalcon @borda @manskx +/src/lightning_lite/__about__.py @williamfalcon @borda @awaelchli +/src/*/__setup__.py @borda @manskx From fbb7177e5f86948773ab05f423b19727183f208d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Fri, 16 Sep 2022 12:02:54 +0200 Subject: [PATCH 172/193] Combine the pip install commands in conda workflow (#14744) --- .github/workflows/ci-pytorch-test-conda.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/ci-pytorch-test-conda.yml b/.github/workflows/ci-pytorch-test-conda.yml index e3b4582e05b1d..c05772ed34871 100644 --- a/.github/workflows/ci-pytorch-test-conda.yml +++ b/.github/workflows/ci-pytorch-test-conda.yml @@ -67,8 +67,7 @@ jobs: # adjust versions according installed Torch version python ./requirements/pytorch/adjust-versions.py requirements/pytorch/extra.txt python ./requirements/pytorch/adjust-versions.py requirements/pytorch/examples.txt - pip install -r requirements/pytorch/devel.txt --find-links https://download.pytorch.org/whl/torch_stable.html - pip install -r requirements/pytorch/strategies.txt + pip install -r requirements/pytorch/devel.txt -r requirements/pytorch/strategies.txt --find-links https://download.pytorch.org/whl/torch_stable.html # set a per-test timeout of 2.5 minutes to fail sooner; this aids with hanging tests pip install pytest-timeout pip list From 619e76f22db9ac590d1972a43da34475f00e4e07 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Fri, 16 Sep 2022 13:00:09 +0200 Subject: [PATCH 173/193] Remove silent behavior when `num_slurm_tasks` does not correspond to number of processes in Trainer (#14300) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * simplify logic * remove hpc * update * add changelog * more tests * update test Co-authored-by: Carlos Mocholí Co-authored-by: Jirka Co-authored-by: Jirka Borovec --- .../plugins/environments/slurm_environment.py | 10 +++- src/pytorch_lightning/CHANGELOG.md | 2 + .../connectors/accelerator_connector.py | 25 ++++------ .../environments/test_slurm_environment.py | 3 ++ .../connectors/test_accelerator_connector.py | 47 ++++++++++--------- 5 files changed, 46 insertions(+), 41 deletions(-) diff --git a/src/lightning_lite/plugins/environments/slurm_environment.py b/src/lightning_lite/plugins/environments/slurm_environment.py index a69eea6a471f3..7b7b3e5fa60d3 100644 --- a/src/lightning_lite/plugins/environments/slurm_environment.py +++ b/src/lightning_lite/plugins/environments/slurm_environment.py @@ -88,8 +88,14 @@ def main_port(self) -> int: @staticmethod def detect() -> bool: - """Returns ``True`` if the current process was launched on a SLURM cluster.""" - return "SLURM_NTASKS" in os.environ + """Returns ``True`` if the current process was launched on a SLURM cluster. + + It is possible to use the SLURM scheduler to request resources and then launch processes manually using a + different environment. For this, the user can set the job name in SLURM to 'bash' (``SLURM_JOB_NAME=bash``). + This will then avoid the detection of ``SLURMEnvironment`` and another environment can be detected + automatically. + """ + return "SLURM_NTASKS" in os.environ and SLURMEnvironment.job_name() != "bash" @staticmethod def job_name() -> Optional[str]: diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 5b433e265f018..6b350f0c9007a 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -71,6 +71,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - In Lightning Lite, state-dict access to the module wrapper now gets passed through to the original module reference ([#14629](https://github.com/Lightning-AI/lightning/pull/14629)) +- Removed fall-back to `LightningEnvironment` when number of SLURM tasks does not correspond to number of processes in Trainer ([#14300](https://github.com/Lightning-AI/lightning/pull/14300)) + ### Deprecated diff --git a/src/pytorch_lightning/trainer/connectors/accelerator_connector.py b/src/pytorch_lightning/trainer/connectors/accelerator_connector.py index 8e73d13458238..2231e2d7f7212 100644 --- a/src/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/src/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -558,24 +558,17 @@ def _set_devices_flag_if_auto_select_gpus_passed(self) -> None: def _choose_and_init_cluster_environment(self) -> ClusterEnvironment: if isinstance(self._cluster_environment_flag, ClusterEnvironment): return self._cluster_environment_flag - if self._is_slurm_managing_tasks(): - rank_zero_info("Multiprocessing is handled by SLURM.") - return SLURMEnvironment() - for env_type in (BaguaEnvironment, TorchElasticEnvironment, KubeflowEnvironment, LSFEnvironment): + for env_type in ( + SLURMEnvironment, + BaguaEnvironment, + TorchElasticEnvironment, + KubeflowEnvironment, + LSFEnvironment, + ): if env_type.detect(): - # Ignore type error because it is a false positive: https://github.com/python/mypy/issues/13044 - return env_type() # type: ignore[abstract] + return env_type() return LightningEnvironment() - def _is_slurm_managing_tasks(self) -> bool: - """used by choosing cluster enviroment.""" - if not SLURMEnvironment.detect() or SLURMEnvironment.job_name() == "bash": - return False - - total_requested_devices = len(self._parallel_devices) * self._num_nodes_flag - num_slurm_tasks = int(os.environ["SLURM_NTASKS"], 0) - return num_slurm_tasks == total_requested_devices - def _choose_strategy(self) -> Union[Strategy, str]: if self._accelerator_flag == "ipu": return IPUStrategy.strategy_name @@ -619,7 +612,7 @@ def _check_strategy_and_fallback(self) -> None: strategy_flag = "" if isinstance(self._strategy_flag, Strategy) else self._strategy_flag if strategy_flag in ("ddp_spawn", "ddp_spawn_find_unused_parameters_false") and ( - TorchElasticEnvironment.detect() or KubeflowEnvironment.detect() or self._is_slurm_managing_tasks() + TorchElasticEnvironment.detect() or KubeflowEnvironment.detect() or SLURMEnvironment.detect() ): strategy_flag = "ddp" if strategy_flag == "dp" and self._accelerator_flag == "cpu": diff --git a/tests/tests_lite/plugins/environments/test_slurm_environment.py b/tests/tests_lite/plugins/environments/test_slurm_environment.py index 805aa8acf80ef..441f996127c41 100644 --- a/tests/tests_lite/plugins/environments/test_slurm_environment.py +++ b/tests/tests_lite/plugins/environments/test_slurm_environment.py @@ -97,3 +97,6 @@ def test_detect(): with mock.patch.dict(os.environ, {"SLURM_NTASKS": "2"}): assert SLURMEnvironment.detect() + + with mock.patch.dict(os.environ, {"SLURM_JOB_NAME": "bash"}): + assert not SLURMEnvironment.detect() diff --git a/tests/tests_pytorch/trainer/connectors/test_accelerator_connector.py b/tests/tests_pytorch/trainer/connectors/test_accelerator_connector.py index 562e7c4df2e4c..3c0968fe68c0f 100644 --- a/tests/tests_pytorch/trainer/connectors/test_accelerator_connector.py +++ b/tests/tests_pytorch/trainer/connectors/test_accelerator_connector.py @@ -138,7 +138,7 @@ def creates_processes_externally(self) -> bool: ) @mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=0) @mock.patch("pytorch_lightning.strategies.DDPStrategy.setup_distributed", autospec=True) -def test_custom_accelerator(device_count_mock, setup_distributed_mock): +def test_custom_accelerator(*_): class Accel(Accelerator): def setup_device(self, device: torch.device) -> None: pass @@ -478,28 +478,29 @@ def test_strategy_choice_ddp_spawn(cuda_available_mock, device_count_mock): assert isinstance(trainer.strategy.cluster_environment, LightningEnvironment) -@RunIf(min_cuda_gpus=2) -@mock.patch.dict( - os.environ, - { - "CUDA_VISIBLE_DEVICES": "0,1", - "SLURM_NTASKS": "2", - "SLURM_JOB_NAME": "SOME_NAME", - "SLURM_NODEID": "0", - "SLURM_PROCID": "1", - "SLURM_LOCALID": "1", - }, -) -@mock.patch("pytorch_lightning.strategies.DDPStrategy.setup_distributed", autospec=True) -@pytest.mark.parametrize("strategy", ["ddp", DDPStrategy()]) -def test_strategy_choice_ddp_slurm(setup_distributed_mock, strategy): - trainer = Trainer(fast_dev_run=True, strategy=strategy, accelerator="gpu", devices=2) - assert trainer._accelerator_connector._is_slurm_managing_tasks() - assert isinstance(trainer.accelerator, CUDAAccelerator) - assert isinstance(trainer.strategy, DDPStrategy) - assert isinstance(trainer.strategy.cluster_environment, SLURMEnvironment) - assert trainer.strategy.cluster_environment.local_rank() == 1 - assert trainer.strategy.local_rank == 1 +@mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=2) +@mock.patch("lightning_lite.utilities.device_parser.is_cuda_available", return_value=True) +@pytest.mark.parametrize("job_name,expected_env", [("some_name", SLURMEnvironment), ("bash", LightningEnvironment)]) +@pytest.mark.parametrize("strategy", ["ddp", DDPStrategy]) +def test_strategy_choice_ddp_slurm(_, __, strategy, job_name, expected_env): + if not isinstance(strategy, str): + strategy = strategy() + + with mock.patch.dict( + os.environ, + { + "CUDA_VISIBLE_DEVICES": "0,1", + "SLURM_NTASKS": "2", + "SLURM_JOB_NAME": job_name, + "SLURM_NODEID": "0", + "SLURM_PROCID": "1", + "SLURM_LOCALID": "1", + }, + ): + trainer = Trainer(fast_dev_run=True, strategy=strategy, accelerator="cuda", devices=2) + assert isinstance(trainer.accelerator, CUDAAccelerator) + assert isinstance(trainer.strategy, DDPStrategy) + assert isinstance(trainer.strategy.cluster_environment, expected_env) @mock.patch.dict( From 0b2ebbeb4b5ea3f576040efe0ddb9dd0e216f06c Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Fri, 16 Sep 2022 14:04:13 +0200 Subject: [PATCH 174/193] CI: update HPU pool (#14733) --- .azure/hpu-tests.yml | 2 +- .github/checkgroup.yml | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/.azure/hpu-tests.yml b/.azure/hpu-tests.yml index fbe30f7539632..3aea24a148314 100644 --- a/.azure/hpu-tests.yml +++ b/.azure/hpu-tests.yml @@ -37,7 +37,7 @@ jobs: timeoutInMinutes: "10" # how much time to give 'run always even if cancelled tasks' before stopping them cancelTimeoutInMinutes: "2" - pool: habana-gaudi-hpus + pool: intel-hpus container: image: "vault.habana.ai/gaudi-docker/1.5.0/ubuntu20.04/habanalabs/pytorch-installer-1.11.0:latest" options: "--runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host --shm-size=4g --name cd-container -v /usr/bin/docker:/tmp/docker:ro" diff --git a/.github/checkgroup.yml b/.github/checkgroup.yml index 099b05f3a2c24..5cba389ffd055 100644 --- a/.github/checkgroup.yml +++ b/.github/checkgroup.yml @@ -49,8 +49,7 @@ subprojects: - "make-html (pytorch)" - "mypy" - "pytorch-lightning (GPUs)" - # TODO: the HPU server is down - #- "pytorch-lightning (HPUs)" + - "pytorch-lightning (HPUs)" - "pytorch-lightning (IPUs)" - "pl-slow (macOS-11, 3.7, 1.11)" - "pl-slow (ubuntu-20.04, 3.7, 1.11)" From c2159c48a56b7aa0da884af7201761bb5fe487f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Fri, 16 Sep 2022 14:41:02 +0200 Subject: [PATCH 175/193] Fix CC-bot for non-forks (#14710) --- .github/workflows/probot-auto-cc.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/probot-auto-cc.yml b/.github/workflows/probot-auto-cc.yml index 388cd6ee51624..585befc937cc0 100644 --- a/.github/workflows/probot-auto-cc.yml +++ b/.github/workflows/probot-auto-cc.yml @@ -3,7 +3,10 @@ name: Probot on: issues: types: [labeled] - pull_request_target: + # should use `pull_request_target` but it's blocked by + # https://github.com/probot/probot/issues/1635 + # so this job will not run on forks until the above is fixed + pull_request: types: [labeled, ready_for_review] jobs: From ced94874b28a0d5206b1a845b9833d7e6b32ea86 Mon Sep 17 00:00:00 2001 From: otaj <6065855+otaj@users.noreply.github.com> Date: Fri, 16 Sep 2022 15:26:04 +0200 Subject: [PATCH 176/193] Use valid pypi versions for install in assistant (#14750) Co-authored-by: Kushashwa Ravi Shrimali --- .actions/assistant.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/.actions/assistant.py b/.actions/assistant.py index 85a1b5879e109..38c3d8cebd763 100644 --- a/.actions/assistant.py +++ b/.actions/assistant.py @@ -1,7 +1,6 @@ import datetime import glob import json -import logging import os import re import shutil @@ -17,7 +16,6 @@ import fire import pkg_resources -from packaging.version import parse as version_parse REQUIREMENT_FILES = { "pytorch": ( @@ -124,10 +122,8 @@ def download_package(package: str, folder: str = ".", version: Optional[str] = N url = f"https://pypi.org/pypi/{PACKAGE_MAPPING[package]}/json" data = json.load(urlopen(Request(url))) if not version: - versions = list(data["releases"].keys()) - versions = sorted(versions, key=lambda x: version_parse(x)) - logging.debug(f"Available versions: {versions}") - version = versions[-1] + pypi_vers = pypi_versions(PACKAGE_MAPPING[package]) + version = pypi_vers[-1] releases = list(filter(lambda r: r["packagetype"] == "sdist", data["releases"][version])) assert releases, f"Missing 'sdist' for this package/version aka {package}/{version}" release = releases[0] From 5bef75648e27097e28c80981b1aa108a5e7ba840 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Fri, 16 Sep 2022 17:27:36 +0200 Subject: [PATCH 177/193] Remove deprecated `torch_distributed_backend` logic (#14693) * Remove deprecated torch_distributed_backend logic * changelog * mention deprecated * imports Co-authored-by: Jirka Borovec Co-authored-by: Jirka --- src/lightning_lite/strategies/deepspeed.py | 5 +- src/lightning_lite/utilities/distributed.py | 12 ---- src/pytorch_lightning/CHANGELOG.md | 4 ++ src/pytorch_lightning/strategies/ddp.py | 12 +--- src/pytorch_lightning/strategies/ddp_spawn.py | 12 +--- src/pytorch_lightning/strategies/deepspeed.py | 13 +--- .../strategies/fully_sharded_native.py | 11 +-- src/pytorch_lightning/strategies/parallel.py | 19 +---- .../deprecated_api/test_remove_1-8.py | 46 ------------ tests/tests_pytorch/strategies/test_ddp.py | 70 ++++++++----------- 10 files changed, 43 insertions(+), 161 deletions(-) diff --git a/src/lightning_lite/strategies/deepspeed.py b/src/lightning_lite/strategies/deepspeed.py index 5241d30f97976..985989752f771 100644 --- a/src/lightning_lite/strategies/deepspeed.py +++ b/src/lightning_lite/strategies/deepspeed.py @@ -33,7 +33,7 @@ from lightning_lite.plugins.precision.utils import _fp_to_half from lightning_lite.strategies.ddp import DDPStrategy from lightning_lite.utilities.apply_func import apply_to_collection -from lightning_lite.utilities.distributed import get_default_process_group_backend_for_device, log +from lightning_lite.utilities.distributed import log from lightning_lite.utilities.enums import AMPType, PrecisionType from lightning_lite.utilities.rank_zero import rank_zero_info from lightning_lite.utilities.seed import reset_seed @@ -450,9 +450,6 @@ def _init_deepspeed_distributed(self) -> None: self._process_group_backend = self._get_process_group_backend() deepspeed.init_distributed(self._process_group_backend, distributed_port=self.cluster_environment.main_port) - def _get_process_group_backend(self) -> str: - return self._process_group_backend or get_default_process_group_backend_for_device(self.root_device) - def _set_node_environment_variables(self) -> None: assert self.cluster_environment is not None os.environ["MASTER_ADDR"] = self.cluster_environment.main_address diff --git a/src/lightning_lite/utilities/distributed.py b/src/lightning_lite/utilities/distributed.py index 26fa3e1e230d0..ed7a05dc4fcfc 100644 --- a/src/lightning_lite/utilities/distributed.py +++ b/src/lightning_lite/utilities/distributed.py @@ -3,7 +3,6 @@ from typing import Any, List, Optional, Tuple, Union import torch -from lightning_utilities.core.rank_zero import rank_zero_deprecation from torch import Tensor from torch.nn import functional as F @@ -251,14 +250,3 @@ def tpu_distributed() -> bool: def get_default_process_group_backend_for_device(device: torch.device) -> str: return "nccl" if device.type == "cuda" else "gloo" - - -def _get_process_group_backend_from_env() -> Optional[str]: - torch_backend = os.getenv("PL_TORCH_DISTRIBUTED_BACKEND") - if torch_backend is not None: - rank_zero_deprecation( - "Environment variable `PL_TORCH_DISTRIBUTED_BACKEND`" - " was deprecated in v1.6 and will be removed in v1.8." - " Specify `process_group_backend` directly on the strategy constructor." - ) - return torch_backend diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 6b350f0c9007a..e1f2cbce12a49 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -171,6 +171,10 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Removed the deprecated `BaseProfiler` and `AbstractProfiler` classes ([#14404](https://github.com/Lightning-AI/lightning/pull/14404)) +- Removed the deprecated way to set the distributed backend via the environment variable `PL_TORCH_DISTRIBUTED_BACKEND`, in favor of setting the `process_group_backend` in the strategy constructor ([#14693](https://github.com/Lightning-AI/lightning/pull/14693)) + + + ### Fixed - Break HPU Graphs into two parts (forward + backward as one and optimizer as another) for better performance ([#14656](https://github.com/Lightning-AI/lightning/pull/14656)) diff --git a/src/pytorch_lightning/strategies/ddp.py b/src/pytorch_lightning/strategies/ddp.py index d197aa7979a0a..58b5593444c95 100644 --- a/src/pytorch_lightning/strategies/ddp.py +++ b/src/pytorch_lightning/strategies/ddp.py @@ -31,11 +31,7 @@ import pytorch_lightning as pl from lightning_lite.plugins.environments.cluster_environment import ClusterEnvironment from lightning_lite.plugins.io.checkpoint_plugin import CheckpointIO -from lightning_lite.utilities.distributed import ( - _get_process_group_backend_from_env, - distributed_available, - get_default_process_group_backend_for_device, -) +from lightning_lite.utilities.distributed import distributed_available, get_default_process_group_backend_for_device from lightning_lite.utilities.distributed import group as _group from lightning_lite.utilities.distributed import init_dist_connection, ReduceOp, sync_ddp_if_available from lightning_lite.utilities.optimizer import optimizers_to_device @@ -213,11 +209,7 @@ def setup_distributed(self) -> None: init_dist_connection(self.cluster_environment, self._process_group_backend, timeout=self._timeout) def _get_process_group_backend(self) -> str: - return ( - self._process_group_backend - or _get_process_group_backend_from_env() - or get_default_process_group_backend_for_device(self.root_device) - ) + return self._process_group_backend or get_default_process_group_backend_for_device(self.root_device) def set_world_ranks(self) -> None: if self.cluster_environment is None: diff --git a/src/pytorch_lightning/strategies/ddp_spawn.py b/src/pytorch_lightning/strategies/ddp_spawn.py index 092f90009bbfd..9b790c30b0011 100644 --- a/src/pytorch_lightning/strategies/ddp_spawn.py +++ b/src/pytorch_lightning/strategies/ddp_spawn.py @@ -26,11 +26,7 @@ import pytorch_lightning as pl from lightning_lite.plugins.environments.cluster_environment import ClusterEnvironment from lightning_lite.plugins.io.checkpoint_plugin import CheckpointIO -from lightning_lite.utilities.distributed import ( - _get_process_group_backend_from_env, - distributed_available, - get_default_process_group_backend_for_device, -) +from lightning_lite.utilities.distributed import distributed_available, get_default_process_group_backend_for_device from lightning_lite.utilities.distributed import group as _group from lightning_lite.utilities.distributed import init_dist_connection, ReduceOp, sync_ddp_if_available from lightning_lite.utilities.optimizer import optimizers_to_device @@ -187,11 +183,7 @@ def _worker_setup(self, process_idx: int) -> None: ) def _get_process_group_backend(self) -> str: - return ( - self._process_group_backend - or _get_process_group_backend_from_env() - or get_default_process_group_backend_for_device(self.root_device) - ) + return self._process_group_backend or get_default_process_group_backend_for_device(self.root_device) def pre_configure_ddp(self) -> None: # if unset, default `find_unused_parameters` `True` diff --git a/src/pytorch_lightning/strategies/deepspeed.py b/src/pytorch_lightning/strategies/deepspeed.py index 4116448ab95fc..c7a5600952b94 100644 --- a/src/pytorch_lightning/strategies/deepspeed.py +++ b/src/pytorch_lightning/strategies/deepspeed.py @@ -32,11 +32,6 @@ import pytorch_lightning as pl from lightning_lite.plugins.environments.cluster_environment import ClusterEnvironment from lightning_lite.plugins.precision.utils import _fp_to_half -from lightning_lite.utilities.distributed import ( - _get_process_group_backend_from_env, - get_default_process_group_backend_for_device, - log, -) from lightning_lite.utilities.enums import AMPType, PrecisionType from lightning_lite.utilities.optimizer import optimizers_to_device from lightning_lite.utilities.seed import reset_seed @@ -53,6 +48,7 @@ from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation, rank_zero_info, rank_zero_warn from pytorch_lightning.utilities.types import LRSchedulerConfig, STEP_OUTPUT +log = logging.getLogger(__name__) warning_cache = WarningCache() _DEEPSPEED_AVAILABLE = RequirementCache("deepspeed") @@ -395,13 +391,6 @@ def _init_deepspeed_distributed(self) -> None: self._process_group_backend = self._get_process_group_backend() deepspeed.init_distributed(self._process_group_backend, distributed_port=self.cluster_environment.main_port) - def _get_process_group_backend(self) -> str: - return ( - self._process_group_backend - or _get_process_group_backend_from_env() - or get_default_process_group_backend_for_device(self.root_device) - ) - def _set_node_environment_variables(self) -> None: assert self.cluster_environment is not None os.environ["MASTER_ADDR"] = self.cluster_environment.main_address diff --git a/src/pytorch_lightning/strategies/fully_sharded_native.py b/src/pytorch_lightning/strategies/fully_sharded_native.py index 09b4113adc419..999a1c50ed284 100644 --- a/src/pytorch_lightning/strategies/fully_sharded_native.py +++ b/src/pytorch_lightning/strategies/fully_sharded_native.py @@ -21,10 +21,7 @@ import pytorch_lightning as pl from lightning_lite.plugins.environments.cluster_environment import ClusterEnvironment from lightning_lite.plugins.io.checkpoint_plugin import CheckpointIO -from lightning_lite.utilities.distributed import ( - _get_process_group_backend_from_env, - get_default_process_group_backend_for_device, -) +from lightning_lite.utilities.distributed import get_default_process_group_backend_for_device from lightning_lite.utilities.distributed import group as _group from lightning_lite.utilities.distributed import init_dist_connection, ReduceOp, sync_ddp_if_available from lightning_lite.utilities.optimizer import optimizers_to_device @@ -188,11 +185,7 @@ def setup_environment(self) -> None: super().setup_environment() def _get_process_group_backend(self) -> str: - return ( - self._process_group_backend - or _get_process_group_backend_from_env() - or get_default_process_group_backend_for_device(self.root_device) - ) + return self._process_group_backend or get_default_process_group_backend_for_device(self.root_device) def set_world_ranks(self) -> None: if self.cluster_environment is None: diff --git a/src/pytorch_lightning/strategies/parallel.py b/src/pytorch_lightning/strategies/parallel.py index 3d9f6a5dd3bdd..cd737b7a54815 100644 --- a/src/pytorch_lightning/strategies/parallel.py +++ b/src/pytorch_lightning/strategies/parallel.py @@ -21,16 +21,10 @@ import pytorch_lightning as pl from lightning_lite.plugins.environments.cluster_environment import ClusterEnvironment from lightning_lite.plugins.io.checkpoint_plugin import CheckpointIO -from lightning_lite.utilities.distributed import ( - _get_process_group_backend_from_env, - all_gather_ddp_if_available, - get_default_process_group_backend_for_device, - ReduceOp, -) +from lightning_lite.utilities.distributed import all_gather_ddp_if_available, ReduceOp from pytorch_lightning.plugins import LayerSync from pytorch_lightning.plugins.precision import PrecisionPlugin from pytorch_lightning.strategies.strategy import Strategy -from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation class ParallelStrategy(Strategy, ABC): @@ -89,17 +83,6 @@ def distributed_sampler_kwargs(self) -> Dict[str, Any]: ) return distributed_sampler_kwargs - @property - def torch_distributed_backend(self) -> str: - """Deprecated property.""" - rank_zero_deprecation( - "ParallelStrategy.torch_distributed_backend was deprecated in v1.6 and will be removed in v1.8." - ) - pg_backend = _get_process_group_backend_from_env() - if pg_backend: - return pg_backend - return get_default_process_group_backend_for_device(self.root_device) - def reconciliate_processes(self, trace: str) -> None: """Function to re-conciliate processes on failure.""" diff --git a/tests/tests_pytorch/deprecated_api/test_remove_1-8.py b/tests/tests_pytorch/deprecated_api/test_remove_1-8.py index 2d51ed6c5c0d4..eb87ee75ef786 100644 --- a/tests/tests_pytorch/deprecated_api/test_remove_1-8.py +++ b/tests/tests_pytorch/deprecated_api/test_remove_1-8.py @@ -12,14 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. """Test deprecated functionality which will be removed in v1.8.0.""" -import os import time from unittest import mock from unittest.mock import Mock import numpy as np import pytest -import torch import pytorch_lightning from lightning_lite.utilities import device_parser @@ -29,7 +27,6 @@ from pytorch_lightning.loggers import CSVLogger, Logger from pytorch_lightning.plugins.precision.precision_plugin import PrecisionPlugin from pytorch_lightning.profilers import AdvancedProfiler, SimpleProfiler -from pytorch_lightning.strategies import ParallelStrategy from pytorch_lightning.strategies.ipu import LightningIPUModule from pytorch_lightning.trainer.configuration_validator import _check_datamodule_checkpoint_hooks from pytorch_lightning.trainer.states import RunningStage @@ -509,49 +506,6 @@ def test_v1_8_0_lightning_module_use_amp(): model.use_amp = False -@mock.patch.dict(os.environ, {"PL_TORCH_DISTRIBUTED_BACKEND": "foo"}) -def test_v1_8_0_torch_distributed_backend_env(): - from lightning_lite.utilities.distributed import _get_process_group_backend_from_env - - with pytest.deprecated_call( - match="Environment variable `PL_TORCH_DISTRIBUTED_BACKEND`" - " was deprecated in v1.6 and will be removed in v1.8." - ): - _get_process_group_backend_from_env() - - -def test_parallel_strategy_torch_distributed_backend(): - class CustomParallel(ParallelStrategy): - @property - def root_device(self) -> torch.device: - return torch.device("cpu") - - def model_to_device(self): - pass - - @property - def is_global_zero(self): - return True - - def broadcast(self, obj): - return obj - - def reduce(self, tensor): - return tensor - - def barrier(self): - return - - def all_gather(self, tensor): - return tensor - - strategy = CustomParallel() - with pytest.deprecated_call( - match="ParallelStrategy.torch_distributed_backend was deprecated" " in v1.6 and will be removed in v1.8." - ): - strategy.torch_distributed_backend - - def test_trainer_config_device_ids(): trainer = Trainer(devices=2) with pytest.deprecated_call( diff --git a/tests/tests_pytorch/strategies/test_ddp.py b/tests/tests_pytorch/strategies/test_ddp.py index 19317bfe300a6..a69ede050a8e7 100644 --- a/tests/tests_pytorch/strategies/test_ddp.py +++ b/tests/tests_pytorch/strategies/test_ddp.py @@ -13,7 +13,6 @@ # limitations under the License. import os from unittest import mock -from unittest.mock import patch import pytest import torch @@ -59,26 +58,22 @@ def test_multi_gpu_model_ddp_fit_test(tmpdir): @RunIf(skip_windows=True) -@pytest.mark.skipif(torch.cuda.is_available(), reason="test doesn't requires GPU machine") +@mock.patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0,1", "WORLD_SIZE": "2"}, clear=True) @mock.patch("lightning_lite.utilities.device_parser.is_cuda_available", return_value=True) -def test_torch_distributed_backend_env_variables(tmpdir): +@mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=2) +def test_torch_distributed_backend_invalid(_, __, tmpdir): """This test set `undefined` as torch backend and should raise an `Backend.UNDEFINED` ValueError.""" - _environ = {"PL_TORCH_DISTRIBUTED_BACKEND": "undefined", "CUDA_VISIBLE_DEVICES": "0,1", "WORLD_SIZE": "2"} - with patch.dict(os.environ, _environ), patch( - "lightning_lite.utilities.device_parser.num_cuda_devices", return_value=2 - ): - with pytest.deprecated_call(match="Environment variable `PL_TORCH_DISTRIBUTED_BACKEND` was deprecated in v1.6"): - with pytest.raises(ValueError, match="Invalid backend: 'undefined'"): - model = BoringModel() - trainer = Trainer( - default_root_dir=tmpdir, - fast_dev_run=True, - strategy="ddp", - accelerator="gpu", - devices=2, - logger=False, - ) - trainer.fit(model) + model = BoringModel() + trainer = Trainer( + default_root_dir=tmpdir, + fast_dev_run=True, + strategy=DDPStrategy(process_group_backend="undefined"), + accelerator="cuda", + devices=2, + logger=False, + ) + with pytest.raises(ValueError, match="Invalid backend: 'undefined'"): + trainer.fit(model) @RunIf(skip_windows=True) @@ -86,7 +81,6 @@ def test_torch_distributed_backend_env_variables(tmpdir): @mock.patch("lightning_lite.utilities.device_parser.is_cuda_available", return_value=True) @mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=1) @mock.patch("pytorch_lightning.accelerators.gpu.CUDAAccelerator.is_available", return_value=True) -@mock.patch.dict(os.environ, {"PL_TORCH_DISTRIBUTED_BACKEND": "gloo"}, clear=True) def test_ddp_torch_dist_is_available_in_setup( mock_gpu_is_available, mock_device_count, mock_cuda_available, mock_set_device, tmpdir ): @@ -98,10 +92,15 @@ def setup(self, stage: str) -> None: raise SystemExit() model = TestModel() - trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, strategy="ddp", accelerator="gpu", devices=1) - with pytest.deprecated_call(match="Environment variable `PL_TORCH_DISTRIBUTED_BACKEND` was deprecated in v1.6"): - with pytest.raises(SystemExit): - trainer.fit(model) + trainer = Trainer( + default_root_dir=tmpdir, + fast_dev_run=True, + strategy=DDPStrategy(process_group_backend="gloo"), + accelerator="gpu", + devices=1, + ) + with pytest.raises(SystemExit): + trainer.fit(model) @RunIf(min_cuda_gpus=2, min_torch="1.8.1", standalone=True) @@ -143,17 +142,15 @@ def on_train_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") @pytest.mark.parametrize( - ["process_group_backend", "env_var", "device_str", "expected_process_group_backend"], + ["process_group_backend", "device_str", "expected_process_group_backend"], [ - pytest.param("foo", None, "cpu", "foo"), - pytest.param("foo", "BAR", "cpu", "foo"), - pytest.param("foo", "BAR", "cuda:0", "foo"), - pytest.param(None, "BAR", "cuda:0", "BAR"), - pytest.param(None, None, "cuda:0", "nccl"), - pytest.param(None, None, "cpu", "gloo"), + pytest.param("foo", "cpu", "foo"), + pytest.param("foo", "cuda:0", "foo"), + pytest.param(None, "cuda:0", "nccl"), + pytest.param(None, "cpu", "gloo"), ], ) -def test_ddp_process_group_backend(process_group_backend, env_var, device_str, expected_process_group_backend): +def test_ddp_process_group_backend(process_group_backend, device_str, expected_process_group_backend): """Test settings for process group backend.""" class MockDDPStrategy(DDPStrategy): @@ -166,14 +163,7 @@ def root_device(self): return self._root_device strategy = MockDDPStrategy(process_group_backend=process_group_backend, root_device=torch.device(device_str)) - if not process_group_backend and env_var: - with mock.patch.dict(os.environ, {"PL_TORCH_DISTRIBUTED_BACKEND": env_var}): - with pytest.deprecated_call( - match="Environment variable `PL_TORCH_DISTRIBUTED_BACKEND` was deprecated in v1.6" - ): - assert strategy._get_process_group_backend() == expected_process_group_backend - else: - assert strategy._get_process_group_backend() == expected_process_group_backend + assert strategy._get_process_group_backend() == expected_process_group_backend @pytest.mark.parametrize( From 8c01c89d7480fe9f09152a0dd5134a53843a6fee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Fri, 16 Sep 2022 18:26:15 +0200 Subject: [PATCH 178/193] Remove deprecated `NeptuneLogger` code (#14727) --- src/pytorch_lightning/CHANGELOG.md | 3 + src/pytorch_lightning/loggers/neptune.py | 178 ++------------------ tests/tests_pytorch/loggers/test_neptune.py | 73 -------- 3 files changed, 20 insertions(+), 234 deletions(-) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index e1f2cbce12a49..4ca202c34edd6 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -147,6 +147,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Removed deprecated support for old torchtext versions ([#14375](https://github.com/Lightning-AI/lightning/pull/14375)) +- Removed deprecated support for the old `neptune-client` API in the `NeptuneLogger` ([#14727](https://github.com/Lightning-AI/lightning/pull/14727)) + + - Removed the deprecated `weights_save_path` Trainer argumnent and `Trainer.weights_save_path` property ([#14424](https://github.com/Lightning-AI/lightning/pull/14424)) diff --git a/src/pytorch_lightning/loggers/neptune.py b/src/pytorch_lightning/loggers/neptune.py index 0c1ab35cf58ee..a5df22d8be230 100644 --- a/src/pytorch_lightning/loggers/neptune.py +++ b/src/pytorch_lightning/loggers/neptune.py @@ -21,7 +21,6 @@ import logging import os -import warnings from argparse import Namespace from typing import Any, Callable, Dict, Generator, List, Mapping, Optional, Sequence, Set, Union from weakref import ReferenceType @@ -37,56 +36,18 @@ from pytorch_lightning.utilities.rank_zero import rank_zero_only _NEPTUNE_AVAILABLE = RequirementCache("neptune-client") -_NEPTUNE_GREATER_EQUAL_0_9 = RequirementCache("neptune-client>=0.9.0") - - -if _NEPTUNE_AVAILABLE and _NEPTUNE_GREATER_EQUAL_0_9: - try: - from neptune import new as neptune - from neptune.new.exceptions import NeptuneLegacyProjectException, NeptuneOfflineModeFetchException - from neptune.new.run import Run - from neptune.new.types import File as NeptuneFile - except ModuleNotFoundError: - import neptune - from neptune.exceptions import NeptuneLegacyProjectException, NeptuneOfflineModeFetchException - from neptune.run import Run - from neptune.types import File as NeptuneFile +if _NEPTUNE_AVAILABLE: + from neptune import new as neptune + from neptune.new.exceptions import NeptuneOfflineModeFetchException + from neptune.new.run import Run else: # needed for test mocks, and function signatures - neptune, Run, NeptuneFile = None, None, None + neptune, Run = None, None log = logging.getLogger(__name__) _INTEGRATION_VERSION_KEY = "source_code/integrations/pytorch-lightning" -# kwargs used in previous NeptuneLogger version, now deprecated -_LEGACY_NEPTUNE_INIT_KWARGS = [ - "project_name", - "offline_mode", - "experiment_name", - "experiment_id", - "params", - "properties", - "upload_source_files", - "abort_callback", - "logger", - "upload_stdout", - "upload_stderr", - "send_hardware_metrics", - "run_monitoring_thread", - "handle_uncaught_exceptions", - "git_info", - "hostname", - "notebook_id", - "notebook_path", -] - -# kwargs used in legacy NeptuneLogger from neptune-pytorch-lightning package -_LEGACY_NEPTUNE_LOGGER_KWARGS = [ - "base_namespace", - "close_after_fit", -] - class NeptuneLogger(Logger): r""" @@ -248,9 +209,7 @@ def any_lightning_module_function_or_hook(self): Raises: ModuleNotFoundError: - If required Neptune package in version >=0.9 is not installed on the device. - TypeError: - If configured project has not been migrated to new structure yet. + If required Neptune package is not installed. ValueError: If argument passed to the logger's constructor is incorrect. """ @@ -272,14 +231,13 @@ def __init__( agg_default_func: Optional[Callable[[Sequence[float]], float]] = None, **neptune_run_kwargs: Any, ): - # verify if user passed proper init arguments - self._verify_input_arguments(api_key, project, name, run, neptune_run_kwargs) if neptune is None: raise ModuleNotFoundError( "You want to use the `Neptune` logger which is not installed yet, install it with" " `pip install neptune-client`." ) - + # verify if user passed proper init arguments + self._verify_input_arguments(api_key, project, name, run, neptune_run_kwargs) super().__init__(agg_key_funcs=agg_key_funcs, agg_default_func=agg_default_func) self._log_model_checkpoints = log_model_checkpoints self._prefix = prefix @@ -347,43 +305,9 @@ def _verify_input_arguments( run: Optional["Run"], neptune_run_kwargs: dict, ) -> None: - legacy_kwargs_msg = ( - "Following kwargs are deprecated: {legacy_kwargs}.\n" - "If you are looking for the Neptune logger using legacy Python API," - " it's still available as part of neptune-contrib package:\n" - " - https://docs-legacy.neptune.ai/integrations/pytorch_lightning.html\n" - "The NeptuneLogger was re-written to use the neptune.new Python API\n" - " - https://neptune.ai/blog/neptune-new\n" - " - https://docs.neptune.ai/integrations-and-supported-tools/model-training/pytorch-lightning\n" - "You should use arguments accepted by either NeptuneLogger.init() or neptune.init()" - ) - - # check if user used legacy kwargs expected in `NeptuneLegacyLogger` - used_legacy_kwargs = [ - legacy_kwarg for legacy_kwarg in neptune_run_kwargs if legacy_kwarg in _LEGACY_NEPTUNE_INIT_KWARGS - ] - if used_legacy_kwargs: - raise ValueError(legacy_kwargs_msg.format(legacy_kwargs=used_legacy_kwargs)) - - # check if user used legacy kwargs expected in `NeptuneLogger` from neptune-pytorch-lightning package - used_legacy_neptune_kwargs = [ - legacy_kwarg for legacy_kwarg in neptune_run_kwargs if legacy_kwarg in _LEGACY_NEPTUNE_LOGGER_KWARGS - ] - if used_legacy_neptune_kwargs: - raise ValueError(legacy_kwargs_msg.format(legacy_kwargs=used_legacy_neptune_kwargs)) - - # check if user passed new client `Run` object + # check if user passed the client `Run` object if run is not None and not isinstance(run, Run): - raise ValueError( - "Run parameter expected to be of type `neptune.new.Run`.\n" - "If you are looking for the Neptune logger using legacy Python API," - " it's still available as part of neptune-contrib package:\n" - " - https://docs-legacy.neptune.ai/integrations/pytorch_lightning.html\n" - "The NeptuneLogger was re-written to use the neptune.new Python API\n" - " - https://neptune.ai/blog/neptune-new\n" - " - https://docs.neptune.ai/integrations-and-supported-tools/model-training/pytorch-lightning\n" - ) - + raise ValueError("Run parameter expected to be of type `neptune.new.Run`.") # check if user passed redundant neptune.init arguments when passed run any_neptune_init_arg_passed = any(arg is not None for arg in [api_key, project, name]) or neptune_run_kwargs if run is not None and any_neptune_init_arg_passed: @@ -435,21 +359,13 @@ def training_step(self, batch, batch_idx): @property # type: ignore[misc] @rank_zero_experiment def run(self) -> Run: - try: - if not self._run_instance: - self._run_instance = neptune.init(**self._neptune_init_args) - self._retrieve_run_data() - # make sure that we've log integration version for newly created - self._run_instance[_INTEGRATION_VERSION_KEY] = pl.__version__ - - return self._run_instance - except NeptuneLegacyProjectException as e: - raise TypeError( - f"Project {self._project_name} has not been migrated to the new structure." - " You can still integrate it with the Neptune logger using legacy Python API" - " available as part of neptune-contrib package:" - " https://docs-legacy.neptune.ai/integrations/pytorch_lightning.html\n" - ) from e + if not self._run_instance: + self._run_instance = neptune.init(**self._neptune_init_args) + self._retrieve_run_data() + # make sure that we've log integration version for newly created + self._run_instance[_INTEGRATION_VERSION_KEY] = pl.__version__ + + return self._run_instance @rank_zero_only def log_hyperparams(self, params: Union[Dict[str, Any], Namespace]) -> None: # skipcq: PYL-W0221 @@ -628,63 +544,3 @@ def version(self) -> Optional[str]: It's Neptune Run's short_id """ return self._run_short_id - - @staticmethod - def _signal_deprecated_api_usage(f_name: str, sample_code: str, raise_exception: bool = False) -> None: - msg_suffix = ( - f"If you are looking for the Neptune logger using legacy Python API," - f" it's still available as part of neptune-contrib package:\n" - f" - https://docs-legacy.neptune.ai/integrations/pytorch_lightning.html\n" - f"The NeptuneLogger was re-written to use the neptune.new Python API\n" - f" - https://neptune.ai/blog/neptune-new\n" - f" - https://docs.neptune.ai/integrations-and-supported-tools/model-training/pytorch-lightning\n" - f"Instead of `logger.{f_name}` you can use:\n" - f"\t{sample_code}" - ) - - if not raise_exception: - warnings.warn( - "The function you've used is deprecated in v1.5.0 and will be removed in v1.7.0. " + msg_suffix - ) - else: - raise ValueError("The function you've used is deprecated.\n" + msg_suffix) - - @rank_zero_only - def log_metric(self, metric_name: str, metric_value: Union[Tensor, float, str], step: Optional[int] = None) -> None: - key = f"{self._prefix}/{metric_name}" - self._signal_deprecated_api_usage("log_metric", f"logger.run['{key}'].log(42)") - if isinstance(metric_value, Tensor): - metric_value = metric_value.cpu().detach() - - self.run[key].log(metric_value, step=step) - - @rank_zero_only - def log_text(self, log_name: str, text: str, step: Optional[int] = None) -> None: - key = f"{self._prefix}/{log_name}" - self._signal_deprecated_api_usage("log_text", f"logger.run['{key}].log('text')") - self.run[key].log(str(text), step=step) - - @rank_zero_only - def log_image(self, log_name: str, image: Union[str, Any], step: Optional[int] = None) -> None: - key = f"{self._prefix}/{log_name}" - self._signal_deprecated_api_usage("log_image", f"logger.run['{key}'].log(File('path_to_image'))") - if isinstance(image, str): - # if `img` is path to file, convert it to file object - image = NeptuneFile(image) - self.run[key].log(image, step=step) - - @rank_zero_only - def log_artifact(self, artifact: str, destination: Optional[str] = None) -> None: - key = f"{self._prefix}/{self.ARTIFACTS_KEY}/{artifact}" - self._signal_deprecated_api_usage("log_artifact", f"logger.run['{key}].log('path_to_file')") - self.run[key].log(destination) - - def set_property(self, *args: Any, **kwargs: Any) -> None: - self._signal_deprecated_api_usage( - "log_artifact", f"logger.run['{self._prefix}/{self.PARAMETERS_KEY}/key'].log(value)", raise_exception=True - ) - - def append_tags(self, *args: Any, **kwargs: Any) -> None: - self._signal_deprecated_api_usage( - "append_tags", "logger.run['sys/tags'].add(['foo', 'bar'])", raise_exception=True - ) diff --git a/tests/tests_pytorch/loggers/test_neptune.py b/tests/tests_pytorch/loggers/test_neptune.py index de3017a33a472..dd7fa60fd97c9 100644 --- a/tests/tests_pytorch/loggers/test_neptune.py +++ b/tests/tests_pytorch/loggers/test_neptune.py @@ -320,79 +320,6 @@ def test_save_dir(self, neptune): self.assertEqual(logger.save_dir, os.path.join(os.getcwd(), ".neptune")) -class TestNeptuneLoggerDeprecatedUsages(unittest.TestCase): - @staticmethod - def _assert_legacy_usage(callback, *args, **kwargs): - with pytest.raises(ValueError): - callback(*args, **kwargs) - - def test_legacy_kwargs(self): - legacy_neptune_kwargs = [ - # NeptuneLegacyLogger kwargs - "project_name", - "offline_mode", - "experiment_name", - "experiment_id", - "params", - "properties", - "upload_source_files", - "abort_callback", - "logger", - "upload_stdout", - "upload_stderr", - "send_hardware_metrics", - "run_monitoring_thread", - "handle_uncaught_exceptions", - "git_info", - "hostname", - "notebook_id", - "notebook_path", - # NeptuneLogger from neptune-pytorch-lightning package kwargs - "base_namespace", - "close_after_fit", - ] - for legacy_kwarg in legacy_neptune_kwargs: - self._assert_legacy_usage(NeptuneLogger, **{legacy_kwarg: None}) - - @patch("pytorch_lightning.loggers.neptune.warnings") - @patch("pytorch_lightning.loggers.neptune.NeptuneFile") - @patch("pytorch_lightning.loggers.neptune.neptune") - def test_legacy_functions(self, neptune, neptune_file_mock, warnings_mock): - logger = NeptuneLogger(api_key="test", project="project") - - # test deprecated functions which will be shut down in pytorch-lightning 1.7.0 - attr_mock = logger.run.__getitem__ - attr_mock.reset_mock() - fake_image = {} - - logger.log_metric("metric", 42) - logger.log_text("text", "some string") - logger.log_image("image_obj", fake_image) - logger.log_image("image_str", "img/path") - logger.log_artifact("artifact", "some/path") - - assert attr_mock.call_count == 5 - assert warnings_mock.warn.call_count == 5 - attr_mock.assert_has_calls( - [ - call("training/metric"), - call().log(42, step=None), - call("training/text"), - call().log("some string", step=None), - call("training/image_obj"), - call().log(fake_image, step=None), - call("training/image_str"), - call().log(neptune_file_mock(), step=None), - call("training/artifacts/artifact"), - call().log("some/path"), - ] - ) - - # test Exception raising functions functions - self._assert_legacy_usage(logger.set_property) - self._assert_legacy_usage(logger.append_tags) - - class TestNeptuneLoggerUtils(unittest.TestCase): def test__get_full_model_name(self): # given: From a05c2a833b13f704911111c85e01579fea2d7d81 Mon Sep 17 00:00:00 2001 From: Kushashwa Ravi Shrimali Date: Fri, 16 Sep 2022 22:44:53 +0530 Subject: [PATCH 179/193] Fix boring app test: `debug=True` when running on the cloud (#14751) debug=True for boring_app (dynamic app also has debug=True) Co-authored-by: thomas chaton --- tests/tests_app_examples/test_boring_app.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/tests_app_examples/test_boring_app.py b/tests/tests_app_examples/test_boring_app.py index aa4c568b4f2a4..e1dfc33d32ef2 100644 --- a/tests/tests_app_examples/test_boring_app.py +++ b/tests/tests_app_examples/test_boring_app.py @@ -13,7 +13,7 @@ def test_boring_app_example_cloud() -> None: with run_app_in_cloud( os.path.join(_PROJECT_ROOT, "examples/app_boring/"), app_name="app_dynamic.py", - debug=False, + debug=True, ) as ( _, view_page, From 47f0d336f14c9d9d3e6ff57b23dceb6271ab183c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Fri, 16 Sep 2022 19:25:27 +0200 Subject: [PATCH 180/193] Standalone Lite: Update LightningLite (#14726) --- src/lightning_lite/__init__.py | 9 +- src/lightning_lite/connector.py | 3 +- src/lightning_lite/lite.py | 3 - src/lightning_lite/strategies/ddp_spawn.py | 7 +- src/lightning_lite/utilities/distributed.py | 58 ++++++++- src/pytorch_lightning/lite/lite.py | 117 +++++------------- src/pytorch_lightning/lite/wrappers.py | 19 ++- .../overrides/distributed.py | 58 +-------- .../trainer/connectors/data_connector.py | 3 +- tests/tests_lite/test_lite.py | 12 -- tests/tests_pytorch/lite/test_lite.py | 42 +++---- tests/tests_pytorch/lite/test_parity.py | 6 +- tests/tests_pytorch/lite/test_wrappers.py | 20 ++- .../trainer/connectors/test_data_connector.py | 2 +- 14 files changed, 144 insertions(+), 215 deletions(-) delete mode 100644 src/lightning_lite/lite.py delete mode 100644 tests/tests_lite/test_lite.py diff --git a/src/lightning_lite/__init__.py b/src/lightning_lite/__init__.py index 6c16dcbf6c393..dccaeae932c70 100644 --- a/src/lightning_lite/__init__.py +++ b/src/lightning_lite/__init__.py @@ -12,10 +12,15 @@ _logger.addHandler(logging.StreamHandler()) _logger.propagate = False -from lightning_lite.lite import LightningLite # noqa: E402 +# TODO(lite): Re-enable this import +# from lightning_lite.lite import LightningLite from lightning_lite.utilities.seed import seed_everything # noqa: E402 -__all__ = ["LightningLite", "seed_everything"] +__all__ = [ + # TODO(lite): Re-enable this import + # "LightningLite", + "seed_everything", +] # for compatibility with namespace packages __import__("pkg_resources").declare_namespace(__name__) diff --git a/src/lightning_lite/connector.py b/src/lightning_lite/connector.py index 4d512933d029a..80a932eb529cf 100644 --- a/src/lightning_lite/connector.py +++ b/src/lightning_lite/connector.py @@ -184,7 +184,8 @@ def _check_config_and_set_final_flags( if strategy is not None and strategy not in self._registered_strategies and not isinstance(strategy, Strategy): raise ValueError( f"You selected an invalid strategy name: `strategy={strategy!r}`." - f" Available names are: {', '.join(self._registered_strategies)}." + " Example choices: ddp, ddp_spawn, deepspeed, dp, ..." + " Find a complete list of options in our documentation at https://lightning.ai" ) if ( diff --git a/src/lightning_lite/lite.py b/src/lightning_lite/lite.py deleted file mode 100644 index 65fee1bf09834..0000000000000 --- a/src/lightning_lite/lite.py +++ /dev/null @@ -1,3 +0,0 @@ -class LightningLite: - # Placeholder for real implementation - pass diff --git a/src/lightning_lite/strategies/ddp_spawn.py b/src/lightning_lite/strategies/ddp_spawn.py index 3e8b48b2a6b43..def19d4ac0f24 100644 --- a/src/lightning_lite/strategies/ddp_spawn.py +++ b/src/lightning_lite/strategies/ddp_spawn.py @@ -114,10 +114,6 @@ def setup_module(self, module: Module) -> Module: return DistributedDataParallel(module=module, device_ids=self._determine_ddp_device_ids(), **self._ddp_kwargs) def module_to_device(self, module: Module) -> None: - if self.root_device.type == "cuda": - # TODO(lite): This should be handled outside module_to_device, by a call to accelerator.setup_device() - # set the device on the spawned subprocesses - torch.cuda.set_device(self.root_device) module.to(self.root_device) def reduce( @@ -200,8 +196,7 @@ def _setup_distributed(self) -> None: def _get_process_group_backend(self) -> str: return self._process_group_backend or get_default_process_group_backend_for_device(self.root_device) - def _set_world_ranks(self, process_idx: int = 0) -> None: - self._local_rank = process_idx + def _set_world_ranks(self) -> None: if self.cluster_environment is None: return self.cluster_environment.set_global_rank(self.node_rank * self.num_processes + self.local_rank) diff --git a/src/lightning_lite/utilities/distributed.py b/src/lightning_lite/utilities/distributed.py index ed7a05dc4fcfc..43c70800a529d 100644 --- a/src/lightning_lite/utilities/distributed.py +++ b/src/lightning_lite/utilities/distributed.py @@ -1,10 +1,11 @@ import logging import os -from typing import Any, List, Optional, Tuple, Union +from typing import Any, Iterable, Iterator, List, Optional, Sized, Tuple, Union import torch from torch import Tensor from torch.nn import functional as F +from torch.utils.data import Dataset, DistributedSampler, Sampler from lightning_lite.plugins.environments.cluster_environment import ClusterEnvironment from lightning_lite.utilities.imports import _HPU_AVAILABLE, _TPU_AVAILABLE @@ -250,3 +251,58 @@ def tpu_distributed() -> bool: def get_default_process_group_backend_for_device(device: torch.device) -> str: return "nccl" if device.type == "cuda" else "gloo" + + +# TODO(lite): The error messages refer to 'replace_sampler_ddp' in PL but Lite has it named 'replace_sampler' +class _DatasetSamplerWrapper(Dataset): + """Dataset to create indexes from `Sampler` or `Iterable`""" + + def __init__(self, sampler: Union[Sampler, Iterable]) -> None: + if not isinstance(sampler, Sized): + raise TypeError( + "You seem to have configured a sampler in your DataLoader which" + " does not provide `__len__` method. The sampler was about to be" + " replaced by `DistributedSamplerWrapper` since `replace_sampler_ddp`" + " is True and you are using distributed training. Either provide `__len__`" + " method in your sampler, remove it from DataLoader or set `replace_sampler_ddp=False`" + " if you want to handle distributed sampling yourself." + ) + if len(sampler) == float("inf"): + raise TypeError( + "You seem to have configured a sampler in your DataLoader which" + " does not provide finite `__len__` method. The sampler was about to be" + " replaced by `DistributedSamplerWrapper` since `replace_sampler_ddp`" + " is True and you are using distributed training. Either provide `__len__`" + " method in your sampler which returns a finite number, remove it from DataLoader" + " or set `replace_sampler_ddp=False` if you want to handle distributed sampling yourself." + ) + self._sampler = sampler + # defer materializing an iterator until it is necessary + self._sampler_list: Optional[List[Any]] = None + + def __getitem__(self, index: int) -> Any: + if self._sampler_list is None: + self._sampler_list = list(self._sampler) + return self._sampler_list[index] + + def __len__(self) -> int: + return len(self._sampler) + + def reset(self) -> None: + """Reset the sampler list in order to get new sampling.""" + self._sampler_list = list(self._sampler) + + +class DistributedSamplerWrapper(DistributedSampler): + """Wrapper over ``Sampler`` for distributed training. + + Allows you to use any sampler in distributed mode. It will be automatically used by Lightning in distributed mode if + sampler replacement is enabled. + """ + + def __init__(self, sampler: Union[Sampler, Iterable], *args: Any, **kwargs: Any) -> None: + super().__init__(_DatasetSamplerWrapper(sampler), *args, **kwargs) + + def __iter__(self) -> Iterator: + self.dataset.reset() + return (self.dataset[index] for index in super().__iter__()) diff --git a/src/pytorch_lightning/lite/lite.py b/src/pytorch_lightning/lite/lite.py index 331495e04ce06..7a361231352f6 100644 --- a/src/pytorch_lightning/lite/lite.py +++ b/src/pytorch_lightning/lite/lite.py @@ -25,7 +25,12 @@ from torch.optim import Optimizer from torch.utils.data import BatchSampler, DataLoader, DistributedSampler -from lightning_lite.utilities import _AcceleratorType, _StrategyType, move_data_to_device +from lightning_lite.accelerators.accelerator import Accelerator +from lightning_lite.connector import _Connector, _PLUGIN_INPUT +from lightning_lite.plugins import Precision +from lightning_lite.strategies import DeepSpeedStrategy, Strategy, XLAStrategy +from lightning_lite.strategies.strategy import TBroadcast +from lightning_lite.utilities import move_data_to_device from lightning_lite.utilities.apply_func import convert_to_tensors from lightning_lite.utilities.data import ( _auto_add_worker_init_fn, @@ -33,15 +38,9 @@ _update_dataloader, has_iterable_dataset, ) +from lightning_lite.utilities.distributed import DistributedSamplerWrapper from lightning_lite.utilities.seed import seed_everything -from pytorch_lightning.accelerators.accelerator import Accelerator from pytorch_lightning.lite.wrappers import _LiteDataLoader, _LiteModule, _LiteOptimizer -from pytorch_lightning.overrides.distributed import DistributedSamplerWrapper -from pytorch_lightning.plugins import PLUGIN_INPUT -from pytorch_lightning.strategies import DeepSpeedStrategy, Strategy, TPUSpawnStrategy -from pytorch_lightning.strategies.strategy import TBroadcast -from pytorch_lightning.trainer.connectors.accelerator_connector import AcceleratorConnector -from pytorch_lightning.utilities.exceptions import MisconfigurationException class LightningLite(ABC): @@ -76,34 +75,23 @@ def __init__( devices: Optional[Union[List[int], str, int]] = None, num_nodes: int = 1, precision: Union[int, str] = 32, - plugins: Optional[Union[PLUGIN_INPUT, List[PLUGIN_INPUT]]] = None, + plugins: Optional[Union[_PLUGIN_INPUT, List[_PLUGIN_INPUT]]] = None, gpus: Optional[Union[List[int], str, int]] = None, tpu_cores: Optional[Union[List[int], str, int]] = None, ) -> None: - self._check_accelerator_support(accelerator) - self._check_strategy_support(strategy) - self._accelerator_connector = AcceleratorConnector( - num_processes=None, - devices=devices, - tpu_cores=tpu_cores, - ipus=None, + self._connector = _Connector( accelerator=accelerator, strategy=strategy, - gpus=gpus, + devices=devices, num_nodes=num_nodes, - sync_batchnorm=False, # TODO: add support? - benchmark=False, - replace_sampler_ddp=True, - deterministic=False, precision=precision, - amp_type="native", - amp_level=None, plugins=plugins, - auto_select_gpus=False, + tpu_cores=tpu_cores, + gpus=gpus, ) - self._strategy = self._accelerator_connector.strategy - self._accelerator = self._strategy.accelerator - self._precision_plugin = self._strategy.precision_plugin + self._strategy: Strategy = self._connector.strategy + self._accelerator: Accelerator = self._connector.accelerator + self._precision_plugin: Precision = self._strategy.precision_plugin self._models_setup: int = 0 # wrap the run method so we can inject setup logic or spawn processes for the user @@ -173,7 +161,7 @@ def setup( model = self._move_model_to_device(model=model, optimizers=list(optimizers)) # Let accelerator/plugin wrap and connect the models and optimizers - model, optimizers = self._strategy._setup_model_and_optimizers(model, list(optimizers)) + model, optimizers = self._strategy.setup_module_and_optimizers(model, list(optimizers)) model = _LiteModule(model, self._precision_plugin, original_module=original_model) optimizers = [_LiteOptimizer(optimizer=optimizer, strategy=self._strategy) for optimizer in optimizers] self._models_setup += 1 @@ -234,7 +222,7 @@ def _setup_dataloader( _auto_add_worker_init_fn(dataloader, self.global_rank) dataloader = self._strategy.process_dataloader(dataloader) - device = self.device if move_to_device and not isinstance(self._strategy, TPUSpawnStrategy) else None + device = self.device if move_to_device and not isinstance(self._strategy, XLAStrategy) else None lite_dataloader = _LiteDataLoader(dataloader=dataloader, device=device) lite_dataloader = cast(DataLoader, lite_dataloader) return lite_dataloader @@ -256,20 +244,18 @@ def backward(self, tensor: Tensor, *args: Any, model: Optional[_LiteModule] = No if isinstance(self._strategy, DeepSpeedStrategy): if model is None: if self._models_setup == 0: - raise MisconfigurationException( - "No models were setup for backward. Did you forget to call `self.setup()`?" - ) + raise RuntimeError("No models were set up for backward. Did you forget to call `self.setup()`?") if self._models_setup > 1: - raise MisconfigurationException( + raise ValueError( "When using multiple models + deepspeed, please provide the model used to perform" " the optimization: `self.backward(loss, model=model)`" ) module = self._strategy.model else: # requires to attach the current `DeepSpeedEngine` for the `_LiteOptimizer.step` call. - self._strategy.model = module + self._strategy._deepspeed_engine = module - self._precision_plugin._run_backward(tensor, module, *args, **kwargs) + self._precision_plugin.backward(tensor, module, *args, **kwargs) @contextmanager def autocast(self) -> Generator[None, None, None]: @@ -305,11 +291,9 @@ def to_device(self, obj: Union[nn.Module, Tensor, Any]) -> Union[nn.Module, Tens A reference to the object that was moved to the new device. """ if isinstance(obj, nn.Module): - if self.device.type == "cuda": - # need to call this manually here again in case we spawned with DDPSpawnStrategy - # TODO: refactor to let accelerator handle this cleanly (see Accelerator.setup_device) - torch.cuda.set_device(self.device) - return obj.to(self.device) + self._accelerator.setup_device(self.device) + self._strategy.module_to_device(obj) + return obj return move_data_to_device(obj, device=self.device) def print(self, *args: Any, **kwargs: Any) -> None: @@ -404,13 +388,13 @@ def _run_impl(self, run_method: Callable, *args: Any, **kwargs: Any) -> Any: def _run_with_strategy_setup(self, run_method: Callable, *args: Any, **kwargs: Any) -> Any: self._strategy.setup_environment() - with self._strategy.model_sharded_context(), _replace_dunder_methods( + with self._strategy.module_sharded_context(), _replace_dunder_methods( DataLoader, "dataset" ), _replace_dunder_methods(BatchSampler): return run_method(*args, **kwargs) def _move_model_to_device(self, model: nn.Module, optimizers: List[Optimizer]) -> nn.Module: - if isinstance(self._strategy, TPUSpawnStrategy): + if isinstance(self._strategy, XLAStrategy): # When the user creates the optimizer, they reference the parameters on the CPU. # However, when running with TPU the parameters get copied and the reference in the optimizer # remains invalid. We need to update the references to point to the parameter tensors on the device. @@ -429,7 +413,7 @@ def _move_model_to_device(self, model: nn.Module, optimizers: List[Optimizer]) - def _requires_distributed_sampler(self, dataloader: DataLoader) -> bool: return ( - self._accelerator_connector.is_distributed + self._connector.is_distributed and not isinstance(dataloader.sampler, DistributedSampler) and not has_iterable_dataset(dataloader) ) @@ -439,57 +423,18 @@ def _get_distributed_sampler(dataloader: DataLoader, **kwargs: Any) -> Distribut kwargs.setdefault("seed", int(os.getenv("PL_GLOBAL_SEED", 0))) return DistributedSamplerWrapper(dataloader.sampler, **kwargs) - def _check_accelerator_support(self, accelerator: Optional[Union[str, Accelerator]]) -> None: - supported = [t.value.lower() for t in self._supported_device_types()] + ["gpu", "auto"] - valid = accelerator is None or isinstance(accelerator, Accelerator) or accelerator in supported - if not valid: - raise MisconfigurationException( - f"`accelerator={repr(accelerator)}` is not a valid choice." - f" Choose one of {supported} or pass in a `Accelerator` instance." - ) - - def _check_strategy_support(self, strategy: Optional[Union[str, Strategy]]) -> None: - supported = [t.lower() for t in self._supported_strategy_types()] - valid = strategy is None or isinstance(strategy, Strategy) or strategy in supported - if not valid: - raise MisconfigurationException( - f"`strategy={repr(strategy)}` is not a valid choice." - f" Choose one of {supported} or pass in a `Strategy` instance." - ) - - @staticmethod - def _supported_device_types() -> Sequence[_AcceleratorType]: - return ( - _AcceleratorType.CPU, - _AcceleratorType.CUDA, - _AcceleratorType.TPU, - _AcceleratorType.MPS, - ) - - @staticmethod - def _supported_strategy_types() -> Sequence[_StrategyType]: - return ( - _StrategyType.DP, - _StrategyType.DDP, - _StrategyType.DDP_SPAWN, - _StrategyType.DDP_FORK, - _StrategyType.DEEPSPEED, - _StrategyType.DDP_SHARDED, - _StrategyType.DDP_SHARDED_SPAWN, - ) - @staticmethod def _validate_setup(model: nn.Module, optimizers: Sequence[Optimizer]) -> None: if isinstance(model, _LiteModule): - raise MisconfigurationException("A model should be passed only once to the `setup` method.") + raise ValueError("A model should be passed only once to the `setup` method.") if any(isinstance(opt, _LiteOptimizer) for opt in optimizers): - raise MisconfigurationException("An optimizer should be passed only once to the `setup` method.") + raise ValueError("An optimizer should be passed only once to the `setup` method.") @staticmethod def _validate_setup_dataloaders(dataloaders: Sequence[DataLoader]) -> None: if any(isinstance(dl, _LiteDataLoader) for dl in dataloaders): - raise MisconfigurationException("A dataloader should be passed only once to the `setup_dataloaders` method") + raise ValueError("A dataloader should be passed only once to the `setup_dataloaders` method") if any(not isinstance(dl, DataLoader) for dl in dataloaders): - raise MisconfigurationException("Only PyTorch DataLoader are currently supported in `setup_dataloaders`.") + raise TypeError("Only PyTorch DataLoader are currently supported in `setup_dataloaders`.") diff --git a/src/pytorch_lightning/lite/wrappers.py b/src/pytorch_lightning/lite/wrappers.py index 0c3924694cc06..aed21b3aa5192 100644 --- a/src/pytorch_lightning/lite/wrappers.py +++ b/src/pytorch_lightning/lite/wrappers.py @@ -21,18 +21,14 @@ from torch.optim import Optimizer from torch.utils.data import DataLoader +from lightning_lite.plugins import Precision +from lightning_lite.strategies import Strategy from lightning_lite.utilities.apply_func import move_data_to_device from lightning_lite.utilities.device_dtype_mixin import _DeviceDtypeModuleMixin -from pytorch_lightning.plugins import PrecisionPlugin -from pytorch_lightning.strategies import Strategy T_destination = TypeVar("T_destination", bound=Dict[str, Any]) -def _do_nothing_closure() -> None: - return None - - class _LiteOptimizer: def __init__(self, optimizer: Optimizer, strategy: Strategy) -> None: """LiteOptimizer is a thin wrapper around the :class:`~torch.optim.Optimizer` that delegates the optimizer @@ -56,21 +52,20 @@ def optimizer(self) -> Optimizer: return self._optimizer def state_dict(self) -> Dict[str, Tensor]: - return self._strategy.optimizer_state(self.optimizer) + return self._strategy.get_optimizer_state(self.optimizer) def step(self, closure: Optional[Callable] = None) -> Any: - closure = closure or _do_nothing_closure + kwargs = dict(closure=closure) if closure is not None else {} return self._strategy.optimizer_step( self.optimizer, - opt_idx=0, - closure=closure, - model=self._strategy.model, + model=getattr(self._strategy, "model", None), + **kwargs, ) class _LiteModule(_DeviceDtypeModuleMixin): def __init__( - self, forward_module: nn.Module, precision_plugin: PrecisionPlugin, original_module: Optional[nn.Module] = None + self, forward_module: nn.Module, precision_plugin: Precision, original_module: Optional[nn.Module] = None ) -> None: """The LiteModule is a thin wrapper around the :class:`torch.nn.Module` and handles precision / autocast automatically for the forward pass. diff --git a/src/pytorch_lightning/overrides/distributed.py b/src/pytorch_lightning/overrides/distributed.py index 3ecac8c1eea04..5a38742972925 100644 --- a/src/pytorch_lightning/overrides/distributed.py +++ b/src/pytorch_lightning/overrides/distributed.py @@ -17,11 +17,11 @@ import torch from torch import Tensor from torch.nn.parallel import DistributedDataParallel -from torch.utils.data import BatchSampler, Dataset, DistributedSampler, Sampler +from torch.utils.data import BatchSampler, DistributedSampler, Sampler import pytorch_lightning as pl +from lightning_lite.utilities.distributed import _DatasetSamplerWrapper from pytorch_lightning.overrides.base import _LightningModuleWrapperBase, _LightningPrecisionModuleWrapperBase -from pytorch_lightning.utilities.exceptions import MisconfigurationException class LightningDistributedModule(_LightningModuleWrapperBase): @@ -109,60 +109,6 @@ def __iter__(self) -> Iterator[List[int]]: return iter(indices) -class _DatasetSamplerWrapper(Dataset): - """Dataset to create indexes from `Sampler` or `Iterable`""" - - def __init__(self, sampler: Union[Sampler, Iterable]) -> None: - if not isinstance(sampler, Sized): - raise MisconfigurationException( - "You seem to have configured a sampler in your DataLoader which" - " does not provide `__len__` method. The sampler was about to be" - " replaced by `DistributedSamplerWrapper` since `replace_sampler_ddp`" - " is True and you are using distributed training. Either provide `__len__`" - " method in your sampler, remove it from DataLoader or set `replace_sampler_ddp=False`" - " if you want to handle distributed sampling yourself." - ) - if len(sampler) == float("inf"): - raise MisconfigurationException( - "You seem to have configured a sampler in your DataLoader which" - " does not provide finite `__len__` method. The sampler was about to be" - " replaced by `DistributedSamplerWrapper` since `replace_sampler_ddp`" - " is True and you are using distributed training. Either provide `__len__`" - " method in your sampler which returns a finite number, remove it from DataLoader" - " or set `replace_sampler_ddp=False` if you want to handle distributed sampling yourself." - ) - self._sampler = sampler - # defer materializing an iterator until it is necessary - self._sampler_list: Optional[List[Any]] = None - - def __getitem__(self, index: int) -> Any: - if self._sampler_list is None: - self._sampler_list = list(self._sampler) - return self._sampler_list[index] - - def __len__(self) -> int: - return len(self._sampler) - - def reset(self) -> None: - """Reset the sampler list in order to get new sampling.""" - self._sampler_list = list(self._sampler) - - -class DistributedSamplerWrapper(DistributedSampler): - """Wrapper over ``Sampler`` for distributed training. - - Allows you to use any sampler in distributed mode. It will be automatically used by PyTorch Lightning in distributed - mode if `replace_sampler_ddp=True` - """ - - def __init__(self, sampler: Union[Sampler, Iterable], *args: Any, **kwargs: Any) -> None: - super().__init__(_DatasetSamplerWrapper(sampler), *args, **kwargs) - - def __iter__(self) -> Iterator: - self.dataset.reset() - return (self.dataset[index] for index in super().__iter__()) - - class UnrepeatedDistributedSamplerWrapper(UnrepeatedDistributedSampler): """Equivalent class to ``DistributedSamplerWrapper`` but for the ``UnrepeatedDistributedSampler``.""" diff --git a/src/pytorch_lightning/trainer/connectors/data_connector.py b/src/pytorch_lightning/trainer/connectors/data_connector.py index b2a6dbe0c8a5a..7543172de9450 100644 --- a/src/pytorch_lightning/trainer/connectors/data_connector.py +++ b/src/pytorch_lightning/trainer/connectors/data_connector.py @@ -24,8 +24,9 @@ import pytorch_lightning as pl from lightning_lite.utilities.data import _auto_add_worker_init_fn, _replace_dunder_methods, has_iterable_dataset +from lightning_lite.utilities.distributed import DistributedSamplerWrapper from pytorch_lightning.accelerators.ipu import IPUAccelerator -from pytorch_lightning.overrides.distributed import DistributedSamplerWrapper, UnrepeatedDistributedSamplerWrapper +from pytorch_lightning.overrides.distributed import UnrepeatedDistributedSamplerWrapper from pytorch_lightning.strategies import DDPSpawnStrategy from pytorch_lightning.trainer.states import RunningStage, TrainerFn from pytorch_lightning.trainer.supporters import CombinedLoader, CycleIterator diff --git a/tests/tests_lite/test_lite.py b/tests/tests_lite/test_lite.py deleted file mode 100644 index a7df3089cb5ac..0000000000000 --- a/tests/tests_lite/test_lite.py +++ /dev/null @@ -1,12 +0,0 @@ -from tests_lite.helpers.runif import RunIf - -from lightning_lite.lite import LightningLite # noqa: F401 - - -def test_placeholder(tmpdir): - assert True - - -@RunIf(min_cuda_gpus=2, standalone=True) -def test_placeholder_standalone(tmpdir): - assert True diff --git a/tests/tests_pytorch/lite/test_lite.py b/tests/tests_pytorch/lite/test_lite.py index e7b5c61a67727..8b8c999580e25 100644 --- a/tests/tests_pytorch/lite/test_lite.py +++ b/tests/tests_pytorch/lite/test_lite.py @@ -23,13 +23,13 @@ from torch import nn from torch.utils.data import DataLoader, DistributedSampler, Sampler +from lightning_lite.plugins import Precision +from lightning_lite.strategies import DeepSpeedStrategy, Strategy from lightning_lite.utilities import _StrategyType +from lightning_lite.utilities.exceptions import MisconfigurationException from lightning_lite.utilities.seed import pl_worker_init_function from pytorch_lightning.lite import LightningLite from pytorch_lightning.lite.wrappers import _LiteDataLoader, _LiteModule, _LiteOptimizer -from pytorch_lightning.plugins import PrecisionPlugin -from pytorch_lightning.strategies import DeepSpeedStrategy, Strategy -from pytorch_lightning.utilities.exceptions import MisconfigurationException from tests_pytorch.helpers.runif import RunIf @@ -48,18 +48,6 @@ def forward(self, x): return torch.nn.functional.mse_loss(x, torch.ones_like(x)) -def test_unsupported_accelerator(): - accelerator = "coconut" - with pytest.raises(MisconfigurationException, match=f"`accelerator={repr(accelerator)}` is not a valid choice"): - EmptyLite(accelerator=accelerator) - - -def test_unsupported_strategy(): - strategy = "coconut" - with pytest.raises(MisconfigurationException, match=f"`strategy={repr(strategy)}` is not a valid choice"): - EmptyLite(strategy=strategy) - - def test_run_input_output(): """Test that the dynamically patched run() method receives the input arguments and returns the result.""" @@ -80,7 +68,7 @@ def run(self, *args, **kwargs): assert lite.run_kwargs == {"three": 3} -@mock.patch("pytorch_lightning.strategies.ddp.DistributedDataParallel") +@mock.patch("lightning_lite.strategies.ddp.DistributedDataParallel") def test_setup_model(ddp_mock): """Test that the setup method lets the strategy wrap the model, but keeps a reference to the original model.""" lite = EmptyLite(accelerator="cpu", strategy="ddp", devices=2) @@ -128,11 +116,11 @@ def test_setup_twice_fails(): optimizer = torch.optim.Adam(model.parameters()) lite_model, lite_optimizer = lite.setup(model, optimizer) - with pytest.raises(MisconfigurationException, match="A model should be passed only once to the"): + with pytest.raises(ValueError, match="A model should be passed only once to the"): lite.setup(lite_model, optimizer) lite_model, lite_optimizer = lite.setup(model, optimizer) - with pytest.raises(MisconfigurationException, match="An optimizer should be passed only once to the"): + with pytest.raises(ValueError, match="An optimizer should be passed only once to the"): lite.setup(model, lite_optimizer) @@ -153,7 +141,7 @@ def test_setup_tracks_num_models(): def test_setup_dataloaders_unsupported_type(): """Test that the setup_dataloaders method fails when provided with non-DataLoader objects.""" lite = EmptyLite() - with pytest.raises(MisconfigurationException, match="Only PyTorch DataLoader are currently supported"): + with pytest.raises(TypeError, match="Only PyTorch DataLoader are currently supported"): lite.setup_dataloaders(range(2)) # type: ignore @@ -217,7 +205,7 @@ def test_setup_dataloaders_twice_fails(): dataloader = DataLoader(range(2)) lite_dataloader = lite.setup_dataloaders(dataloader) - with pytest.raises(MisconfigurationException, match="A dataloader should be passed only once to the"): + with pytest.raises(ValueError, match="A dataloader should be passed only once to the"): lite.setup_dataloaders(lite_dataloader) @@ -282,8 +270,8 @@ def test_setup_dataloaders_replace_custom_sampler(strategy): # explicitly asking to replace when a custom sampler is already configured raises an exception lite = EmptyLite(accelerator="cpu", strategy=strategy, devices=2) - if lite._accelerator_connector.is_distributed: - with pytest.raises(MisconfigurationException, match="You seem to have configured a sampler in your DataLoader"): + if lite._connector.is_distributed: + with pytest.raises(TypeError, match="You seem to have configured a sampler in your DataLoader"): lite.setup_dataloaders(dataloader, replace_sampler=True) # setting `replace_sampler=False` leaves the sampler untouched @@ -307,7 +295,7 @@ def test_setup_dataloaders_replace_custom_sampler(strategy): def test_setup_dataloaders_replace_standard_sampler(shuffle, strategy): """Test that Lite replaces the default samplers with DistributedSampler automatically.""" lite = EmptyLite(accelerator="cpu", strategy=strategy, devices=2) - is_distributed = lite._accelerator_connector.is_distributed + is_distributed = lite._connector.is_distributed lite_dataloader = lite.setup_dataloaders(DataLoader(range(3), shuffle=shuffle)) assert not is_distributed or isinstance(lite_dataloader.sampler, DistributedSampler) @@ -366,10 +354,10 @@ def test_rank_properties(): def test_backward(): """Test that backward() calls into the precision plugin.""" lite = EmptyLite() - lite._precision_plugin = Mock(spec=PrecisionPlugin) + lite._precision_plugin = Mock(spec=Precision) loss = Mock() lite.backward(loss, "arg", keyword="kwarg") - lite._precision_plugin._run_backward.assert_called_with(loss, None, "arg", keyword="kwarg") + lite._precision_plugin.backward.assert_called_with(loss, None, "arg", keyword="kwarg") @RunIf(deepspeed=True) @@ -383,14 +371,14 @@ def test_backward_model_input_required(): optimizer0 = torch.optim.Adam(model0.parameters()) optimizer1 = torch.optim.Adam(model1.parameters()) - lite._strategy._setup_model_and_optimizer = lambda *args: args + lite._strategy.setup_module_and_optimizers = lambda *args: args lite.setup(model0, optimizer0) lite.setup(model1, optimizer1) loss = model0(torch.randn(1, 1)).sum() - with pytest.raises(MisconfigurationException, match="please provide the model used to perform"): + with pytest.raises(ValueError, match="please provide the model used to perform"): lite.backward(loss) diff --git a/tests/tests_pytorch/lite/test_parity.py b/tests/tests_pytorch/lite/test_parity.py index eaada992da497..ffb95855154cb 100644 --- a/tests/tests_pytorch/lite/test_parity.py +++ b/tests/tests_pytorch/lite/test_parity.py @@ -133,10 +133,12 @@ def test_boring_lite_model_single_device(precision, strategy, devices, accelerat state_dict = apply_to_collection(state_dict, torch.Tensor, lite.to_device) for w_pure, w_lite in zip(state_dict.values(), lite_state_dict.values()): - assert not torch.equal(w_pure, w_lite) + # TODO: This should be torch.equal, but MPS does not yet support this operation (torch 1.12) + assert not torch.allclose(w_pure, w_lite) for w_pure, w_lite in zip(pure_state_dict.values(), lite_state_dict.values()): - assert torch.equal(w_pure, w_lite) + # TODO: This should be torch.equal, but MPS does not yet support this operation (torch 1.12) + assert torch.allclose(w_pure, w_lite) def run(rank, model, train_dataloader, num_epochs, precision, accelerator, tmpdir): diff --git a/tests/tests_pytorch/lite/test_wrappers.py b/tests/tests_pytorch/lite/test_wrappers.py index acc05cfdcda8f..c4fc83bf99145 100644 --- a/tests/tests_pytorch/lite/test_wrappers.py +++ b/tests/tests_pytorch/lite/test_wrappers.py @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from unittest.mock import ANY, Mock +from unittest.mock import Mock import pytest import torch @@ -222,10 +222,12 @@ def test_lite_dataloader_device_placement(src_device_str, dest_device_str): iterator = iter(lite_dataloader) batch0 = next(iterator) - assert torch.equal(batch0, torch.tensor([0, 1], device=dest_device)) + # TODO: This should be torch.equal, but not supported on MPS at this time (torch 1.12) + assert torch.allclose(batch0, torch.tensor([0, 1], device=dest_device)) batch1 = next(iterator) - assert torch.equal(batch1["data"], torch.tensor([2, 3], device=dest_device)) + # TODO: This should be torch.equal, but not supported on MPS at this time (torch 1.12) + assert torch.allclose(batch1["data"], torch.tensor([2, 3], device=dest_device)) def test_lite_optimizer_wraps(): @@ -243,7 +245,7 @@ def test_lite_optimizer_state_dict(): strategy = Mock() lite_optimizer = _LiteOptimizer(optimizer=optimizer, strategy=strategy) lite_optimizer.state_dict() - strategy.optimizer_state.assert_called_with(optimizer) + strategy.get_optimizer_state.assert_called_with(optimizer) def test_lite_optimizer_steps(): @@ -255,4 +257,12 @@ def test_lite_optimizer_steps(): step_output = lite_optimizer.step() assert step_output == 123 strategy.optimizer_step.assert_called_once() - strategy.optimizer_step.assert_called_with(optimizer, opt_idx=0, closure=ANY, model=strategy.model) + strategy.optimizer_step.assert_called_with(optimizer, model=strategy.model) + + strategy.optimizer_step.reset_mock() + + # with closure as input + closure = Mock() + lite_optimizer.step(closure=closure) + strategy.optimizer_step.assert_called_once() + strategy.optimizer_step.assert_called_with(optimizer, model=strategy.model, closure=closure) diff --git a/tests/tests_pytorch/trainer/connectors/test_data_connector.py b/tests/tests_pytorch/trainer/connectors/test_data_connector.py index 847922c05294a..ea5b825283680 100644 --- a/tests/tests_pytorch/trainer/connectors/test_data_connector.py +++ b/tests/tests_pytorch/trainer/connectors/test_data_connector.py @@ -21,10 +21,10 @@ from torch import Tensor from torch.utils.data import BatchSampler, DataLoader, DistributedSampler, Sampler, SequentialSampler +from lightning_lite.utilities.distributed import DistributedSamplerWrapper from lightning_lite.utilities.warnings import PossibleUserWarning from pytorch_lightning import Trainer from pytorch_lightning.demos.boring_classes import BoringDataModule, BoringModel, RandomDataset -from pytorch_lightning.overrides.distributed import DistributedSamplerWrapper from pytorch_lightning.strategies import DDPSpawnStrategy from pytorch_lightning.trainer.connectors.data_connector import _DataHookSelector, _DataLoaderSource, warning_cache from pytorch_lightning.trainer.states import RunningStage, TrainerFn From d9be959bc22b9c5fa4c5225370223eca75e27718 Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Fri, 16 Sep 2022 20:11:00 +0200 Subject: [PATCH 181/193] fix: keep pre versions (#14752) --- .actions/assistant.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/.actions/assistant.py b/.actions/assistant.py index 38c3d8cebd763..96a7602707b5f 100644 --- a/.actions/assistant.py +++ b/.actions/assistant.py @@ -4,7 +4,7 @@ import os import re import shutil -from distutils.version import LooseVersion, StrictVersion +from distutils.version import LooseVersion from importlib.util import module_from_spec, spec_from_file_location from itertools import chain from pathlib import Path @@ -16,6 +16,7 @@ import fire import pkg_resources +from packaging.version import parse as version_parse REQUIREMENT_FILES = { "pytorch": ( @@ -30,15 +31,20 @@ PACKAGE_MAPPING = {"app": "lightning-app", "pytorch": "pytorch-lightning"} -def pypi_versions(package_name: str) -> List[str]: - """Return a list of released versions of a provided pypi name.""" +def pypi_versions(package_name: str, drop_pre: bool = True) -> List[str]: + """Return a list of released versions of a provided pypi name. + + >>> _ = pypi_versions("lightning_app", drop_pre=False) + """ # https://stackoverflow.com/a/27239645/4521646 url = f"https://pypi.org/pypi/{package_name}/json" data = json.load(urlopen(Request(url))) versions = list(data["releases"].keys()) # todo: drop this line after cleaning Pypi history from invalid versions - versions = list(filter(lambda v: v.count(".") == 2 and "rc" not in v, versions)) - versions.sort(key=StrictVersion) + versions = list(filter(lambda v: v.count(".") == 2, versions)) + if drop_pre: + versions = list(filter(lambda v: all(c not in v for c in ["rc", "dev"]), versions)) + versions.sort(key=version_parse) return versions @@ -122,7 +128,7 @@ def download_package(package: str, folder: str = ".", version: Optional[str] = N url = f"https://pypi.org/pypi/{PACKAGE_MAPPING[package]}/json" data = json.load(urlopen(Request(url))) if not version: - pypi_vers = pypi_versions(PACKAGE_MAPPING[package]) + pypi_vers = pypi_versions(PACKAGE_MAPPING[package], drop_pre=False) version = pypi_vers[-1] releases = list(filter(lambda r: r["packagetype"] == "sdist", data["releases"][version])) assert releases, f"Missing 'sdist' for this package/version aka {package}/{version}" From e113d3814b62740d787cf729a278c183d3b9e3bf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Fri, 16 Sep 2022 20:12:46 +0200 Subject: [PATCH 182/193] Explicitly set which Probot job to run (#14756) --- .github/workflows/probot-auto-cc.yml | 2 ++ .github/workflows/probot-check-group.yml | 1 + 2 files changed, 3 insertions(+) diff --git a/.github/workflows/probot-auto-cc.yml b/.github/workflows/probot-auto-cc.yml index 585befc937cc0..6de393f41cfd8 100644 --- a/.github/workflows/probot-auto-cc.yml +++ b/.github/workflows/probot-auto-cc.yml @@ -18,3 +18,5 @@ jobs: - uses: carmocca/probot@v2 env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + with: + job: auto-cc diff --git a/.github/workflows/probot-check-group.yml b/.github/workflows/probot-check-group.yml index 28d4b3994db02..855c82d38e3fd 100644 --- a/.github/workflows/probot-check-group.yml +++ b/.github/workflows/probot-check-group.yml @@ -19,4 +19,5 @@ jobs: env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} with: + job: check-group interval: 180 # seconds From 5aaab22fb3f7d02b79003e25f1fe2b64dcf130ee Mon Sep 17 00:00:00 2001 From: Kushashwa Ravi Shrimali Date: Sat, 17 Sep 2022 00:34:21 +0530 Subject: [PATCH 183/193] Bump Lightning Cloud to 0.5.7 (#14757) * Bump Lightning Cloud to 0.5.7 :tada: * Fix link in changelog Co-authored-by: Sherin Thomas --- requirements/app/base.txt | 2 +- src/lightning_app/CHANGELOG.md | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/requirements/app/base.txt b/requirements/app/base.txt index 3e5ec44be652d..1829cd7d0bf3e 100644 --- a/requirements/app/base.txt +++ b/requirements/app/base.txt @@ -1,4 +1,4 @@ -lightning-cloud==0.5.6 +lightning-cloud==0.5.7 packaging deepdiff>=5.7.0, <=5.8.1 starsessions>=1.2.1, <2.0 # strict diff --git a/src/lightning_app/CHANGELOG.md b/src/lightning_app/CHANGELOG.md index 5b05863b15fbf..5190f7c4f22ae 100644 --- a/src/lightning_app/CHANGELOG.md +++ b/src/lightning_app/CHANGELOG.md @@ -24,6 +24,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Fixed +- Making threadpool non default from LightningCloud client ([#14757](https://github.com/Lightning-AI/lightning/pull/14757)) + - Resolved a bug where the state change detection using DeepDiff won't worked with Path, Drive objects ([#14465](https://github.com/Lightning-AI/lightning/pull/14465)) - Resolved a bug where the wrong client was passed to collect cloud logs ([#14684](https://github.com/Lightning-AI/lightning/pull/14684)) From 35c65b028714f58978d50ef1a5cb2fe4e5be8fa0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Fri, 16 Sep 2022 21:21:36 +0200 Subject: [PATCH 184/193] Fix test suite when running on MPS-enabled hardware (#14708) --- _notebooks | 2 +- tests/tests_lite/accelerators/test_mps.py | 13 +++- .../deprecated_api/test_remove_1-8.py | 1 + tests/tests_pytorch/lite/test_wrappers.py | 10 ++-- tests/tests_pytorch/models/test_gpu.py | 8 ++- .../tests_pytorch/plugins/test_amp_plugins.py | 1 + .../plugins/test_cluster_integration.py | 3 +- tests/tests_pytorch/strategies/test_ddp.py | 4 +- .../connectors/test_accelerator_connector.py | 21 +++---- .../properties/test_auto_gpu_select.py | 1 + .../test_estimated_stepping_batches.py | 1 + .../tests_pytorch/trainer/test_supporters.py | 5 +- tests/tests_pytorch/trainer/test_trainer.py | 59 ++++++++++--------- 13 files changed, 71 insertions(+), 58 deletions(-) diff --git a/_notebooks b/_notebooks index 8a36a41548f34..6d5634b794218 160000 --- a/_notebooks +++ b/_notebooks @@ -1 +1 @@ -Subproject commit 8a36a41548f34c44ac455d515a72994487e85813 +Subproject commit 6d5634b7942180e6ba4a30bfbd74926d1c22f1eb diff --git a/tests/tests_lite/accelerators/test_mps.py b/tests/tests_lite/accelerators/test_mps.py index f6148ff7ff3f8..9b67ee5ffdbf3 100644 --- a/tests/tests_lite/accelerators/test_mps.py +++ b/tests/tests_lite/accelerators/test_mps.py @@ -16,6 +16,7 @@ from tests_lite.helpers.runif import RunIf from lightning_lite.accelerators.mps import MPSAccelerator +from lightning_lite.utilities.exceptions import MisconfigurationException _MAYBE_MPS = "mps" if MPSAccelerator.is_available() else "cpu" # torch.device(mps) only works on torch>=1.12 @@ -39,11 +40,17 @@ def test_init_device_with_wrong_device_type(): "devices,expected", [ (1, [torch.device(_MAYBE_MPS, 0)]), - (2, [torch.device(_MAYBE_MPS, 0), torch.device(_MAYBE_MPS, 1)]), ([0], [torch.device(_MAYBE_MPS, 0)]), - # TODO(lite): This case passes with the implementation from PL, but looks like a bug - ([0, 2], [torch.device(_MAYBE_MPS, 0), torch.device(_MAYBE_MPS, 1)]), + ("1", [torch.device(_MAYBE_MPS, 0)]), + ("0,", [torch.device(_MAYBE_MPS, 0)]), ], ) def test_get_parallel_devices(devices, expected): assert MPSAccelerator.get_parallel_devices(devices) == expected + + +@RunIf(mps=True) +@pytest.mark.parametrize("devices", [2, [0, 2], "2", "0,2"]) +def test_get_parallel_devices_invalid_request(devices): + with pytest.raises(MisconfigurationException, match="But your machine only has"): + MPSAccelerator.get_parallel_devices(devices) diff --git a/tests/tests_pytorch/deprecated_api/test_remove_1-8.py b/tests/tests_pytorch/deprecated_api/test_remove_1-8.py index eb87ee75ef786..b6d21896715dc 100644 --- a/tests/tests_pytorch/deprecated_api/test_remove_1-8.py +++ b/tests/tests_pytorch/deprecated_api/test_remove_1-8.py @@ -689,6 +689,7 @@ def on_save_checkpoint(self, trainer, pl_module, checkpoint): def test_trainer_gpus(monkeypatch, trainer_kwargs): monkeypatch.setattr(device_parser, "is_cuda_available", lambda: True) monkeypatch.setattr(device_parser, "num_cuda_devices", lambda: 4) + monkeypatch.setattr(device_parser, "_get_all_available_mps_gpus", lambda: list(range(4))) trainer = Trainer(**trainer_kwargs) with pytest.deprecated_call( match=( diff --git a/tests/tests_pytorch/lite/test_wrappers.py b/tests/tests_pytorch/lite/test_wrappers.py index c4fc83bf99145..0589ac64d9078 100644 --- a/tests/tests_pytorch/lite/test_wrappers.py +++ b/tests/tests_pytorch/lite/test_wrappers.py @@ -204,7 +204,7 @@ def test_lite_dataloader_iterator(): ("cpu", "cpu"), pytest.param("cpu", "cuda:0", marks=RunIf(min_cuda_gpus=1)), pytest.param("cuda:0", "cpu", marks=RunIf(min_cuda_gpus=1)), - pytest.param("cpu", "mps", marks=RunIf(mps=True)), + # pytest.param("cpu", "mps", marks=RunIf(mps=True)), # TODO: Add once torch.equal is supported pytest.param("mps", "cpu", marks=RunIf(mps=True)), ], ) @@ -222,12 +222,12 @@ def test_lite_dataloader_device_placement(src_device_str, dest_device_str): iterator = iter(lite_dataloader) batch0 = next(iterator) - # TODO: This should be torch.equal, but not supported on MPS at this time (torch 1.12) - assert torch.allclose(batch0, torch.tensor([0, 1], device=dest_device)) + # TODO: torch.equal is not supported on MPS at this time (torch 1.12) + assert torch.equal(batch0, torch.tensor([0, 1], device=dest_device)) batch1 = next(iterator) - # TODO: This should be torch.equal, but not supported on MPS at this time (torch 1.12) - assert torch.allclose(batch1["data"], torch.tensor([2, 3], device=dest_device)) + # TODO: torch.equal is not supported on MPS at this time (torch 1.12) + assert torch.equal(batch1["data"], torch.tensor([2, 3], device=dest_device)) def test_lite_optimizer_wraps(): diff --git a/tests/tests_pytorch/models/test_gpu.py b/tests/tests_pytorch/models/test_gpu.py index b01377f1be3f5..fb71145e26c4f 100644 --- a/tests/tests_pytorch/models/test_gpu.py +++ b/tests/tests_pytorch/models/test_gpu.py @@ -91,7 +91,6 @@ def device_count(): monkeypatch.setattr(device_parser, "num_cuda_devices", device_count) -# Asking for a gpu when non are available will result in a MisconfigurationException @pytest.mark.parametrize( ["devices", "expected_root_gpu", "strategy"], [ @@ -104,8 +103,11 @@ def device_count(): ("-1", None, "ddp"), ], ) -def test_root_gpu_property_0_raising(mocked_device_count_0, devices, expected_root_gpu, strategy): - with pytest.raises(MisconfigurationException): +@mock.patch("lightning_lite.accelerators.mps.MPSAccelerator.is_available", return_value=False) +@mock.patch("lightning_lite.accelerators.cuda.CUDAAccelerator.is_available", return_value=False) +def test_root_gpu_property_0_raising(_, __, devices, expected_root_gpu, strategy): + """Test that asking for a GPU when none are available will result in a MisconfigurationException.""" + with pytest.raises(MisconfigurationException, match="No supported gpu backend found!"): Trainer(accelerator="gpu", devices=devices, strategy=strategy) diff --git a/tests/tests_pytorch/plugins/test_amp_plugins.py b/tests/tests_pytorch/plugins/test_amp_plugins.py index a7efe0ec75fdd..087941855a7f3 100644 --- a/tests/tests_pytorch/plugins/test_amp_plugins.py +++ b/tests/tests_pytorch/plugins/test_amp_plugins.py @@ -39,6 +39,7 @@ class MyApexPlugin(ApexMixedPrecisionPlugin): pass +@RunIf(mps=False) @mock.patch.dict( os.environ, { diff --git a/tests/tests_pytorch/plugins/test_cluster_integration.py b/tests/tests_pytorch/plugins/test_cluster_integration.py index 4427551e4a206..1cef8c0dc3ec3 100644 --- a/tests/tests_pytorch/plugins/test_cluster_integration.py +++ b/tests/tests_pytorch/plugins/test_cluster_integration.py @@ -87,7 +87,8 @@ def test_ranks_available_manual_strategy_selection(mock_gpu_acc_available, strat ) @mock.patch("lightning_lite.utilities.device_parser.is_cuda_available", return_value=True) @mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=4) -def test_ranks_available_automatic_strategy_selection(mock0, mock1, trainer_kwargs): +@mock.patch("lightning_lite.utilities.device_parser._get_all_available_mps_gpus", return_value=list(range(4))) +def test_ranks_available_automatic_strategy_selection(_, __, ___, trainer_kwargs): """Test that the rank information is readily available after Trainer initialization.""" num_nodes = 2 trainer_kwargs.update(num_nodes=num_nodes) diff --git a/tests/tests_pytorch/strategies/test_ddp.py b/tests/tests_pytorch/strategies/test_ddp.py index a69ede050a8e7..6b85e1564aab3 100644 --- a/tests/tests_pytorch/strategies/test_ddp.py +++ b/tests/tests_pytorch/strategies/test_ddp.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import os from unittest import mock import pytest @@ -58,9 +57,8 @@ def test_multi_gpu_model_ddp_fit_test(tmpdir): @RunIf(skip_windows=True) -@mock.patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0,1", "WORLD_SIZE": "2"}, clear=True) -@mock.patch("lightning_lite.utilities.device_parser.is_cuda_available", return_value=True) @mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=2) +@mock.patch("lightning_lite.utilities.device_parser._get_all_available_mps_gpus", return_value=list(range(2))) def test_torch_distributed_backend_invalid(_, __, tmpdir): """This test set `undefined` as torch backend and should raise an `Backend.UNDEFINED` ValueError.""" model = BoringModel() diff --git a/tests/tests_pytorch/trainer/connectors/test_accelerator_connector.py b/tests/tests_pytorch/trainer/connectors/test_accelerator_connector.py index 3c0968fe68c0f..626b5bbcf95bf 100644 --- a/tests/tests_pytorch/trainer/connectors/test_accelerator_connector.py +++ b/tests/tests_pytorch/trainer/connectors/test_accelerator_connector.py @@ -214,7 +214,8 @@ def test_dist_backend_accelerator_mapping(*_): @mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=2) -def test_ipython_incompatible_backend_error(_, monkeypatch): +@mock.patch("lightning_lite.utilities.device_parser._get_all_available_mps_gpus", return_value=[0, 1]) +def test_ipython_incompatible_backend_error(_, __, monkeypatch): monkeypatch.setattr(pytorch_lightning.utilities, "_IS_INTERACTIVE", True) with pytest.raises(MisconfigurationException, match=r"strategy='ddp'\)`.*is not compatible"): Trainer(strategy="ddp", accelerator="gpu", devices=2) @@ -252,6 +253,7 @@ def test_ipython_compatible_strategy_ddp_fork(monkeypatch): assert trainer.strategy.launcher.is_interactive_compatible +@RunIf(mps=False) @pytest.mark.parametrize( ["strategy", "strategy_class"], [ @@ -462,7 +464,7 @@ def test_strategy_choice_ddp_fork_cpu(): @mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=2) @mock.patch("lightning_lite.utilities.device_parser.is_cuda_available", return_value=True) def test_strategy_choice_ddp(*_): - trainer = Trainer(fast_dev_run=True, strategy="ddp", accelerator="gpu", devices=1) + trainer = Trainer(fast_dev_run=True, strategy="ddp", accelerator="cuda", devices=1) assert isinstance(trainer.accelerator, CUDAAccelerator) assert isinstance(trainer.strategy, DDPStrategy) assert isinstance(trainer.strategy.cluster_environment, LightningEnvironment) @@ -471,8 +473,8 @@ def test_strategy_choice_ddp(*_): @mock.patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0,1"}) @mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=2) @mock.patch("lightning_lite.utilities.device_parser.is_cuda_available", return_value=True) -def test_strategy_choice_ddp_spawn(cuda_available_mock, device_count_mock): - trainer = Trainer(fast_dev_run=True, strategy="ddp_spawn", accelerator="gpu", devices=1) +def test_strategy_choice_ddp_spawn(*_): + trainer = Trainer(fast_dev_run=True, strategy="ddp_spawn", accelerator="cuda", devices=1) assert isinstance(trainer.accelerator, CUDAAccelerator) assert isinstance(trainer.strategy, DDPSpawnStrategy) assert isinstance(trainer.strategy.cluster_environment, LightningEnvironment) @@ -515,13 +517,10 @@ def test_strategy_choice_ddp_slurm(_, __, strategy, job_name, expected_env): "TORCHELASTIC_RUN_ID": "1", }, ) -@mock.patch("torch.cuda.set_device") @mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=2) @mock.patch("lightning_lite.utilities.device_parser.is_cuda_available", return_value=True) -@mock.patch("pytorch_lightning.strategies.DDPStrategy.setup_distributed", autospec=True) -@mock.patch("lightning_lite.utilities.device_parser.is_cuda_available", return_value=True) def test_strategy_choice_ddp_te(*_): - trainer = Trainer(fast_dev_run=True, strategy="ddp", accelerator="gpu", devices=2) + trainer = Trainer(fast_dev_run=True, strategy="ddp", accelerator="cuda", devices=2) assert isinstance(trainer.accelerator, CUDAAccelerator) assert isinstance(trainer.strategy, DDPStrategy) assert isinstance(trainer.strategy.cluster_environment, TorchElasticEnvironment) @@ -562,12 +561,10 @@ def test_strategy_choice_ddp_cpu_te(*_): "RANK": "1", }, ) -@mock.patch("torch.cuda.set_device") @mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=1) @mock.patch("lightning_lite.utilities.device_parser.is_cuda_available", return_value=True) -@mock.patch("pytorch_lightning.strategies.DDPStrategy.setup_distributed", autospec=True) def test_strategy_choice_ddp_kubeflow(*_): - trainer = Trainer(fast_dev_run=True, strategy="ddp", accelerator="gpu", devices=1) + trainer = Trainer(fast_dev_run=True, strategy="ddp", accelerator="cuda", devices=1) assert isinstance(trainer.accelerator, CUDAAccelerator) assert isinstance(trainer.strategy, DDPStrategy) assert isinstance(trainer.strategy.cluster_environment, KubeflowEnvironment) @@ -780,10 +777,10 @@ def test_gpu_accelerator_backend_choice(expected_accelerator_flag, expected_acce assert isinstance(trainer.accelerator, expected_accelerator_class) +@RunIf(mps=False) @mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=1) def test_gpu_accelerator_backend_choice_cuda(_): trainer = Trainer(accelerator="gpu") - assert trainer._accelerator_connector._accelerator_flag == "cuda" assert isinstance(trainer.accelerator, CUDAAccelerator) diff --git a/tests/tests_pytorch/trainer/properties/test_auto_gpu_select.py b/tests/tests_pytorch/trainer/properties/test_auto_gpu_select.py index 05ee9d2ab3170..37f54bb84b44f 100644 --- a/tests/tests_pytorch/trainer/properties/test_auto_gpu_select.py +++ b/tests/tests_pytorch/trainer/properties/test_auto_gpu_select.py @@ -48,6 +48,7 @@ def test_pick_multiple_gpus_more_than_available(*_): pick_multiple_gpus(3) +@RunIf(mps=False) @mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=2) @mock.patch("pytorch_lightning.trainer.connectors.accelerator_connector.pick_multiple_gpus", return_value=[1]) def test_auto_select_gpus(*_): diff --git a/tests/tests_pytorch/trainer/properties/test_estimated_stepping_batches.py b/tests/tests_pytorch/trainer/properties/test_estimated_stepping_batches.py index 0cd31008ea8ee..9089a4f76f7a6 100644 --- a/tests/tests_pytorch/trainer/properties/test_estimated_stepping_batches.py +++ b/tests/tests_pytorch/trainer/properties/test_estimated_stepping_batches.py @@ -113,6 +113,7 @@ def test_num_stepping_batches_accumulate_gradients(accumulate_grad_batches, expe assert trainer.estimated_stepping_batches == expected_steps +@RunIf(mps=False) @pytest.mark.parametrize( ["trainer_kwargs", "estimated_steps"], [ diff --git a/tests/tests_pytorch/trainer/test_supporters.py b/tests/tests_pytorch/trainer/test_supporters.py index d9beabda43dd9..9970eb20da374 100644 --- a/tests/tests_pytorch/trainer/test_supporters.py +++ b/tests/tests_pytorch/trainer/test_supporters.py @@ -316,11 +316,10 @@ def test_nested_calc_num_data(input_data, compute_func, expected_length): @mock.patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0,1"}) @mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=2) @mock.patch("lightning_lite.utilities.device_parser.is_cuda_available", return_value=True) +@mock.patch("lightning_lite.utilities.device_parser._get_all_available_mps_gpus", return_value=[0, 1]) @pytest.mark.parametrize("use_fault_tolerant", [False, True]) @pytest.mark.parametrize("replace_sampler_ddp", [False, True]) -def test_combined_data_loader_validation_test( - cuda_available_mock, device_count_mock, use_fault_tolerant, replace_sampler_ddp, tmpdir -): +def test_combined_data_loader_validation_test(_, __, ___, use_fault_tolerant, replace_sampler_ddp, tmpdir): """This test makes sure distributed sampler has been properly injected in dataloaders when using CombinedLoader.""" diff --git a/tests/tests_pytorch/trainer/test_trainer.py b/tests/tests_pytorch/trainer/test_trainer.py index da6aedebbedf6..0d3764cdcc5d2 100644 --- a/tests/tests_pytorch/trainer/test_trainer.py +++ b/tests/tests_pytorch/trainer/test_trainer.py @@ -39,7 +39,7 @@ from lightning_lite.utilities.cloud_io import load as pl_load from lightning_lite.utilities.seed import seed_everything from pytorch_lightning import Callback, LightningDataModule, LightningModule, Trainer -from pytorch_lightning.accelerators import CPUAccelerator, CUDAAccelerator +from pytorch_lightning.accelerators import CPUAccelerator, CUDAAccelerator, MPSAccelerator from pytorch_lightning.callbacks import EarlyStopping, GradientAccumulationScheduler, ModelCheckpoint, Timer from pytorch_lightning.callbacks.fault_tolerance import _FaultToleranceCheckpoint from pytorch_lightning.callbacks.prediction_writer import BasePredictionWriter @@ -1991,24 +1991,24 @@ def training_step(self, batch, batch_idx): ({"strategy": "ddp"}, DDPStrategy, "ddp", CPUAccelerator, 1), ({"strategy": "ddp", "num_nodes": 2}, DDPStrategy, "ddp", CPUAccelerator, 1), ( - {"strategy": None, "accelerator": "gpu", "devices": 1}, + {"strategy": None, "accelerator": "cuda", "devices": 1}, SingleDeviceStrategy, "single_device", CUDAAccelerator, 1, ), - ({"strategy": "dp", "accelerator": "gpu", "devices": 1}, DataParallelStrategy, "dp", CUDAAccelerator, 1), - ({"strategy": "ddp", "accelerator": "gpu", "devices": 1}, DDPStrategy, "ddp", CUDAAccelerator, 1), + ({"strategy": "dp", "accelerator": "cuda", "devices": 1}, DataParallelStrategy, "dp", CUDAAccelerator, 1), + ({"strategy": "ddp", "accelerator": "cuda", "devices": 1}, DDPStrategy, "ddp", CUDAAccelerator, 1), ( - {"strategy": "ddp_spawn", "accelerator": "gpu", "devices": 1}, + {"strategy": "ddp_spawn", "accelerator": "cuda", "devices": 1}, DDPSpawnStrategy, "ddp_spawn", CUDAAccelerator, 1, ), - ({"strategy": None, "accelerator": "gpu", "devices": 2}, DDPSpawnStrategy, "ddp_spawn", CUDAAccelerator, 2), - ({"strategy": "dp", "accelerator": "gpu", "devices": 2}, DataParallelStrategy, "dp", CUDAAccelerator, 2), - ({"strategy": "ddp", "accelerator": "gpu", "devices": 2}, DDPStrategy, "ddp", CUDAAccelerator, 2), + ({"strategy": None, "accelerator": "cuda", "devices": 2}, DDPSpawnStrategy, "ddp_spawn", CUDAAccelerator, 2), + ({"strategy": "dp", "accelerator": "cuda", "devices": 2}, DataParallelStrategy, "dp", CUDAAccelerator, 2), + ({"strategy": "ddp", "accelerator": "cuda", "devices": 2}, DDPStrategy, "ddp", CUDAAccelerator, 2), ({"strategy": "ddp", "accelerator": "cpu", "devices": 2}, DDPStrategy, "ddp", CPUAccelerator, 2), ( {"strategy": "ddp_spawn", "accelerator": "cpu", "devices": 2}, @@ -2025,7 +2025,7 @@ def training_step(self, batch, batch_idx): 1, ), ( - {"strategy": "ddp_fully_sharded", "accelerator": "gpu", "devices": 1}, + {"strategy": "ddp_fully_sharded", "accelerator": "cuda", "devices": 1}, DDPFullyShardedStrategy, "ddp_fully_sharded", CUDAAccelerator, @@ -2039,65 +2039,65 @@ def training_step(self, batch, batch_idx): 2, ), ( - {"strategy": DDPSpawnStrategy(), "accelerator": "gpu", "devices": 2}, + {"strategy": DDPSpawnStrategy(), "accelerator": "cuda", "devices": 2}, DDPSpawnStrategy, "ddp_spawn", CUDAAccelerator, 2, ), ({"strategy": DDPStrategy()}, DDPStrategy, "ddp", CPUAccelerator, 1), - ({"strategy": DDPStrategy(), "accelerator": "gpu", "devices": 2}, DDPStrategy, "ddp", CUDAAccelerator, 2), + ({"strategy": DDPStrategy(), "accelerator": "cuda", "devices": 2}, DDPStrategy, "ddp", CUDAAccelerator, 2), ( - {"strategy": DataParallelStrategy(), "accelerator": "gpu", "devices": 2}, + {"strategy": DataParallelStrategy(), "accelerator": "cuda", "devices": 2}, DataParallelStrategy, "dp", CUDAAccelerator, 2, ), ( - {"strategy": DDPFullyShardedStrategy(), "accelerator": "gpu", "devices": 2}, + {"strategy": DDPFullyShardedStrategy(), "accelerator": "cuda", "devices": 2}, DDPFullyShardedStrategy, "ddp_fully_sharded", CUDAAccelerator, 2, ), ( - {"strategy": DDPSpawnShardedStrategy(), "accelerator": "gpu", "devices": 2}, + {"strategy": DDPSpawnShardedStrategy(), "accelerator": "cuda", "devices": 2}, DDPSpawnShardedStrategy, "ddp_sharded_spawn", CUDAAccelerator, 2, ), ( - {"strategy": DDPShardedStrategy(), "accelerator": "gpu", "devices": 2}, + {"strategy": DDPShardedStrategy(), "accelerator": "cuda", "devices": 2}, DDPShardedStrategy, "ddp_sharded", CUDAAccelerator, 2, ), ( - {"strategy": "ddp_spawn", "accelerator": "gpu", "devices": 2, "num_nodes": 2}, + {"strategy": "ddp_spawn", "accelerator": "cuda", "devices": 2, "num_nodes": 2}, DDPSpawnStrategy, "ddp_spawn", CUDAAccelerator, 2, ), ( - {"strategy": "ddp_fully_sharded", "accelerator": "gpu", "devices": 1, "num_nodes": 2}, + {"strategy": "ddp_fully_sharded", "accelerator": "cuda", "devices": 1, "num_nodes": 2}, DDPFullyShardedStrategy, "ddp_fully_sharded", CUDAAccelerator, 1, ), ( - {"strategy": "ddp_sharded", "accelerator": "gpu", "devices": 2, "num_nodes": 2}, + {"strategy": "ddp_sharded", "accelerator": "cuda", "devices": 2, "num_nodes": 2}, DDPShardedStrategy, "ddp_sharded", CUDAAccelerator, 2, ), ( - {"strategy": "ddp_sharded_spawn", "accelerator": "gpu", "devices": 2, "num_nodes": 2}, + {"strategy": "ddp_sharded_spawn", "accelerator": "cuda", "devices": 2, "num_nodes": 2}, DDPSpawnShardedStrategy, "ddp_sharded_spawn", CUDAAccelerator, @@ -2106,7 +2106,7 @@ def training_step(self, batch, batch_idx): ], ) def test_trainer_config_strategy(monkeypatch, trainer_kwargs, strategy_cls, strategy_name, accelerator_cls, devices): - if trainer_kwargs.get("accelerator") == "gpu": + if trainer_kwargs.get("accelerator") == "cuda": monkeypatch.setattr(device_parser, "is_cuda_available", lambda: True) monkeypatch.setattr(device_parser, "num_cuda_devices", lambda: trainer_kwargs["devices"]) @@ -2162,20 +2162,25 @@ def test_dataloaders_are_not_loaded_if_disabled_through_limit_batches(running_st ({"devices": "1"}, [0]), ({"devices": 2}, [0, 1]), ({"accelerator": "gpu", "devices": 1}, [0]), - ({"accelerator": "gpu", "devices": 2}, [0, 1]), - ({"accelerator": "gpu", "devices": "2"}, [0, 1]), - ({"accelerator": "gpu", "devices": [2]}, [2]), - ({"accelerator": "gpu", "devices": "2,"}, [2]), - ({"accelerator": "gpu", "devices": [0, 2]}, [0, 2]), - ({"accelerator": "gpu", "devices": "0, 2"}, [0, 2]), + ({"accelerator": "cuda", "devices": 1}, [0]), + ({"accelerator": "cuda", "devices": 2}, [0, 1]), + ({"accelerator": "cuda", "devices": "2"}, [0, 1]), + ({"accelerator": "cuda", "devices": [2]}, [2]), + ({"accelerator": "cuda", "devices": "2,"}, [2]), + ({"accelerator": "cuda", "devices": [0, 2]}, [0, 2]), + ({"accelerator": "cuda", "devices": "0, 2"}, [0, 2]), ({"accelerator": "ipu", "devices": 1}, [0]), ({"accelerator": "ipu", "devices": 2}, [0, 1]), + pytest.param({"accelerator": "mps", "devices": 1}, [0], marks=RunIf(min_torch="1.12")), ], ) def test_trainer_config_device_ids(monkeypatch, trainer_kwargs, expected_device_ids): - if trainer_kwargs.get("accelerator") == "gpu": + if trainer_kwargs.get("accelerator") in ("cuda", "gpu"): monkeypatch.setattr(device_parser, "is_cuda_available", lambda: True) monkeypatch.setattr(device_parser, "num_cuda_devices", lambda: 4) + elif trainer_kwargs.get("accelerator") in ("mps", "gpu"): + monkeypatch.setattr(device_parser, "_get_all_available_mps_gpus", lambda: [0]) + monkeypatch.setattr(MPSAccelerator, "is_available", lambda *_: True) elif trainer_kwargs.get("accelerator") == "ipu": monkeypatch.setattr(pytorch_lightning.accelerators.ipu.IPUAccelerator, "is_available", lambda _: True) monkeypatch.setattr(pytorch_lightning.strategies.ipu, "_IPU_AVAILABLE", lambda: True) From 51ab51a731fb56010113df5a304eaf28dc3a35f7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sat, 17 Sep 2022 08:25:54 +0200 Subject: [PATCH 185/193] User-friendly exception if root flow does not override the `run()` method (#14760) --- src/lightning_app/CHANGELOG.md | 4 ++++ src/lightning_app/core/app.py | 7 ++++--- src/lightning_app/utilities/component.py | 11 ++++++++++ tests/tests_app/core/test_lightning_app.py | 24 ++++++++++++++++++++++ 4 files changed, 43 insertions(+), 3 deletions(-) diff --git a/src/lightning_app/CHANGELOG.md b/src/lightning_app/CHANGELOG.md index 5190f7c4f22ae..d56ded3eaf0a5 100644 --- a/src/lightning_app/CHANGELOG.md +++ b/src/lightning_app/CHANGELOG.md @@ -22,6 +22,10 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Improve Lightning App connect logic by disconnecting automatically ([#14532](https://github.com/Lightning-AI/lightning/pull/14532)) +- Improved the error messsage when the root `LightningFlow` passed to `LightningApp` is missing the `run` method ([#14760](https://github.com/Lightning-AI/lightning/pull/14760)) + + + ### Fixed - Making threadpool non default from LightningCloud client ([#14757](https://github.com/Lightning-AI/lightning/pull/14757)) diff --git a/src/lightning_app/core/app.py b/src/lightning_app/core/app.py index 5fb693e0fe9b5..932708fd07342 100644 --- a/src/lightning_app/core/app.py +++ b/src/lightning_app/core/app.py @@ -26,7 +26,7 @@ from lightning_app.storage.path import storage_root_dir from lightning_app.utilities.app_helpers import _delta_to_app_state_delta, _LightningAppRef, Logger from lightning_app.utilities.commands.base import _process_requests -from lightning_app.utilities.component import _convert_paths_after_init +from lightning_app.utilities.component import _convert_paths_after_init, _validate_root_flow from lightning_app.utilities.enum import AppStage, CacheCallsKeys from lightning_app.utilities.exceptions import CacheMissException, ExitAppException from lightning_app.utilities.layout import _collect_layout @@ -58,8 +58,8 @@ def __init__( the :class:`~lightning.app.core.flow.LightningFlow` provided. Arguments: - root: The root LightningFlow component, that defined all - the app's nested components, running infinitely. + root: The root LightningFlow component, that defines all the app's nested components, running infinitely. + It must define a `run()` method that the app can call. debug: Whether to activate the Lightning Logger debug mode. This can be helpful when reporting bugs on Lightning repo. @@ -77,6 +77,7 @@ def __init__( Hello World! """ + _validate_root_flow(root) self._root = root # queues definition. diff --git a/src/lightning_app/utilities/component.py b/src/lightning_app/utilities/component.py index 1ca2b72d85aad..f7fd0ff850665 100644 --- a/src/lightning_app/utilities/component.py +++ b/src/lightning_app/utilities/component.py @@ -5,6 +5,7 @@ from deepdiff.helper import NotPresent from lightning_utilities.core.apply_func import apply_to_collection +from lightning_app.utilities.app_helpers import is_overridden from lightning_app.utilities.enum import ComponentContext from lightning_app.utilities.tree import breadth_first @@ -118,3 +119,13 @@ def _context(ctx: str) -> Generator[None, None, None]: _set_context(ctx) yield _set_context(prev) + + +def _validate_root_flow(flow: "LightningFlow") -> None: + from lightning_app.core.flow import LightningFlow + + if not is_overridden("run", instance=flow, parent=LightningFlow): + raise TypeError( + "The root flow passed to `LightningApp` does not override the `run()` method. This is required. Please" + f" implement `run()` in your `{flow.__class__.__name__}` class." + ) diff --git a/tests/tests_app/core/test_lightning_app.py b/tests/tests_app/core/test_lightning_app.py index 55af3a1aeb1b3..93c475700d746 100644 --- a/tests/tests_app/core/test_lightning_app.py +++ b/tests/tests_app/core/test_lightning_app.py @@ -1,6 +1,7 @@ import logging import os import pickle +from re import escape from time import sleep from unittest import mock from unittest.mock import ANY @@ -32,6 +33,29 @@ logger = logging.getLogger() +def test_lightning_app_requires_root_run_method(): + """Test that a useful exception is raised if the root flow does not override the run method.""" + + with pytest.raises( + TypeError, match=escape("The root flow passed to `LightningApp` does not override the `run()` method") + ): + LightningApp(LightningFlow()) + + class FlowWithoutRun(LightningFlow): + pass + + with pytest.raises( + TypeError, match=escape("The root flow passed to `LightningApp` does not override the `run()` method") + ): + LightningApp(FlowWithoutRun()) + + class FlowWithRun(LightningFlow): + def run(self): + pass + + LightningApp(FlowWithRun()) # no error + + class B1(LightningFlow): def __init__(self): super().__init__() From 8f14184180f3a14b052ff42e2280ec2851a1239f Mon Sep 17 00:00:00 2001 From: Pranjal Datta Date: Sat, 17 Sep 2022 12:36:45 +0530 Subject: [PATCH 186/193] Fix property setter override by default setter (#14259) --- src/lightning_app/CHANGELOG.md | 12 ++++++- src/lightning_app/core/work.py | 14 ++++++-- tests/tests_app/core/test_lightning_work.py | 36 ++++++++++++++++++++- 3 files changed, 57 insertions(+), 5 deletions(-) diff --git a/src/lightning_app/CHANGELOG.md b/src/lightning_app/CHANGELOG.md index d56ded3eaf0a5..b287ce67cbcf9 100644 --- a/src/lightning_app/CHANGELOG.md +++ b/src/lightning_app/CHANGELOG.md @@ -10,12 +10,16 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Added - Add `load_state_dict` and `state_dict` ([#14100](https://github.com/Lightning-AI/lightning/pull/14100)) + + - Add `--secret` option to CLI to allow binding Secrets to app environment variables when running in the cloud ([#14612](https://github.com/Lightning-AI/lightning/pull/14612)) ### Changed - Application storage prefix moved from `app_id` to `project_id/app_id` ([#14583](https://github.com/Lightning-AI/lightning/pull/14583)) + + - LightningCloud client calls to use key word arguments instead of positional arguments ([#14685](https://github.com/Lightning-AI/lightning/pull/14685) @@ -30,17 +34,20 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Making threadpool non default from LightningCloud client ([#14757](https://github.com/Lightning-AI/lightning/pull/14757)) + - Resolved a bug where the state change detection using DeepDiff won't worked with Path, Drive objects ([#14465](https://github.com/Lightning-AI/lightning/pull/14465)) + - Resolved a bug where the wrong client was passed to collect cloud logs ([#14684](https://github.com/Lightning-AI/lightning/pull/14684)) + - Resolved the memory leak issue with Lightning Cloud package and bumped the requirements to use the latest version ([#14697](https://github.com/Lightning-AI/lightning/pull/14697) - Unification of app template: moved `app.py` to root dir for `lightning init app ` template ([#13853](https://github.com/Lightning-AI/lightning/pull/13853)) -- Fixing 5000 log line limitation for Lightning AI BYOC cluster logs ([#14458](https://github.com/Lightning-AI/lightning/pull/14458)) +- Fixing 5000 log line limitation for Lightning AI BYOC cluster logs ([#14458](https://github.com/Lightning-AI/lightning/pull/14458)) - Fixed a bug where the uploaded command file wasn't properly parsed ([#14532](https://github.com/Lightning-AI/lightning/pull/14532)) @@ -49,6 +56,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Resolved `LightningApp(..., debug=True)` ([#14464](https://github.com/Lightning-AI/lightning/pull/14464)) +- Fixed an issue where custom property setters were not being used `LightningWork` class. ([#14259](https://github.com/Lightning-AI/lightning/pull/14259)) + + ## [0.6.0] - 2022-09-08 ### Added diff --git a/src/lightning_app/core/work.py b/src/lightning_app/core/work.py index 054fe44cfcb89..6c8b09875e9f6 100644 --- a/src/lightning_app/core/work.py +++ b/src/lightning_app/core/work.py @@ -3,7 +3,7 @@ import warnings from copy import deepcopy from functools import partial, wraps -from typing import Any, Callable, Dict, List, Optional +from typing import Any, Callable, Dict, List, Optional, Union from deepdiff import DeepHash @@ -316,9 +316,17 @@ def num_successes(self) -> int: return has_succeeded_counter + def _get_property_if_exists(self, name: str) -> Union[property, None]: + attr = getattr(self.__class__, name, None) + return attr if isinstance(attr, property) else None + def __setattr__(self, name: str, value: Any) -> None: - setattr_fn = getattr(self, "_setattr_replacement", None) or self._default_setattr - setattr_fn(name, value) + property_object = self._get_property_if_exists(name) + if property_object is not None and property_object.fset is not None: + property_object.fset(self, value) + else: + setattr_fn = getattr(self, "_setattr_replacement", None) or self._default_setattr + setattr_fn(name, value) def _default_setattr(self, name: str, value: Any) -> None: from lightning_app.core.flow import LightningFlow diff --git a/tests/tests_app/core/test_lightning_work.py b/tests/tests_app/core/test_lightning_work.py index 14d8d26a458a6..fd936c802e71d 100644 --- a/tests/tests_app/core/test_lightning_work.py +++ b/tests/tests_app/core/test_lightning_work.py @@ -5,7 +5,7 @@ from lightning_app import LightningApp from lightning_app.core.flow import LightningFlow -from lightning_app.core.work import LightningWork, LightningWorkException +from lightning_app.core.work import BuildConfig, LightningWork, LightningWorkException from lightning_app.runners import MultiProcessRuntime from lightning_app.storage import Path from lightning_app.testing.helpers import EmptyFlow, EmptyWork, MockQueue @@ -280,3 +280,37 @@ def run(self, *args, **kwargs): w.run(1, [2], (3, 4), {"1": "3"}) assert len(w._calls) == 2 assert w._calls["0d824f7"] == {"ret": None} + + +def test_work_cloud_build_config_provided(): + + assert isinstance(LightningWork.cloud_build_config, property) + assert LightningWork.cloud_build_config.fset is not None + + class Work(LightningWork): + def __init__(self): + super().__init__() + self.cloud_build_config = BuildConfig(image="ghcr.io/gridai/base-images:v1.8-cpu") + + def run(self, *args, **kwargs): + pass + + w = Work() + w.run() + + +def test_work_local_build_config_provided(): + + assert isinstance(LightningWork.local_build_config, property) + assert LightningWork.local_build_config.fset is not None + + class Work(LightningWork): + def __init__(self): + super().__init__() + self.local_build_config = BuildConfig(image="ghcr.io/gridai/base-images:v1.8-cpu") + + def run(self, *args, **kwargs): + pass + + w = Work() + w.run() From e872b274cafa8dc21642fc93b5eb91887f62192f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sat, 17 Sep 2022 21:45:07 +0200 Subject: [PATCH 187/193] User-friendly exception when `LightningWork.run()` method is missing (#14759) --- src/lightning_app/CHANGELOG.md | 4 +++ src/lightning_app/core/work.py | 15 +++++--- tests/tests_app/core/test_lightning_work.py | 39 ++++++++++++++------- 3 files changed, 40 insertions(+), 18 deletions(-) diff --git a/src/lightning_app/CHANGELOG.md b/src/lightning_app/CHANGELOG.md index b287ce67cbcf9..45cfa83166ea4 100644 --- a/src/lightning_app/CHANGELOG.md +++ b/src/lightning_app/CHANGELOG.md @@ -26,10 +26,14 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Improve Lightning App connect logic by disconnecting automatically ([#14532](https://github.com/Lightning-AI/lightning/pull/14532)) +- Improved the error messsage when the `LightningWork` is missing the `run` method ([#14759](https://github.com/Lightning-AI/lightning/pull/14759)) + + - Improved the error messsage when the root `LightningFlow` passed to `LightningApp` is missing the `run` method ([#14760](https://github.com/Lightning-AI/lightning/pull/14760)) + ### Fixed - Making threadpool non default from LightningCloud client ([#14757](https://github.com/Lightning-AI/lightning/pull/14757)) diff --git a/src/lightning_app/core/work.py b/src/lightning_app/core/work.py index 6c8b09875e9f6..6bc299a8b4149 100644 --- a/src/lightning_app/core/work.py +++ b/src/lightning_app/core/work.py @@ -1,4 +1,3 @@ -import abc import time import warnings from copy import deepcopy @@ -11,7 +10,7 @@ from lightning_app.storage import Path from lightning_app.storage.drive import _maybe_create_drive, Drive from lightning_app.storage.payload import Payload -from lightning_app.utilities.app_helpers import _is_json_serializable, _LightningAppRef +from lightning_app.utilities.app_helpers import _is_json_serializable, _LightningAppRef, is_overridden from lightning_app.utilities.component import _is_flow_context, _sanitize_state from lightning_app.utilities.enum import ( CacheCallsKeys, @@ -29,7 +28,7 @@ from lightning_app.utilities.proxies import LightningWorkSetAttrProxy, ProxyWorkRun, unwrap -class LightningWork(abc.ABC): +class LightningWork: _INTERNAL_STATE_VARS = ( # Internal protected variables that are still part of the state (even though they are prefixed with "_") @@ -139,6 +138,7 @@ def __init__( self._cloud_build_config = cloud_build_config or BuildConfig() self._cloud_compute = cloud_compute or CloudCompute() self._backend: Optional[Backend] = None + self._check_run_is_implemented() self._on_init_end() @property @@ -524,14 +524,12 @@ def _cleanup_calls(calls: Dict[str, Any]): final_statuses.append(status) calls[call_hash]["statuses"] = final_statuses - @abc.abstractmethod def run(self, *args, **kwargs): """Override to add your own logic. Raises: LightningPlatformException: If resource exceeds platform quotas or other constraints. """ - pass def on_exception(self, exception: BaseException): """Override to customize how to handle exception in the run method.""" @@ -570,3 +568,10 @@ def stop(self): self._calls[latest_hash]["statuses"].append(stop_status) app = _LightningAppRef().get_current() self._backend.stop_work(app, self) + + def _check_run_is_implemented(self) -> None: + if not is_overridden("run", instance=self, parent=LightningWork): + raise TypeError( + f"The work `{self.__class__.__name__}` is missing the `run()` method. This is required. Implement it" + " first and then call it in your Flow." + ) diff --git a/tests/tests_app/core/test_lightning_work.py b/tests/tests_app/core/test_lightning_work.py index fd936c802e71d..e0619420f4ed3 100644 --- a/tests/tests_app/core/test_lightning_work.py +++ b/tests/tests_app/core/test_lightning_work.py @@ -1,4 +1,5 @@ from queue import Empty +from re import escape from unittest.mock import Mock import pytest @@ -13,35 +14,47 @@ from lightning_app.utilities.proxies import ProxyWorkRun, WorkRunner -def test_simple_lightning_work(): - class Work_A(LightningWork): +def test_lightning_work_run_method_required(): + """Test that a helpful exception is raised when the user did not implement the `LightningWork.run()` method.""" + + with pytest.raises(TypeError, match=escape("The work `LightningWork` is missing the `run()` method")): + LightningWork() + + class WorkWithoutRun(LightningWork): def __init__(self): super().__init__() self.started = False - with pytest.raises(TypeError, match="Work_A"): - Work_A() + with pytest.raises(TypeError, match=escape("The work `WorkWithoutRun` is missing the `run()` method")): + WorkWithoutRun() - class Work_B(Work_A): + class WorkWithRun(WorkWithoutRun): def run(self, *args, **kwargs): self.started = True - work_b = Work_B() - work_b.run() - assert work_b.started + work = WorkWithRun() + work.run() + assert work.started + + +def test_lightning_work_no_children_allowed(): + """Test that a LightningWork can't have any children (work or flow).""" + + class ChildWork(EmptyWork): + pass - class Work_C(LightningWork): + class ParentWork(LightningWork): def __init__(self): super().__init__() - self.work_b = Work_B() + self.work_b = ChildWork() def run(self, *args, **kwargs): pass with pytest.raises(LightningWorkException, match="isn't allowed to take any children such as"): - Work_C() + ParentWork() - class Work_C(LightningWork): + class ParentWork(LightningWork): def __init__(self): super().__init__() self.flow = LightningFlow() @@ -50,7 +63,7 @@ def run(self, *args, **kwargs): pass with pytest.raises(LightningWorkException, match="LightningFlow"): - Work_C() + ParentWork() def test_forgot_to_call_init(): From 1092265140bf5333bb647afda851f040b7d62b23 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sun, 18 Sep 2022 20:01:49 +0200 Subject: [PATCH 188/193] Remove check `num_slurm_tasks` in Lite (#14761) --- src/lightning_lite/connector.py | 26 ++++++----------- tests/tests_lite/test_connector.py | 45 +++++++++++++++--------------- 2 files changed, 31 insertions(+), 40 deletions(-) diff --git a/src/lightning_lite/connector.py b/src/lightning_lite/connector.py index 80a932eb529cf..4b91c1f328798 100644 --- a/src/lightning_lite/connector.py +++ b/src/lightning_lite/connector.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os from collections import Counter from typing import Dict, List, Optional, Union @@ -406,25 +405,16 @@ def _set_devices_flag_if_auto_passed(self) -> None: def _choose_and_init_cluster_environment(self) -> ClusterEnvironment: if isinstance(self._cluster_environment_flag, ClusterEnvironment): return self._cluster_environment_flag - if self._is_slurm_managing_tasks(): - rank_zero_info("Multiprocessing is handled by SLURM.") - return SLURMEnvironment() - for env_type in (TorchElasticEnvironment, KubeflowEnvironment, LSFEnvironment): + for env_type in ( + SLURMEnvironment, + TorchElasticEnvironment, + KubeflowEnvironment, + LSFEnvironment, + ): if env_type.detect(): - # Ignore type error because it is a false positive: https://github.com/python/mypy/issues/13044 - return env_type() # type: ignore[abstract] + return env_type() return LightningEnvironment() - def _is_slurm_managing_tasks(self) -> bool: - """used by choosing cluster enviroment.""" - # TODO(lite): Remove this, see: https://github.com/Lightning-AI/lightning/pull/14300 - if not SLURMEnvironment.detect() or SLURMEnvironment.job_name() == "bash": - return False - - total_requested_devices = len(self._parallel_devices) * self._num_nodes_flag - num_slurm_tasks = int(os.environ["SLURM_NTASKS"], 0) - return num_slurm_tasks == total_requested_devices - def _choose_strategy(self) -> Union[Strategy, str]: if self._accelerator_flag == "tpu": if self._parallel_devices and len(self._parallel_devices) > 1: @@ -459,7 +449,7 @@ def _check_strategy_and_fallback(self) -> None: strategy_flag = "" if isinstance(self._strategy_flag, Strategy) else self._strategy_flag if strategy_flag in ("ddp_spawn", "ddp_spawn_find_unused_parameters_false") and ( - TorchElasticEnvironment.detect() or KubeflowEnvironment.detect() or self._is_slurm_managing_tasks() + TorchElasticEnvironment.detect() or KubeflowEnvironment.detect() or SLURMEnvironment.detect() ): strategy_flag = "ddp" if strategy_flag == "dp" and self._accelerator_flag == "cpu": diff --git a/tests/tests_lite/test_connector.py b/tests/tests_lite/test_connector.py index 8f9f9984ef53b..87121b4888657 100644 --- a/tests/tests_lite/test_connector.py +++ b/tests/tests_lite/test_connector.py @@ -460,28 +460,29 @@ def test_strategy_choice_ddp_spawn(*_): assert isinstance(connector.strategy.cluster_environment, LightningEnvironment) -@RunIf(min_cuda_gpus=2) -@mock.patch.dict( - os.environ, - { - "CUDA_VISIBLE_DEVICES": "0,1", - "SLURM_NTASKS": "2", - "SLURM_JOB_NAME": "SOME_NAME", - "SLURM_NODEID": "0", - "SLURM_PROCID": "1", - "SLURM_LOCALID": "1", - }, -) -@mock.patch("pytorch_lightning.strategies.DDPStrategy.setup_distributed", autospec=True) -@pytest.mark.parametrize("strategy", ["ddp", DDPStrategy()]) -def test_strategy_choice_ddp_slurm(_, strategy): - connector = _Connector(strategy=strategy, accelerator="gpu", devices=2) - assert connector._is_slurm_managing_tasks() - assert isinstance(connector.accelerator, CUDAAccelerator) - assert isinstance(connector.strategy, DDPStrategy) - assert isinstance(connector.strategy.cluster_environment, SLURMEnvironment) - assert connector.strategy.cluster_environment.local_rank() == 1 - assert connector.strategy.local_rank == 1 +@mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=2) +@mock.patch("lightning_lite.utilities.device_parser.is_cuda_available", return_value=True) +@pytest.mark.parametrize("job_name,expected_env", [("some_name", SLURMEnvironment), ("bash", LightningEnvironment)]) +@pytest.mark.parametrize("strategy", ["ddp", DDPStrategy]) +def test_strategy_choice_ddp_slurm(_, __, strategy, job_name, expected_env): + if not isinstance(strategy, str): + strategy = strategy() + + with mock.patch.dict( + os.environ, + { + "CUDA_VISIBLE_DEVICES": "0,1", + "SLURM_NTASKS": "2", + "SLURM_JOB_NAME": job_name, + "SLURM_NODEID": "0", + "SLURM_PROCID": "1", + "SLURM_LOCALID": "1", + }, + ): + trainer = _Connector(strategy=strategy, accelerator="cuda", devices=2) + assert isinstance(trainer.accelerator, CUDAAccelerator) + assert isinstance(trainer.strategy, DDPStrategy) + assert isinstance(trainer.strategy.cluster_environment, expected_env) @mock.patch.dict( From 4f9c7793e754545366796e041fe431d1e9e646a2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sun, 18 Sep 2022 22:27:15 +0200 Subject: [PATCH 189/193] Fix TensorBoardLogger creating redundant experiment when finalizing (#14762) Co-authored-by: Kushashwa Ravi Shrimali --- src/pytorch_lightning/CHANGELOG.md | 10 +++++++--- src/pytorch_lightning/loggers/tensorboard.py | 5 +++-- .../checkpointing/test_model_checkpoint.py | 14 ++++++++++---- tests/tests_pytorch/loggers/test_tensorboard.py | 10 ++++++++++ 4 files changed, 30 insertions(+), 9 deletions(-) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 4ca202c34edd6..367d32210e2e4 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -7,9 +7,6 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ## [unReleased] - 2022-MM-DD -- Added an option to configure the signal SLURM sends when a job is preempted or requeued ([#14610](https://github.com/Lightning-AI/lightning/issues/14610)) - - ### Added @@ -40,6 +37,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added `WandbLogger.download_artifact` and `WandbLogger.use_artifact` for managing artifacts with Weights and Biases ([#14551](https://github.com/Lightning-AI/lightning/issues/14551)) +- Added an option to configure the signal SLURM sends when a job is preempted or requeued ([#14610](https://github.com/Lightning-AI/lightning/issues/14610)) + + ### Changed - The `Trainer.{fit,validate,test,predict,tune}` methods now raise a useful error message if the input is not a `LightningModule` ([#13892](https://github.com/Lightning-AI/lightning/pull/13892)) @@ -186,6 +186,10 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed torchscript error with ensembles of LightningModules ([#14657](https://github.com/Lightning-AI/lightning/pull/14657), [#14724](https://github.com/Lightning-AI/lightning/pull/14724)) +- Fixed an issue with `TensorBoardLogger.finalize` creating a new experiment when none was created during the Trainer's execution ([#14762](https://github.com/Lightning-AI/lightning/pull/14762)) + + + ## [1.7.6] - 2022-09-13 ### Changed diff --git a/src/pytorch_lightning/loggers/tensorboard.py b/src/pytorch_lightning/loggers/tensorboard.py index 7ff19b8c38c89..f27ae1c015b59 100644 --- a/src/pytorch_lightning/loggers/tensorboard.py +++ b/src/pytorch_lightning/loggers/tensorboard.py @@ -268,8 +268,9 @@ def save(self) -> None: @rank_zero_only def finalize(self, status: str) -> None: - self.experiment.flush() - self.experiment.close() + if self._experiment is not None: + self.experiment.flush() + self.experiment.close() self.save() @property diff --git a/tests/tests_pytorch/checkpointing/test_model_checkpoint.py b/tests/tests_pytorch/checkpointing/test_model_checkpoint.py index 6d44bfe83be3d..f8a172d27359f 100644 --- a/tests/tests_pytorch/checkpointing/test_model_checkpoint.py +++ b/tests/tests_pytorch/checkpointing/test_model_checkpoint.py @@ -869,18 +869,23 @@ def validation_step(self, batch, batch_idx): "limit_test_batches": 2, "enable_progress_bar": False, "enable_model_summary": False, + "log_every_n_steps": 1, + "default_root_dir": tmpdir, } trainer = Trainer(**trainer_kwargs, callbacks=[checkpoint_callback]) trainer.fit(model) - assert os.listdir(tmpdir) == ["epoch=00.ckpt"] + assert set(os.listdir(tmpdir)) == {"epoch=00.ckpt", "lightning_logs"} for idx in range(4): # load from checkpoint - trainer = pl.Trainer(**trainer_kwargs, default_root_dir=tmpdir) + trainer = Trainer(**trainer_kwargs) trainer.fit(model, ckpt_path=checkpoint_callback.best_model_path) trainer.test(ckpt_path=checkpoint_callback.best_model_path, verbose=False) + assert set(os.listdir(tmpdir)) == {"epoch=00.ckpt", "lightning_logs"} - assert set(os.listdir(tmpdir / "lightning_logs")) == {f"version_{i}" for i in range(4)} + + # no new versions created after the initial fit, because the ones that resume from ckpt do not log anything + assert set(os.listdir(tmpdir / "lightning_logs")) == {"version_0"} def test_checkpoint_repeated_strategy_extended(tmpdir): @@ -891,6 +896,7 @@ class ExtendedBoringModel(BoringModel): def validation_step(self, batch, batch_idx): output = self.layer(batch) loss = self.loss(batch, output) + self.log("val_loss", loss) return {"val_loss": loss} def validation_epoch_end(self, *_): @@ -930,7 +936,7 @@ def assert_checkpoint_log_dir(idx): limit_test_batches=4, callbacks=[checkpoint_cb], ) - trainer = pl.Trainer(**trainer_config) + trainer = Trainer(**trainer_config) assert_trainer_init(trainer) model = ExtendedBoringModel() diff --git a/tests/tests_pytorch/loggers/test_tensorboard.py b/tests/tests_pytorch/loggers/test_tensorboard.py index 3793b5c58b5b6..3b25c86b87c32 100644 --- a/tests/tests_pytorch/loggers/test_tensorboard.py +++ b/tests/tests_pytorch/loggers/test_tensorboard.py @@ -275,7 +275,17 @@ def training_step(self, *args): def test_tensorboard_finalize(summary_writer, tmpdir): """Test that the SummaryWriter closes in finalize.""" logger = TensorBoardLogger(save_dir=tmpdir) + assert logger._experiment is None logger.finalize("any") + + # no log calls, no experiment created -> nothing to flush + summary_writer.assert_not_called() + + logger = TensorBoardLogger(save_dir=tmpdir) + logger.log_metrics({"flush_me": 11.1}) # trigger creation of an experiment + logger.finalize("any") + + # finalize flushes to experiment directory summary_writer().flush.assert_called() summary_writer().close.assert_called() From e9c571d39fb80b10c6462703e7ba35052511d41c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Mon, 19 Sep 2022 00:48:45 +0200 Subject: [PATCH 190/193] Move accelerator-specific parsing functions with their accelerators (#14753) Co-authored-by: awaelchli --- src/lightning_lite/accelerators/cpu.py | 28 +++- src/lightning_lite/accelerators/cuda.py | 43 +++++- src/lightning_lite/accelerators/mps.py | 14 +- src/lightning_lite/accelerators/tpu.py | 57 +++++++- src/lightning_lite/connector.py | 5 +- src/lightning_lite/utilities/device_parser.py | 123 +----------------- src/pytorch_lightning/CHANGELOG.md | 10 ++ src/pytorch_lightning/accelerators/cpu.py | 2 +- src/pytorch_lightning/accelerators/cuda.py | 13 +- src/pytorch_lightning/accelerators/hpu.py | 24 +++- src/pytorch_lightning/accelerators/mps.py | 4 +- src/pytorch_lightning/accelerators/tpu.py | 4 +- src/pytorch_lightning/profilers/pytorch.py | 2 +- .../connectors/accelerator_connector.py | 5 +- .../tuner/auto_gpu_select.py | 6 +- .../utilities/device_parser.py | 41 ++---- tests/tests_lite/accelerators/test_cuda.py | 2 +- tests/tests_lite/test_connector.py | 52 ++++---- .../utilities/test_device_parser.py | 45 +++---- .../tests_pytorch/accelerators/test_common.py | 11 +- tests/tests_pytorch/accelerators/test_cpu.py | 8 -- tests/tests_pytorch/accelerators/test_gpu.py | 4 +- tests/tests_pytorch/accelerators/test_ipu.py | 4 + tests/tests_pytorch/conftest.py | 43 ++++++ .../deprecated_api/test_remove_1-10.py | 4 + .../deprecated_api/test_remove_1-8.py | 35 ++--- .../deprecated_api/test_remove_2-0.py | 4 +- tests/tests_pytorch/models/test_gpu.py | 51 ++------ .../tests_pytorch/plugins/test_amp_plugins.py | 18 ++- .../plugins/test_cluster_integration.py | 4 +- .../strategies/test_bagua_strategy.py | 7 +- tests/tests_pytorch/strategies/test_ddp.py | 10 +- ..._ddp_fully_sharded_with_full_state_dict.py | 4 +- .../strategies/test_deepspeed_strategy.py | 3 +- .../connectors/test_accelerator_connector.py | 93 +++++-------- .../trainer/flags/test_env_vars.py | 4 +- .../properties/test_auto_gpu_select.py | 7 +- .../test_estimated_stepping_batches.py | 5 +- .../trainer/test_config_validator.py | 7 +- .../tests_pytorch/trainer/test_supporters.py | 4 +- tests/tests_pytorch/trainer/test_trainer.py | 16 +-- .../tests_pytorch/trainer/test_trainer_cli.py | 7 +- .../utilities/test_device_parser.py | 31 ----- 43 files changed, 387 insertions(+), 477 deletions(-) delete mode 100644 tests/tests_pytorch/utilities/test_device_parser.py diff --git a/src/lightning_lite/accelerators/cpu.py b/src/lightning_lite/accelerators/cpu.py index 24b360179801b..11bb0328e5dfa 100644 --- a/src/lightning_lite/accelerators/cpu.py +++ b/src/lightning_lite/accelerators/cpu.py @@ -16,7 +16,6 @@ import torch from lightning_lite.accelerators.accelerator import Accelerator -from lightning_lite.utilities import device_parser class CPUAccelerator(Accelerator): @@ -37,13 +36,13 @@ def teardown(self) -> None: @staticmethod def parse_devices(devices: Union[int, str, List[int]]) -> int: """Accelerator device parsing logic.""" - devices = device_parser.parse_cpu_cores(devices) + devices = parse_cpu_cores(devices) return devices @staticmethod def get_parallel_devices(devices: Union[int, str, List[int]]) -> List[torch.device]: """Gets parallel devices for the Accelerator.""" - devices = device_parser.parse_cpu_cores(devices) + devices = parse_cpu_cores(devices) return [torch.device("cpu")] * devices @staticmethod @@ -63,3 +62,26 @@ def register_accelerators(cls, accelerator_registry: Dict) -> None: cls, description=cls.__class__.__name__, ) + + +def parse_cpu_cores(cpu_cores: Union[int, str, List[int]]) -> int: + """Parses the cpu_cores given in the format as accepted by the ``devices`` argument in the + :class:`~pytorch_lightning.trainer.Trainer`. + + Args: + cpu_cores: An int > 0. + + Returns: + An int representing the number of processes + + Raises: + MisconfigurationException: + If cpu_cores is not an int > 0 + """ + if isinstance(cpu_cores, str) and cpu_cores.strip().isdigit(): + cpu_cores = int(cpu_cores) + + if not isinstance(cpu_cores, int) or cpu_cores <= 0: + raise TypeError("`devices` selected with `CPUAccelerator` should be an int > 0.") + + return cpu_cores diff --git a/src/lightning_lite/accelerators/cuda.py b/src/lightning_lite/accelerators/cuda.py index 7e7947a361873..f2b412a9713a5 100644 --- a/src/lightning_lite/accelerators/cuda.py +++ b/src/lightning_lite/accelerators/cuda.py @@ -11,12 +11,13 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import multiprocessing from typing import Dict, List, Optional, Union import torch from lightning_lite.accelerators.accelerator import Accelerator -from lightning_lite.utilities import device_parser +from lightning_lite.strategies.launchers.multiprocessing import _is_forking_disabled class CUDAAccelerator(Accelerator): @@ -39,7 +40,9 @@ def teardown(self) -> None: @staticmethod def parse_devices(devices: Union[int, str, List[int]]) -> Optional[List[int]]: """Accelerator device parsing logic.""" - return device_parser.parse_gpu_ids(devices, include_cuda=True) + from lightning_lite.utilities.device_parser import parse_gpu_ids + + return parse_gpu_ids(devices, include_cuda=True) @staticmethod def get_parallel_devices(devices: List[int]) -> List[torch.device]: @@ -49,11 +52,11 @@ def get_parallel_devices(devices: List[int]) -> List[torch.device]: @staticmethod def auto_device_count() -> int: """Get the devices when set to auto.""" - return device_parser.num_cuda_devices() + return num_cuda_devices() @staticmethod def is_available() -> bool: - return device_parser.num_cuda_devices() > 0 + return num_cuda_devices() > 0 @classmethod def register_accelerators(cls, accelerator_registry: Dict) -> None: @@ -62,3 +65,35 @@ def register_accelerators(cls, accelerator_registry: Dict) -> None: cls, description=cls.__class__.__name__, ) + + +def _get_all_available_cuda_gpus() -> List[int]: + """ + Returns: + A list of all available CUDA GPUs + """ + return list(range(num_cuda_devices())) + + +def num_cuda_devices() -> int: + """Returns the number of GPUs available. + + Unlike :func:`torch.cuda.device_count`, this function does its best not to create a CUDA context for fork support, + if the platform allows it. + """ + if "fork" not in torch.multiprocessing.get_all_start_methods() or _is_forking_disabled(): + return torch.cuda.device_count() + with multiprocessing.get_context("fork").Pool(1) as pool: + return pool.apply(torch.cuda.device_count) + + +def is_cuda_available() -> bool: + """Returns a bool indicating if CUDA is currently available. + + Unlike :func:`torch.cuda.is_available`, this function does its best not to create a CUDA context for fork support, + if the platform allows it. + """ + if "fork" not in torch.multiprocessing.get_all_start_methods() or _is_forking_disabled(): + return torch.cuda.is_available() + with multiprocessing.get_context("fork").Pool(1) as pool: + return pool.apply(torch.cuda.is_available) diff --git a/src/lightning_lite/accelerators/mps.py b/src/lightning_lite/accelerators/mps.py index 694cb135ddead..95046c64d4d7c 100644 --- a/src/lightning_lite/accelerators/mps.py +++ b/src/lightning_lite/accelerators/mps.py @@ -18,7 +18,6 @@ import torch from lightning_lite.accelerators.accelerator import Accelerator -from lightning_lite.utilities import device_parser from lightning_lite.utilities.imports import _TORCH_GREATER_EQUAL_1_12 @@ -40,7 +39,9 @@ def teardown(self) -> None: @staticmethod def parse_devices(devices: Union[int, str, List[int]]) -> Optional[List[int]]: """Accelerator device parsing logic.""" - parsed_devices = device_parser.parse_gpu_ids(devices, include_mps=True) + from lightning_lite.utilities.device_parser import parse_gpu_ids + + parsed_devices = parse_gpu_ids(devices, include_mps=True) return parsed_devices @staticmethod @@ -48,7 +49,6 @@ def get_parallel_devices(devices: Union[int, str, List[int]]) -> List[torch.devi """Gets parallel devices for the Accelerator.""" parsed_devices = MPSAccelerator.parse_devices(devices) assert parsed_devices is not None - return [torch.device("mps", i) for i in range(len(parsed_devices))] @staticmethod @@ -72,3 +72,11 @@ def register_accelerators(cls, accelerator_registry: Dict) -> None: cls, description=cls.__class__.__name__, ) + + +def _get_all_available_mps_gpus() -> List[int]: + """ + Returns: + A list of all available MPS GPUs + """ + return [0] if MPSAccelerator.is_available() else [] diff --git a/src/lightning_lite/accelerators/tpu.py b/src/lightning_lite/accelerators/tpu.py index 4d124b25f291c..7a326e47596c3 100644 --- a/src/lightning_lite/accelerators/tpu.py +++ b/src/lightning_lite/accelerators/tpu.py @@ -11,12 +11,12 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import Dict, List, Optional, Union +from typing import Any, Dict, List, Optional, Union import torch from lightning_lite.accelerators.accelerator import Accelerator -from lightning_lite.utilities import device_parser +from lightning_lite.utilities.device_parser import _check_data_type from lightning_lite.utilities.imports import _TPU_AVAILABLE @@ -32,7 +32,7 @@ def teardown(self) -> None: @staticmethod def parse_devices(devices: Union[int, str, List[int]]) -> Optional[Union[int, List[int]]]: """Accelerator device parsing logic.""" - return device_parser.parse_tpu_cores(devices) + return parse_tpu_cores(devices) @staticmethod def get_parallel_devices(devices: Union[int, List[int]]) -> List[int]: @@ -57,3 +57,54 @@ def register_accelerators(cls, accelerator_registry: Dict) -> None: cls, description=cls.__class__.__name__, ) + + +def parse_tpu_cores(tpu_cores: Optional[Union[int, str, List[int]]]) -> Optional[Union[int, List[int]]]: + """ + Parses the tpu_cores given in the format as accepted by the + :class:`~pytorch_lightning.trainer.Trainer`. + + Args: + tpu_cores: An int of 1 or string '1' indicates that 1 core with multi-processing should be used + An int 8 or string '8' indicates that all 8 cores with multi-processing should be used + A list of ints or a strings containing a list of comma separated integers + indicates the specific TPU core to use. + + Returns: + A list of tpu_cores to be used or ``None`` if no TPU cores were requested + + Raises: + MisconfigurationException: + If TPU cores aren't 1, 8 or [<1-8>] + """ + _check_data_type(tpu_cores) + + if isinstance(tpu_cores, str): + tpu_cores = _parse_tpu_cores_str(tpu_cores.strip()) + + if not _tpu_cores_valid(tpu_cores): + raise TypeError("`tpu_cores` can only be 1, 8 or [<1-8>]") + + return tpu_cores + + +def _tpu_cores_valid(tpu_cores: Any) -> bool: + # allow 1 or 8 cores + if tpu_cores in (1, 8, None): + return True + + # allow picking 1 of 8 indexes + if isinstance(tpu_cores, (list, tuple, set)): + has_1_tpu_idx = len(tpu_cores) == 1 + is_valid_tpu_idx = 1 <= list(tpu_cores)[0] <= 8 + + is_valid_tpu_core_choice = has_1_tpu_idx and is_valid_tpu_idx + return is_valid_tpu_core_choice + + return False + + +def _parse_tpu_cores_str(tpu_cores: str) -> Union[int, List[int]]: + if tpu_cores in ("1", "8"): + return int(tpu_cores) + return [int(x.strip()) for x in tpu_cores.split(",") if len(x) > 0] diff --git a/src/lightning_lite/connector.py b/src/lightning_lite/connector.py index 4b91c1f328798..ed1f54524fe0f 100644 --- a/src/lightning_lite/connector.py +++ b/src/lightning_lite/connector.py @@ -52,7 +52,8 @@ XLAStrategy, ) from lightning_lite.strategies.ddp_spawn import _DDP_FORK_ALIASES -from lightning_lite.utilities import _StrategyType, device_parser, rank_zero_deprecation, rank_zero_info, rank_zero_warn +from lightning_lite.utilities import _StrategyType, rank_zero_deprecation, rank_zero_info, rank_zero_warn +from lightning_lite.utilities.device_parser import determine_root_gpu_device from lightning_lite.utilities.imports import _HPU_AVAILABLE, _IPU_AVAILABLE, _IS_INTERACTIVE, _TPU_AVAILABLE _PLUGIN = Union[Strategy, Precision, ClusterEnvironment, CheckpointIO] @@ -429,7 +430,7 @@ def _choose_strategy(self) -> Union[Strategy, str]: if isinstance(self._accelerator_flag, (CUDAAccelerator, MPSAccelerator)) or ( isinstance(self._accelerator_flag, str) and self._accelerator_flag in ("cuda", "gpu", "mps") ): - device = device_parser.determine_root_gpu_device(self._parallel_devices) + device = determine_root_gpu_device(self._parallel_devices) else: device = "cpu" # TODO: lazy initialized device, then here could be self._strategy_flag = "single_device" diff --git a/src/lightning_lite/utilities/device_parser.py b/src/lightning_lite/utilities/device_parser.py index 04aa14ecdbcd2..8a04e9b625e25 100644 --- a/src/lightning_lite/utilities/device_parser.py +++ b/src/lightning_lite/utilities/device_parser.py @@ -1,10 +1,8 @@ -import multiprocessing from typing import Any, List, MutableSequence, Optional, Tuple, Union -import torch - +from lightning_lite.accelerators.cuda import _get_all_available_cuda_gpus +from lightning_lite.accelerators.mps import _get_all_available_mps_gpus from lightning_lite.plugins.environments.torchelastic_environment import TorchElasticEnvironment -from lightning_lite.strategies.launchers.multiprocessing import _is_forking_disabled from lightning_lite.utilities.exceptions import MisconfigurationException from lightning_lite.utilities.types import _DEVICE @@ -94,58 +92,6 @@ def parse_gpu_ids( return _sanitize_gpu_ids(gpus, include_cuda=include_cuda, include_mps=include_mps) -def parse_tpu_cores(tpu_cores: Optional[Union[int, str, List[int]]]) -> Optional[Union[int, List[int]]]: - """ - Parses the tpu_cores given in the format as accepted by the - :class:`~pytorch_lightning.trainer.Trainer`. - - Args: - tpu_cores: An int of 1 or string '1' indicates that 1 core with multi-processing should be used - An int 8 or string '8' indicates that all 8 cores with multi-processing should be used - A list of ints or a strings containing a list of comma separated integers - indicates the specific TPU core to use. - - Returns: - A list of tpu_cores to be used or ``None`` if no TPU cores were requested - - Raises: - MisconfigurationException: - If TPU cores aren't 1, 8 or [<1-8>] - """ - _check_data_type(tpu_cores) - - if isinstance(tpu_cores, str): - tpu_cores = _parse_tpu_cores_str(tpu_cores.strip()) - - if not _tpu_cores_valid(tpu_cores): - raise MisconfigurationException("`tpu_cores` can only be 1, 8 or [<1-8>]") - - return tpu_cores - - -def parse_cpu_cores(cpu_cores: Union[int, str, List[int]]) -> int: - """Parses the cpu_cores given in the format as accepted by the ``devices`` argument in the - :class:`~pytorch_lightning.trainer.Trainer`. - - Args: - cpu_cores: An int > 0. - - Returns: - An int representing the number of processes - - Raises: - MisconfigurationException: - If cpu_cores is not an int > 0 - """ - if isinstance(cpu_cores, str) and cpu_cores.strip().isdigit(): - cpu_cores = int(cpu_cores) - - if not isinstance(cpu_cores, int) or cpu_cores <= 0: - raise MisconfigurationException("`devices` selected with `CPUAccelerator` should be an int > 0.") - - return cpu_cores - - def _normalize_parse_gpu_string_input(s: Union[int, str, List[int]]) -> Union[int, List[int]]: if not isinstance(s, str): return s @@ -207,25 +153,6 @@ def _get_all_available_gpus(include_cuda: bool = False, include_mps: bool = Fals return cuda_gpus + mps_gpus -def _get_all_available_mps_gpus() -> List[int]: - """ - Returns: - A list of all available MPS GPUs - """ - # lazy import to avoid circular dependencies - from lightning_lite.accelerators.mps import MPSAccelerator - - return [0] if MPSAccelerator.is_available() else [] - - -def _get_all_available_cuda_gpus() -> List[int]: - """ - Returns: - A list of all available CUDA GPUs - """ - return list(range(num_cuda_devices())) - - def _check_unique(device_ids: List[int]) -> None: """Checks that the device_ids are unique. @@ -260,49 +187,3 @@ def _check_data_type(device_ids: Any) -> None: raise MisconfigurationException(f"{msg} a sequence of {type(id_).__name__}.") elif type(device_ids) not in (int, str): raise MisconfigurationException(f"{msg} {type(device_ids).__name__}.") - - -def _tpu_cores_valid(tpu_cores: Any) -> bool: - # allow 1 or 8 cores - if tpu_cores in (1, 8, None): - return True - - # allow picking 1 of 8 indexes - if isinstance(tpu_cores, (list, tuple, set)): - has_1_tpu_idx = len(tpu_cores) == 1 - is_valid_tpu_idx = 1 <= list(tpu_cores)[0] <= 8 - - is_valid_tpu_core_choice = has_1_tpu_idx and is_valid_tpu_idx - return is_valid_tpu_core_choice - - return False - - -def _parse_tpu_cores_str(tpu_cores: str) -> Union[int, List[int]]: - if tpu_cores in ("1", "8"): - return int(tpu_cores) - return [int(x.strip()) for x in tpu_cores.split(",") if len(x) > 0] - - -def num_cuda_devices() -> int: - """Returns the number of GPUs available. - - Unlike :func:`torch.cuda.device_count`, this function does its best not to create a CUDA context for fork support, - if the platform allows it. - """ - if "fork" not in torch.multiprocessing.get_all_start_methods() or _is_forking_disabled(): - return torch.cuda.device_count() - with multiprocessing.get_context("fork").Pool(1) as pool: - return pool.apply(torch.cuda.device_count) - - -def is_cuda_available() -> bool: - """Returns a bool indicating if CUDA is currently available. - - Unlike :func:`torch.cuda.is_available`, this function does its best not to create a CUDA context for fork support, - if the platform allows it. - """ - if "fork" not in torch.multiprocessing.get_all_start_methods() or _is_forking_disabled(): - return torch.cuda.is_available() - with multiprocessing.get_context("fork").Pool(1) as pool: - return pool.apply(torch.cuda.is_available) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 367d32210e2e4..fe7cdeceff1eb 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -106,6 +106,16 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Deprecated the functions in `pytorch_lightning.utilities.apply_func` in favor of `lightning_utilities.core.apply_func` ([#14516](https://github.com/Lightning-AI/lightning/pull/14516), [#14537](https://github.com/Lightning-AI/lightning/pull/14537)) +- Deprecated all functions in `pytorch_lightning.utilities.device_parser` ([#14492](https://github.com/Lightning-AI/lightning/pull/14492), [#14753](https://github.com/Lightning-AI/lightning/pull/14753)) + * Deprecated the `pytorch_lightning.utilities.device_parser.determine_root_gpu_device` in favor of `lightning_lite.utilities.device_parser.determine_root_gpu_device` + * Deprecated the `pytorch_lightning.utilities.device_parser.parse_gpu_ids` in favor of `lightning_lite.utilities.device_parser.parse_gpu_ids` + * Deprecated the `pytorch_lightning.utilities.device_parser.is_cuda_available` in favor of `lightning_lite.accelerators.cuda.is_cuda_available` + * Deprecated the `pytorch_lightning.utilities.device_parser.num_cuda_devices` in favor of `lightning_lite.accelerators.cuda.num_cuda_devices` + * Deprecated the `pytorch_lightning.utilities.device_parser.parse_cpu_cores` in favor of `lightning_lite.accelerators.cpu.parse_cpu_cores` + * Deprecated the `pytorch_lightning.utilities.device_parser.parse_tpu_cores` in favor of `lightning_lite.accelerators.tpu.parse_tpu_cores` + * Deprecated the `pytorch_lightning.utilities.device_parser.parse_hpus` in favor of `pytorch_lightning.accelerators.hpu.parse_hpus` + + ### Removed - Removed the deprecated `Trainer.training_type_plugin` property in favor of `Trainer.strategy` ([#14011](https://github.com/Lightning-AI/lightning/pull/14011)) diff --git a/src/pytorch_lightning/accelerators/cpu.py b/src/pytorch_lightning/accelerators/cpu.py index 4369233350ad7..3465e1e39c05c 100644 --- a/src/pytorch_lightning/accelerators/cpu.py +++ b/src/pytorch_lightning/accelerators/cpu.py @@ -15,7 +15,7 @@ import torch -from lightning_lite.utilities.device_parser import parse_cpu_cores +from lightning_lite.accelerators.cpu import parse_cpu_cores from lightning_lite.utilities.types import _DEVICE from pytorch_lightning.accelerators.accelerator import Accelerator from pytorch_lightning.utilities.exceptions import MisconfigurationException diff --git a/src/pytorch_lightning/accelerators/cuda.py b/src/pytorch_lightning/accelerators/cuda.py index 03b7eadf55cd5..4c91691ccbde3 100644 --- a/src/pytorch_lightning/accelerators/cuda.py +++ b/src/pytorch_lightning/accelerators/cuda.py @@ -20,7 +20,8 @@ import torch import pytorch_lightning as pl -from lightning_lite.utilities import device_parser +from lightning_lite.accelerators.cuda import num_cuda_devices +from lightning_lite.utilities.device_parser import parse_gpu_ids from lightning_lite.utilities.types import _DEVICE from pytorch_lightning.accelerators.accelerator import Accelerator from pytorch_lightning.utilities.exceptions import MisconfigurationException @@ -51,7 +52,7 @@ def setup(self, trainer: "pl.Trainer") -> None: def set_nvidia_flags(local_rank: int) -> None: # set the correct cuda visible devices (using pci order) os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" - all_gpu_ids = ",".join(str(x) for x in range(device_parser.num_cuda_devices())) + all_gpu_ids = ",".join(str(x) for x in range(num_cuda_devices())) devices = os.getenv("CUDA_VISIBLE_DEVICES", all_gpu_ids) _log.info(f"LOCAL_RANK: {local_rank} - CUDA_VISIBLE_DEVICES: [{devices}]") @@ -77,7 +78,7 @@ def teardown(self) -> None: @staticmethod def parse_devices(devices: Union[int, str, List[int]]) -> Optional[List[int]]: """Accelerator device parsing logic.""" - return device_parser.parse_gpu_ids(devices, include_cuda=True) + return parse_gpu_ids(devices, include_cuda=True) @staticmethod def get_parallel_devices(devices: List[int]) -> List[torch.device]: @@ -87,11 +88,11 @@ def get_parallel_devices(devices: List[int]) -> List[torch.device]: @staticmethod def auto_device_count() -> int: """Get the devices when set to auto.""" - return device_parser.num_cuda_devices() + return num_cuda_devices() @staticmethod def is_available() -> bool: - return device_parser.num_cuda_devices() > 0 + return num_cuda_devices() > 0 @classmethod def register_accelerators(cls, accelerator_registry: Dict) -> None: @@ -155,6 +156,6 @@ def _to_float(x: str) -> float: def _get_gpu_id(device_id: int) -> str: """Get the unmasked real GPU IDs.""" # All devices if `CUDA_VISIBLE_DEVICES` unset - default = ",".join(str(i) for i in range(device_parser.num_cuda_devices())) + default = ",".join(str(i) for i in range(num_cuda_devices())) cuda_visible_devices = os.getenv("CUDA_VISIBLE_DEVICES", default=default).split(",") return cuda_visible_devices[device_id].strip() diff --git a/src/pytorch_lightning/accelerators/hpu.py b/src/pytorch_lightning/accelerators/hpu.py index 3d18a0ad556cf..9703a054b07b6 100644 --- a/src/pytorch_lightning/accelerators/hpu.py +++ b/src/pytorch_lightning/accelerators/hpu.py @@ -18,7 +18,6 @@ from lightning_lite.utilities.types import _DEVICE from pytorch_lightning.accelerators.accelerator import Accelerator -from pytorch_lightning.utilities.device_parser import parse_hpus from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.imports import _HPU_AVAILABLE from pytorch_lightning.utilities.rank_zero import rank_zero_debug @@ -102,5 +101,26 @@ def register_accelerators(cls, accelerator_registry: Dict) -> None: accelerator_registry.register( "hpu", cls, - description=f"{cls.__class__.__name__}", + description=cls.__class__.__name__, ) + + +def parse_hpus(devices: Optional[Union[int, str, List[int]]]) -> Optional[int]: + """ + Parses the hpus given in the format as accepted by the + :class:`~pytorch_lightning.trainer.Trainer` for the `devices` flag. + + Args: + devices: An integer that indicates the number of Gaudi devices to be used + + Returns: + Either an integer or ``None`` if no devices were requested + + Raises: + MisconfigurationException: + If devices aren't of type `int` or `str` + """ + if devices is not None and not isinstance(devices, (int, str)): + raise MisconfigurationException("`devices` for `HPUAccelerator` must be int, string or None.") + + return int(devices) if isinstance(devices, str) else devices diff --git a/src/pytorch_lightning/accelerators/mps.py b/src/pytorch_lightning/accelerators/mps.py index 6fa6f423fbed7..6cbefa5f43da4 100644 --- a/src/pytorch_lightning/accelerators/mps.py +++ b/src/pytorch_lightning/accelerators/mps.py @@ -16,7 +16,7 @@ import torch from lightning_lite.accelerators.mps import MPSAccelerator as _MPSAccelerator -from lightning_lite.utilities import device_parser +from lightning_lite.utilities.device_parser import parse_gpu_ids from lightning_lite.utilities.types import _DEVICE from pytorch_lightning.accelerators.accelerator import Accelerator from pytorch_lightning.utilities.exceptions import MisconfigurationException @@ -45,7 +45,7 @@ def teardown(self) -> None: @staticmethod def parse_devices(devices: Union[int, str, List[int]]) -> Optional[List[int]]: """Accelerator device parsing logic.""" - parsed_devices = device_parser.parse_gpu_ids(devices, include_mps=True) + parsed_devices = parse_gpu_ids(devices, include_mps=True) return parsed_devices @staticmethod diff --git a/src/pytorch_lightning/accelerators/tpu.py b/src/pytorch_lightning/accelerators/tpu.py index 8637de9095dd2..ddb981d3545a1 100644 --- a/src/pytorch_lightning/accelerators/tpu.py +++ b/src/pytorch_lightning/accelerators/tpu.py @@ -15,7 +15,7 @@ import torch -from lightning_lite.utilities import device_parser +from lightning_lite.accelerators.tpu import parse_tpu_cores from lightning_lite.utilities.types import _DEVICE from pytorch_lightning.accelerators.accelerator import Accelerator from pytorch_lightning.utilities.imports import _TPU_AVAILABLE @@ -53,7 +53,7 @@ def teardown(self) -> None: @staticmethod def parse_devices(devices: Union[int, str, List[int]]) -> Optional[Union[int, List[int]]]: """Accelerator device parsing logic.""" - return device_parser.parse_tpu_cores(devices) + return parse_tpu_cores(devices) @staticmethod def get_parallel_devices(devices: Union[int, List[int]]) -> List[int]: diff --git a/src/pytorch_lightning/profilers/pytorch.py b/src/pytorch_lightning/profilers/pytorch.py index 475db682d953f..7652c416d23b0 100644 --- a/src/pytorch_lightning/profilers/pytorch.py +++ b/src/pytorch_lightning/profilers/pytorch.py @@ -24,7 +24,7 @@ from torch import nn, Tensor from torch.autograd.profiler import record_function -from lightning_lite.utilities.device_parser import is_cuda_available +from lightning_lite.accelerators.cuda import is_cuda_available from pytorch_lightning.profilers.profiler import Profiler from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.imports import _KINETO_AVAILABLE diff --git a/src/pytorch_lightning/trainer/connectors/accelerator_connector.py b/src/pytorch_lightning/trainer/connectors/accelerator_connector.py index 2231e2d7f7212..c10b34ef3d95b 100644 --- a/src/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/src/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -28,7 +28,8 @@ SLURMEnvironment, TorchElasticEnvironment, ) -from lightning_lite.utilities import _StrategyType, AMPType, device_parser, LightningEnum +from lightning_lite.utilities import _StrategyType, AMPType, LightningEnum +from lightning_lite.utilities.device_parser import determine_root_gpu_device from pytorch_lightning.accelerators import AcceleratorRegistry from pytorch_lightning.accelerators.accelerator import Accelerator from pytorch_lightning.accelerators.cpu import CPUAccelerator @@ -592,7 +593,7 @@ def _choose_strategy(self) -> Union[Strategy, str]: if isinstance(self._accelerator_flag, (CUDAAccelerator, MPSAccelerator)) or ( isinstance(self._accelerator_flag, str) and self._accelerator_flag in ("cuda", "gpu", "mps") ): - device = device_parser.determine_root_gpu_device(self._parallel_devices) + device = determine_root_gpu_device(self._parallel_devices) else: device = "cpu" # TODO: lazy initialized device, then here could be self._strategy_flag = "single_device" diff --git a/src/pytorch_lightning/tuner/auto_gpu_select.py b/src/pytorch_lightning/tuner/auto_gpu_select.py index 5b165c9d9409c..cb320afb9f884 100644 --- a/src/pytorch_lightning/tuner/auto_gpu_select.py +++ b/src/pytorch_lightning/tuner/auto_gpu_select.py @@ -15,7 +15,7 @@ import torch -from lightning_lite.utilities import device_parser +from lightning_lite.accelerators.cuda import num_cuda_devices from pytorch_lightning.utilities.exceptions import MisconfigurationException @@ -32,7 +32,7 @@ def pick_multiple_gpus(nb: int) -> List[int]: " Please select a valid number of GPU resources when using auto_select_gpus." ) - num_gpus = device_parser.num_cuda_devices() + num_gpus = num_cuda_devices() if nb > num_gpus: raise MisconfigurationException(f"You requested {nb} GPUs but your machine only has {num_gpus} GPUs.") nb = num_gpus if nb == -1 else nb @@ -52,7 +52,7 @@ def pick_single_gpu(exclude_gpus: List[int]) -> int: """ previously_used_gpus = [] unused_gpus = [] - for i in range(device_parser.num_cuda_devices()): + for i in range(num_cuda_devices()): if i in exclude_gpus: continue diff --git a/src/pytorch_lightning/utilities/device_parser.py b/src/pytorch_lightning/utilities/device_parser.py index b1337c25547b6..b82ca520b265b 100644 --- a/src/pytorch_lightning/utilities/device_parser.py +++ b/src/pytorch_lightning/utilities/device_parser.py @@ -11,37 +11,24 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import Any, List, Optional, Union +from typing import Any +from lightning_lite.accelerators.cpu import parse_cpu_cores as new_parse_cpu_cores +from lightning_lite.accelerators.cuda import is_cuda_available as new_is_cuda_available +from lightning_lite.accelerators.cuda import num_cuda_devices as new_num_cuda_devices +from lightning_lite.accelerators.tpu import parse_tpu_cores as new_parse_tpu_cores from lightning_lite.utilities.device_parser import determine_root_gpu_device as new_determine_root_gpu_device -from lightning_lite.utilities.device_parser import is_cuda_available as new_is_cuda_available -from lightning_lite.utilities.device_parser import num_cuda_devices as new_num_cuda_devices -from lightning_lite.utilities.device_parser import parse_cpu_cores as new_parse_cpu_cores from lightning_lite.utilities.device_parser import parse_gpu_ids as new_parse_gpu_ids -from lightning_lite.utilities.device_parser import parse_tpu_cores as new_parse_tpu_cores -from pytorch_lightning.utilities.exceptions import MisconfigurationException +from pytorch_lightning.accelerators.hpu import parse_hpus as new_parse_hpus from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation -def parse_hpus(devices: Optional[Union[int, str, List[int]]]) -> Optional[int]: - """ - Parses the hpus given in the format as accepted by the - :class:`~pytorch_lightning.trainer.Trainer` for the `devices` flag. - - Args: - devices: An integer that indicates the number of Gaudi devices to be used - - Returns: - Either an integer or ``None`` if no devices were requested - - Raises: - MisconfigurationException: - If devices aren't of type `int` or `str` - """ - if devices is not None and not isinstance(devices, (int, str)): - raise MisconfigurationException("`devices` for `HPUAccelerator` must be int, string or None.") - - return int(devices) if isinstance(devices, str) else devices +def parse_hpus(*args: Any, **kwargs: Any) -> Any: + rank_zero_deprecation( + "`pytorch_lightning.utilities.device_parser.parse_hpus` has been deprecated in v1.8.0 and will" + " be removed in v1.10.0. Please use `pytorch_lightning.accelerators.hpu.parse_hpus` instead." + ) + return new_parse_hpus(*args, **kwargs) def determine_root_gpu_device(*args: Any, **kwargs: Any) -> Any: @@ -55,7 +42,7 @@ def determine_root_gpu_device(*args: Any, **kwargs: Any) -> Any: def is_cuda_available() -> bool: rank_zero_deprecation( "`pytorch_lightning.utilities.device_parser.is_cuda_available` has been deprecated in v1.8.0 and will" - " be removed in v1.10.0. Please use `lightning_lite.utilities.device_parser.is_cuda_available` instead." + " be removed in v1.10.0. Please use `lightning_lite.accelerators.cuda.is_cuda_available` instead." ) return new_is_cuda_available() @@ -63,7 +50,7 @@ def is_cuda_available() -> bool: def num_cuda_devices() -> int: rank_zero_deprecation( "`pytorch_lightning.utilities.device_parser.num_cuda_devices` has been deprecated in v1.8.0 and will" - " be removed in v1.10.0. Please use `lightning_lite.utilities.device_parser.num_cuda_devices` instead." + " be removed in v1.10.0. Please use `lightning_lite.accelerators.cuda.num_cuda_devices` instead." ) return new_num_cuda_devices() diff --git a/tests/tests_lite/accelerators/test_cuda.py b/tests/tests_lite/accelerators/test_cuda.py index 85106ed5c8c5b..1c2c7a8ac33d8 100644 --- a/tests/tests_lite/accelerators/test_cuda.py +++ b/tests/tests_lite/accelerators/test_cuda.py @@ -20,7 +20,7 @@ from lightning_lite.accelerators.cuda import CUDAAccelerator -@mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=2) +@mock.patch("lightning_lite.accelerators.cuda.num_cuda_devices", return_value=2) def test_auto_device_count(_): assert CUDAAccelerator.auto_device_count() == 2 diff --git a/tests/tests_lite/test_connector.py b/tests/tests_lite/test_connector.py index 87121b4888657..72b01c7bfcae8 100644 --- a/tests/tests_lite/test_connector.py +++ b/tests/tests_lite/test_connector.py @@ -89,7 +89,7 @@ def _test_strategy_choice_ddp_and_cpu(ddp_strategy_class): "SLURM_LOCALID": "0", }, ) -@mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=0) +@mock.patch("lightning_lite.accelerators.cuda.num_cuda_devices", return_value=0) def test_custom_cluster_environment_in_slurm_environment(_): """Test that we choose the custom cluster even when SLURM or TE flags are around.""" @@ -124,7 +124,7 @@ def creates_processes_externally(self) -> bool: "SLURM_LOCALID": "0", }, ) -@mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=0) +@mock.patch("lightning_lite.accelerators.cuda.num_cuda_devices", return_value=0) def test_custom_accelerator(*_): class Accel(Accelerator): def setup_device(self, device: torch.device) -> None: @@ -191,7 +191,7 @@ class Strat(DDPStrategy): "SLURM_LOCALID": "0", }, ) -@mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=0) +@mock.patch("lightning_lite.accelerators.cuda.num_cuda_devices", return_value=0) def test_dist_backend_accelerator_mapping(*_): connector = _Connector(strategy="ddp_spawn", accelerator="cpu", devices=2) assert isinstance(connector.accelerator, CPUAccelerator) @@ -199,9 +199,9 @@ def test_dist_backend_accelerator_mapping(*_): assert connector.strategy.local_rank == 0 -@mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=2) -@mock.patch("lightning_lite.utilities.device_parser._get_all_available_mps_gpus", return_value=[0, 1]) -def test_ipython_incompatible_backend_error(_, __, monkeypatch): +@RunIf(mps=False) +@mock.patch("lightning_lite.accelerators.cuda.num_cuda_devices", return_value=2) +def test_ipython_incompatible_backend_error(_, monkeypatch): monkeypatch.setattr(lightning_lite.utilities, "_IS_INTERACTIVE", True) with pytest.raises(RuntimeError, match=r"strategy='ddp'\)`.*is not compatible"): _Connector(strategy="ddp", accelerator="gpu", devices=2) @@ -217,7 +217,7 @@ def test_ipython_incompatible_backend_error(_, __, monkeypatch): _Connector(strategy="dp") -@mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=2) +@mock.patch("lightning_lite.accelerators.cuda.num_cuda_devices", return_value=2) def test_ipython_compatible_dp_strategy_gpu(_, monkeypatch): monkeypatch.setattr(lightning_lite.utilities, "_IS_INTERACTIVE", True) connector = _Connector(strategy="dp", accelerator="gpu") @@ -239,6 +239,7 @@ def test_ipython_compatible_strategy_ddp_fork(monkeypatch): assert connector.strategy.launcher.is_interactive_compatible +@RunIf(mps=False) @pytest.mark.parametrize( ["strategy", "strategy_class"], [ @@ -250,15 +251,13 @@ def test_ipython_compatible_strategy_ddp_fork(monkeypatch): ], ) @pytest.mark.parametrize("devices", [1, 2]) -@mock.patch("lightning_lite.utilities.device_parser.is_cuda_available", return_value=True) -@mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=2) -@mock.patch("lightning_lite.utilities.device_parser._get_all_available_mps_gpus", return_value=[0, 1]) -def test_accelerator_choice_multi_node_gpu(_, __, ___, strategy, strategy_class, devices): +@mock.patch("lightning_lite.accelerators.cuda.num_cuda_devices", return_value=2) +def test_accelerator_choice_multi_node_gpu(_, strategy, strategy_class, devices): connector = _Connector(num_nodes=2, accelerator="gpu", strategy=strategy, devices=devices) assert isinstance(connector.strategy, strategy_class) -@mock.patch("lightning_lite.accelerators.cuda.device_parser.num_cuda_devices", return_value=0) +@mock.patch("lightning_lite.accelerators.cuda.num_cuda_devices", return_value=0) def test_accelerator_cpu(*_): connector = _Connector(accelerator="cpu") assert isinstance(connector.accelerator, CPUAccelerator) @@ -280,10 +279,9 @@ def test_accelerator_cpu(*_): _Connector(accelerator="cpu", gpus=1) -@mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=2) -@mock.patch("lightning_lite.utilities.device_parser.is_cuda_available", return_value=True) +@mock.patch("lightning_lite.accelerators.cuda.num_cuda_devices", return_value=2) @pytest.mark.parametrize("device_count", (["0"], [0, "1"], ["GPU"], [["0", "1"], [0, 1]], [False])) -def test_accelererator_invalid_type_devices(_, __, device_count): +def test_accelererator_invalid_type_devices(_, device_count): with pytest.raises( MisconfigurationException, match=r"must be an int, a string, a sequence of ints or None, but you" ): @@ -439,8 +437,7 @@ def test_strategy_choice_ddp_fork_cpu(): @mock.patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0,1"}) -@mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=2) -@mock.patch("lightning_lite.utilities.device_parser.is_cuda_available", return_value=True) +@mock.patch("lightning_lite.accelerators.cuda.num_cuda_devices", return_value=2) @mock.patch("lightning_lite.accelerators.mps.MPSAccelerator.is_available", return_value=False) def test_strategy_choice_ddp(*_): connector = _Connector(strategy="ddp", accelerator="gpu", devices=1) @@ -450,8 +447,7 @@ def test_strategy_choice_ddp(*_): @mock.patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0,1"}) -@mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=2) -@mock.patch("lightning_lite.utilities.device_parser.is_cuda_available", return_value=True) +@mock.patch("lightning_lite.accelerators.cuda.num_cuda_devices", return_value=2) @mock.patch("lightning_lite.accelerators.mps.MPSAccelerator.is_available", return_value=False) def test_strategy_choice_ddp_spawn(*_): connector = _Connector(strategy="ddp_spawn", accelerator="gpu", devices=1) @@ -460,11 +456,10 @@ def test_strategy_choice_ddp_spawn(*_): assert isinstance(connector.strategy.cluster_environment, LightningEnvironment) -@mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=2) -@mock.patch("lightning_lite.utilities.device_parser.is_cuda_available", return_value=True) +@mock.patch("lightning_lite.accelerators.cuda.num_cuda_devices", return_value=2) @pytest.mark.parametrize("job_name,expected_env", [("some_name", SLURMEnvironment), ("bash", LightningEnvironment)]) @pytest.mark.parametrize("strategy", ["ddp", DDPStrategy]) -def test_strategy_choice_ddp_slurm(_, __, strategy, job_name, expected_env): +def test_strategy_choice_ddp_slurm(_, strategy, job_name, expected_env): if not isinstance(strategy, str): strategy = strategy() @@ -497,8 +492,7 @@ def test_strategy_choice_ddp_slurm(_, __, strategy, job_name, expected_env): "TORCHELASTIC_RUN_ID": "1", }, ) -@mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=2) -@mock.patch("lightning_lite.utilities.device_parser.is_cuda_available", return_value=True) +@mock.patch("lightning_lite.accelerators.cuda.num_cuda_devices", return_value=2) @mock.patch("lightning_lite.accelerators.mps.MPSAccelerator.is_available", return_value=False) def test_strategy_choice_ddp_te(*_): connector = _Connector(strategy="ddp", accelerator="gpu", devices=2) @@ -540,8 +534,7 @@ def test_strategy_choice_ddp_cpu_te(): "RANK": "1", }, ) -@mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=1) -@mock.patch("lightning_lite.utilities.device_parser.is_cuda_available", return_value=True) +@mock.patch("lightning_lite.accelerators.cuda.num_cuda_devices", return_value=1) @mock.patch("lightning_lite.accelerators.mps.MPSAccelerator.is_available", return_value=False) def test_strategy_choice_ddp_kubeflow(*_): connector = _Connector(strategy="ddp", accelerator="gpu", devices=1) @@ -615,8 +608,7 @@ def test_devices_auto_choice_cpu(*_): @RunIf(mps=False) -@mock.patch("lightning_lite.utilities.device_parser.is_cuda_available", return_value=True) -@mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=2) +@mock.patch("lightning_lite.accelerators.cuda.num_cuda_devices", return_value=2) def test_devices_auto_choice_gpu(*_): connector = _Connector(accelerator="auto", devices="auto") assert isinstance(connector.accelerator, CUDAAccelerator) @@ -679,7 +671,7 @@ def test_gpu_accelerator_backend_choice(expected_accelerator_flag, expected_acce @mock.patch("lightning_lite.accelerators.mps.MPSAccelerator.is_available", return_value=False) -@mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=1) +@mock.patch("lightning_lite.accelerators.cuda.num_cuda_devices", return_value=1) def test_gpu_accelerator_backend_choice_cuda(*_): connector = _Connector(accelerator="gpu") assert connector._accelerator_flag == "cuda" @@ -687,7 +679,7 @@ def test_gpu_accelerator_backend_choice_cuda(*_): @mock.patch("lightning_lite.accelerators.mps.MPSAccelerator.is_available", return_value=True) -@mock.patch("lightning_lite.utilities.device_parser._get_all_available_mps_gpus", return_value=[0]) +@mock.patch("lightning_lite.accelerators.mps._get_all_available_mps_gpus", return_value=[0]) @mock.patch("torch.device", return_value="mps") # necessary because torch doesn't allow creation of mps devices def test_gpu_accelerator_backend_choice_mps(*_): connector = _Connector(accelerator="gpu") diff --git a/tests/tests_lite/utilities/test_device_parser.py b/tests/tests_lite/utilities/test_device_parser.py index 09e35fb61d51c..9f1d9d9a782f7 100644 --- a/tests/tests_lite/utilities/test_device_parser.py +++ b/tests/tests_lite/utilities/test_device_parser.py @@ -16,32 +16,14 @@ import pytest import torch +from lightning_lite.accelerators.cpu import parse_cpu_cores +from lightning_lite.accelerators.cuda import is_cuda_available, num_cuda_devices from lightning_lite.utilities import device_parser from lightning_lite.utilities.exceptions import MisconfigurationException _PRETEND_N_OF_GPUS = 16 -@pytest.fixture -def mocked_device_count(monkeypatch): - def device_count(): - return _PRETEND_N_OF_GPUS - - def is_available(): - return True - - monkeypatch.setattr(device_parser, "is_cuda_available", is_available) - monkeypatch.setattr(device_parser, "num_cuda_devices", device_count) - - -@pytest.fixture -def mocked_device_count_0(monkeypatch): - def device_count(): - return 0 - - monkeypatch.setattr(device_parser, "num_cuda_devices", device_count) - - @pytest.mark.parametrize( ["devices", "expected_root_gpu"], [ @@ -75,29 +57,34 @@ def test_determine_root_gpu_device(devices, expected_root_gpu): pytest.param("-1", list(range(_PRETEND_N_OF_GPUS)), id="'-1' - use all gpus"), ], ) -def test_parse_gpu_ids(mocked_device_count, devices, expected_gpu_ids): +@mock.patch("lightning_lite.accelerators.cuda.num_cuda_devices", return_value=_PRETEND_N_OF_GPUS) +def test_parse_gpu_ids(_, devices, expected_gpu_ids): assert device_parser.parse_gpu_ids(devices, include_cuda=True) == expected_gpu_ids @pytest.mark.parametrize("devices", [0.1, -2, False, [-1], [None], ["0"], [0, 0]]) -def test_parse_gpu_fail_on_unsupported_inputs(mocked_device_count, devices): +@mock.patch("lightning_lite.accelerators.cuda.num_cuda_devices", return_value=_PRETEND_N_OF_GPUS) +def test_parse_gpu_fail_on_unsupported_inputs(_, devices): with pytest.raises(MisconfigurationException): device_parser.parse_gpu_ids(devices, include_cuda=True) @pytest.mark.parametrize("devices", [[1, 2, 19], -1, "-1"]) -def test_parse_gpu_fail_on_non_existent_id(mocked_device_count_0, devices): +@mock.patch("lightning_lite.accelerators.cuda.num_cuda_devices", return_value=0) +def test_parse_gpu_fail_on_non_existent_id(_, devices): with pytest.raises(MisconfigurationException): device_parser.parse_gpu_ids(devices, include_cuda=True) -def test_parse_gpu_fail_on_non_existent_id_2(mocked_device_count): +@mock.patch("lightning_lite.accelerators.cuda.num_cuda_devices", return_value=_PRETEND_N_OF_GPUS) +def test_parse_gpu_fail_on_non_existent_id_2(_): with pytest.raises(MisconfigurationException): device_parser.parse_gpu_ids([1, 2, 19], include_cuda=True) @pytest.mark.parametrize("devices", [-1, "-1"]) -def test_parse_gpu_returns_none_when_no_devices_are_available(mocked_device_count_0, devices): +@mock.patch("lightning_lite.accelerators.cuda.num_cuda_devices", return_value=0) +def test_parse_gpu_returns_none_when_no_devices_are_available(_, devices): with pytest.raises(MisconfigurationException): device_parser.parse_gpu_ids(devices, include_cuda=True) @@ -110,12 +97,12 @@ def test_parse_gpu_returns_none_when_no_devices_are_available(mocked_device_coun def test_num_cuda_devices_without_forking(*_): """This merely tests that on platforms without fork support our helper functions fall back to the default implementation for determining cuda availability.""" - assert device_parser.is_cuda_available() - assert device_parser.num_cuda_devices() == 2 + assert is_cuda_available() + assert num_cuda_devices() == 2 @pytest.mark.parametrize("devices", ([3], -1)) def test_invalid_devices_with_cpu_accelerator(devices): """Test invalid device flag raises MisconfigurationException.""" - with pytest.raises(MisconfigurationException, match="should be an int > 0"): - device_parser.parse_cpu_cores(devices) + with pytest.raises(TypeError, match="should be an int > 0"): + parse_cpu_cores(devices) diff --git a/tests/tests_pytorch/accelerators/test_common.py b/tests/tests_pytorch/accelerators/test_common.py index 3eeda536e4c72..a2fab353e4caf 100644 --- a/tests/tests_pytorch/accelerators/test_common.py +++ b/tests/tests_pytorch/accelerators/test_common.py @@ -12,23 +12,14 @@ # See the License for the specific language governing permissions and # limitations under the License. from typing import Any, Dict -from unittest import mock import torch from pytorch_lightning import Trainer -from pytorch_lightning.accelerators import Accelerator, CPUAccelerator, CUDAAccelerator, IPUAccelerator, TPUAccelerator +from pytorch_lightning.accelerators import Accelerator from pytorch_lightning.strategies import DDPStrategy -@mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=2) -def test_auto_device_count(_): - assert CPUAccelerator.auto_device_count() == 1 - assert CUDAAccelerator.auto_device_count() == 2 - assert TPUAccelerator.auto_device_count() == 8 - assert IPUAccelerator.auto_device_count() == 4 - - def test_pluggable_accelerator(): class TestAccelerator(Accelerator): def setup_device(self, device: torch.device) -> None: diff --git a/tests/tests_pytorch/accelerators/test_cpu.py b/tests/tests_pytorch/accelerators/test_cpu.py index 717acff318633..4bd14c30d07f6 100644 --- a/tests/tests_pytorch/accelerators/test_cpu.py +++ b/tests/tests_pytorch/accelerators/test_cpu.py @@ -13,7 +13,6 @@ from pytorch_lightning.demos.boring_classes import BoringModel from pytorch_lightning.plugins.precision.precision_plugin import PrecisionPlugin from pytorch_lightning.strategies import SingleDeviceStrategy -from pytorch_lightning.utilities.exceptions import MisconfigurationException from tests_pytorch.helpers.runif import RunIf @@ -78,10 +77,3 @@ def load_checkpoint(self, checkpoint_path: Union[str, Path]) -> Dict[str, Any]: for func in (trainer.test, trainer.validate, trainer.predict): plugin.setup_called = False func(model, ckpt_path=checkpoint_path) - - -@pytest.mark.parametrize("devices", ([3], -1)) -def test_invalid_devices_with_cpu_accelerator(devices): - """Test invalid device flag raises MisconfigurationException with CPUAccelerator.""" - with pytest.raises(MisconfigurationException, match="should be an int > 0"): - Trainer(accelerator="cpu", devices=devices) diff --git a/tests/tests_pytorch/accelerators/test_gpu.py b/tests/tests_pytorch/accelerators/test_gpu.py index e660ff270f921..3835168c16546 100644 --- a/tests/tests_pytorch/accelerators/test_gpu.py +++ b/tests/tests_pytorch/accelerators/test_gpu.py @@ -24,7 +24,7 @@ @RunIf(min_cuda_gpus=1) -def test_get_torch_gpu_stats(tmpdir): +def test_get_torch_gpu_stats(): current_device = torch.device(f"cuda:{torch.cuda.current_device()}") gpu_stats = CUDAAccelerator().get_device_stats(current_device) fields = ["allocated_bytes.all.freed", "inactive_split.all.peak", "reserved_bytes.large_pool.peak"] @@ -34,7 +34,7 @@ def test_get_torch_gpu_stats(tmpdir): @RunIf(min_cuda_gpus=1) -def test_get_nvidia_gpu_stats(tmpdir): +def test_get_nvidia_gpu_stats(): current_device = torch.device(f"cuda:{torch.cuda.current_device()}") gpu_stats = get_nvidia_gpu_stats(current_device) fields = ["utilization.gpu", "memory.used", "memory.free", "utilization.memory"] diff --git a/tests/tests_pytorch/accelerators/test_ipu.py b/tests/tests_pytorch/accelerators/test_ipu.py index d5958eae0ec4f..35bb38794f988 100644 --- a/tests/tests_pytorch/accelerators/test_ipu.py +++ b/tests/tests_pytorch/accelerators/test_ipu.py @@ -96,6 +96,10 @@ def test_epoch_end(self, outputs) -> None: self.log("test_acc", torch.stack(outputs).mean()) +def test_auto_device_count(): + assert IPUAccelerator.auto_device_count() == 4 + + @pytest.mark.skipif(_IPU_AVAILABLE, reason="test requires non-IPU machine") @mock.patch("pytorch_lightning.accelerators.ipu.IPUAccelerator.is_available", return_value=True) def test_fail_if_no_ipus(_, tmpdir): diff --git a/tests/tests_pytorch/conftest.py b/tests/tests_pytorch/conftest.py index d3c54a9b15b9e..252837889f92e 100644 --- a/tests/tests_pytorch/conftest.py +++ b/tests/tests_pytorch/conftest.py @@ -22,6 +22,8 @@ import pytest import torch.distributed +import lightning_lite +import pytorch_lightning from lightning_lite.plugins.environments.lightning_environment import find_free_network_port from pytorch_lightning.trainer.connectors.signal_connector import SignalConnector from pytorch_lightning.utilities.imports import _IS_WINDOWS @@ -118,6 +120,47 @@ def reset_deterministic_algorithm(): torch.use_deterministic_algorithms(False) +def mock_cuda_count(monkeypatch, n: int) -> None: + monkeypatch.setattr(lightning_lite.accelerators.cuda, "num_cuda_devices", lambda: n) + monkeypatch.setattr(pytorch_lightning.accelerators.cuda, "num_cuda_devices", lambda: n) + monkeypatch.setattr(pytorch_lightning.tuner.auto_gpu_select, "num_cuda_devices", lambda: n) + + +@pytest.fixture(scope="function") +def cuda_count_0(monkeypatch): + mock_cuda_count(monkeypatch, 0) + + +@pytest.fixture(scope="function") +def cuda_count_1(monkeypatch): + mock_cuda_count(monkeypatch, 1) + + +@pytest.fixture(scope="function") +def cuda_count_2(monkeypatch): + mock_cuda_count(monkeypatch, 2) + + +@pytest.fixture(scope="function") +def cuda_count_4(monkeypatch): + mock_cuda_count(monkeypatch, 4) + + +def mock_mps_count(monkeypatch, n: int) -> None: + monkeypatch.setattr(lightning_lite.accelerators.mps, "_get_all_available_mps_gpus", lambda: list(range(n))) + monkeypatch.setattr(lightning_lite.accelerators.mps.MPSAccelerator, "is_available", lambda *_: n > 0) + + +@pytest.fixture(scope="function") +def mps_count_0(monkeypatch): + mock_mps_count(monkeypatch, 0) + + +@pytest.fixture(scope="function") +def mps_count_1(monkeypatch): + mock_mps_count(monkeypatch, 1) + + @pytest.fixture def caplog(caplog): """Workaround for https://github.com/pytest-dev/pytest/issues/3697. diff --git a/tests/tests_pytorch/deprecated_api/test_remove_1-10.py b/tests/tests_pytorch/deprecated_api/test_remove_1-10.py index 3eebceaadd40a..cfb58f6af4764 100644 --- a/tests/tests_pytorch/deprecated_api/test_remove_1-10.py +++ b/tests/tests_pytorch/deprecated_api/test_remove_1-10.py @@ -48,6 +48,7 @@ num_cuda_devices, parse_cpu_cores, parse_gpu_ids, + parse_hpus, parse_tpu_cores, ) from pytorch_lightning.utilities.distributed import ( @@ -191,6 +192,9 @@ def test_v1_10_deprecated_device_parser_utilities(): with pytest.deprecated_call(match="device_parser.num_cuda_devices` has been deprecated in v1.8.0"): num_cuda_devices() + with pytest.deprecated_call(match="device_parser.parse_hpus` has been deprecated in v1.8.0"): + parse_hpus(1) + with pytest.deprecated_call(match="device_parser.parse_cpu_cores` has been deprecated in v1.8.0"): parse_cpu_cores(1) diff --git a/tests/tests_pytorch/deprecated_api/test_remove_1-8.py b/tests/tests_pytorch/deprecated_api/test_remove_1-8.py index b6d21896715dc..f0910d57828c9 100644 --- a/tests/tests_pytorch/deprecated_api/test_remove_1-8.py +++ b/tests/tests_pytorch/deprecated_api/test_remove_1-8.py @@ -20,7 +20,6 @@ import pytest import pytorch_lightning -from lightning_lite.utilities import device_parser from pytorch_lightning import Callback, Trainer from pytorch_lightning.callbacks import ModelCheckpoint from pytorch_lightning.demos.boring_classes import BoringDataModule, BoringModel @@ -526,9 +525,7 @@ def test_trainer_config_device_ids(): pytest.param(3, 0, "ddp", id="3 gpus, expect gpu root device to be 0.(backend:ddp)"), ], ) -def test_root_gpu_property(monkeypatch, gpus, expected_root_gpu, strategy): - monkeypatch.setattr(device_parser, "is_cuda_available", lambda: True) - monkeypatch.setattr(device_parser, "num_cuda_devices", lambda: 16) +def test_root_gpu_property(cuda_count_4, gpus, expected_root_gpu, strategy): with pytest.deprecated_call( match="`Trainer.root_gpu` is deprecated in v1.6 and will be removed in v1.8. " "Please use `Trainer.strategy.root_device.index` instead." @@ -544,8 +541,7 @@ def test_root_gpu_property(monkeypatch, gpus, expected_root_gpu, strategy): pytest.param(0, None, "ddp", id="None is None"), ], ) -def test_root_gpu_property_0_passing(monkeypatch, gpus, expected_root_gpu, strategy): - monkeypatch.setattr(device_parser, "num_cuda_devices", lambda: 0) +def test_root_gpu_property_0_passing(cuda_count_0, gpus, expected_root_gpu, strategy): with pytest.deprecated_call( match="`Trainer.root_gpu` is deprecated in v1.6 and will be removed in v1.8. " "Please use `Trainer.strategy.root_device.index` instead." @@ -559,14 +555,12 @@ def test_root_gpu_property_0_passing(monkeypatch, gpus, expected_root_gpu, strat pytest.param(None, 0, None, id="None - expect 0 gpu to use."), pytest.param(0, 0, None, id="Oth gpu, expect 1 gpu to use."), pytest.param(1, 1, None, id="1st gpu, expect 1 gpu to use."), - pytest.param(-1, 16, "ddp", id="-1 - use all gpus"), - pytest.param("-1", 16, "ddp", id="'-1' - use all gpus"), + pytest.param(-1, 4, "ddp", id="-1 - use all gpus"), + pytest.param("-1", 4, "ddp", id="'-1' - use all gpus"), pytest.param(3, 3, "ddp", id="3rd gpu - 1 gpu to use (backend:ddp)"), ], ) -def test_trainer_gpu_parse(monkeypatch, gpus, expected_num_gpus, strategy): - monkeypatch.setattr(device_parser, "is_cuda_available", lambda: True) - monkeypatch.setattr(device_parser, "num_cuda_devices", lambda: 16) +def test_trainer_gpu_parse(cuda_count_4, gpus, expected_num_gpus, strategy): with pytest.deprecated_call( match="`Trainer.num_gpus` was deprecated in v1.6 and will be removed in v1.8." " Please use `Trainer.num_devices` instead." @@ -581,8 +575,7 @@ def test_trainer_gpu_parse(monkeypatch, gpus, expected_num_gpus, strategy): pytest.param(None, 0, "ddp", id="None - expect 0 gpu to use."), ], ) -def test_trainer_num_gpu_0(monkeypatch, gpus, expected_num_gpus, strategy): - monkeypatch.setattr(device_parser, "num_cuda_devices", lambda: 0) +def test_trainer_num_gpu_0(cuda_count_0, gpus, expected_num_gpus, strategy): with pytest.deprecated_call( match="`Trainer.num_gpus` was deprecated in v1.6 and will be removed in v1.8." " Please use `Trainer.num_devices` instead." @@ -680,16 +673,16 @@ def on_save_checkpoint(self, trainer, pl_module, checkpoint): @pytest.mark.parametrize( "trainer_kwargs", [ - {"accelerator": "gpu", "devices": 2}, - {"accelerator": "gpu", "devices": [0, 2]}, - {"accelerator": "gpu", "devices": "2"}, - {"accelerator": "gpu", "devices": "0,"}, + pytest.param({"accelerator": "gpu", "devices": 2}, marks=RunIf(mps=False)), + pytest.param({"accelerator": "gpu", "devices": [0, 2]}, marks=RunIf(mps=False)), + pytest.param({"accelerator": "gpu", "devices": "2"}, marks=RunIf(mps=False)), + pytest.param({"accelerator": "gpu", "devices": "0,"}, marks=RunIf(mps=False)), + pytest.param({"accelerator": "gpu", "devices": 1}, marks=RunIf(mps=True)), + pytest.param({"accelerator": "gpu", "devices": [0]}, marks=RunIf(mps=True)), + pytest.param({"accelerator": "gpu", "devices": "0,"}, marks=RunIf(mps=True)), ], ) -def test_trainer_gpus(monkeypatch, trainer_kwargs): - monkeypatch.setattr(device_parser, "is_cuda_available", lambda: True) - monkeypatch.setattr(device_parser, "num_cuda_devices", lambda: 4) - monkeypatch.setattr(device_parser, "_get_all_available_mps_gpus", lambda: list(range(4))) +def test_trainer_gpus(cuda_count_4, trainer_kwargs): trainer = Trainer(**trainer_kwargs) with pytest.deprecated_call( match=( diff --git a/tests/tests_pytorch/deprecated_api/test_remove_2-0.py b/tests/tests_pytorch/deprecated_api/test_remove_2-0.py index bd359cc3234f2..3110948cd8ddf 100644 --- a/tests/tests_pytorch/deprecated_api/test_remove_2-0.py +++ b/tests/tests_pytorch/deprecated_api/test_remove_2-0.py @@ -28,9 +28,7 @@ def test_v2_0_0_deprecated_num_processes(): _ = Trainer(num_processes=2) -@mock.patch("lightning_lite.utilities.device_parser.is_cuda_available", return_value=True) -@mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=2) -def test_v2_0_0_deprecated_gpus(*_): +def test_v2_0_0_deprecated_gpus(cuda_count_4): with pytest.deprecated_call(match=r"is deprecated in v1.7 and will be removed in v2.0."): _ = Trainer(gpus=0) diff --git a/tests/tests_pytorch/models/test_gpu.py b/tests/tests_pytorch/models/test_gpu.py index fb71145e26c4f..b240360db6f6a 100644 --- a/tests/tests_pytorch/models/test_gpu.py +++ b/tests/tests_pytorch/models/test_gpu.py @@ -22,7 +22,7 @@ import tests_pytorch.helpers.pipelines as tpipes import tests_pytorch.helpers.utils as tutils from lightning_lite.plugins.environments import TorchElasticEnvironment -from lightning_lite.utilities import device_parser +from lightning_lite.utilities.device_parser import parse_gpu_ids from pytorch_lightning import Trainer from pytorch_lightning.accelerators import CPUAccelerator, CUDAAccelerator from pytorch_lightning.demos.boring_classes import BoringModel @@ -71,44 +71,23 @@ def test_single_gpu_model(tmpdir, devices): tpipes.run_model_test(trainer_options, model) -@pytest.fixture -def mocked_device_count(monkeypatch): - def device_count(): - return PRETEND_N_OF_GPUS - - def is_available(): - return True - - monkeypatch.setattr(device_parser, "is_cuda_available", is_available) - monkeypatch.setattr(device_parser, "num_cuda_devices", device_count) - - -@pytest.fixture -def mocked_device_count_0(monkeypatch): - def device_count(): - return 0 - - monkeypatch.setattr(device_parser, "num_cuda_devices", device_count) - - @pytest.mark.parametrize( - ["devices", "expected_root_gpu", "strategy"], + "devices", [ - (1, None, "ddp"), - (3, None, "ddp"), - (3, None, "ddp"), - ([1, 2], None, "ddp"), - ([0, 1], None, "ddp"), - (-1, None, "ddp"), - ("-1", None, "ddp"), + 1, + 3, + 3, + [1, 2], + [0, 1], + -1, + "-1", ], ) @mock.patch("lightning_lite.accelerators.mps.MPSAccelerator.is_available", return_value=False) -@mock.patch("lightning_lite.accelerators.cuda.CUDAAccelerator.is_available", return_value=False) -def test_root_gpu_property_0_raising(_, __, devices, expected_root_gpu, strategy): +def test_root_gpu_property_0_raising(_, devices): """Test that asking for a GPU when none are available will result in a MisconfigurationException.""" with pytest.raises(MisconfigurationException, match="No supported gpu backend found!"): - Trainer(accelerator="gpu", devices=devices, strategy=strategy) + Trainer(accelerator="gpu", devices=devices, strategy="ddp") @mock.patch.dict( @@ -123,20 +102,18 @@ def test_root_gpu_property_0_raising(_, __, devices, expected_root_gpu, strategy "TORCHELASTIC_RUN_ID": "1", }, ) -@mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=1) -@mock.patch("lightning_lite.utilities.device_parser.is_cuda_available", return_value=True) @pytest.mark.parametrize("gpus", [[0, 1, 2], 2, "0", [0, 2]]) -def test_torchelastic_gpu_parsing(mocked_device_count, mocked_is_available, gpus): +def test_torchelastic_gpu_parsing(cuda_count_1, gpus): """Ensure when using torchelastic and nproc_per_node is set to the default of 1 per GPU device That we omit sanitizing the gpus as only one of the GPUs is visible.""" with pytest.deprecated_call(match=r"is deprecated in v1.7 and will be removed in v2.0."): trainer = Trainer(gpus=gpus) assert isinstance(trainer._accelerator_connector.cluster_environment, TorchElasticEnvironment) # when use gpu - if device_parser.parse_gpu_ids(gpus, include_cuda=True) is not None: + if parse_gpu_ids(gpus, include_cuda=True) is not None: assert isinstance(trainer.accelerator, CUDAAccelerator) assert trainer.num_devices == len(gpus) if isinstance(gpus, list) else gpus - assert trainer.device_ids == device_parser.parse_gpu_ids(gpus, include_cuda=True) + assert trainer.device_ids == parse_gpu_ids(gpus, include_cuda=True) # fall back to cpu else: assert isinstance(trainer.accelerator, CPUAccelerator) diff --git a/tests/tests_pytorch/plugins/test_amp_plugins.py b/tests/tests_pytorch/plugins/test_amp_plugins.py index 087941855a7f3..f086df0755dc6 100644 --- a/tests/tests_pytorch/plugins/test_amp_plugins.py +++ b/tests/tests_pytorch/plugins/test_amp_plugins.py @@ -23,6 +23,7 @@ from pytorch_lightning.plugins import ApexMixedPrecisionPlugin, NativeMixedPrecisionPlugin from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_12 +from tests_pytorch.conftest import mock_cuda_count from tests_pytorch.helpers.runif import RunIf if _TORCH_GREATER_EQUAL_1_12: @@ -52,8 +53,6 @@ class MyApexPlugin(ApexMixedPrecisionPlugin): "SLURM_LOCALID": "0", }, ) -@mock.patch("lightning_lite.utilities.device_parser.is_cuda_available", return_value=True) -@mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=2) @pytest.mark.parametrize("strategy,devices", [("ddp", 2), ("ddp_spawn", 2)]) @pytest.mark.parametrize( "amp,custom_plugin,plugin_cls", @@ -64,7 +63,7 @@ class MyApexPlugin(ApexMixedPrecisionPlugin): pytest.param("apex", True, MyApexPlugin, marks=RunIf(amp_apex=True)), ], ) -def test_amp_apex_ddp(mocked_device_count, mocked_is_available, strategy, devices, amp, custom_plugin, plugin_cls): +def test_amp_apex_ddp(cuda_count_2, strategy, devices, amp, custom_plugin, plugin_cls): plugin = None if custom_plugin: plugin = plugin_cls(16, "cpu") if amp == "native" else plugin_cls() @@ -279,16 +278,15 @@ def test_precision_selection_raises(monkeypatch): with pytest.raises(MisconfigurationException, match=r"amp_type='apex', precision='bf16'\)` but it's not supported"): Trainer(amp_backend="apex", precision="bf16") - with mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=1), pytest.raises( - MisconfigurationException, match="Sharded plugins are not supported with apex" - ): - with mock.patch("lightning_lite.utilities.device_parser.is_cuda_available", return_value=True): + mock_cuda_count(monkeypatch, 1) + with pytest.raises(MisconfigurationException, match="Sharded plugins are not supported with apex"): + with mock.patch("lightning_lite.accelerators.cuda.is_cuda_available", return_value=True): Trainer(amp_backend="apex", precision=16, accelerator="gpu", devices=1, strategy="ddp_fully_sharded") import pytorch_lightning.plugins.precision.apex_amp as apex monkeypatch.setattr(apex, "_APEX_AVAILABLE", False) - with mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=1), mock.patch( - "lightning_lite.utilities.device_parser.is_cuda_available", return_value=True - ), pytest.raises(MisconfigurationException, match="asked for Apex AMP but `apex` is not installed"): + with mock.patch("lightning_lite.accelerators.cuda.is_cuda_available", return_value=True), pytest.raises( + MisconfigurationException, match="asked for Apex AMP but `apex` is not installed" + ): Trainer(amp_backend="apex", precision=16, accelerator="gpu", devices=1) diff --git a/tests/tests_pytorch/plugins/test_cluster_integration.py b/tests/tests_pytorch/plugins/test_cluster_integration.py index 1cef8c0dc3ec3..f8005f2d8a80e 100644 --- a/tests/tests_pytorch/plugins/test_cluster_integration.py +++ b/tests/tests_pytorch/plugins/test_cluster_integration.py @@ -85,10 +85,8 @@ def test_ranks_available_manual_strategy_selection(mock_gpu_acc_available, strat dict(strategy="ddp_spawn", accelerator="gpu", devices=[1, 2]), ], ) -@mock.patch("lightning_lite.utilities.device_parser.is_cuda_available", return_value=True) -@mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=4) @mock.patch("lightning_lite.utilities.device_parser._get_all_available_mps_gpus", return_value=list(range(4))) -def test_ranks_available_automatic_strategy_selection(_, __, ___, trainer_kwargs): +def test_ranks_available_automatic_strategy_selection(_, cuda_count_4, trainer_kwargs): """Test that the rank information is readily available after Trainer initialization.""" num_nodes = 2 trainer_kwargs.update(num_nodes=num_nodes) diff --git a/tests/tests_pytorch/strategies/test_bagua_strategy.py b/tests/tests_pytorch/strategies/test_bagua_strategy.py index 3e9aba79dd5ea..9c36552789615 100644 --- a/tests/tests_pytorch/strategies/test_bagua_strategy.py +++ b/tests/tests_pytorch/strategies/test_bagua_strategy.py @@ -114,10 +114,9 @@ def test_qadam_configuration(tmpdir): trainer.strategy._configure_bagua_model(trainer) -def test_bagua_not_available(monkeypatch): +def test_bagua_not_available(cuda_count_1, monkeypatch): import pytorch_lightning.strategies.bagua as imports monkeypatch.setattr(imports, "_BAGUA_AVAILABLE", False) - with mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=1): - with pytest.raises(MisconfigurationException, match="you must have `Bagua` installed"): - Trainer(strategy="bagua", accelerator="gpu", devices=1) + with pytest.raises(MisconfigurationException, match="you must have `Bagua` installed"): + Trainer(strategy="bagua", accelerator="gpu", devices=1) diff --git a/tests/tests_pytorch/strategies/test_ddp.py b/tests/tests_pytorch/strategies/test_ddp.py index 6b85e1564aab3..7fcb18791ba9d 100644 --- a/tests/tests_pytorch/strategies/test_ddp.py +++ b/tests/tests_pytorch/strategies/test_ddp.py @@ -57,9 +57,8 @@ def test_multi_gpu_model_ddp_fit_test(tmpdir): @RunIf(skip_windows=True) -@mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=2) @mock.patch("lightning_lite.utilities.device_parser._get_all_available_mps_gpus", return_value=list(range(2))) -def test_torch_distributed_backend_invalid(_, __, tmpdir): +def test_torch_distributed_backend_invalid(_, cuda_count_2, tmpdir): """This test set `undefined` as torch backend and should raise an `Backend.UNDEFINED` ValueError.""" model = BoringModel() trainer = Trainer( @@ -76,12 +75,7 @@ def test_torch_distributed_backend_invalid(_, __, tmpdir): @RunIf(skip_windows=True) @mock.patch("torch.cuda.set_device") -@mock.patch("lightning_lite.utilities.device_parser.is_cuda_available", return_value=True) -@mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=1) -@mock.patch("pytorch_lightning.accelerators.gpu.CUDAAccelerator.is_available", return_value=True) -def test_ddp_torch_dist_is_available_in_setup( - mock_gpu_is_available, mock_device_count, mock_cuda_available, mock_set_device, tmpdir -): +def test_ddp_torch_dist_is_available_in_setup(mock_set_device, cuda_count_1, tmpdir): """Test to ensure torch distributed is available within the setup hook using ddp.""" class TestModel(BoringModel): diff --git a/tests/tests_pytorch/strategies/test_ddp_fully_sharded_with_full_state_dict.py b/tests/tests_pytorch/strategies/test_ddp_fully_sharded_with_full_state_dict.py index bb3b63ea578c6..e7b12bd7c7e6b 100644 --- a/tests/tests_pytorch/strategies/test_ddp_fully_sharded_with_full_state_dict.py +++ b/tests/tests_pytorch/strategies/test_ddp_fully_sharded_with_full_state_dict.py @@ -29,10 +29,8 @@ def test_invalid_on_cpu(tmpdir): @mock.patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0"}) -@mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=1) -@mock.patch("lightning_lite.utilities.device_parser.is_cuda_available", return_value=True) @RunIf(fairscale=True) -def test_fsdp_with_sharded_amp(device_count_mock, mock_cuda_available, tmpdir): +def test_fsdp_with_sharded_amp(cuda_count_1, tmpdir): """Test to ensure that plugin native amp plugin is correctly chosen when using sharded.""" trainer = Trainer( default_root_dir=tmpdir, fast_dev_run=True, strategy="fsdp", accelerator="gpu", devices=1, precision=16 diff --git a/tests/tests_pytorch/strategies/test_deepspeed_strategy.py b/tests/tests_pytorch/strategies/test_deepspeed_strategy.py index 70af274e2f788..2e42803d17fa2 100644 --- a/tests/tests_pytorch/strategies/test_deepspeed_strategy.py +++ b/tests/tests_pytorch/strategies/test_deepspeed_strategy.py @@ -169,12 +169,11 @@ def test_deepspeed_strategy_env(tmpdir, monkeypatch, deepspeed_config): @RunIf(deepspeed=True) -@mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=1) @pytest.mark.parametrize( "amp_backend", ["native", pytest.param("apex", marks=RunIf(amp_apex=True))], ) -def test_deepspeed_precision_choice(_, amp_backend, tmpdir): +def test_deepspeed_precision_choice(cuda_count_1, amp_backend, tmpdir): """Test to ensure precision plugin is also correctly chosen. DeepSpeed handles precision via Custom DeepSpeedPrecisionPlugin diff --git a/tests/tests_pytorch/trainer/connectors/test_accelerator_connector.py b/tests/tests_pytorch/trainer/connectors/test_accelerator_connector.py index 626b5bbcf95bf..b2a784525520e 100644 --- a/tests/tests_pytorch/trainer/connectors/test_accelerator_connector.py +++ b/tests/tests_pytorch/trainer/connectors/test_accelerator_connector.py @@ -99,8 +99,7 @@ def _test_strategy_choice_ddp_and_cpu(tmpdir, ddp_strategy_class): "SLURM_LOCALID": "0", }, ) -@mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=0) -def test_custom_cluster_environment_in_slurm_environment(_, tmpdir): +def test_custom_cluster_environment_in_slurm_environment(cuda_count_0, tmpdir): """Test that we choose the custom cluster even when SLURM or TE flags are around.""" class CustomCluster(LightningEnvironment): @@ -136,9 +135,8 @@ def creates_processes_externally(self) -> bool: "SLURM_LOCALID": "0", }, ) -@mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=0) @mock.patch("pytorch_lightning.strategies.DDPStrategy.setup_distributed", autospec=True) -def test_custom_accelerator(*_): +def test_custom_accelerator(cuda_count_0): class Accel(Accelerator): def setup_device(self, device: torch.device) -> None: pass @@ -204,18 +202,16 @@ class Strat(DDPStrategy): "SLURM_LOCALID": "0", }, ) -@mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=0) @mock.patch("pytorch_lightning.strategies.DDPStrategy.setup_distributed", autospec=True) -def test_dist_backend_accelerator_mapping(*_): +def test_dist_backend_accelerator_mapping(cuda_count_0): trainer = Trainer(fast_dev_run=True, strategy="ddp_spawn", accelerator="cpu", devices=2) assert isinstance(trainer.accelerator, CPUAccelerator) assert isinstance(trainer.strategy, DDPStrategy) assert trainer.strategy.local_rank == 0 -@mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=2) @mock.patch("lightning_lite.utilities.device_parser._get_all_available_mps_gpus", return_value=[0, 1]) -def test_ipython_incompatible_backend_error(_, __, monkeypatch): +def test_ipython_incompatible_backend_error(_, cuda_count_2, monkeypatch): monkeypatch.setattr(pytorch_lightning.utilities, "_IS_INTERACTIVE", True) with pytest.raises(MisconfigurationException, match=r"strategy='ddp'\)`.*is not compatible"): Trainer(strategy="ddp", accelerator="gpu", devices=2) @@ -231,8 +227,7 @@ def test_ipython_incompatible_backend_error(_, __, monkeypatch): Trainer(strategy="dp") -@mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=2) -def test_ipython_compatible_dp_strategy_gpu(_, monkeypatch): +def test_ipython_compatible_dp_strategy_gpu(cuda_count_2, monkeypatch): monkeypatch.setattr(pytorch_lightning.utilities, "_IS_INTERACTIVE", True) trainer = Trainer(strategy="dp", accelerator="gpu") assert trainer.strategy.launcher is None @@ -265,17 +260,12 @@ def test_ipython_compatible_strategy_ddp_fork(monkeypatch): ], ) @pytest.mark.parametrize("devices", [1, 2]) -@mock.patch("lightning_lite.utilities.device_parser.is_cuda_available", return_value=True) -@mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=2) -def test_accelerator_choice_multi_node_gpu( - mock_is_available, mock_device_count, tmpdir, strategy, strategy_class, devices -): +def test_accelerator_choice_multi_node_gpu(cuda_count_2, tmpdir, strategy, strategy_class, devices): trainer = Trainer(default_root_dir=tmpdir, num_nodes=2, accelerator="gpu", strategy=strategy, devices=devices) assert isinstance(trainer.strategy, strategy_class) -@mock.patch("pytorch_lightning.accelerators.cuda.device_parser.num_cuda_devices", return_value=0) -def test_accelerator_cpu(_): +def test_accelerator_cpu(cuda_count_0): trainer = Trainer(accelerator="cpu") assert isinstance(trainer.accelerator, CPUAccelerator) @@ -296,10 +286,8 @@ def test_accelerator_cpu(_): Trainer(accelerator="cpu", gpus=1) -@mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=2) -@mock.patch("lightning_lite.utilities.device_parser.is_cuda_available", return_value=True) @pytest.mark.parametrize("device_count", (["0"], [0, "1"], ["GPU"], [["0", "1"], [0, 1]], [False])) -def test_accelererator_invalid_type_devices(mock_is_available, mock_device_count, device_count): +def test_accelererator_invalid_type_devices(cuda_count_2, device_count): with pytest.raises( MisconfigurationException, match=r"must be an int, a string, a sequence of ints or None, but you" ): @@ -460,31 +448,26 @@ def test_strategy_choice_ddp_fork_cpu(): assert trainer.strategy.launcher._start_method == "fork" -@mock.patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0,1"}) -@mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=2) -@mock.patch("lightning_lite.utilities.device_parser.is_cuda_available", return_value=True) -def test_strategy_choice_ddp(*_): - trainer = Trainer(fast_dev_run=True, strategy="ddp", accelerator="cuda", devices=1) +@pytest.mark.parametrize("strategy,expected_cls", [("ddp", DDPStrategy), ("ddp_spawn", DDPSpawnStrategy)]) +def test_strategy_choice_ddp_cuda(strategy, expected_cls, mps_count_0, cuda_count_2): + trainer = Trainer(fast_dev_run=True, strategy=strategy, accelerator="gpu", devices=1) assert isinstance(trainer.accelerator, CUDAAccelerator) - assert isinstance(trainer.strategy, DDPStrategy) + assert isinstance(trainer.strategy, expected_cls) assert isinstance(trainer.strategy.cluster_environment, LightningEnvironment) -@mock.patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0,1"}) -@mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=2) -@mock.patch("lightning_lite.utilities.device_parser.is_cuda_available", return_value=True) -def test_strategy_choice_ddp_spawn(*_): - trainer = Trainer(fast_dev_run=True, strategy="ddp_spawn", accelerator="cuda", devices=1) - assert isinstance(trainer.accelerator, CUDAAccelerator) - assert isinstance(trainer.strategy, DDPSpawnStrategy) +@RunIf(mps=True) +@pytest.mark.parametrize("strategy,expected_cls", [("ddp", DDPStrategy), ("ddp_spawn", DDPSpawnStrategy)]) +def test_strategy_choice_ddp_mps(strategy, expected_cls, mps_count_1, cuda_count_0): + trainer = Trainer(fast_dev_run=True, strategy=strategy, accelerator="gpu", devices=1) + assert isinstance(trainer.accelerator, MPSAccelerator) + assert isinstance(trainer.strategy, expected_cls) assert isinstance(trainer.strategy.cluster_environment, LightningEnvironment) -@mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=2) -@mock.patch("lightning_lite.utilities.device_parser.is_cuda_available", return_value=True) @pytest.mark.parametrize("job_name,expected_env", [("some_name", SLURMEnvironment), ("bash", LightningEnvironment)]) @pytest.mark.parametrize("strategy", ["ddp", DDPStrategy]) -def test_strategy_choice_ddp_slurm(_, __, strategy, job_name, expected_env): +def test_strategy_choice_ddp_slurm(cuda_count_2, strategy, job_name, expected_env): if not isinstance(strategy, str): strategy = strategy() @@ -517,10 +500,10 @@ def test_strategy_choice_ddp_slurm(_, __, strategy, job_name, expected_env): "TORCHELASTIC_RUN_ID": "1", }, ) -@mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=2) -@mock.patch("lightning_lite.utilities.device_parser.is_cuda_available", return_value=True) -def test_strategy_choice_ddp_te(*_): - trainer = Trainer(fast_dev_run=True, strategy="ddp", accelerator="cuda", devices=2) +@mock.patch("torch.cuda.set_device") +@mock.patch("pytorch_lightning.strategies.DDPStrategy.setup_distributed", autospec=True) +def test_strategy_choice_ddp_te(_, __, mps_count_0, cuda_count_2): + trainer = Trainer(fast_dev_run=True, strategy="ddp", accelerator="gpu", devices=2) assert isinstance(trainer.accelerator, CUDAAccelerator) assert isinstance(trainer.strategy, DDPStrategy) assert isinstance(trainer.strategy.cluster_environment, TorchElasticEnvironment) @@ -539,9 +522,8 @@ def test_strategy_choice_ddp_te(*_): "TORCHELASTIC_RUN_ID": "1", }, ) -@mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=0) @mock.patch("pytorch_lightning.strategies.DDPStrategy.setup_distributed", autospec=True) -def test_strategy_choice_ddp_cpu_te(*_): +def test_strategy_choice_ddp_cpu_te(cuda_count_0): trainer = Trainer(fast_dev_run=True, strategy="ddp_spawn", accelerator="cpu", devices=2) assert isinstance(trainer.accelerator, CPUAccelerator) assert isinstance(trainer.strategy, DDPStrategy) @@ -561,10 +543,10 @@ def test_strategy_choice_ddp_cpu_te(*_): "RANK": "1", }, ) -@mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=1) -@mock.patch("lightning_lite.utilities.device_parser.is_cuda_available", return_value=True) -def test_strategy_choice_ddp_kubeflow(*_): - trainer = Trainer(fast_dev_run=True, strategy="ddp", accelerator="cuda", devices=1) +@mock.patch("torch.cuda.set_device") +@mock.patch("pytorch_lightning.strategies.DDPStrategy.setup_distributed", autospec=True) +def test_strategy_choice_ddp_kubeflow(_, __, mps_count_0, cuda_count_1): + trainer = Trainer(fast_dev_run=True, strategy="ddp", accelerator="gpu", devices=1) assert isinstance(trainer.accelerator, CUDAAccelerator) assert isinstance(trainer.strategy, DDPStrategy) assert isinstance(trainer.strategy.cluster_environment, KubeflowEnvironment) @@ -582,9 +564,8 @@ def test_strategy_choice_ddp_kubeflow(*_): "RANK": "1", }, ) -@mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=0) @mock.patch("pytorch_lightning.strategies.DDPStrategy.setup_distributed", autospec=True) -def test_strategy_choice_ddp_cpu_kubeflow(*_): +def test_strategy_choice_ddp_cpu_kubeflow(cuda_count_0): trainer = Trainer(fast_dev_run=True, strategy="ddp_spawn", accelerator="cpu", devices=2) assert isinstance(trainer.accelerator, CPUAccelerator) assert isinstance(trainer.strategy, DDPStrategy) @@ -604,10 +585,9 @@ def test_strategy_choice_ddp_cpu_kubeflow(*_): "SLURM_LOCALID": "0", }, ) -@mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=0) @mock.patch("pytorch_lightning.strategies.DDPStrategy.setup_distributed", autospec=True) @pytest.mark.parametrize("strategy", ["ddp", DDPStrategy()]) -def test_strategy_choice_ddp_cpu_slurm(device_count_mock, setup_distributed_mock, strategy): +def test_strategy_choice_ddp_cpu_slurm(cuda_count_0, strategy): trainer = Trainer(fast_dev_run=True, strategy=strategy, accelerator="cpu", devices=2) assert isinstance(trainer.accelerator, CPUAccelerator) assert isinstance(trainer.strategy, DDPStrategy) @@ -654,19 +634,16 @@ def test_unsupported_ipu_choice(mock_ipu_acc_avail, monkeypatch): Trainer(accelerator="ipu", precision=64) -@mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=0) @mock.patch("pytorch_lightning.utilities.imports._TPU_AVAILABLE", return_value=False) @mock.patch("pytorch_lightning.utilities.imports._IPU_AVAILABLE", return_value=False) @mock.patch("pytorch_lightning.utilities.imports._HPU_AVAILABLE", return_value=False) -def test_devices_auto_choice_cpu(*_): +def test_devices_auto_choice_cpu(cuda_count_0, *_): trainer = Trainer(accelerator="auto", devices="auto") assert trainer.num_devices == 1 -@mock.patch("lightning_lite.utilities.device_parser.is_cuda_available", return_value=True) -@mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=2) @RunIf(mps=False) -def test_devices_auto_choice_gpu(is_gpu_available_mock, device_count_mock): +def test_devices_auto_choice_gpu(cuda_count_2): trainer = Trainer(accelerator="auto", devices="auto") assert isinstance(trainer.accelerator, CUDAAccelerator) assert trainer.num_devices == 2 @@ -771,22 +748,20 @@ def test_passing_zero_and_empty_list_to_devices_flag(accelerator, devices): ], ) def test_gpu_accelerator_backend_choice(expected_accelerator_flag, expected_accelerator_class): - trainer = Trainer(accelerator="gpu") assert trainer._accelerator_connector._accelerator_flag == expected_accelerator_flag assert isinstance(trainer.accelerator, expected_accelerator_class) @RunIf(mps=False) -@mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=1) -def test_gpu_accelerator_backend_choice_cuda(_): +def test_gpu_accelerator_backend_choice_cuda(cuda_count_1): trainer = Trainer(accelerator="gpu") assert trainer._accelerator_connector._accelerator_flag == "cuda" assert isinstance(trainer.accelerator, CUDAAccelerator) @mock.patch("lightning_lite.accelerators.mps.MPSAccelerator.is_available", return_value=True) -@mock.patch("lightning_lite.utilities.device_parser._get_all_available_mps_gpus", return_value=[0]) +@mock.patch("lightning_lite.accelerators.mps._get_all_available_mps_gpus", return_value=[0]) @mock.patch("torch.device", return_value="mps") # necessary because torch doesn't allow creation of mps devices def test_gpu_accelerator_backend_choice_mps(*_): trainer = Trainer(accelerator="gpu") diff --git a/tests/tests_pytorch/trainer/flags/test_env_vars.py b/tests/tests_pytorch/trainer/flags/test_env_vars.py index a6415d5e907d2..606fdc89467ce 100644 --- a/tests/tests_pytorch/trainer/flags/test_env_vars.py +++ b/tests/tests_pytorch/trainer/flags/test_env_vars.py @@ -49,9 +49,7 @@ def test_passing_env_variables_defaults(): @mock.patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0,1", "PL_TRAINER_DEVICES": "2"}) -@mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=2) -@mock.patch("lightning_lite.utilities.device_parser.is_cuda_available", return_value=True) -def test_passing_env_variables_devices(cuda_available_mock, device_count_mock): +def test_passing_env_variables_devices(cuda_count_2): """Testing overwriting trainer arguments.""" trainer = Trainer() assert trainer.num_devices == 2 diff --git a/tests/tests_pytorch/trainer/properties/test_auto_gpu_select.py b/tests/tests_pytorch/trainer/properties/test_auto_gpu_select.py index 37f54bb84b44f..3c7c11d4ee548 100644 --- a/tests/tests_pytorch/trainer/properties/test_auto_gpu_select.py +++ b/tests/tests_pytorch/trainer/properties/test_auto_gpu_select.py @@ -42,17 +42,14 @@ def test_pick_multiple_gpus(nb, expected_gpu_idxs, expected_error): assert expected_gpu_idxs == pick_multiple_gpus(nb) -@mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=1) -def test_pick_multiple_gpus_more_than_available(*_): +def test_pick_multiple_gpus_more_than_available(cuda_count_1): with pytest.raises(MisconfigurationException, match="You requested 3 GPUs but your machine only has 1 GPUs"): pick_multiple_gpus(3) @RunIf(mps=False) -@mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=2) @mock.patch("pytorch_lightning.trainer.connectors.accelerator_connector.pick_multiple_gpus", return_value=[1]) -def test_auto_select_gpus(*_): - +def test_auto_select_gpus(_, cuda_count_2): trainer = Trainer(auto_select_gpus=True, accelerator="gpu", devices=1) assert trainer.num_devices == 1 assert trainer.device_ids == [1] diff --git a/tests/tests_pytorch/trainer/properties/test_estimated_stepping_batches.py b/tests/tests_pytorch/trainer/properties/test_estimated_stepping_batches.py index 9089a4f76f7a6..2dec57277e2a4 100644 --- a/tests/tests_pytorch/trainer/properties/test_estimated_stepping_batches.py +++ b/tests/tests_pytorch/trainer/properties/test_estimated_stepping_batches.py @@ -20,12 +20,12 @@ import torch from torch.utils.data import DataLoader -from lightning_lite.utilities import device_parser from pytorch_lightning import Trainer from pytorch_lightning.callbacks.gradient_accumulation_scheduler import GradientAccumulationScheduler from pytorch_lightning.demos.boring_classes import BoringModel, RandomIterableDataset from pytorch_lightning.strategies.ipu import IPUStrategy from pytorch_lightning.utilities.exceptions import MisconfigurationException +from tests_pytorch.conftest import mock_cuda_count from tests_pytorch.helpers.runif import RunIf @@ -127,8 +127,7 @@ def test_num_stepping_batches_accumulate_gradients(accumulate_grad_batches, expe def test_num_stepping_batches_gpu(trainer_kwargs, estimated_steps, monkeypatch): """Test stepping batches with GPU strategies.""" num_devices_per_node = 7 - monkeypatch.setattr(device_parser, "is_cuda_available", lambda: True) - monkeypatch.setattr(device_parser, "num_cuda_devices", lambda: num_devices_per_node) + mock_cuda_count(monkeypatch, num_devices_per_node) trainer = Trainer(max_epochs=1, devices=num_devices_per_node, accelerator="gpu", **trainer_kwargs) # set the `parallel_devices` to cpu to run the test on CPU and take `num_nodes`` into consideration diff --git a/tests/tests_pytorch/trainer/test_config_validator.py b/tests/tests_pytorch/trainer/test_config_validator.py index c4d34315364c5..a954f90402e84 100644 --- a/tests/tests_pytorch/trainer/test_config_validator.py +++ b/tests/tests_pytorch/trainer/test_config_validator.py @@ -15,11 +15,11 @@ import torch import pytorch_lightning as pl -from lightning_lite.utilities import device_parser from lightning_lite.utilities.warnings import PossibleUserWarning from pytorch_lightning import LightningDataModule, LightningModule, Trainer from pytorch_lightning.demos.boring_classes import BoringModel, RandomDataset from pytorch_lightning.utilities.exceptions import MisconfigurationException +from tests_pytorch.conftest import mock_cuda_count def test_wrong_train_setting(tmpdir): @@ -121,7 +121,7 @@ def predict_dataloader(self): assert results[0][0].shape == torch.Size([1, 2]) -def test_trainer_manual_optimization_config(tmpdir): +def test_trainer_manual_optimization_config(): """Test error message when requesting Trainer features unsupported with manual optimization.""" model = BoringModel() model.automatic_optimization = False @@ -141,8 +141,7 @@ def test_raise_exception_with_batch_transfer_hooks(monkeypatch, hook, trainer_kw """Test that an exception is raised when overriding batch_transfer_hooks.""" if trainer_kwargs.get("accelerator") == "gpu": match_pattern = rf"Overriding `{hook}` is not .* in DP mode." - monkeypatch.setattr(device_parser, "is_cuda_available", lambda: True) - monkeypatch.setattr(device_parser, "num_cuda_devices", lambda: 2) + mock_cuda_count(monkeypatch, 2) elif trainer_kwargs.get("accelerator") == "ipu": match_pattern = rf"Overriding `{hook}` is not .* with IPUs" monkeypatch.setattr(pl.accelerators.ipu.IPUAccelerator, "is_available", lambda _: True) diff --git a/tests/tests_pytorch/trainer/test_supporters.py b/tests/tests_pytorch/trainer/test_supporters.py index 9970eb20da374..fa043bb126338 100644 --- a/tests/tests_pytorch/trainer/test_supporters.py +++ b/tests/tests_pytorch/trainer/test_supporters.py @@ -314,12 +314,10 @@ def test_nested_calc_num_data(input_data, compute_func, expected_length): @mock.patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0,1"}) -@mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=2) -@mock.patch("lightning_lite.utilities.device_parser.is_cuda_available", return_value=True) @mock.patch("lightning_lite.utilities.device_parser._get_all_available_mps_gpus", return_value=[0, 1]) @pytest.mark.parametrize("use_fault_tolerant", [False, True]) @pytest.mark.parametrize("replace_sampler_ddp", [False, True]) -def test_combined_data_loader_validation_test(_, __, ___, use_fault_tolerant, replace_sampler_ddp, tmpdir): +def test_combined_data_loader_validation_test(_, cuda_count_2, use_fault_tolerant, replace_sampler_ddp): """This test makes sure distributed sampler has been properly injected in dataloaders when using CombinedLoader.""" diff --git a/tests/tests_pytorch/trainer/test_trainer.py b/tests/tests_pytorch/trainer/test_trainer.py index 0d3764cdcc5d2..5a957ae8b7827 100644 --- a/tests/tests_pytorch/trainer/test_trainer.py +++ b/tests/tests_pytorch/trainer/test_trainer.py @@ -21,7 +21,6 @@ from copy import deepcopy from pathlib import Path from re import escape -from unittest import mock from unittest.mock import ANY, call, patch import cloudpickle @@ -33,9 +32,9 @@ from torch.optim import SGD from torch.utils.data import DataLoader, IterableDataset +import lightning_lite import pytorch_lightning import tests_pytorch.helpers.utils as tutils -from lightning_lite.utilities import device_parser from lightning_lite.utilities.cloud_io import load as pl_load from lightning_lite.utilities.seed import seed_everything from pytorch_lightning import Callback, LightningDataModule, LightningModule, Trainer @@ -64,6 +63,7 @@ from pytorch_lightning.trainer.states import RunningStage, TrainerFn from pytorch_lightning.utilities.exceptions import DeadlockDetectedException, MisconfigurationException from pytorch_lightning.utilities.imports import _OMEGACONF_AVAILABLE, _TORCH_GREATER_EQUAL_1_12 +from tests_pytorch.conftest import mock_cuda_count from tests_pytorch.helpers.datamodules import ClassifDataModule from tests_pytorch.helpers.runif import RunIf from tests_pytorch.helpers.simple_models import ClassificationModel @@ -1258,9 +1258,7 @@ def __init__(self, **kwargs): "trainer_params", [{"max_epochs": 1, "accelerator": "gpu", "devices": 1}, {"max_epochs": 1, "accelerator": "gpu", "devices": [0]}], ) -@mock.patch("lightning_lite.utilities.device_parser.is_cuda_available", return_value=True) -@mock.patch("lightning_lite.utilities.device_parser.num_cuda_devices", return_value=1) -def test_trainer_omegaconf(_, __, trainer_params): +def test_trainer_omegaconf(cuda_count_1, trainer_params): config = OmegaConf.create(trainer_params) Trainer(**config) @@ -2107,8 +2105,7 @@ def training_step(self, batch, batch_idx): ) def test_trainer_config_strategy(monkeypatch, trainer_kwargs, strategy_cls, strategy_name, accelerator_cls, devices): if trainer_kwargs.get("accelerator") == "cuda": - monkeypatch.setattr(device_parser, "is_cuda_available", lambda: True) - monkeypatch.setattr(device_parser, "num_cuda_devices", lambda: trainer_kwargs["devices"]) + mock_cuda_count(monkeypatch, trainer_kwargs["devices"]) trainer = Trainer(**trainer_kwargs) @@ -2176,10 +2173,9 @@ def test_dataloaders_are_not_loaded_if_disabled_through_limit_batches(running_st ) def test_trainer_config_device_ids(monkeypatch, trainer_kwargs, expected_device_ids): if trainer_kwargs.get("accelerator") in ("cuda", "gpu"): - monkeypatch.setattr(device_parser, "is_cuda_available", lambda: True) - monkeypatch.setattr(device_parser, "num_cuda_devices", lambda: 4) + mock_cuda_count(monkeypatch, 4) elif trainer_kwargs.get("accelerator") in ("mps", "gpu"): - monkeypatch.setattr(device_parser, "_get_all_available_mps_gpus", lambda: [0]) + monkeypatch.setattr(lightning_lite.utilities.device_parser, "_get_all_available_mps_gpus", lambda: [0]) monkeypatch.setattr(MPSAccelerator, "is_available", lambda *_: True) elif trainer_kwargs.get("accelerator") == "ipu": monkeypatch.setattr(pytorch_lightning.accelerators.ipu.IPUAccelerator, "is_available", lambda _: True) diff --git a/tests/tests_pytorch/trainer/test_trainer_cli.py b/tests/tests_pytorch/trainer/test_trainer_cli.py index 6613f0b1bcf38..c4b39977a35b0 100644 --- a/tests/tests_pytorch/trainer/test_trainer_cli.py +++ b/tests/tests_pytorch/trainer/test_trainer_cli.py @@ -19,7 +19,6 @@ import pytest import tests_pytorch.helpers.utils as tutils -from lightning_lite.utilities import device_parser from pytorch_lightning import Trainer from pytorch_lightning.utilities import argparse @@ -164,12 +163,8 @@ def test_argparse_args_parsing_fast_dev_run(cli_args, expected): ["cli_args", "expected_parsed"], [("", None), ("--accelerator gpu --devices 1", "1"), ("--accelerator gpu --devices 0,", "0,")], ) -def test_argparse_args_parsing_devices(cli_args, expected_parsed, monkeypatch): +def test_argparse_args_parsing_devices(cli_args, expected_parsed, cuda_count_1): """Test multi type argument with bool.""" - - monkeypatch.setattr(device_parser, "is_cuda_available", lambda: True) - monkeypatch.setattr(device_parser, "num_cuda_devices", lambda: 1) - cli_args = cli_args.split(" ") if cli_args else [] with mock.patch("argparse._sys.argv", ["any.py"] + cli_args): parser = ArgumentParser(add_help=False) diff --git a/tests/tests_pytorch/utilities/test_device_parser.py b/tests/tests_pytorch/utilities/test_device_parser.py deleted file mode 100644 index a4a84892a6e8d..0000000000000 --- a/tests/tests_pytorch/utilities/test_device_parser.py +++ /dev/null @@ -1,31 +0,0 @@ -# Copyright The PyTorch Lightning team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from unittest import mock - -import pytest -import torch - -from lightning_lite.utilities import device_parser - - -@pytest.mark.skipif( - "fork" in torch.multiprocessing.get_all_start_methods(), reason="Requires platform without forking support" -) -@mock.patch("torch.cuda.is_available", return_value=True) -@mock.patch("torch.cuda.device_count", return_value=2) -def test_num_cuda_devices_without_forking(*_): - """This merely tests that on platforms without fork support our helper functions fall back to the default - implementation for determining cuda availability.""" - assert device_parser.is_cuda_available() - assert device_parser.num_cuda_devices() == 2 From a5b0f8bd5cd28fbd79fdafa5d9380b00258d7a76 Mon Sep 17 00:00:00 2001 From: Gilad <88031955+gilfree@users.noreply.github.com> Date: Mon, 19 Sep 2022 02:07:19 +0300 Subject: [PATCH 191/193] Fix TQDMProgressBar usage in logging.rst (#14768) --- docs/source-pytorch/extensions/logging.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source-pytorch/extensions/logging.rst b/docs/source-pytorch/extensions/logging.rst index 109445779f991..0b18293be8ad1 100644 --- a/docs/source-pytorch/extensions/logging.rst +++ b/docs/source-pytorch/extensions/logging.rst @@ -324,10 +324,10 @@ if you are using a logger. These defaults can be customized by overriding the .. code-block:: python - from pytorch_lightning.callbacks.progress import Tqdm + from pytorch_lightning.callbacks.progress import TQDMProgressBar - class CustomProgressBar(Tqdm): + class CustomProgressBar(TQDMProgressBar): def get_metrics(self, *args, **kwargs): # don't show the version number items = super().get_metrics() From 8c4e17f3599038a7b211b6a58e993869ec10c4a3 Mon Sep 17 00:00:00 2001 From: Laverne Henderson Date: Mon, 19 Sep 2022 06:25:57 -0700 Subject: [PATCH 192/193] Removes the old HPO content (#14754) * Removes the old HPO content * Remove source-lit symlinks for HPO * drop ref Co-authored-by: Jirka --- .../examples/hpo/build_from_scratch.rst | 41 -------- docs/source-app/examples/hpo/hpo.py | 54 ---------- docs/source-app/examples/hpo/hpo.rst | 80 --------------- docs/source-app/examples/hpo/hpo_wi.rst | 57 ----------- docs/source-app/examples/hpo/hpo_wo.rst | 57 ----------- .../source-app/examples/hpo/lightning_hpo.rst | 99 ------------------- .../examples/hpo/lightning_hpo_target.py | 53 ---------- docs/source-app/examples/hpo/objective.py | 21 ---- .../examples/hpo/optuna_reference.py | 36 ------- .../get_started/training_with_apps.rst | 7 -- .../get_started/what_app_can_do.rst | 16 --- docs/source-app/index.rst | 1 - .../examples/hpo/build_from_scratch.rst | 1 - docs/source-lit/examples/hpo/hpo.py | 1 - docs/source-lit/examples/hpo/hpo.rst | 1 - docs/source-lit/examples/hpo/hpo_wi.rst | 1 - docs/source-lit/examples/hpo/hpo_wo.rst | 1 - .../source-lit/examples/hpo/lightning_hpo.rst | 1 - .../examples/hpo/lightning_hpo_target.py | 1 - docs/source-lit/examples/hpo/objective.py | 1 - .../examples/hpo/optuna_reference.py | 1 - docs/source-lit/index.rst | 1 - 22 files changed, 532 deletions(-) delete mode 100644 docs/source-app/examples/hpo/build_from_scratch.rst delete mode 100644 docs/source-app/examples/hpo/hpo.py delete mode 100644 docs/source-app/examples/hpo/hpo.rst delete mode 100644 docs/source-app/examples/hpo/hpo_wi.rst delete mode 100644 docs/source-app/examples/hpo/hpo_wo.rst delete mode 100644 docs/source-app/examples/hpo/lightning_hpo.rst delete mode 100644 docs/source-app/examples/hpo/lightning_hpo_target.py delete mode 100644 docs/source-app/examples/hpo/objective.py delete mode 100644 docs/source-app/examples/hpo/optuna_reference.py delete mode 120000 docs/source-lit/examples/hpo/build_from_scratch.rst delete mode 120000 docs/source-lit/examples/hpo/hpo.py delete mode 120000 docs/source-lit/examples/hpo/hpo.rst delete mode 120000 docs/source-lit/examples/hpo/hpo_wi.rst delete mode 120000 docs/source-lit/examples/hpo/hpo_wo.rst delete mode 120000 docs/source-lit/examples/hpo/lightning_hpo.rst delete mode 120000 docs/source-lit/examples/hpo/lightning_hpo_target.py delete mode 120000 docs/source-lit/examples/hpo/objective.py delete mode 120000 docs/source-lit/examples/hpo/optuna_reference.py diff --git a/docs/source-app/examples/hpo/build_from_scratch.rst b/docs/source-app/examples/hpo/build_from_scratch.rst deleted file mode 100644 index cade8b7f6edc1..0000000000000 --- a/docs/source-app/examples/hpo/build_from_scratch.rst +++ /dev/null @@ -1,41 +0,0 @@ -:orphan: - -####################################### -Implement an HPO component from scratch -####################################### - -**Audience:** Users who want to understand how to implement sweep training from scratch. - -**Prereqs:** Finish Intermediate Level. - ----- - -******** -Examples -******** - -.. raw:: html - -
-
- -.. displayitem:: - :header: Step 1: Implement an HPO component with the Lightning Works. - :description: Learn how it works under the hood - :col_css: col-md-4 - :button_link: hpo_wo.html - :height: 180 - :tag: Intermediate - -.. displayitem:: - :header: Step 2: Add the flow to your HPO component - :description: Learn how it works under the hood - :col_css: col-md-4 - :button_link: hpo_wi.html - :height: 180 - :tag: Intermediate - -.. raw:: html - -
-
diff --git a/docs/source-app/examples/hpo/hpo.py b/docs/source-app/examples/hpo/hpo.py deleted file mode 100644 index fd05cc2e327e6..0000000000000 --- a/docs/source-app/examples/hpo/hpo.py +++ /dev/null @@ -1,54 +0,0 @@ -import optuna -from objective import ObjectiveWork -from optuna.distributions import CategoricalDistribution, LogUniformDistribution - -TOTAL_TRIALS = 6 -SIMULTANEOUS_TRIALS = 2 -NUM_TRIALS = SIMULTANEOUS_TRIALS -DONE = False - -STUDY = optuna.create_study() -DISTRIBUTIONS = { - "backbone": CategoricalDistribution(["resnet18", "resnet34"]), - "learning_rate": LogUniformDistribution(0.0001, 0.1), -} -TRIALS = [ObjectiveWork() for _ in range(TOTAL_TRIALS)] - -# Lightning Infinite Loop -while not DONE: - - # Finish the Hyperparameter Optimization - if NUM_TRIALS >= TOTAL_TRIALS: - DONE = True - continue - - has_told_study = [] - - # Iterate over the possible number of trials. - for trial_idx in range(NUM_TRIALS): - - objective_work = TRIALS[trial_idx] - - # If a work has already started, it won't be started again. - if not objective_work.has_started: - # Sample a new trial from the distributions - trial = STUDY.ask(DISTRIBUTIONS) - # Run the work - objective_work.run(trial_id=trial._trial_id, **trial.params) - - # With Lightning, the `objective_work` will run asynchronously - # and the metric will be prodcued after X amount of time. - # The Lightning Infinite Loop would have run a very large number of times by then. - if objective_work.metric and not objective_work.has_told_study: - # Add the metric in the Study - STUDY.tell(objective_work.trial_id, objective_work.metric) - objective_work.has_told_study = True - - # Keep track if the objective work has populated the study. - has_told_study.append(objective_work.has_told_study) - - # Trigger the next trials. - if all(has_told_study): - NUM_TRIALS += SIMULTANEOUS_TRIALS - -print({w.trial_id: w.metric for w in TRIALS}) diff --git a/docs/source-app/examples/hpo/hpo.rst b/docs/source-app/examples/hpo/hpo.rst deleted file mode 100644 index 568e17836194d..0000000000000 --- a/docs/source-app/examples/hpo/hpo.rst +++ /dev/null @@ -1,80 +0,0 @@ -.. hpo: -.. _hpo_example: - - -######################################################### -Develop a Lightning Hyperparameter Optimization (HPO) App -######################################################### - -******************* -A bit of background -******************* - -Traditionally, developing machine learning (ML) products requires choosing among a large space of -hyperparameters while creating and training the ML models. Hyperparameter optimization -(HPO) aims to find a well-performing hyperparameter configuration for a given ML model -on a dataset at hand, including the ML model, -its hyperparameters, and other data processing steps. - -HPOs free the human expert from a tedious and error-prone, manual hyperparameter tuning process. - -As an example, in the famous `scikit-learn `_ library, -hyperparameters are passed as arguments to the constructor of -the estimator classes such as ``C`` kernel for -`Support Vector Classifier `_, etc. - -It is possible and recommended to search the hyperparameter space for the best validation score. - -An HPO search consists of: - -* an objective method -* a defined parameter space -* a method for searching or sampling candidates - -A naive method for sampling candidates is grid search, which exhaustively considers all -hyperparameter combinations from a user-specified grid. - -Fortunately, HPO is an active area of research, and many methods have been developed to -optimize the time required to get strong candidates. - -In the following tutorial, you will learn how to use Lightning together with `Optuna `_. - -`Optuna `_ is an open source HPO framework to automate hyperparameter search. -Out-of-the-box, it provides efficient algorithms to search large spaces and prune unpromising trials for faster results. - -First, you will learn about the best practices on how to implement HPO without the Lightning Framework. -Secondly, we will dive into a working HPO application with Lightning, and finally create a neat -`HiPlot UI `_ -for our application. - ----- - -******** -Examples -******** - -.. raw:: html - -
-
- -.. displayitem:: - :header: Re-use an existing HPO component - :description: Learn how to use Lightning HPO with your app. - :col_css: col-md-4 - :button_link: lightning_hpo.html - :height: 180 - :tag: Basic - -.. displayitem:: - :header: Implement an HPO component from scratch - :description: Learn how it works under the hood - :col_css: col-md-4 - :button_link: build_from_scratch.html - :height: 180 - :tag: Intermediate - -.. raw:: html - -
-
diff --git a/docs/source-app/examples/hpo/hpo_wi.rst b/docs/source-app/examples/hpo/hpo_wi.rst deleted file mode 100644 index 17dd971e9e926..0000000000000 --- a/docs/source-app/examples/hpo/hpo_wi.rst +++ /dev/null @@ -1,57 +0,0 @@ -:orphan: - -########################################## -Step 2: Add the flow to your HPO component -########################################## - -**Audience:** Users who want to understand how to implement HPO training from scratch with Lightning. - -**Prereqs:** Level 17+ - ----- - -Thanks to the simplified version, you should have a good grasp on how to implement HPO with Optuna. - -As the :class:`~lightning_app.core.app.LightningApp` handles the Infinite Loop, -it has been removed from within the run method of the HPORootFlow. - -However, the ``run`` method code is the same as the one defined above. - -.. literalinclude:: ../../../examples/app_hpo/app_wo_ui.py - :language: python - -The ``ObjectiveWork`` is sub-classing -the built-in :class:`~lightning_app.components.python.TracerPythonScript` -which enables launching scripts and more. - -.. literalinclude:: ../../../examples/app_hpo/objective.py - :language: python - -Finally, let's add the ``HiPlotFlow`` component to visualize our hyperparameter optimization. - -The metric and sampled parameters are added to the ``self.hi_plot.data`` list, enabling -updates to the dashboard in near-realtime. - -.. literalinclude:: ../../../examples/app_hpo/app_wi_ui.py - :diff: ../../../examples/app_hpo/app_wo_ui.py - -Here is the associated code with the ``HiPlotFlow`` component. - -In the ``render_fn`` method, the state of the ``HiPlotFlow`` is passed. -The ``state.data`` is accessed as it contains the metric and sampled parameters. - -.. literalinclude:: ../../../examples/app_hpo/hyperplot.py - -Run the HPO application with the following command: - -.. code-block:: console - - $ lightning run app examples/app_hpo/app_wi_ui.py - INFO: Your app has started. View it in your browser: http://127.0.0.1:7501/view - {0: ..., 1: ..., ..., 5: ...} - -Here is what the UI looks like when launched: - -.. image:: https://pl-flash-data.s3.amazonaws.com/assets_lightning/hpo_ui_2.gif - :width: 100 % - :alt: Alternative text diff --git a/docs/source-app/examples/hpo/hpo_wo.rst b/docs/source-app/examples/hpo/hpo_wo.rst deleted file mode 100644 index 6a13ff253dbc2..0000000000000 --- a/docs/source-app/examples/hpo/hpo_wo.rst +++ /dev/null @@ -1,57 +0,0 @@ -:orphan: - -########################################################### -Step 1: Implement an HPO component with the Lightning Works -########################################################### - -**Audience:** Users who want to understand how to implement HPO training from scratch. - -**Prereqs:** Level 17+ - ----- - -In the example below, we are emulating the Lightning Infinite Loop. - -We are assuming we have already defined an ``ObjectiveWork`` component which is responsible to run the objective method and track the metric through its state. - -.. literalinclude:: ./hpo.py - :language: python - -We are running ``TOTAL_TRIALS`` trials by series of ``SIMULTANEOUS_TRIALS`` trials. -When starting, ``TOTAL_TRIALS`` ``ObjectiveWork`` are created. - -The entire code runs within an infinite loop as it would within Lightning. - -When iterating through the Works, if the current ``objective_work`` hasn't started, -some new parameters are sampled from the Optuna Study with our custom distributions -and then passed to run method of the ``objective_work``. - -The condition ``not objective_work.has_started`` will be ``False`` once ``objective_work.run()`` starts. - -Also, the second condition ``objective_work.has_told_study`` will be ``True`` when the metric -is defined within the state of the Work and has been shared with the study. - -Finally, once the current ``SIMULTANEOUS_TRIALS`` have both registered their -metric to the Optuna Study, simply increment ``NUM_TRIALS`` by ``SIMULTANEOUS_TRIALS`` to launch the next trials. - -Below, you can find the simplified version of the ``ObjectiveWork`` where the metric is randomly sampled using NumPy. - -In a realistic use case, the Work executes some user-defined code. - -.. literalinclude:: ./objective.py - :language: python - -Here are the logs produced when running the application above: - -.. code-block:: console - - $ python docs/source-app/tutorials/hpo/hpo.py - INFO: Your app has started. View it in your browser: http://127.0.0.1:7501/view - # After you have clicked `run` on the UI. - [I 2022-03-01 12:32:50,050] A new study created in memory with name: ... - {0: 13.994859806481264, 1: 59.866743330127825, ..., 5: 94.65919769609225} - -The following animation shows how this application works in the cloud: - -.. image:: https://pl-flash-data.s3.amazonaws.com/assets_lightning/hpo.gif - :alt: Animation showing how to HPO works UI in a distributed manner. diff --git a/docs/source-app/examples/hpo/lightning_hpo.rst b/docs/source-app/examples/hpo/lightning_hpo.rst deleted file mode 100644 index b1e2f11d3987d..0000000000000 --- a/docs/source-app/examples/hpo/lightning_hpo.rst +++ /dev/null @@ -1,99 +0,0 @@ -:orphan: - -################################ -Re-use an existing HPO component -################################ - -**Audience:** Users who want to easily get started with HPO training. - -**Prereqs:** Level 8+ - ----- - -********************* -Install Lightning HPO -********************* - -Lightning HPO provides a Pythonic implementation for Scalable Hyperparameter Tuning -and relies on Optuna for providing state-of-the-art sampling hyper-parameters algorithms and efficient trial pruning strategies. - -Find the `Lightning Sweeper App `_ on `lightning.ai `_ and its associated `Github repo `_. - -.. code-block:: bash - - lightning install app lightning/hpo - -********************* -Lightning HPO Example -********************* - -In this tutorial, we are going to convert `Optuna Efficient Optimization Algorithms `_ into a Lightning App. - -The Optuna example optimizes the value (example: learning-rate) of a ``SGDClassifier`` from ``sklearn`` trained over the `Iris Dataset `_. - -.. literalinclude:: ./optuna_reference.py - :language: python - - -As you can see, several trials were pruned (stopped) before they finished all of the iterations. - -.. code-block:: console - - A new study created in memory with name: no-name-4423c12c-22e1-4eaf-ba60-caf0020403c6 - Trial 0 finished with value: 0.07894736842105265 and parameters: {'alpha': 0.00020629773477269024}. Best is trial 0 with value: 0.07894736842105265. - Trial 1 finished with value: 0.368421052631579 and parameters: {'alpha': 0.0005250149151047217}. Best is trial 0 with value: 0.07894736842105265. - Trial 2 finished with value: 0.052631578947368474 and parameters: {'alpha': 5.9086862655635784e-05}. Best is trial 2 with value: 0.052631578947368474. - Trial 3 finished with value: 0.3421052631578947 and parameters: {'alpha': 0.07177263583415294}. Best is trial 2 with value: 0.052631578947368474. - Trial 4 finished with value: 0.23684210526315785 and parameters: {'alpha': 1.7451874636151302e-05}. Best is trial 2 with value: 0.052631578947368474. - Trial 5 pruned. - Trial 6 finished with value: 0.10526315789473684 and parameters: {'alpha': 1.4943994864178649e-05}. Best is trial 2 with value: 0.052631578947368474. - Trial 7 pruned. - Trial 8 pruned. - Trial 9 pruned. - Trial 10 pruned. - Trial 11 pruned. - Trial 12 pruned. - Trial 13 pruned. - Trial 14 pruned. - Trial 15 pruned. - Trial 16 finished with value: 0.07894736842105265 and parameters: {'alpha': 0.006166329613687364}. Best is trial 2 with value: 0.052631578947368474. - Trial 17 pruned. - Trial 18 pruned. - Trial 19 pruned. - -The example above has been re-organized in order to run as Lightning App. - -.. literalinclude:: ./lightning_hpo_target.py - :language: python - -Now, your code can run at scale in the cloud, if needed, and it has a simple neat UI. - -.. figure:: https://pl-flash-data.s3.amazonaws.com/assets_lightning/lightning_hpo_optimizer.png - :alt: Lightning App UI - :width: 100 % - -As you can see, several trials were pruned (stopped) before they finished all of the iterations. Same as when using pure optuna. - -.. code-block:: console - - A new study created in memory with name: no-name-a93d848e-a225-4df3-a9c3-5f86680e295d - Trial 0 finished with value: 0.23684210526315785 and parameters: {'alpha': 0.006779437004523296}. Best is trial 0 with value: 0.23684210526315785. - Trial 1 finished with value: 0.07894736842105265 and parameters: {'alpha': 0.008936151407006062}. Best is trial 1 with value: 0.07894736842105265. - Trial 2 finished with value: 0.052631578947368474 and parameters: {'alpha': 0.0035836511240528008}. Best is trial 2 with value: 0.052631578947368474. - Trial 3 finished with value: 0.052631578947368474 and parameters: {'alpha': 0.0005393218926409795}. Best is trial 2 with value: 0.052631578947368474. - Trial 4 finished with value: 0.1578947368421053 and parameters: {'alpha': 6.572557493358585e-05}. Best is trial 2 with value: 0.052631578947368474. - Trial 5 finished with value: 0.02631578947368418 and parameters: {'alpha': 0.0013953760106345603}. Best is trial 5 with value: 0.02631578947368418. - Trail 6 pruned. - Trail 7 pruned. - Trail 8 pruned. - Trail 9 pruned. - Trial 10 finished with value: 0.07894736842105265 and parameters: {'alpha': 0.00555435554783454}. Best is trial 5 with value: 0.02631578947368418. - Trail 11 pruned. - Trial 12 finished with value: 0.052631578947368474 and parameters: {'alpha': 0.025624276147153992}. Best is trial 5 with value: 0.02631578947368418. - Trial 13 finished with value: 0.07894736842105265 and parameters: {'alpha': 0.014613957457075546}. Best is trial 5 with value: 0.02631578947368418. - Trail 14 pruned. - Trail 15 pruned. - Trail 16 pruned. - Trial 17 finished with value: 0.052631578947368474 and parameters: {'alpha': 0.01028208215647372}. Best is trial 5 with value: 0.02631578947368418. - Trail 18 pruned. - Trail 19 pruned. diff --git a/docs/source-app/examples/hpo/lightning_hpo_target.py b/docs/source-app/examples/hpo/lightning_hpo_target.py deleted file mode 100644 index 779f992554412..0000000000000 --- a/docs/source-app/examples/hpo/lightning_hpo_target.py +++ /dev/null @@ -1,53 +0,0 @@ -import optuna -from lightning_hpo import BaseObjective, Optimizer -from optuna.distributions import LogUniformDistribution -from sklearn import datasets -from sklearn.linear_model import SGDClassifier -from sklearn.model_selection import train_test_split - -from lightning import LightningApp, LightningFlow - - -class Objective(BaseObjective): - def run(self, params): - # WARNING: Don't forget to assign `params` to self, - # so they get tracked in the state. - self.params = params - - iris = datasets.load_iris() - classes = list(set(iris.target)) - train_x, valid_x, train_y, valid_y = train_test_split(iris.data, iris.target, test_size=0.25, random_state=0) - - clf = SGDClassifier(alpha=params["alpha"]) - - for step in range(100): - clf.partial_fit(train_x, train_y, classes=classes) - intermediate_value = 1.0 - clf.score(valid_x, valid_y) - - # WARNING: Assign to reports, - # so the state is instantly sent to the flow. - self.reports = self.reports + [[intermediate_value, step]] - - self.best_model_score = 1.0 - clf.score(valid_x, valid_y) - - def distributions(self): - return {"alpha": LogUniformDistribution(1e-5, 1e-1)} - - -class RootFlow(LightningFlow): - def __init__(self): - super().__init__() - self.optimizer = Optimizer( - objective_cls=Objective, - n_trials=20, - study=optuna.create_study(pruner=optuna.pruners.MedianPruner()), - ) - - def run(self): - self.optimizer.run() - - def configure_layout(self): - return {"name": "HyperPlot", "content": self.optimizer.hi_plot} - - -app = LightningApp(RootFlow()) diff --git a/docs/source-app/examples/hpo/objective.py b/docs/source-app/examples/hpo/objective.py deleted file mode 100644 index d20232fb10ab2..0000000000000 --- a/docs/source-app/examples/hpo/objective.py +++ /dev/null @@ -1,21 +0,0 @@ -import numpy as np - -import lightning as L - - -class ObjectiveWork(L.LightningWork): - def __init__(self): - super().__init__(parallel=True) - self.metric = None - self.trial_id = None - self.params = None - self.has_told_study = False - - def run(self, trial_id, **params): - self.trial_id = trial_id - # Received suggested `backbone` and `learning_rate` - self.params = params - # Emulate metric computation would be - # computed once a script has been completed. - # In reality, this would excute a user defined script. - self.metric = np.random.uniform(0, 100) diff --git a/docs/source-app/examples/hpo/optuna_reference.py b/docs/source-app/examples/hpo/optuna_reference.py deleted file mode 100644 index 46f76c8662244..0000000000000 --- a/docs/source-app/examples/hpo/optuna_reference.py +++ /dev/null @@ -1,36 +0,0 @@ -import logging -import sys - -import optuna -from sklearn import datasets -from sklearn.linear_model import SGDClassifier -from sklearn.model_selection import train_test_split - - -def objective(trial): - iris = datasets.load_iris() - classes = list(set(iris.target)) - train_x, valid_x, train_y, valid_y = train_test_split(iris.data, iris.target, test_size=0.25, random_state=0) - - alpha = trial.suggest_float("alpha", 1e-5, 1e-1, log=True) - clf = SGDClassifier(alpha=alpha) - - for step in range(100): - clf.partial_fit(train_x, train_y, classes=classes) - - # Report intermediate objective value. - intermediate_value = 1.0 - clf.score(valid_x, valid_y) - trial.report(intermediate_value, step) - - # Handle pruning based on the intermediate value. - if trial.should_prune(): - raise optuna.TrialPruned() - - return 1.0 - clf.score(valid_x, valid_y) - - -# Add stream handler of stdout to show the messages -logger = optuna.logging.get_logger("optuna") -logger.addHandler(logging.StreamHandler(sys.stdout)) -study = optuna.create_study(pruner=optuna.pruners.MedianPruner()) -study.optimize(objective, n_trials=20) diff --git a/docs/source-app/get_started/training_with_apps.rst b/docs/source-app/get_started/training_with_apps.rst index f509ba4cf0267..4ba78edee3136 100644 --- a/docs/source-app/get_started/training_with_apps.rst +++ b/docs/source-app/get_started/training_with_apps.rst @@ -107,13 +107,6 @@ Next Steps :button_link: add_an_interactive_demo.html :height: 180 -.. displayitem:: - :header: Add Hyper Parameter Optimization - :description: Add a HPO to optimize your models - :col_css: col-md-4 - :button_link: ../examples/hpo/hpo.html - :height: 180 - .. displayitem:: :header: Add Model Serving :description: Serve and load testing with MLServer and Locust diff --git a/docs/source-app/get_started/what_app_can_do.rst b/docs/source-app/get_started/what_app_can_do.rst index a2c88a6f85bdc..2b03663f365d7 100644 --- a/docs/source-app/get_started/what_app_can_do.rst +++ b/docs/source-app/get_started/what_app_can_do.rst @@ -77,22 +77,6 @@ Find the `ScratchPad App `_ on the App Gallery and the `Lightning HPO Sweeper codebase. `_ on GitHub. - -.. raw:: html - - - ----- - *********************** InVideo Search (Public) *********************** diff --git a/docs/source-app/index.rst b/docs/source-app/index.rst index 8bbe84702ccc4..d3bac069f9135 100644 --- a/docs/source-app/index.rst +++ b/docs/source-app/index.rst @@ -203,7 +203,6 @@ Keep Learning Develop a DAG Develop a File Server Develop a Github Repo Script Runner - Develop a HPO Sweeper Develop a Model Server .. diff --git a/docs/source-lit/examples/hpo/build_from_scratch.rst b/docs/source-lit/examples/hpo/build_from_scratch.rst deleted file mode 120000 index a743030e8643f..0000000000000 --- a/docs/source-lit/examples/hpo/build_from_scratch.rst +++ /dev/null @@ -1 +0,0 @@ -../../../source-app/examples/hpo/build_from_scratch.rst \ No newline at end of file diff --git a/docs/source-lit/examples/hpo/hpo.py b/docs/source-lit/examples/hpo/hpo.py deleted file mode 120000 index a26ef671f9de5..0000000000000 --- a/docs/source-lit/examples/hpo/hpo.py +++ /dev/null @@ -1 +0,0 @@ -../../../source-app/examples/hpo/hpo.py \ No newline at end of file diff --git a/docs/source-lit/examples/hpo/hpo.rst b/docs/source-lit/examples/hpo/hpo.rst deleted file mode 120000 index e5808b20b8ec5..0000000000000 --- a/docs/source-lit/examples/hpo/hpo.rst +++ /dev/null @@ -1 +0,0 @@ -../../../source-app/examples/hpo/hpo.rst \ No newline at end of file diff --git a/docs/source-lit/examples/hpo/hpo_wi.rst b/docs/source-lit/examples/hpo/hpo_wi.rst deleted file mode 120000 index 1fcf07e6d3d75..0000000000000 --- a/docs/source-lit/examples/hpo/hpo_wi.rst +++ /dev/null @@ -1 +0,0 @@ -../../../source-app/examples/hpo/hpo_wi.rst \ No newline at end of file diff --git a/docs/source-lit/examples/hpo/hpo_wo.rst b/docs/source-lit/examples/hpo/hpo_wo.rst deleted file mode 120000 index c0b562258aebb..0000000000000 --- a/docs/source-lit/examples/hpo/hpo_wo.rst +++ /dev/null @@ -1 +0,0 @@ -../../../source-app/examples/hpo/hpo_wo.rst \ No newline at end of file diff --git a/docs/source-lit/examples/hpo/lightning_hpo.rst b/docs/source-lit/examples/hpo/lightning_hpo.rst deleted file mode 120000 index c6f5cdaa7d93e..0000000000000 --- a/docs/source-lit/examples/hpo/lightning_hpo.rst +++ /dev/null @@ -1 +0,0 @@ -../../../source-app/examples/hpo/lightning_hpo.rst \ No newline at end of file diff --git a/docs/source-lit/examples/hpo/lightning_hpo_target.py b/docs/source-lit/examples/hpo/lightning_hpo_target.py deleted file mode 120000 index 8a492d411a559..0000000000000 --- a/docs/source-lit/examples/hpo/lightning_hpo_target.py +++ /dev/null @@ -1 +0,0 @@ -../../../source-app/examples/hpo/lightning_hpo_target.py \ No newline at end of file diff --git a/docs/source-lit/examples/hpo/objective.py b/docs/source-lit/examples/hpo/objective.py deleted file mode 120000 index 9023edf5d254f..0000000000000 --- a/docs/source-lit/examples/hpo/objective.py +++ /dev/null @@ -1 +0,0 @@ -../../../source-app/examples/hpo/objective.py \ No newline at end of file diff --git a/docs/source-lit/examples/hpo/optuna_reference.py b/docs/source-lit/examples/hpo/optuna_reference.py deleted file mode 120000 index 70a01efee4454..0000000000000 --- a/docs/source-lit/examples/hpo/optuna_reference.py +++ /dev/null @@ -1 +0,0 @@ -../../../source-app/examples/hpo/optuna_reference.py \ No newline at end of file diff --git a/docs/source-lit/index.rst b/docs/source-lit/index.rst index 48fa845f8daf8..a4dea5fad776c 100644 --- a/docs/source-lit/index.rst +++ b/docs/source-lit/index.rst @@ -49,7 +49,6 @@ Welcome to ⚡ Lightning Apps Develop a DAG Develop a File Server Develop a Github Repo Script Runner - Develop a HPO Sweeper Develop a Model Server .. From c926e10e54b255d9c0450ca9657bec1aa854abbc Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Mon, 19 Sep 2022 14:26:23 +0100 Subject: [PATCH 193/193] Introduce Upload File endpoint (#14703) * update * update * update * update * update * update * update * update * update * update --- src/lightning_app/CHANGELOG.md | 3 +++ src/lightning_app/core/api.py | 27 ++++++++++++++++++++++++++- src/lightning_app/runners/cloud.py | 2 +- 3 files changed, 30 insertions(+), 2 deletions(-) diff --git a/src/lightning_app/CHANGELOG.md b/src/lightning_app/CHANGELOG.md index 45cfa83166ea4..e7ca605b4c2b5 100644 --- a/src/lightning_app/CHANGELOG.md +++ b/src/lightning_app/CHANGELOG.md @@ -15,6 +15,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Add `--secret` option to CLI to allow binding Secrets to app environment variables when running in the cloud ([#14612](https://github.com/Lightning-AI/lightning/pull/14612)) +- Add support to upload files to the Drive through an asynchronous `upload_file` endpoint ([#14703](https://github.com/Lightning-AI/lightning/pull/14703)) + + ### Changed - Application storage prefix moved from `app_id` to `project_id/app_id` ([#14583](https://github.com/Lightning-AI/lightning/pull/14583)) diff --git a/src/lightning_app/core/api.py b/src/lightning_app/core/api.py index a1b75dcea1383..7ba6b3de104cc 100644 --- a/src/lightning_app/core/api.py +++ b/src/lightning_app/core/api.py @@ -5,12 +5,13 @@ import traceback from copy import deepcopy from multiprocessing import Queue +from tempfile import TemporaryDirectory from threading import Event, Lock, Thread from typing import Dict, List, Mapping, Optional import uvicorn from deepdiff import DeepDiff, Delta -from fastapi import FastAPI, HTTPException, Request, Response, status, WebSocket +from fastapi import FastAPI, File, HTTPException, Request, Response, status, UploadFile, WebSocket from fastapi.middleware.cors import CORSMiddleware from fastapi.params import Header from fastapi.responses import HTMLResponse, JSONResponse @@ -23,6 +24,7 @@ from lightning_app.api.request_types import DeltaRequest from lightning_app.core.constants import ENABLE_STATE_WEBSOCKET, FRONTEND_DIR from lightning_app.core.queues import RedisQueue +from lightning_app.storage import Drive from lightning_app.utilities.app_helpers import InMemoryStateStore, Logger, StateStore from lightning_app.utilities.enum import OpenAPITags from lightning_app.utilities.imports import _is_redis_available, _is_starsessions_available @@ -234,6 +236,29 @@ async def post_state( api_app_delta_queue.put(DeltaRequest(delta=Delta(deep_diff))) +@fastapi_service.put("/api/v1/upload_file/{filename}") +async def upload_file(filename: str, uploaded_file: UploadFile = File(...)): + with TemporaryDirectory() as tmp: + drive = Drive( + "lit://uploaded_files", + component_name="file_server", + allow_duplicates=True, + root_folder=tmp, + ) + tmp_file = os.path.join(tmp, filename) + + with open(tmp_file, "wb") as f: + done = False + while not done: + # Note: The 8192 number doesn't have a strong reason. + content = await uploaded_file.read(8192) + f.write(content) + done = content == b"" + + drive.put(filename) + return f"Successfully uploaded '{filename}' to the Drive" + + @fastapi_service.get("/healthz", status_code=200) async def healthz(response: Response): """Health check endpoint used in the cloud FastAPI servers to check the status periodically. This requires diff --git a/src/lightning_app/runners/cloud.py b/src/lightning_app/runners/cloud.py index b873f6db48ab6..fe7c9e4b8751a 100644 --- a/src/lightning_app/runners/cloud.py +++ b/src/lightning_app/runners/cloud.py @@ -351,7 +351,7 @@ def _check_uploaded_folder(root: Path, repo: LocalSourceCodeDir) -> None: else: warning_msg += "\nYou can ignore some files or folders by adding them to `.lightningignore`." - logger.warning(warning_msg) + logger.warn(warning_msg) def _project_has_sufficient_credits(self, project: V1Membership, app: Optional[LightningApp] = None): """check if user has enough credits to run the app with its hardware if app is not passed return True if