From e8ad5d002a6cb46b5e0b06116023a76ed09c44a6 Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Wed, 3 Nov 2021 12:56:08 +0100 Subject: [PATCH 001/123] Update __version__ --- pytorch_lightning/__about__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/__about__.py b/pytorch_lightning/__about__.py index dc61686e5dbea..e1bdee9b7320b 100644 --- a/pytorch_lightning/__about__.py +++ b/pytorch_lightning/__about__.py @@ -1,7 +1,7 @@ import time _this_year = time.strftime("%Y") -__version__ = "1.5.0" +__version__ = "1.5.1" __author__ = "William Falcon et al." __author_email__ = "waf2107@columbia.edu" __license__ = "Apache-2.0" From 7d879865afc06b7cac5052b52566ff434c9c553b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Wed, 3 Nov 2021 12:18:10 +0100 Subject: [PATCH 002/123] Fix `apply_to_collection(defaultdict)` (#10316) --- CHANGELOG.md | 12 +++++++----- pytorch_lightning/utilities/apply_func.py | 4 +++- tests/utilities/test_apply_func.py | 7 ++++++- 3 files changed, 16 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b9c808464dadc..9beec3074145a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,13 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). +## [1.5.1] - 2021-MM-DD + +### Fixed + +- Fixed `apply_to_collection(defaultdict)` ([#10316](https://github.com/PyTorchLightning/pytorch-lightning/issues/10316)) + + ## [1.5.0] - 2021-11-02 ### Added @@ -132,7 +139,6 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added support for empty `gpus` list to run on CPU ([#10246](https://github.com/PyTorchLightning/pytorch-lightning/pull/10246)) - Added a warning if multiple batch sizes are found from ambiguous batch ([#10247](https://github.com/PyTorchLightning/pytorch-lightning/pull/10247)) - ### Changed - Trainer now raises a `MisconfigurationException` when its methods are called with `ckpt_path="best"` but a checkpoint callback isn't configured ([#9841](https://github.com/PyTorchLightning/pytorch-lightning/pull/9841)) @@ -184,7 +190,6 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Enabled `on_load_checkpoint` for `LightningDataModule` for all `trainer_fn` ([#10238](https://github.com/PyTorchLightning/pytorch-lightning/pull/10238)) - Allowed separate config files for parameters with class type when LightningCLI is in `subclass_mode=False` ([#10286](https://github.com/PyTorchLightning/pytorch-lightning/pull/10286)) - ### Deprecated - Deprecated Trainer argument `terminate_on_nan` in favor of `detect_anomaly`([#9175](https://github.com/PyTorchLightning/pytorch-lightning/pull/9175)) @@ -220,7 +225,6 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Deprecated `lr_sch_names` from `LearningRateMonitor` ([#10066](https://github.com/PyTorchLightning/pytorch-lightning/pull/10066)) - Deprecated `ProgressBar` callback in favor of `TQDMProgressBar` ([#10134](https://github.com/PyTorchLightning/pytorch-lightning/pull/10134)) - ### Removed - Removed deprecated `metrics` ([#8586](https://github.com/PyTorchLightning/pytorch-lightning/pull/8586/)) @@ -264,7 +268,6 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Removed automatic patching of `{train,val,test,predict}_dataloader()` on the `LightningModule` ([#9764](https://github.com/PyTorchLightning/pytorch-lightning/pull/9764)) - Removed `pytorch_lightning.trainer.connectors.OptimizerConnector` ([#10120](https://github.com/PyTorchLightning/pytorch-lightning/pull/10120)) - ### Fixed - Fixed ImageNet evaluation in example ([#10179](https://github.com/PyTorchLightning/pytorch-lightning/pull/10179)) @@ -473,7 +476,6 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added private `prevent_trainer_and_dataloaders_deepcopy` context manager on the `LightningModule` ([#8472](https://github.com/PyTorchLightning/pytorch-lightning/pull/8472)) - Added support for providing callables to the Lightning CLI instead of types ([#8400](https://github.com/PyTorchLightning/pytorch-lightning/pull/8400)) - ### Changed - Decoupled device parsing logic from Accelerator connector to Trainer ([#8180](https://github.com/PyTorchLightning/pytorch-lightning/pull/8180)) diff --git a/pytorch_lightning/utilities/apply_func.py b/pytorch_lightning/utilities/apply_func.py index 3bd920c2e304b..1e981a0f543e7 100644 --- a/pytorch_lightning/utilities/apply_func.py +++ b/pytorch_lightning/utilities/apply_func.py @@ -14,7 +14,7 @@ import dataclasses import operator from abc import ABC -from collections import OrderedDict +from collections import defaultdict, OrderedDict from collections.abc import Mapping, Sequence from copy import copy from functools import partial @@ -102,6 +102,8 @@ def apply_to_collection( ) if include_none or v is not None: out.append((k, v)) + if isinstance(data, defaultdict): + return elem_type(data.default_factory, OrderedDict(out)) return elem_type(OrderedDict(out)) is_namedtuple = _is_namedtuple(data) diff --git a/tests/utilities/test_apply_func.py b/tests/utilities/test_apply_func.py index 2c131f96ecc6f..da309f7d22b50 100644 --- a/tests/utilities/test_apply_func.py +++ b/tests/utilities/test_apply_func.py @@ -13,7 +13,7 @@ # limitations under the License. import dataclasses import numbers -from collections import namedtuple, OrderedDict +from collections import defaultdict, namedtuple, OrderedDict from typing import List import numpy as np @@ -153,6 +153,11 @@ def __init__(self, initial_dict): reduced = apply_to_collection(to_reduce, int, lambda x: str(x)) assert reduced == _CustomCollection({"a": "1", "b": "2", "c": "3"}) + # defaultdict + to_reduce = defaultdict(int, {"a": 1, "b": 2, "c": 3}) + reduced = apply_to_collection(to_reduce, int, lambda x: str(x)) + assert reduced == defaultdict(int, {"a": "1", "b": "2", "c": "3"}) + def test_apply_to_collection_include_none(): to_reduce = [1, 2, 3.4, 5.6, 7, (8, 9.1, {10: 10})] From 2685aa6edaccc893cc49459d3829d4c1019d3803 Mon Sep 17 00:00:00 2001 From: Peter Dudfield <34686298+peterdudfield@users.noreply.github.com> Date: Thu, 4 Nov 2021 11:46:57 +0000 Subject: [PATCH 003/123] Fix failure when `DataLoader(batch_size=None)` is passed (#10345) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Carlos Mocholí --- CHANGELOG.md | 1 + pytorch_lightning/trainer/data_loading.py | 2 +- tests/trainer/test_data_loading.py | 38 ++++++++++++++--------- 3 files changed, 25 insertions(+), 16 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9beec3074145a..4681c80baee60 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Fixed - Fixed `apply_to_collection(defaultdict)` ([#10316](https://github.com/PyTorchLightning/pytorch-lightning/issues/10316)) +- Fixed failure when `DataLoader(batch_size=None)` is passed ([#10345](https://github.com/PyTorchLightning/pytorch-lightning/issues/10345)) ## [1.5.0] - 2021-11-02 diff --git a/pytorch_lightning/trainer/data_loading.py b/pytorch_lightning/trainer/data_loading.py index e149aef9a7997..c41d80b903d4e 100644 --- a/pytorch_lightning/trainer/data_loading.py +++ b/pytorch_lightning/trainer/data_loading.py @@ -184,7 +184,7 @@ def _dataloader_init_kwargs_resolve_sampler( batch_sampler = getattr(dataloader, "batch_sampler") is_predicting = mode == RunningStage.PREDICTING # checking the batch sampler type is different than PyTorch default. - if (batch_sampler is not None and type(batch_sampler) is not BatchSampler) or is_predicting: + if batch_sampler is not None and (type(batch_sampler) is not BatchSampler or is_predicting): batch_sampler = type(batch_sampler)( sampler, batch_size=batch_sampler.batch_size, diff --git a/tests/trainer/test_data_loading.py b/tests/trainer/test_data_loading.py index 723cff55c6860..35f9838f0b04a 100644 --- a/tests/trainer/test_data_loading.py +++ b/tests/trainer/test_data_loading.py @@ -283,25 +283,26 @@ class CustomSampler(Sampler): trainer.prepare_dataloader(dataloader, shuffle=True) -def test_loader_detaching(): - """Checks that the loader has been resetted after the entrypoint.""" +class LoaderTestModel(BoringModel): + def training_step(self, batch, batch_idx): + assert len(self.trainer.train_dataloader.loaders) == 10 + return super().training_step(batch, batch_idx) - class LoaderTestModel(BoringModel): - def training_step(self, batch, batch_idx): - assert len(self.trainer.train_dataloader.loaders) == 10 - return super().training_step(batch, batch_idx) + def validation_step(self, batch, batch_idx): + assert len(self.trainer.val_dataloaders[0]) == 10 + return super().validation_step(batch, batch_idx) - def validation_step(self, batch, batch_idx): - assert len(self.trainer.val_dataloaders[0]) == 10 - return super().validation_step(batch, batch_idx) + def test_step(self, batch, batch_idx): + assert len(self.trainer.test_dataloaders[0]) == 10 + return super().test_step(batch, batch_idx) - def test_step(self, batch, batch_idx): - assert len(self.trainer.test_dataloaders[0]) == 10 - return super().test_step(batch, batch_idx) + def predict_step(self, batch, batch_idx, dataloader_idx=0): + assert len(self.trainer.predict_dataloaders[0]) == 10 + return super().predict_step(batch, batch_idx, dataloader_idx=dataloader_idx) - def predict_step(self, batch, batch_idx, dataloader_idx=None): - assert len(self.trainer.predict_dataloaders[0]) == 10 - return super().predict_step(batch, batch_idx, dataloader_idx=dataloader_idx) + +def test_loader_detaching(): + """Checks that the loader has been resetted after the entrypoint.""" loader = DataLoader(RandomDataset(32, 10), batch_size=1) @@ -340,3 +341,10 @@ def predict_step(self, batch, batch_idx, dataloader_idx=None): assert len(model.val_dataloader()) == 64 assert len(model.predict_dataloader()) == 64 assert len(model.test_dataloader()) == 64 + + +def test_pre_made_batches(): + """Check that loader works with pre-made batches.""" + loader = DataLoader(RandomDataset(32, 10), batch_size=None) + trainer = Trainer(fast_dev_run=1) + trainer.predict(LoaderTestModel(), loader) From 71380a50475468704da277ec56723cfad2fa8c1d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Thu, 4 Nov 2021 18:26:24 +0100 Subject: [PATCH 004/123] Update Python testing (#10269) --- .azure-pipelines/gpu-benchmark.yml | 4 ++-- .github/workflows/ci_dockers.yml | 4 ++-- .github/workflows/ci_test-conda.yml | 2 +- .github/workflows/ci_test-full.yml | 26 +++++++++++++++----------- .github/workflows/events-nightly.yml | 4 ++-- dockers/base-cuda/Dockerfile | 2 +- dockers/base-ipu/Dockerfile | 5 ++--- dockers/base-xla/Dockerfile | 6 +++--- dockers/ipu-ci-runner/Dockerfile | 2 +- dockers/release/Dockerfile | 2 +- dockers/tpu-tests/Dockerfile | 2 +- 11 files changed, 31 insertions(+), 28 deletions(-) diff --git a/.azure-pipelines/gpu-benchmark.yml b/.azure-pipelines/gpu-benchmark.yml index a63c9e8640bc8..f8b9593d72798 100644 --- a/.azure-pipelines/gpu-benchmark.yml +++ b/.azure-pipelines/gpu-benchmark.yml @@ -28,8 +28,8 @@ jobs: cancelTimeoutInMinutes: "2" pool: gridai-spot-pool container: - # base ML image: mcr.microsoft.com/azureml/openmpi3.1.2-cuda10.2-cudnn8-ubuntu18.04 - image: "pytorchlightning/pytorch_lightning:base-cuda-py3.8-torch1.8" + # should match the one in '.azure-pipelines/gpu-benchmark.yml' + image: "pytorchlightning/pytorch_lightning:base-cuda-py3.7-torch1.8" options: "--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all --shm-size=32g" workspace: clean: all diff --git a/.github/workflows/ci_dockers.yml b/.github/workflows/ci_dockers.yml index 5b5a140e7791a..701223c795a3b 100644 --- a/.github/workflows/ci_dockers.yml +++ b/.github/workflows/ci_dockers.yml @@ -92,7 +92,7 @@ jobs: fail-fast: false matrix: # the config used in '.github/workflows/ci_test-conda.yml' - python_version: ["3.7"] + python_version: ["3.8"] pytorch_version: ["1.6", "1.7", "1.8", "1.9", "1.10"] steps: - name: Checkout @@ -119,7 +119,7 @@ jobs: fail-fast: false matrix: # the config used in 'dockers/ipu-ci-runner/Dockerfile' - python_version: ["3.8"] # latest + python_version: ["3.9"] # latest # TODO: upgrade - PopTorch 2.2 uses torch 1.9, see: # https://docs.graphcore.ai/projects/poptorch-user-guide/en/latest/installation.html#version-compatibility pytorch_version: ["1.7"] diff --git a/.github/workflows/ci_test-conda.yml b/.github/workflows/ci_test-conda.yml index cdd18d13e909a..edae03db7936b 100644 --- a/.github/workflows/ci_test-conda.yml +++ b/.github/workflows/ci_test-conda.yml @@ -14,7 +14,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: ["3.7"] + python-version: ["3.8"] # previous to last Python version as that one is already used in test-full pytorch-version: ["1.6", "1.7", "1.8", "1.9", "1.10"] # Timeout: https://stackoverflow.com/a/59076067/4521646 diff --git a/.github/workflows/ci_test-full.yml b/.github/workflows/ci_test-full.yml index 5cc4827d84888..8be8fd1146864 100644 --- a/.github/workflows/ci_test-full.yml +++ b/.github/workflows/ci_test-full.yml @@ -18,17 +18,21 @@ jobs: fail-fast: false matrix: os: [ubuntu-18.04, windows-2019, macOS-10.15] - python-version: [3.6, 3.8, 3.9] - requires: ['minimal', 'latest'] - release: ['stable'] - exclude: - - python-version: 3.9 - requires: 'minimal' + python-version: ["3.7", "3.9"] # minimum, maximum + requires: ["oldest", "latest"] + release: ["stable"] include: - - os: ubuntu-20.04 - python-version: 3.9 - requires: 'latest' - release: 'pre' + # test 3.6 only on oldest until EOL: https://github.com/PyTorchLightning/pytorch-lightning/issues/9981 + - {os: ubuntu-18.04, python-version: "3.6", requires: "oldest", release: "stable"} + - {os: windows-2019, python-version: "3.6", requires: "oldest", release: "stable"} + - {os: macOS-10.15, python-version: "3.6", requires: "oldest", release: "stable"} + # nightly: add when there's a release candidate + #- {os: ubuntu-20.04, python-version: "3.10", requires: "latest", release: "pre"} + exclude: + # PyTorch 1.6 is not available with Python 3.9: https://github.com/pytorch/pytorch/issues/46205 + - {os: ubuntu-18.04, python-version: "3.9", requires: "oldest", release: "stable"} + - {os: windows-2019, python-version: "3.9", requires: "oldest", release: "stable"} + - {os: macOS-10.15, python-version: "3.9", requires: "oldest", release: "stable"} # Timeout: https://stackoverflow.com/a/59076067/4521646 # TODO: the macOS is taking too long, probably caching did not work... @@ -64,7 +68,7 @@ jobs: python .github/prune-packages.py requirements/extra.txt "horovod" - name: Set min. dependencies - if: matrix.requires == 'minimal' + if: matrix.requires == 'oldest' run: | python .github/set-min-requirements.py diff --git a/.github/workflows/events-nightly.yml b/.github/workflows/events-nightly.yml index 4afcec0496abc..ce2072e5f45aa 100644 --- a/.github/workflows/events-nightly.yml +++ b/.github/workflows/events-nightly.yml @@ -122,7 +122,7 @@ jobs: fail-fast: false matrix: # the config used in '.github/workflows/ci_test-conda.yml' - python_version: ["3.7"] + python_version: ["3.8"] pytorch_version: ["1.6", "1.7", "1.8", "1.9", "1.10"] steps: @@ -163,7 +163,7 @@ jobs: matrix: # the config used in 'dockers/ipu-ci-runner/Dockerfile' include: - - python_version: "3.8" + - python_version: "3.9" pytorch_version: "1.7" steps: diff --git a/dockers/base-cuda/Dockerfile b/dockers/base-cuda/Dockerfile index 978f8bcdf100e..ab26af6c7accf 100644 --- a/dockers/base-cuda/Dockerfile +++ b/dockers/base-cuda/Dockerfile @@ -16,7 +16,7 @@ ARG CUDA_VERSION=10.2 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu18.04 -ARG PYTHON_VERSION=3.8 +ARG PYTHON_VERSION=3.9 ARG PYTORCH_VERSION=1.6 SHELL ["/bin/bash", "-c"] diff --git a/dockers/base-ipu/Dockerfile b/dockers/base-ipu/Dockerfile index 01b5920d88fd1..e91a0dc4a0a1e 100644 --- a/dockers/base-ipu/Dockerfile +++ b/dockers/base-ipu/Dockerfile @@ -16,8 +16,7 @@ FROM ubuntu:20.04 LABEL maintainer="PyTorchLightning " -ARG PYTHON_VERSION=3.8 -ARG PYTORCH_VERSION=1.7 +ARG PYTHON_VERSION=3.9 ARG CONDA_VERSION=4.9.2 SHELL ["/bin/bash", "-c"] @@ -41,7 +40,7 @@ RUN apt-get update -qq && \ && \ # Install conda and python. # NOTE new Conda does not forward the exit status... https://github.com/conda/conda/issues/8385 - curl -o ~/miniconda.sh https://repo.anaconda.com/miniconda/Miniconda3-py38_${CONDA_VERSION}-Linux-x86_64.sh && \ + curl -o ~/miniconda.sh https://repo.anaconda.com/miniconda/Miniconda3-py39_${CONDA_VERSION}-Linux-x86_64.sh && \ chmod +x ~/miniconda.sh && \ ~/miniconda.sh -b && \ rm ~/miniconda.sh && \ diff --git a/dockers/base-xla/Dockerfile b/dockers/base-xla/Dockerfile index 1a2554b6b94b2..5c86da2147717 100644 --- a/dockers/base-xla/Dockerfile +++ b/dockers/base-xla/Dockerfile @@ -16,8 +16,8 @@ FROM google/cloud-sdk:slim LABEL maintainer="PyTorchLightning " -# CALL: docker image build -t pytorch-lightning:XLA-extras-py3.6 -f dockers/base-xla/Dockerfile . --build-arg PYTHON_VERSION=3.6 -ARG PYTHON_VERSION=3.7 +# CALL: docker image build -t pytorch-lightning:XLA-extras-py3.6 -f dockers/base-xla/Dockerfile . --build-arg PYTHON_VERSION=3.8 +ARG PYTHON_VERSION=3.9 ARG CONDA_VERSION=4.9.2 ARG XLA_VERSION=1.6 @@ -42,7 +42,7 @@ RUN apt-get update -qq && \ && \ # Install conda and python. # NOTE new Conda does not forward the exit status... https://github.com/conda/conda/issues/8385 - curl -o ~/miniconda.sh https://repo.anaconda.com/miniconda/Miniconda3-py38_${CONDA_VERSION}-Linux-x86_64.sh && \ + curl -o ~/miniconda.sh https://repo.anaconda.com/miniconda/Miniconda3-py39_${CONDA_VERSION}-Linux-x86_64.sh && \ chmod +x ~/miniconda.sh && \ ~/miniconda.sh -b && \ rm ~/miniconda.sh && \ diff --git a/dockers/ipu-ci-runner/Dockerfile b/dockers/ipu-ci-runner/Dockerfile index aa8672a34a376..98f769f78fe8f 100644 --- a/dockers/ipu-ci-runner/Dockerfile +++ b/dockers/ipu-ci-runner/Dockerfile @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -ARG PYTHON_VERSION=3.8 +ARG PYTHON_VERSION=3.9 ARG PYTORCH_VERSION=1.7 FROM pytorchlightning/pytorch_lightning:base-ipu-py${PYTHON_VERSION}-torch${PYTORCH_VERSION} diff --git a/dockers/release/Dockerfile b/dockers/release/Dockerfile index bea977899ce50..529680059791c 100644 --- a/dockers/release/Dockerfile +++ b/dockers/release/Dockerfile @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -ARG PYTHON_VERSION=3.7 +ARG PYTHON_VERSION=3.9 ARG PYTORCH_VERSION=1.6 FROM pytorchlightning/pytorch_lightning:base-cuda-py${PYTHON_VERSION}-torch${PYTORCH_VERSION} diff --git a/dockers/tpu-tests/Dockerfile b/dockers/tpu-tests/Dockerfile index 3fc703edb2e0d..086bd349bc757 100644 --- a/dockers/tpu-tests/Dockerfile +++ b/dockers/tpu-tests/Dockerfile @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -ARG PYTHON_VERSION=3.7 +ARG PYTHON_VERSION=3.9 ARG PYTORCH_VERSION=1.6 FROM pytorchlightning/pytorch_lightning:base-xla-py${PYTHON_VERSION}-torch${PYTORCH_VERSION} From c7be2741d0dac01a811a8634e7fe272298bfb800 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Fri, 5 Nov 2021 18:36:00 +0100 Subject: [PATCH 005/123] Fix DataLoader inspection and re-instantiation in Lite (#10334) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Carlos Mocholí --- CHANGELOG.md | 1 + pytorch_lightning/lite/wrappers.py | 9 +++++---- tests/lite/test_lite.py | 28 ++++++++++++++++++++++++++++ 3 files changed, 34 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4681c80baee60..d8683520c8d57 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed `apply_to_collection(defaultdict)` ([#10316](https://github.com/PyTorchLightning/pytorch-lightning/issues/10316)) - Fixed failure when `DataLoader(batch_size=None)` is passed ([#10345](https://github.com/PyTorchLightning/pytorch-lightning/issues/10345)) +- Fixed interception of `__init__` arguments for sub-classed DataLoader re-instantiation in Lite ([#10334](https://github.com/PyTorchLightning/pytorch-lightning/issues/10334)) ## [1.5.0] - 2021-11-02 diff --git a/pytorch_lightning/lite/wrappers.py b/pytorch_lightning/lite/wrappers.py index ad01b44ef30f4..881a663fdb9e5 100644 --- a/pytorch_lightning/lite/wrappers.py +++ b/pytorch_lightning/lite/wrappers.py @@ -14,6 +14,7 @@ import functools import inspect from contextlib import contextmanager +from itertools import chain from typing import Any, Callable, Dict, Generator, Iterable, Iterator, Optional, Set, Sized, Type, Union import torch @@ -109,7 +110,7 @@ def wrapper(module: Any, *args: Any, **kwargs: Dict[str, Any]) -> None: params = dict(inspect.signature(module._old_init).parameters) params.pop("args") params.pop("kwargs") - for init_name, init_arg in zip(params, args): + for init_name, init_arg in chain(zip(params, args), kwargs.items()): setattr(module, init_name, init_arg) f(module, *args, **kwargs) @@ -118,15 +119,15 @@ def wrapper(module: Any, *args: Any, **kwargs: Dict[str, Any]) -> None: # https://stackoverflow.com/a/63851681/9201239 def _get_all_subclasses(cls: Type[Any]) -> Set[Type[Any]]: - subclass_list = [] + subclasses = set() def recurse(cl: Type[Any]) -> None: for subclass in cl.__subclasses__(): - subclass_list.append(subclass) + subclasses.add(subclass) recurse(subclass) recurse(cls) - return set(subclass_list) + return subclasses def _enable_class(cls: Type[Any]) -> None: diff --git a/tests/lite/test_lite.py b/tests/lite/test_lite.py index 8eac30f9cf823..b563e56e2fdec 100644 --- a/tests/lite/test_lite.py +++ b/tests/lite/test_lite.py @@ -164,6 +164,34 @@ def test_setup_dataloaders_return_type(): assert lite_dataloader1.dataset is dataset1 +def test_setup_dataloaders_with_custom_type(): + """Test that Lite intercepts arguments passed to custom subclasses of torch.utils.DataLoader and sets them as + attributes.""" + + class DataLoaderSubclass1(DataLoader): + def __init__(self, attribute1, *args, **kwargs): + # intentionally not setting this attribute, calling super with different args + # self.attribute1 = attribute1 + super().__init__(*args, **kwargs) + + class DataLoaderSubclass2(DataLoaderSubclass1): + def __init__(self, attribute1, attribute2, *args, **kwargs): + # intentionally not setting this attribute, calling super with different args + # self.attribute2 = attribute2 + super().__init__(attribute1, *args, **kwargs) + + class LiteWithCustomDataLoader(LightningLite): + def run(self): + dataloader = DataLoaderSubclass2("attribute1", "attribute2", dataset=range(4), batch_size=2) + assert dataloader.attribute1 == "attribute1" + assert dataloader.attribute2 == "attribute2" + lite_dataloader = self.setup_dataloaders(dataloader) + assert lite_dataloader.attribute1 == "attribute1" + assert lite_dataloader.attribute2 == "attribute2" + + LiteWithCustomDataLoader().run() + + def test_setup_custom_dataloaders(): """Test that the setup_dataloaders method returns the dataloaders wrapped as LiteDataLoader.""" lite = EmptyLite() From 297d549f22125d45fd89b684dfd6f6550d8f3a77 Mon Sep 17 00:00:00 2001 From: four4fish <88516121+four4fish@users.noreply.github.com> Date: Sun, 7 Nov 2021 18:05:44 -0800 Subject: [PATCH 006/123] Only import PostLocalSGD related modules when it's needed (#10359) * Only import PostLocalSGD related modules when it's needed * Only import PostLocalSGD related modules when it's needed * Only import PostLocalSGD related modules when it's needed --- .../plugins/training_type/ddp.py | 26 +++++++++---------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/pytorch_lightning/plugins/training_type/ddp.py b/pytorch_lightning/plugins/training_type/ddp.py index ea4820f61ec7c..fe10088b19944 100644 --- a/pytorch_lightning/plugins/training_type/ddp.py +++ b/pytorch_lightning/plugins/training_type/ddp.py @@ -63,11 +63,6 @@ from pytorch_lightning.utilities.seed import reset_seed from pytorch_lightning.utilities.types import STEP_OUTPUT -if _TORCH_GREATER_EQUAL_1_10: - if not _IS_WINDOWS: - from torch.distributed.optim import DistributedOptimizer - from torch.distributed.optim import PostLocalSGDOptimizer, ZeroRedundancyOptimizer - if _FAIRSCALE_AVAILABLE: from fairscale.optim import OSS if _HYDRA_AVAILABLE: @@ -75,9 +70,7 @@ from hydra.utils import get_original_cwd, to_absolute_path if _TORCH_GREATER_EQUAL_1_8: from pytorch_lightning.utilities.distributed import register_ddp_comm_hook -if _TORCH_GREATER_EQUAL_1_10: - import torch.distributed.algorithms.ddp_comm_hooks.post_localSGD_hook as post_localSGD - import torch.distributed.algorithms.model_averaging.averagers as averagers + log = logging.getLogger(__name__) @@ -324,12 +317,11 @@ def _register_ddp_hooks(self) -> None: ddp_comm_wrapper=self._ddp_comm_wrapper, ) - if ( - _TORCH_GREATER_EQUAL_1_10 - and isinstance(self._ddp_comm_state, post_localSGD.PostLocalSGDState) - and self.lightning_module.trainer.state.fn == TrainerFn.FITTING - ): - self._reinit_optimizers_with_post_localSGD(self._ddp_comm_state.start_localSGD_iter) + if _TORCH_GREATER_EQUAL_1_10 and self.lightning_module.trainer.state.fn == TrainerFn.FITTING: + import torch.distributed.algorithms.ddp_comm_hooks.post_localSGD_hook as post_localSGD + + if isinstance(self._ddp_comm_state, post_localSGD.PostLocalSGDState): + self._reinit_optimizers_with_post_localSGD(self._ddp_comm_state.start_localSGD_iter) def _reinit_optimizers_with_post_localSGD(self, warmup_steps: int): optimizers = self.lightning_module.trainer.optimizers @@ -337,6 +329,12 @@ def _reinit_optimizers_with_post_localSGD(self, warmup_steps: int): raise ValueError( "Post-localSGD algorithm is used, but model averaging period is not provided to DDP plugin." ) + if _TORCH_GREATER_EQUAL_1_10: + if not _IS_WINDOWS: + from torch.distributed.optim import DistributedOptimizer + import torch.distributed.algorithms.model_averaging.averagers as averagers + from torch.distributed.optim import PostLocalSGDOptimizer, ZeroRedundancyOptimizer + averager = averagers.PeriodicModelAverager(period=self._model_averaging_period, warmup_steps=warmup_steps) for x, optimizer in enumerate(optimizers): if isinstance(optimizer, LightningOptimizer): From 0db233b0b1f1dee3403f6db002341fc3fe3efa6c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 8 Nov 2021 13:00:19 +0100 Subject: [PATCH 007/123] Fix pickling error with CSVLogger (#10388) * Don't store csv.Dictwriter in ExperimentWriter * Add test for pickle after .save() * Add entry in changelog --- CHANGELOG.md | 1 + pytorch_lightning/loggers/csv_logs.py | 6 +++--- tests/loggers/test_all.py | 4 ++++ 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d8683520c8d57..1c5e5dd8141a7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed `apply_to_collection(defaultdict)` ([#10316](https://github.com/PyTorchLightning/pytorch-lightning/issues/10316)) - Fixed failure when `DataLoader(batch_size=None)` is passed ([#10345](https://github.com/PyTorchLightning/pytorch-lightning/issues/10345)) - Fixed interception of `__init__` arguments for sub-classed DataLoader re-instantiation in Lite ([#10334](https://github.com/PyTorchLightning/pytorch-lightning/issues/10334)) +- Fixed issue with pickling `CSVLogger` after a call to `CSVLogger.save` ([#10388](https://github.com/PyTorchLightning/pytorch-lightning/pull/10388)) ## [1.5.0] - 2021-11-02 diff --git a/pytorch_lightning/loggers/csv_logs.py b/pytorch_lightning/loggers/csv_logs.py index 77adfe551f72d..454a17905c529 100644 --- a/pytorch_lightning/loggers/csv_logs.py +++ b/pytorch_lightning/loggers/csv_logs.py @@ -95,9 +95,9 @@ def save(self) -> None: metrics_keys = list(last_m.keys()) with open(self.metrics_file_path, "w", newline="") as f: - self.writer = csv.DictWriter(f, fieldnames=metrics_keys) - self.writer.writeheader() - self.writer.writerows(self.metrics) + writer = csv.DictWriter(f, fieldnames=metrics_keys) + writer.writeheader() + writer.writerows(self.metrics) class CSVLogger(LightningLoggerBase): diff --git a/tests/loggers/test_all.py b/tests/loggers/test_all.py index 67838e219fcfb..271ffce811fe5 100644 --- a/tests/loggers/test_all.py +++ b/tests/loggers/test_all.py @@ -263,6 +263,10 @@ def _test_loggers_pickle(tmpdir, monkeypatch, logger_class): # the logger needs to remove it from the state before pickle _ = logger.experiment + # logger also has to avoid adding un-picklable attributes to self in .save + logger.log_metrics({"a": 1}) + logger.save() + # test pickling loggers pickle.dumps(logger) From 30b3ccba154887f19ea85a15ab7dbbec8934b24d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Mon, 8 Nov 2021 12:28:58 +0100 Subject: [PATCH 008/123] Revert part of #10279 (#10376) --- pytorch_lightning/lite/lite.py | 17 +++----- tests/lite/test_lite.py | 80 ++++++++++++---------------------- 2 files changed, 35 insertions(+), 62 deletions(-) diff --git a/pytorch_lightning/lite/lite.py b/pytorch_lightning/lite/lite.py index 2e6f10d356fe0..2e45f6c7d0e48 100644 --- a/pytorch_lightning/lite/lite.py +++ b/pytorch_lightning/lite/lite.py @@ -238,18 +238,15 @@ def _setup_dataloader( ) sampler = self._get_distributed_sampler(dataloader, **self._strategy.distributed_sampler_kwargs) - dataloader_kwargs = TrainerDataLoadingMixin._get_dataloader_init_kwargs(dataloader, sampler) - try: - dataloader = type(dataloader)(**dataloader_kwargs) - except TypeError: - dataloader_kwargs.pop("dataset") - dataloader = type(dataloader)(**dataloader_kwargs) + # the dataloader needs to be re-instantiated because we want to update the input arguments (e.g., sampler) + dataloader = TrainerDataLoadingMixin._update_dataloader(dataloader, sampler) + # add worker_init_fn for correct seeding in worker processes TrainerDataLoadingMixin._auto_add_worker_init_fn(dataloader, self.global_rank) - return _LiteDataLoader( - dataloader=self._strategy.process_dataloader(dataloader), - device=self.device if move_to_device and not isinstance(self._strategy, TPUSpawnPlugin) else None, - ) + + dataloader = self._strategy.process_dataloader(dataloader) + device = self.device if move_to_device and not isinstance(self._strategy, TPUSpawnPlugin) else None + return _LiteDataLoader(dataloader=dataloader, device=device) def backward(self, tensor: Tensor, *args: Any, model: Optional[_LiteModule] = None, **kwargs: Any) -> None: """Replaces ``loss.backward()`` in your training loop. Handles precision and automatically for you. diff --git a/tests/lite/test_lite.py b/tests/lite/test_lite.py index b563e56e2fdec..bd69cf359473e 100644 --- a/tests/lite/test_lite.py +++ b/tests/lite/test_lite.py @@ -24,7 +24,12 @@ from torch.utils.data import DataLoader, DistributedSampler, Sampler from pytorch_lightning.lite import LightningLite -from pytorch_lightning.lite.wrappers import _LiteDataLoader, _LiteModule, _LiteOptimizer +from pytorch_lightning.lite.wrappers import ( + _LiteDataLoader, + _LiteModule, + _LiteOptimizer, + _replace_dataloader_init_method, +) from pytorch_lightning.plugins import DeepSpeedPlugin, PrecisionPlugin, TrainingTypePlugin from pytorch_lightning.utilities import DistributedType from pytorch_lightning.utilities.exceptions import MisconfigurationException @@ -192,57 +197,6 @@ def run(self): LiteWithCustomDataLoader().run() -def test_setup_custom_dataloaders(): - """Test that the setup_dataloaders method returns the dataloaders wrapped as LiteDataLoader.""" - lite = EmptyLite() - - class CustomDataLoader(DataLoader): - def __init__(self, value: int = 2, *args, **kwargs): - self.value = value - super().__init__(range(value), *args, **kwargs) - - dataloader = CustomDataLoader(2, batch_size=2) - - # single dataloader - lite_dataloader = lite.setup_dataloaders(dataloader) - assert lite_dataloader._dataloader - assert lite_dataloader.value == 2 - batch0 = next(iter(lite_dataloader)) - assert torch.equal(batch0, torch.tensor([0, 1])) - - class CustomDataLoader2(DataLoader): - def __init__(self, range, *args, **kwargs): - self.range = range - super().__init__(range, *args, **kwargs) - - dataloader = CustomDataLoader2(range(2), batch_size=2) - - # single dataloader - lite_dataloader = lite.setup_dataloaders(dataloader) - assert lite_dataloader._dataloader - batch0 = next(iter(lite_dataloader)) - assert torch.equal(batch0, torch.tensor([0, 1])) - - class CustomDataLoader(DataLoader): - def __init__(self, value: int, *args, **kwargs): - super().__init__(range(value), *args, **kwargs) - - class LiteWithCustomDataLoader(LightningLite): - def run(self): - # This doesn't fail as the context manager would save all the arguments provided - # to the dataloaders. - dataloader = CustomDataLoader(2, batch_size=2) - self.setup_dataloaders(dataloader) - - LiteWithCustomDataLoader().run() - - with pytest.raises( - MisconfigurationException, match="Trying to inject `DistributedSampler` into the `CustomDataLoader` instance" - ): - dataloader = CustomDataLoader(2, batch_size=2) - lite_dataloader = lite.setup_dataloaders(dataloader) - - def test_setup_dataloaders_twice_fails(): """Test that calling setup_dataloaders with a dataloader that is already wrapped fails.""" lite = EmptyLite() @@ -490,3 +444,25 @@ def run(self): assert self.is_global_zero == (self.local_rank == 0) Lite(strategy=DeepSpeedPlugin(stage=3, logging_batch_size_per_gpu=1), devices=2, accelerator="gpu").run() + + +def test_replace_dataloader_init_method(): + """Test that the context manager enables to save the parameters passed to the DataLoader __init__ method.""" + + class CustomDataLoader(DataLoader): + def __init__(self, extra_argument: int, *args, **kwargs): + super().__init__(*args, **kwargs) + + dataloader = CustomDataLoader(extra_argument=1, dataset=range(1)) + lite = EmptyLite() + with pytest.raises(MisconfigurationException, match="extra_argument"): + dataloader = lite.setup_dataloaders(dataloader) + + with _replace_dataloader_init_method(): + dataloader = CustomDataLoader(extra_argument=1, dataset=range(1)) + assert dataloader.extra_argument == 1 + dataloader = lite.setup_dataloaders(dataloader) + + dataloader = CustomDataLoader(1, range(1)) + assert dataloader.extra_argument == 1 + dataloader = lite.setup_dataloaders(dataloader) From 8d7712c65a0ff2be2388dda2149a481e561da9cf Mon Sep 17 00:00:00 2001 From: Rohit Gupta Date: Tue, 9 Nov 2021 16:53:27 +0530 Subject: [PATCH 009/123] disable step logging in epoch hooks (#10409) * disable step logging in epoch hooks * chlog * Apply suggestions from code review * chlog --- CHANGELOG.md | 2 ++ .../trainer/connectors/logger_connector/fx_validator.py | 8 ++++---- tests/trainer/logging_/test_eval_loop_logging.py | 6 ++++++ tests/trainer/logging_/test_train_loop_logging.py | 6 ++---- 4 files changed, 14 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1c5e5dd8141a7..5e40fdb8d95a1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed failure when `DataLoader(batch_size=None)` is passed ([#10345](https://github.com/PyTorchLightning/pytorch-lightning/issues/10345)) - Fixed interception of `__init__` arguments for sub-classed DataLoader re-instantiation in Lite ([#10334](https://github.com/PyTorchLightning/pytorch-lightning/issues/10334)) - Fixed issue with pickling `CSVLogger` after a call to `CSVLogger.save` ([#10388](https://github.com/PyTorchLightning/pytorch-lightning/pull/10388)) +- Fixed an import error being caused by `PostLocalSGD` when `torch.distributed` not available ([#10359](https://github.com/PyTorchLightning/pytorch-lightning/pull/10359)) +- Fixed the logging with `on_step=True` in epoch-level hooks causing unintended side-effects. Logging with `on_step=True` in epoch-level hooks will now correctly raise an error ([#10409](https://github.com/PyTorchLightning/pytorch-lightning/pull/10409)) ## [1.5.0] - 2021-11-02 diff --git a/pytorch_lightning/trainer/connectors/logger_connector/fx_validator.py b/pytorch_lightning/trainer/connectors/logger_connector/fx_validator.py index a928122a2053a..cc91476518565 100644 --- a/pytorch_lightning/trainer/connectors/logger_connector/fx_validator.py +++ b/pytorch_lightning/trainer/connectors/logger_connector/fx_validator.py @@ -46,15 +46,15 @@ class _LogOptions(TypedDict): "on_predict_end": None, "on_pretrain_routine_start": None, "on_pretrain_routine_end": None, - "on_train_epoch_start": _LogOptions(on_step=(False, True), on_epoch=(True,)), + "on_train_epoch_start": _LogOptions(on_step=(False,), on_epoch=(True,)), "on_train_epoch_end": _LogOptions(on_step=(False,), on_epoch=(True,)), - "on_validation_epoch_start": _LogOptions(on_step=(False, True), on_epoch=(True,)), + "on_validation_epoch_start": _LogOptions(on_step=(False,), on_epoch=(True,)), "on_validation_epoch_end": _LogOptions(on_step=(False,), on_epoch=(True,)), - "on_test_epoch_start": _LogOptions(on_step=(False, True), on_epoch=(True,)), + "on_test_epoch_start": _LogOptions(on_step=(False,), on_epoch=(True,)), "on_test_epoch_end": _LogOptions(on_step=(False,), on_epoch=(True,)), "on_predict_epoch_start": None, "on_predict_epoch_end": None, - "on_epoch_start": _LogOptions(on_step=(False, True), on_epoch=(True,)), + "on_epoch_start": _LogOptions(on_step=(False,), on_epoch=(True,)), "on_epoch_end": _LogOptions(on_step=(False,), on_epoch=(True,)), "on_batch_start": _LogOptions(on_step=(False, True), on_epoch=(False, True)), "on_batch_end": _LogOptions(on_step=(False, True), on_epoch=(False, True)), diff --git a/tests/trainer/logging_/test_eval_loop_logging.py b/tests/trainer/logging_/test_eval_loop_logging.py index b1b7217c892e5..d0e65e0429bc2 100644 --- a/tests/trainer/logging_/test_eval_loop_logging.py +++ b/tests/trainer/logging_/test_eval_loop_logging.py @@ -423,6 +423,12 @@ def make_logging(self, pl_module, func_name, on_steps, on_epochs, prob_bars): def on_test_start(self, _, pl_module): self.make_logging(pl_module, "on_test_start", on_steps=[False], on_epochs=[True], prob_bars=self.choices) + def on_epoch_start(self, trainer, pl_module): + if trainer.testing: + self.make_logging( + pl_module, "on_epoch_start", on_steps=[False], on_epochs=[True], prob_bars=self.choices + ) + def on_test_epoch_start(self, _, pl_module): self.make_logging( pl_module, "on_test_epoch_start", on_steps=[False], on_epochs=[True], prob_bars=self.choices diff --git a/tests/trainer/logging_/test_train_loop_logging.py b/tests/trainer/logging_/test_train_loop_logging.py index 6cad94017177e..5b775b9968d99 100644 --- a/tests/trainer/logging_/test_train_loop_logging.py +++ b/tests/trainer/logging_/test_train_loop_logging.py @@ -272,13 +272,11 @@ def on_train_start(self, _, pl_module): self.make_logging(pl_module, "on_train_start", on_steps=[False], on_epochs=[True], prob_bars=self.choices) def on_epoch_start(self, _, pl_module): - self.make_logging( - pl_module, "on_epoch_start", on_steps=self.choices, on_epochs=[True], prob_bars=self.choices - ) + self.make_logging(pl_module, "on_epoch_start", on_steps=[False], on_epochs=[True], prob_bars=self.choices) def on_train_epoch_start(self, _, pl_module): self.make_logging( - pl_module, "on_train_epoch_start", on_steps=self.choices, on_epochs=[True], prob_bars=self.choices + pl_module, "on_train_epoch_start", on_steps=[False], on_epochs=[True], prob_bars=self.choices ) def on_batch_start(self, _, pl_module, *__): From 3668598196e0fbde25bf9d41ed4c50dea805692c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 9 Nov 2021 15:32:47 +0100 Subject: [PATCH 010/123] Fix deadlocks for distributed training for RichProgressBar (#10428) Co-authored-by: Kaushik Bokka --- CHANGELOG.md | 3 +- .../callbacks/progress/rich_progress.py | 57 +++++++++++-------- tests/callbacks/test_rich_progress_bar.py | 6 +- 3 files changed, 37 insertions(+), 29 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5e40fdb8d95a1..28e609813e0f2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,7 +5,7 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). -## [1.5.1] - 2021-MM-DD +## [1.5.1] - 2021-11-09 ### Fixed @@ -15,6 +15,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed issue with pickling `CSVLogger` after a call to `CSVLogger.save` ([#10388](https://github.com/PyTorchLightning/pytorch-lightning/pull/10388)) - Fixed an import error being caused by `PostLocalSGD` when `torch.distributed` not available ([#10359](https://github.com/PyTorchLightning/pytorch-lightning/pull/10359)) - Fixed the logging with `on_step=True` in epoch-level hooks causing unintended side-effects. Logging with `on_step=True` in epoch-level hooks will now correctly raise an error ([#10409](https://github.com/PyTorchLightning/pytorch-lightning/pull/10409)) +- Fixed deadlocks for distributed training with `RichProgressBar` ([#10428](https://github.com/PyTorchLightning/pytorch-lightning/pull/10428)) ## [1.5.0] - 2021-11-02 diff --git a/pytorch_lightning/callbacks/progress/rich_progress.py b/pytorch_lightning/callbacks/progress/rich_progress.py index f6f862704f599..ab771992a960b 100644 --- a/pytorch_lightning/callbacks/progress/rich_progress.py +++ b/pytorch_lightning/callbacks/progress/rich_progress.py @@ -129,13 +129,19 @@ def render(self, task) -> RenderableType: class MetricsTextColumn(ProgressColumn): """A column containing text.""" - def __init__(self, trainer, pl_module): + def __init__(self, trainer): self._trainer = trainer - self._pl_module = pl_module self._tasks = {} self._current_task_id = 0 + self._metrics = {} super().__init__() + def update(self, metrics): + # Called when metrics are ready to be rendered. + # This is to prevent render from causing deadlock issues by requesting metrics + # in separate threads. + self._metrics = metrics + def render(self, task) -> Text: from pytorch_lightning.trainer.states import TrainerFn @@ -149,14 +155,8 @@ def render(self, task) -> Text: if self._trainer.training and task.id != self._current_task_id: return self._tasks[task.id] _text = "" - # TODO(@daniellepintz): make this code cleaner - progress_bar_callback = getattr(self._trainer, "progress_bar_callback", None) - if progress_bar_callback: - metrics = self._trainer.progress_bar_callback.get_metrics(self._trainer, self._pl_module) - else: - metrics = self._trainer.progress_bar_metrics - - for k, v in metrics.items(): + + for k, v in self._metrics.items(): _text += f"{k}: {round(v, 3) if isinstance(v, float) else v} " return Text(_text, justify="left") @@ -220,9 +220,9 @@ def __init__( self.progress: Optional[Progress] = None self.val_sanity_progress_bar_id: Optional[int] = None self._reset_progress_bar_ids() + self._metric_component = None self._progress_stopped: bool = False self.theme = theme - self._console: Console = Console() @property def refresh_rate_per_second(self) -> float: @@ -263,12 +263,15 @@ def test_description(self) -> str: def predict_description(self) -> str: return "Predicting" - def _init_progress(self, trainer, pl_module): - if self.progress is None or self._progress_stopped: + def _init_progress(self, trainer): + if self.is_enabled and (self.progress is None or self._progress_stopped): self._reset_progress_bar_ids() + self._console: Console = Console() self._console.clear_live() + self._metric_component = MetricsTextColumn(trainer) self.progress = CustomProgress( - *self.configure_columns(trainer, pl_module), + *self.configure_columns(trainer), + self._metric_component, refresh_per_second=self.refresh_rate_per_second, disable=self.is_disabled, console=self._console, @@ -279,19 +282,19 @@ def _init_progress(self, trainer, pl_module): def on_train_start(self, trainer, pl_module): super().on_train_start(trainer, pl_module) - self._init_progress(trainer, pl_module) + self._init_progress(trainer) def on_predict_start(self, trainer, pl_module): super().on_predict_start(trainer, pl_module) - self._init_progress(trainer, pl_module) + self._init_progress(trainer) def on_test_start(self, trainer, pl_module): super().on_test_start(trainer, pl_module) - self._init_progress(trainer, pl_module) + self._init_progress(trainer) def on_validation_start(self, trainer, pl_module): super().on_validation_start(trainer, pl_module) - self._init_progress(trainer, pl_module) + self._init_progress(trainer) def __getstate__(self): # can't pickle the rich progress objects @@ -302,12 +305,11 @@ def __getstate__(self): def __setstate__(self, state): self.__dict__ = state - # reset console reference after loading progress - self._console = Console() + state["_console"] = Console() def on_sanity_check_start(self, trainer, pl_module): super().on_sanity_check_start(trainer, pl_module) - self._init_progress(trainer, pl_module) + self._init_progress(trainer) self.val_sanity_progress_bar_id = self._add_task(trainer.num_sanity_val_steps, self.sanity_check_description) def on_sanity_check_end(self, trainer, pl_module): @@ -328,10 +330,10 @@ def on_train_epoch_start(self, trainer, pl_module): train_description = self._get_train_description(trainer.current_epoch) if self.main_progress_bar_id is not None and self._leave: self._stop_progress() - self._init_progress(trainer, pl_module) + self._init_progress(trainer) if self.main_progress_bar_id is None: self.main_progress_bar_id = self._add_task(total_batches, train_description) - else: + elif self.progress is not None: self.progress.reset( self.main_progress_bar_id, total=total_batches, description=train_description, visible=True ) @@ -372,6 +374,7 @@ def on_predict_epoch_start(self, trainer, pl_module): def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx): super().on_train_batch_end(trainer, pl_module, outputs, batch, batch_idx) self._update(self.main_progress_bar_id) + self._update_metrics(trainer, pl_module) def on_validation_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, dataloader_idx): super().on_validation_batch_end(trainer, pl_module, outputs, batch, batch_idx, dataloader_idx) @@ -414,6 +417,11 @@ def _reset_progress_bar_ids(self): self.test_progress_bar_id: Optional[int] = None self.predict_progress_bar_id: Optional[int] = None + def _update_metrics(self, trainer, pl_module) -> None: + metrics = self.get_metrics(trainer, pl_module) + if self._metric_component: + self._metric_component.update(metrics) + def teardown(self, trainer, pl_module, stage: Optional[str] = None) -> None: self._stop_progress() @@ -436,7 +444,7 @@ def main_progress_bar(self) -> Task: def test_progress_bar(self) -> Task: return self.progress.tasks[self.test_progress_bar_id] - def configure_columns(self, trainer, pl_module) -> list: + def configure_columns(self, trainer) -> list: return [ TextColumn("[progress.description]{task.description}"), CustomBarColumn( @@ -447,5 +455,4 @@ def configure_columns(self, trainer, pl_module) -> list: BatchesProcessedColumn(style=self.theme.batch_process), CustomTimeColumn(style=self.theme.time), ProcessingSpeedColumn(style=self.theme.processing_speed), - MetricsTextColumn(trainer, pl_module), ] diff --git a/tests/callbacks/test_rich_progress_bar.py b/tests/callbacks/test_rich_progress_bar.py index 6c0a201c794c3..31681754423a8 100644 --- a/tests/callbacks/test_rich_progress_bar.py +++ b/tests/callbacks/test_rich_progress_bar.py @@ -150,15 +150,15 @@ def test_rich_progress_bar_configure_columns(): custom_column = TextColumn("[progress.description]Testing Rich!") class CustomRichProgressBar(RichProgressBar): - def configure_columns(self, trainer, pl_module): + def configure_columns(self, trainer): return [custom_column] progress_bar = CustomRichProgressBar() - progress_bar._init_progress(Mock(), Mock()) + progress_bar._init_progress(Mock()) assert progress_bar.progress.columns[0] == custom_column - assert len(progress_bar.progress.columns) == 1 + assert len(progress_bar.progress.columns) == 2 @RunIf(rich=True) From 53451c522ed0a280b301b4f2ff21834547527a8e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 9 Nov 2021 15:33:48 +0100 Subject: [PATCH 011/123] Fix converting only float type tensors in Lite (#10429) * fix * less code * add test case * add test cases * update input * add test cases * add type hint * add changelog note Co-authored-by: Kaushik B <45285388+kaushikb11@users.noreply.github.com> --- CHANGELOG.md | 1 + pytorch_lightning/lite/wrappers.py | 9 +++++++-- tests/lite/test_wrappers.py | 11 ++++++++--- 3 files changed, 16 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 28e609813e0f2..22615f329dbfd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,6 +16,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed an import error being caused by `PostLocalSGD` when `torch.distributed` not available ([#10359](https://github.com/PyTorchLightning/pytorch-lightning/pull/10359)) - Fixed the logging with `on_step=True` in epoch-level hooks causing unintended side-effects. Logging with `on_step=True` in epoch-level hooks will now correctly raise an error ([#10409](https://github.com/PyTorchLightning/pytorch-lightning/pull/10409)) - Fixed deadlocks for distributed training with `RichProgressBar` ([#10428](https://github.com/PyTorchLightning/pytorch-lightning/pull/10428)) +- Fixed an issue where the model wrapper in Lite converted non-floating point tensors to float ([#10429](https://github.com/PyTorchLightning/pytorch-lightning/pull/10429)) ## [1.5.0] - 2021-11-02 diff --git a/pytorch_lightning/lite/wrappers.py b/pytorch_lightning/lite/wrappers.py index 881a663fdb9e5..615f461055204 100644 --- a/pytorch_lightning/lite/wrappers.py +++ b/pytorch_lightning/lite/wrappers.py @@ -95,12 +95,17 @@ def forward(self, *args: Any, **kwargs: Any) -> Any: } # TODO (@awaelchli): let the precision plugin handle the conversion to_type = precision_to_type[precision] - args, kwargs = apply_to_collection([args, kwargs], function=lambda t: t.to(to_type), dtype=Tensor) + + def _convert_float_tensor(t: Tensor) -> Tensor: + return t.to(to_type) if torch.is_floating_point(t) else t + + args, kwargs = apply_to_collection([args, kwargs], function=_convert_float_tensor, dtype=Tensor) with self._precision_plugin.forward_context(): output = self.module(*args, **kwargs) - output = apply_to_collection(output, function=lambda t: t.to(torch.get_default_dtype()), dtype=Tensor) + to_type = torch.get_default_dtype() + output = apply_to_collection(output, function=_convert_float_tensor, dtype=Tensor) return output diff --git a/tests/lite/test_wrappers.py b/tests/lite/test_wrappers.py index 6741bf59b4dca..4993a10c8dbc2 100644 --- a/tests/lite/test_wrappers.py +++ b/tests/lite/test_wrappers.py @@ -40,8 +40,13 @@ def test_lite_module_wraps(): (32, torch.float16, torch.float32), (32, torch.float32, torch.float32), (32, torch.float64, torch.float32), + (32, torch.int, torch.int), (16, torch.float32, torch.float16), (16, torch.float64, torch.float16), + (16, torch.long, torch.long), + pytest.param("bf16", torch.float32, torch.bfloat16, marks=RunIf(min_torch="1.10")), + pytest.param("bf16", torch.float64, torch.bfloat16, marks=RunIf(min_torch="1.10")), + pytest.param("bf16", torch.bool, torch.bool, marks=RunIf(min_torch="1.10")), ], ) def test_lite_module_forward_conversion(precision, input_type, expected_type): @@ -53,11 +58,11 @@ def check_autocast(forward_input): assert precision != 16 or torch.is_autocast_enabled() return forward_input - module = Mock(wraps=torch.nn.Linear(1, 1), side_effect=check_autocast) + module = Mock(wraps=torch.nn.Identity(), side_effect=check_autocast) lite_module = _LiteModule(module, lite._precision_plugin).to(device) - out = lite_module(torch.rand(1, dtype=input_type, device=device)) + out = lite_module(torch.tensor([1, 2, 3], dtype=input_type, device=device)) assert module.call_args[0][0].dtype == expected_type - assert out.dtype == torch.get_default_dtype() + assert out.dtype == input_type or out.dtype == torch.get_default_dtype() def test_lite_dataloader_iterator(): From 69ea0b024e0ebd8ae95a342b8d158ff561584f84 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 9 Nov 2021 16:35:47 +0100 Subject: [PATCH 012/123] [Fault Tolerance] Don't check the len of a dataset, but its instance. (#10432) Co-authored-by: Thomas Chaton --- CHANGELOG.md | 1 + pytorch_lightning/trainer/data_loading.py | 7 ++++--- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 22615f329dbfd..c96074a6f640f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,6 +17,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed the logging with `on_step=True` in epoch-level hooks causing unintended side-effects. Logging with `on_step=True` in epoch-level hooks will now correctly raise an error ([#10409](https://github.com/PyTorchLightning/pytorch-lightning/pull/10409)) - Fixed deadlocks for distributed training with `RichProgressBar` ([#10428](https://github.com/PyTorchLightning/pytorch-lightning/pull/10428)) - Fixed an issue where the model wrapper in Lite converted non-floating point tensors to float ([#10429](https://github.com/PyTorchLightning/pytorch-lightning/pull/10429)) +- Fixed an issue with inferring the dataset type in fault-tolerant training ([#10432](https://github.com/PyTorchLightning/pytorch-lightning/pull/10432)) ## [1.5.0] - 2021-11-02 diff --git a/pytorch_lightning/trainer/data_loading.py b/pytorch_lightning/trainer/data_loading.py index c41d80b903d4e..0f1eccc3f7cbb 100644 --- a/pytorch_lightning/trainer/data_loading.py +++ b/pytorch_lightning/trainer/data_loading.py @@ -37,7 +37,7 @@ CaptureMapDataset, FastForwardSampler, ) -from pytorch_lightning.utilities.data import has_iterable_dataset, has_len_all_ranks +from pytorch_lightning.utilities.data import get_len, has_iterable_dataset, has_len_all_ranks from pytorch_lightning.utilities.enums import DistributedType from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.imports import _fault_tolerant_training @@ -282,10 +282,11 @@ def _get_dataloader_init_kwargs( dl_kwargs["sampler"] = None if _fault_tolerant_training(): - if isinstance(dl_kwargs["dataset"], IterableDataset): + dataset = dl_kwargs["dataset"] + if isinstance(dataset, IterableDataset): # wrap the `IterableDataset` into a `CaptureIterableDataset` to record sampler states. dl_kwargs["dataset"] = CaptureIterableDataset(dataset=dl_kwargs["dataset"]) - elif len(dl_kwargs["dataset"]): + elif get_len(dataset) != float("inf"): dl_kwargs["dataset"] = CaptureMapDataset(dataset=dl_kwargs["dataset"]) else: raise MisconfigurationException( From 51cb0774e377837f644368e392d241fc3ff18326 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 9 Nov 2021 16:37:44 +0100 Subject: [PATCH 013/123] Resolve workers being forcelly deleted with `persistent_workers=True` (#10434) Co-authored-by: Thomas Chaton --- CHANGELOG.md | 1 + pytorch_lightning/utilities/fetching.py | 6 +++--- tests/loops/test_loops.py | 18 ++++++++++++------ 3 files changed, 16 insertions(+), 9 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c96074a6f640f..45f5efcc56216 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -18,6 +18,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed deadlocks for distributed training with `RichProgressBar` ([#10428](https://github.com/PyTorchLightning/pytorch-lightning/pull/10428)) - Fixed an issue where the model wrapper in Lite converted non-floating point tensors to float ([#10429](https://github.com/PyTorchLightning/pytorch-lightning/pull/10429)) - Fixed an issue with inferring the dataset type in fault-tolerant training ([#10432](https://github.com/PyTorchLightning/pytorch-lightning/pull/10432)) +- Fixed dataloader workers with `persistent_workers` being deleted on every iteration ([#10434](https://github.com/PyTorchLightning/pytorch-lightning/pull/10434)) ## [1.5.0] - 2021-11-02 diff --git a/pytorch_lightning/utilities/fetching.py b/pytorch_lightning/utilities/fetching.py index fd9baf3e9c4f1..9b80d2f9874c7 100644 --- a/pytorch_lightning/utilities/fetching.py +++ b/pytorch_lightning/utilities/fetching.py @@ -206,15 +206,15 @@ def reset(self) -> None: self.batches: List = [] self.fetched: int = 0 self.done: bool = False + + def teardown(self) -> None: + self.reset() if isinstance(self.dataloader, CombinedLoader): self.dataloader.reset() if isinstance(self.dataloader, DataLoader): CombinedLoader._shutdown_workers_and_reset_iterator(self.dataloader) self.dataloader_iter = None - def teardown(self) -> None: - self.reset() - class DataFetcher(AbstractDataFetcher): diff --git a/tests/loops/test_loops.py b/tests/loops/test_loops.py index dd390ab4939d5..bad9a717d1629 100644 --- a/tests/loops/test_loops.py +++ b/tests/loops/test_loops.py @@ -912,21 +912,25 @@ def val_dataloader(self): @RunIf(min_torch="1.8.0") -@pytest.mark.parametrize("persistent_workers", (True, False)) +@pytest.mark.parametrize("persistent_workers", (False, True)) def test_workers_are_shutdown(tmpdir, persistent_workers): # `num_workers == 1` uses `_MultiProcessingDataLoaderIter` # `persistent_workers` makes sure `self._iterator` gets set on the `DataLoader` instance class _TestMultiProcessingDataLoaderIter(_MultiProcessingDataLoaderIter): - def __init__(self, *args, dataloader: DataLoader, **kwargs): + def __init__(self, *args, dataloader, **kwargs): super().__init__(*args, **kwargs) self.dataloader = dataloader def _shutdown_workers(self): - setattr(self.dataloader, "has_shutdown_workers", True) + self.dataloader.count_shutdown_workers += 1 super()._shutdown_workers() class TestDataLoader(DataLoader): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.count_shutdown_workers = 0 + def _get_iterator(self): if self.num_workers == 0: return super()._get_iterator() @@ -937,10 +941,12 @@ def _get_iterator(self): train_dataloader = TestDataLoader(RandomDataset(32, 64), num_workers=1, persistent_workers=persistent_workers) val_dataloader = TestDataLoader(RandomDataset(32, 64), num_workers=1, persistent_workers=persistent_workers) + max_epochs = 3 model = BoringModel() - trainer = Trainer(default_root_dir=tmpdir, limit_train_batches=2, limit_val_batches=2, max_epochs=2) + trainer = Trainer(default_root_dir=tmpdir, limit_train_batches=2, limit_val_batches=2, max_epochs=max_epochs) trainer.fit(model, train_dataloader, val_dataloader) - assert train_dataloader.has_shutdown_workers - assert val_dataloader.has_shutdown_workers + assert train_dataloader.count_shutdown_workers == (2 if persistent_workers else max_epochs) + # on sanity checking end, the workers are being deleted too. + assert val_dataloader.count_shutdown_workers == (2 if persistent_workers else max_epochs + 1) assert train_dataloader._iterator is None assert val_dataloader._iterator is None From 6d1ee3b2b38a6547c953a66c62f8b60f6ab8daf6 Mon Sep 17 00:00:00 2001 From: Justus Schock <12886177+justusschock@users.noreply.github.com> Date: Tue, 9 Nov 2021 17:24:49 +0100 Subject: [PATCH 014/123] Update csv_logs.py --- pytorch_lightning/loggers/csv_logs.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pytorch_lightning/loggers/csv_logs.py b/pytorch_lightning/loggers/csv_logs.py index 454a17905c529..fa463f3a2cb49 100644 --- a/pytorch_lightning/loggers/csv_logs.py +++ b/pytorch_lightning/loggers/csv_logs.py @@ -95,6 +95,8 @@ def save(self) -> None: metrics_keys = list(last_m.keys()) with open(self.metrics_file_path, "w", newline="") as f: + # Don't assign the writer to self. + # Keeps an open reference and prevents pickling otherwise writer = csv.DictWriter(f, fieldnames=metrics_keys) writer.writeheader() writer.writerows(self.metrics) From 4755ae60b71a857a17f39dba1189c2c3245660f5 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 9 Nov 2021 16:26:00 +0000 Subject: [PATCH 015/123] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- pytorch_lightning/loggers/csv_logs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/loggers/csv_logs.py b/pytorch_lightning/loggers/csv_logs.py index fa463f3a2cb49..cd513bb30012b 100644 --- a/pytorch_lightning/loggers/csv_logs.py +++ b/pytorch_lightning/loggers/csv_logs.py @@ -95,7 +95,7 @@ def save(self) -> None: metrics_keys = list(last_m.keys()) with open(self.metrics_file_path, "w", newline="") as f: - # Don't assign the writer to self. + # Don't assign the writer to self. # Keeps an open reference and prevents pickling otherwise writer = csv.DictWriter(f, fieldnames=metrics_keys) writer.writeheader() From 43a50756a557078733274e3260523c1241eed244 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 10 Nov 2021 22:09:52 +0100 Subject: [PATCH 016/123] update version --- CHANGELOG.md | 7 +++++++ pytorch_lightning/__about__.py | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 45f5efcc56216..4357b58cb5c5b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,13 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). +## [1.5.2] - 2021-11-16 + +### Fixed + +- + + ## [1.5.1] - 2021-11-09 ### Fixed diff --git a/pytorch_lightning/__about__.py b/pytorch_lightning/__about__.py index e1bdee9b7320b..bc9aefd1d2dad 100644 --- a/pytorch_lightning/__about__.py +++ b/pytorch_lightning/__about__.py @@ -1,7 +1,7 @@ import time _this_year = time.strftime("%Y") -__version__ = "1.5.1" +__version__ = "1.5.2" __author__ = "William Falcon et al." __author_email__ = "waf2107@columbia.edu" __license__ = "Apache-2.0" From ab44b812631c5adb1aa7f09af0c980e3b80807b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 10 Nov 2021 22:18:29 +0100 Subject: [PATCH 017/123] Enable distributed training with CombinedDataLoader and max_size_cycle (#10374) Co-authored-by: Carlos Mocholi Co-authored-by: Thomas Chaton --- CHANGELOG.md | 3 +- pytorch_lightning/trainer/data_loading.py | 18 ++++++-- pytorch_lightning/trainer/supporters.py | 20 +++++++-- tests/trainer/test_supporters.py | 55 +++++++++++++++++++++++ 4 files changed, 89 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4357b58cb5c5b..c93d1618eb088 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,7 +9,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Fixed -- +- Fixed `CombinedLoader` and `max_size_cycle` didn't receive a `DistributedSampler` ([#10374](https://github.com/PyTorchLightning/pytorch-lightning/issues/10374)) + ## [1.5.1] - 2021-11-09 diff --git a/pytorch_lightning/trainer/data_loading.py b/pytorch_lightning/trainer/data_loading.py index 0f1eccc3f7cbb..37a234f32f711 100644 --- a/pytorch_lightning/trainer/data_loading.py +++ b/pytorch_lightning/trainer/data_loading.py @@ -28,7 +28,7 @@ from pytorch_lightning.overrides.distributed import IndexBatchSamplerWrapper, UnrepeatedDistributedSampler from pytorch_lightning.trainer.connectors.accelerator_connector import AcceleratorConnector from pytorch_lightning.trainer.states import RunningStage -from pytorch_lightning.trainer.supporters import CombinedLoader +from pytorch_lightning.trainer.supporters import CombinedLoader, CycleIterator from pytorch_lightning.utilities import rank_zero_warn from pytorch_lightning.utilities.apply_func import apply_to_collection from pytorch_lightning.utilities.auto_restart import ( @@ -136,14 +136,22 @@ def prepare_dataloader(self, dataloader: Any, shuffle: bool, mode: Optional[Runn if isinstance(dataloader, CombinedLoader): # apply `prepare_dataloader` on all the collection of loaders dataloader.loaders = apply_to_collection( - dataloader.loaders, DataLoader, self.prepare_dataloader, shuffle, mode=mode + dataloader.loaders, (DataLoader, CycleIterator), self.prepare_dataloader, shuffle, mode=mode ) + # the length need to recomputed across all dataloaders in case of special behavior. + dataloader._apply_cycle_iterator_length() return dataloader # don't do anything if it's not a dataloader - if not isinstance(dataloader, DataLoader): + if not isinstance(dataloader, (DataLoader, CycleIterator)): return dataloader + cycle_iterator: Optional[CycleIterator] = None + + if isinstance(dataloader, CycleIterator): + cycle_iterator = dataloader + dataloader = dataloader.loader + if ( _fault_tolerant_training() # injects components to track the state or self._requires_distributed_sampler(dataloader) # sets the distributed sampler @@ -153,6 +161,10 @@ def prepare_dataloader(self, dataloader: Any, shuffle: bool, mode: Optional[Runn sampler = self._resolve_sampler(dataloader, shuffle=shuffle, mode=mode) dataloader = self._update_dataloader(dataloader, sampler, mode=mode) + if cycle_iterator is not None: + cycle_iterator.loader = dataloader + return cycle_iterator + return dataloader def _resolve_sampler(self, dataloader: DataLoader, shuffle: bool, mode: Optional[RunningStage] = None) -> Sampler: diff --git a/pytorch_lightning/trainer/supporters.py b/pytorch_lightning/trainer/supporters.py index 816f4da38f5b9..6e2e51e82bbf1 100644 --- a/pytorch_lightning/trainer/supporters.py +++ b/pytorch_lightning/trainer/supporters.py @@ -457,6 +457,19 @@ def _wrap_loaders_max_size_cycle(self) -> Any: ) state.reset() + def _apply_cycle_iterator_length(self) -> None: + """When the model is `max_size_cycle`, compute the length across all ``CycleIterator`` and re-assign it to + all dataloaders.""" + if self.mode != "max_size_cycle": + return + + def set_len(cycle_iterator: CycleIterator, length: int) -> None: + cycle_iterator.length = length + + all_lengths = apply_to_collection(self.loaders, CycleIterator, lambda c: get_len(c.loader)) + max_length = _nested_calc_num_data(all_lengths, max) + apply_to_collection(self.loaders, CycleIterator, set_len, length=max_length) + def __iter__(self) -> Any: """Create and return an iterator, `CombinedLoaderIterator`, for the combined loader.""" @@ -473,11 +486,12 @@ def __getstate__patch__(*_): return iterator @staticmethod - def _calc_num_batches(loaders: Any) -> Union[int, float]: + def _calc_num_batches(loaders: Any, mode="min_size") -> Union[int, float]: """Compute the length (aka the number of batches) of `CombinedLoader`. Args: loaders: a collections of loaders. + mode: Mode used by the CombinedDataloader Returns: length: the minimum length of loaders @@ -486,10 +500,10 @@ def _calc_num_batches(loaders: Any) -> Union[int, float]: if isinstance(all_lengths, (int, float)): return all_lengths - return _nested_calc_num_data(all_lengths, min) + return _nested_calc_num_data(all_lengths, max if mode == "max_size_cycle" else min) def __len__(self) -> int: - return self._calc_num_batches(self.loaders) + return self._calc_num_batches(self.loaders, mode=self.mode) @staticmethod def _shutdown_workers_and_reset_iterator(dataloader) -> None: diff --git a/tests/trainer/test_supporters.py b/tests/trainer/test_supporters.py index 204f3079f544b..e4598550c24fb 100644 --- a/tests/trainer/test_supporters.py +++ b/tests/trainer/test_supporters.py @@ -33,8 +33,10 @@ ) from pytorch_lightning.utilities.apply_func import apply_to_collection from pytorch_lightning.utilities.auto_restart import CaptureMapDataset, FastForwardSampler +from pytorch_lightning.utilities.data import get_len from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_7 +from tests.helpers.boring_model import RandomDataset def test_tensor_running_accum_reset(): @@ -379,3 +381,56 @@ def _assert_dataset(loader): assert isinstance(d, CustomDataset) apply_to_collection(dataloader.loaders, DataLoader, _assert_dataset) + + +@pytest.mark.parametrize("replace_sampler_ddp", [False, True]) +def test_combined_data_loader_with_max_size_cycle_and_ddp(replace_sampler_ddp, tmpdir): + """This test makes sure distributed sampler has been properly injected in dataloaders when using CombinedLoader + with ddp and `max_size_cycle` mode.""" + trainer = Trainer(strategy="ddp", accelerator="auto", devices=2, replace_sampler_ddp=replace_sampler_ddp) + + dataloader = CombinedLoader( + {"a": DataLoader(RandomDataset(32, 8), batch_size=1), "b": DataLoader(RandomDataset(32, 8), batch_size=1)}, + ) + dataloader = trainer.prepare_dataloader(dataloader, shuffle=False) + assert len(dataloader) == 4 if replace_sampler_ddp else 8 + + for a_length in [6, 8, 10]: + dataloader = CombinedLoader( + { + "a": DataLoader(range(a_length), batch_size=1), + "b": DataLoader(range(8), batch_size=1), + }, + mode="max_size_cycle", + ) + + length = max(a_length, 8) + assert len(dataloader) == length + dataloader = trainer.prepare_dataloader(dataloader, shuffle=False) + assert len(dataloader) == length // 2 if replace_sampler_ddp else length + if replace_sampler_ddp: + last_batch = list(dataloader)[-1] + if a_length == 6: + assert last_batch == {"a": torch.tensor([0]), "b": torch.tensor([6])} + elif a_length == 8: + assert last_batch == {"a": torch.tensor([6]), "b": torch.tensor([6])} + elif a_length == 10: + assert last_batch == {"a": torch.tensor([8]), "b": torch.tensor([0])} + + class InfiniteDataset(IterableDataset): + def __iter__(self): + while True: + yield 1 + + dataloader = CombinedLoader( + { + "a": DataLoader(InfiniteDataset(), batch_size=1), + "b": DataLoader(range(8), batch_size=1), + }, + mode="max_size_cycle", + ) + assert get_len(dataloader) == float("inf") + assert len(dataloader.loaders["b"].loader) == 8 + dataloader = trainer.prepare_dataloader(dataloader, shuffle=False) + assert len(dataloader.loaders["b"].loader) == 4 if replace_sampler_ddp else 8 + assert get_len(dataloader) == float("inf") From 4bc6e959d0be798b00ea8ccac96e9c8e2f9616a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 10 Nov 2021 22:20:59 +0100 Subject: [PATCH 018/123] Fix support for dataclasses with ClassVar/InitVar in `apply_to_collection` (#9702) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Adrian Wälchli Co-authored-by: Carlos Mocholi --- CHANGELOG.md | 2 +- .../connectors/logger_connector/result.py | 26 ++- pytorch_lightning/utilities/apply_func.py | 25 ++- tests/core/test_results.py | 2 +- tests/models/test_tpu.py | 2 +- tests/utilities/test_apply_func.py | 158 +++++++++++++----- 6 files changed, 159 insertions(+), 56 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c93d1618eb088..8b18ac8db873a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,7 +10,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Fixed - Fixed `CombinedLoader` and `max_size_cycle` didn't receive a `DistributedSampler` ([#10374](https://github.com/PyTorchLightning/pytorch-lightning/issues/10374)) - +- Fixed an issue where class or init-only variables of dataclasses were passed to the dataclass constructor in `utilities.apply_to_collection` ([#9702](https://github.com/PyTorchLightning/pytorch-lightning/issues/9702)) ## [1.5.1] - 2021-11-09 diff --git a/pytorch_lightning/trainer/connectors/logger_connector/result.py b/pytorch_lightning/trainer/connectors/logger_connector/result.py index f798cf3ee2b82..53034ac77db3f 100644 --- a/pytorch_lightning/trainer/connectors/logger_connector/result.py +++ b/pytorch_lightning/trainer/connectors/logger_connector/result.py @@ -51,8 +51,8 @@ class _Sync: fn: Optional[Callable] = None _should: bool = False rank_zero_only: bool = False - op: Optional[str] = None - group: Optional[Any] = None + _op: Optional[str] = None + _group: Optional[Any] = None def __post_init__(self) -> None: self._generate_sync_fn() @@ -67,6 +67,26 @@ def should(self, should: bool) -> None: # `self._fn` needs to be re-generated. self._generate_sync_fn() + @property + def op(self) -> Optional[str]: + return self._op + + @op.setter + def op(self, op: Optional[str]) -> None: + self._op = op + # `self._fn` needs to be re-generated. + self._generate_sync_fn() + + @property + def group(self) -> Optional[Any]: + return self._group + + @group.setter + def group(self, group: Optional[Any]) -> None: + self._group = group + # `self._fn` needs to be re-generated. + self._generate_sync_fn() + def _generate_sync_fn(self) -> None: """Used to compute the syncing function and cache it.""" fn = self.no_op if self.fn is None or not self.should or self.rank_zero_only else self.fn @@ -426,7 +446,7 @@ def log( dataloader_idx=dataloader_idx, metric_attribute=metric_attribute, ) - meta.sync = _Sync(_should=sync_dist, fn=sync_dist_fn, group=sync_dist_group, rank_zero_only=rank_zero_only) + meta.sync = _Sync(_should=sync_dist, fn=sync_dist_fn, _group=sync_dist_group, rank_zero_only=rank_zero_only) # register logged value if it doesn't exist if key not in self: diff --git a/pytorch_lightning/utilities/apply_func.py b/pytorch_lightning/utilities/apply_func.py index 1e981a0f543e7..5a76f402bcc02 100644 --- a/pytorch_lightning/utilities/apply_func.py +++ b/pytorch_lightning/utilities/apply_func.py @@ -16,7 +16,7 @@ from abc import ABC from collections import defaultdict, OrderedDict from collections.abc import Mapping, Sequence -from copy import copy +from copy import copy, deepcopy from functools import partial from typing import Any, Callable, List, Optional, Tuple, Union @@ -119,11 +119,21 @@ def apply_to_collection( return elem_type(*out) if is_namedtuple else elem_type(out) if _is_dataclass_instance(data): - out_dict = {} + # make a deepcopy of the data, + # but do not deepcopy mapped fields since the computation would + # be wasted on values that likely get immediately overwritten + fields = {} + memo = {} for field in dataclasses.fields(data): - if field.init: + field_value = getattr(data, field.name) + fields[field.name] = (field_value, field.init) + memo[id(field_value)] = field_value + result = deepcopy(data, memo=memo) + # apply function to each field + for field_name, (field_value, field_init) in fields.items(): + if field_init: v = apply_to_collection( - getattr(data, field.name), + field_value, dtype, function, *args, @@ -131,9 +141,10 @@ def apply_to_collection( include_none=include_none, **kwargs, ) - if include_none or v is not None: - out_dict[field.name] = v - return elem_type(**out_dict) + if not field_init or (not include_none and v is None): # retain old value + v = getattr(data, field_name) + setattr(result, field_name, v) + return result # data is neither of dtype, nor a collection return data diff --git a/tests/core/test_results.py b/tests/core/test_results.py index 0e62441b1d40e..a39ce51788ff9 100644 --- a/tests/core/test_results.py +++ b/tests/core/test_results.py @@ -33,7 +33,7 @@ def _setup_ddp(rank, worldsize): def _ddp_test_fn(rank, worldsize): _setup_ddp(rank, worldsize) tensor = torch.tensor([1.0]) - sync = _Sync(sync_ddp_if_available, _should=True, op="SUM") + sync = _Sync(sync_ddp_if_available, _should=True, _op="SUM") actual = sync(tensor) assert actual.item() == dist.get_world_size(), "Result-Log does not work properly with DDP and Tensors" diff --git a/tests/models/test_tpu.py b/tests/models/test_tpu.py index d8ceb4106fd07..31ebd3968ff3e 100644 --- a/tests/models/test_tpu.py +++ b/tests/models/test_tpu.py @@ -407,7 +407,7 @@ def test_tpu_sync_dist(): """Test tpu spawn sync dist operation.""" def test_sync_dist(_): - sync = _Sync(TPUSpawnPlugin().reduce, should=True, op=torch.distributed.ReduceOp.SUM) + sync = _Sync(TPUSpawnPlugin().reduce, should=True, _op=torch.distributed.ReduceOp.SUM) value = torch.tensor([1.0]) value = (sync(value),) assert value.item() == 8 diff --git a/tests/utilities/test_apply_func.py b/tests/utilities/test_apply_func.py index da309f7d22b50..9b0fcbd643744 100644 --- a/tests/utilities/test_apply_func.py +++ b/tests/utilities/test_apply_func.py @@ -14,7 +14,8 @@ import dataclasses import numbers from collections import defaultdict, namedtuple, OrderedDict -from typing import List +from dataclasses import InitVar +from typing import Any, ClassVar, List, Optional import numpy as np import pytest @@ -31,6 +32,12 @@ class Feature: input_ids: torch.Tensor segment_ids: np.ndarray + def __eq__(self, o: object) -> bool: + if not isinstance(o, Feature): + return NotImplemented + else: + return torch.equal(self.input_ids, o.input_ids) and np.equal(self.segment_ids, o.segment_ids).all() + @dataclasses.dataclass class ModelExample: example_ids: List[str] @@ -41,6 +48,71 @@ class ModelExample: def __post_init__(self): self.some_constant = 7 + def __eq__(self, o: object) -> bool: + if not isinstance(o, ModelExample): + return NotImplemented + else: + return ( + self.example_ids == o.example_ids + and self.feature == o.feature + and torch.equal(self.label, o.label) + and self.some_constant == o.some_constant + ) + + @dataclasses.dataclass + class WithClassVar: + class_var: ClassVar[int] = 0 + dummy: Any + + def __eq__(self, o: object) -> bool: + if not isinstance(o, WithClassVar): + return NotImplemented + elif isinstance(self.dummy, torch.Tensor): + return torch.equal(self.dummy, o.dummy) + else: + return self.dummy == o.dummy + + @dataclasses.dataclass + class WithInitVar: + dummy: Any + override: InitVar[Optional[Any]] = None + + def __post_init__(self, override: Optional[Any]): + if override is not None: + self.dummy = override + + def __eq__(self, o: object) -> bool: + if not isinstance(o, WithInitVar): + return NotImplemented + elif isinstance(self.dummy, torch.Tensor): + return torch.equal(self.dummy, o.dummy) + else: + return self.dummy == o.dummy + + @dataclasses.dataclass + class WithClassAndInitVar: + class_var: ClassVar[torch.Tensor] = torch.tensor(0) + dummy: Any + override: InitVar[Optional[Any]] = torch.tensor(1) + + def __post_init__(self, override: Optional[Any]): + if override is not None: + self.dummy = override + + def __eq__(self, o: object) -> bool: + if not isinstance(o, WithClassAndInitVar): + return NotImplemented + elif isinstance(self.dummy, torch.Tensor): + return torch.equal(self.dummy, o.dummy) + else: + return self.dummy == o.dummy + + model_example = ModelExample( + example_ids=["i-1", "i-2", "i-3"], + feature=Feature(input_ids=torch.tensor([1.0, 2.0, 3.0]), segment_ids=np.array([4.0, 5.0, 6.0])), + label=torch.tensor([7.0, 8.0, 9.0]), + ) + to_reduce = { "a": torch.tensor([1.0]), # Tensor "b": [torch.tensor([2.0])], # list @@ -50,13 +122,18 @@ def __post_init__(self): "f": "this_is_a_dummy_str", # string "g": 12.0, # number "h": Feature(input_ids=torch.tensor([1.0, 2.0, 3.0]), segment_ids=np.array([4.0, 5.0, 6.0])), # dataclass - "i": ModelExample( - example_ids=["i-1", "i-2", "i-3"], - feature=Feature(input_ids=torch.tensor([1.0, 2.0, 3.0]), segment_ids=np.array([4.0, 5.0, 6.0])), - label=torch.tensor([7.0, 8.0, 9.0]), - ), # nested dataclass + "i": model_example, # nested dataclass + "j": WithClassVar(torch.arange(3)), # dataclass with class variable + "k": WithInitVar("this_gets_overridden", torch.tensor([2.0])), # dataclass with init-only variable + "l": WithClassAndInitVar(model_example, None), # nested dataclass with class and init-only variables } + model_example_result = ModelExample( + example_ids=["i-1", "i-2", "i-3"], + feature=Feature(input_ids=torch.tensor([2.0, 4.0, 6.0]), segment_ids=np.array([8.0, 10.0, 12.0])), + label=torch.tensor([14.0, 16.0, 18.0]), + ) + expected_result = { "a": torch.tensor([2.0]), "b": [torch.tensor([4.0])], @@ -66,32 +143,31 @@ def __post_init__(self): "f": "this_is_a_dummy_str", "g": 24.0, "h": Feature(input_ids=torch.tensor([2.0, 4.0, 6.0]), segment_ids=np.array([8.0, 10.0, 12.0])), - "i": ModelExample( - example_ids=["i-1", "i-2", "i-3"], - feature=Feature(input_ids=torch.tensor([2.0, 4.0, 6.0]), segment_ids=np.array([8.0, 10.0, 12.0])), - label=torch.tensor([14.0, 16.0, 18.0]), - ), + "i": model_example_result, + "j": WithClassVar(torch.arange(0, 6, 2)), + "k": WithInitVar(torch.tensor([4.0])), + "l": WithClassAndInitVar(model_example_result, None), } reduced = apply_to_collection(to_reduce, (torch.Tensor, numbers.Number, np.ndarray), lambda x: x * 2) - assert isinstance(reduced, dict), " Type Consistency of dict not preserved" + assert isinstance(reduced, dict), "Type Consistency of dict not preserved" assert all(x in reduced for x in to_reduce), "Not all entries of the dict were preserved" assert all( isinstance(reduced[k], type(expected_result[k])) for k in to_reduce ), "At least one type was not correctly preserved" assert isinstance(reduced["a"], torch.Tensor), "Reduction Result of a Tensor should be a Tensor" - assert torch.allclose(expected_result["a"], reduced["a"]), "Reduction of a tensor does not yield the expected value" + assert torch.equal(expected_result["a"], reduced["a"]), "Reduction of a tensor does not yield the expected value" assert isinstance(reduced["b"], list), "Reduction Result of a list should be a list" assert all( - torch.allclose(x, y) for x, y in zip(reduced["b"], expected_result["b"]) + torch.equal(x, y) for x, y in zip(reduced["b"], expected_result["b"]) ), "At least one value of list reduction did not come out as expected" assert isinstance(reduced["c"], tuple), "Reduction Result of a tuple should be a tuple" assert all( - torch.allclose(x, y) for x, y in zip(reduced["c"], expected_result["c"]) + torch.equal(x, y) for x, y in zip(reduced["c"], expected_result["c"]) ), "At least one value of tuple reduction did not come out as expected" assert isinstance(reduced["d"], ntc), "Type Consistency for named tuple not given" @@ -109,34 +185,30 @@ def __post_init__(self): assert isinstance(reduced["g"], numbers.Number), "Reduction of a number should result in a number" assert reduced["g"] == expected_result["g"], "Reduction of a number did not yield the desired result" - assert dataclasses.is_dataclass(reduced["h"]) and not isinstance( - reduced["h"], type - ), "Reduction of a dataclass should result in a dataclass" - assert torch.allclose( - reduced["h"].input_ids, expected_result["h"].input_ids - ), "Reduction of a dataclass did not yield the desired result" - assert np.allclose( - reduced["h"].segment_ids, expected_result["h"].segment_ids - ), "Reduction of a dataclass did not yield the desired result" - - assert dataclasses.is_dataclass(reduced["i"]) and not isinstance( - reduced["i"], type - ), "Reduction of a dataclass should result in a dataclass" - assert dataclasses.is_dataclass(reduced["i"].feature) and not isinstance( - reduced["i"].feature, type - ), "Reduction of a nested dataclass should result in a nested dataclass" - assert ( - reduced["i"].example_ids == expected_result["i"].example_ids - ), "Reduction of a nested dataclass did not yield the desired result" - assert torch.allclose( - reduced["i"].label, expected_result["i"].label - ), "Reduction of a nested dataclass did not yield the desired result" - assert torch.allclose( - reduced["i"].feature.input_ids, expected_result["i"].feature.input_ids - ), "Reduction of a nested dataclass did not yield the desired result" - assert np.allclose( - reduced["i"].feature.segment_ids, expected_result["i"].feature.segment_ids - ), "Reduction of a nested dataclass did not yield the desired result" + def _assert_dataclass_reduction(actual, expected, dataclass_type: str = ""): + assert dataclasses.is_dataclass(actual) and not isinstance( + actual, type + ), f"Reduction of a {dataclass_type} dataclass should result in a dataclass" + for field in dataclasses.fields(actual): + if dataclasses.is_dataclass(field.type): + _assert_dataclass_reduction(getattr(actual, field.name), getattr(expected, field.name), "nested") + assert actual == expected, f"Reduction of a {dataclass_type} dataclass did not yield the desired result" + + _assert_dataclass_reduction(reduced["h"], expected_result["h"]) + + _assert_dataclass_reduction(reduced["i"], expected_result["i"]) + + dataclass_type = "ClassVar-containing" + _assert_dataclass_reduction(reduced["j"], expected_result["j"], dataclass_type) + assert WithClassVar.class_var == 0, f"Reduction of a {dataclass_type} dataclass should not change the class var" + + _assert_dataclass_reduction(reduced["k"], expected_result["k"], "InitVar-containing") + + dataclass_type = "Class-and-InitVar-containing" + _assert_dataclass_reduction(reduced["l"], expected_result["l"], dataclass_type) + assert torch.equal( + WithClassAndInitVar.class_var, torch.tensor(0) + ), f"Reduction of a {dataclass_type} dataclass should not change the class var" # mapping support reduced = apply_to_collection({"a": 1, "b": 2}, int, lambda x: str(x)) From 6baa5ccec311e968800f671223dcd37a1fd35b9b Mon Sep 17 00:00:00 2001 From: Rohit Gupta Date: Tue, 16 Nov 2021 04:01:45 +0530 Subject: [PATCH 019/123] fix overfit_batch sampler replacement logic (#10486) Co-authored-by: thomas chaton --- CHANGELOG.md | 20 +++++++ pytorch_lightning/trainer/data_loading.py | 18 +++--- tests/trainer/flags/test_overfit_batches.py | 63 +++++++++++++++++---- 3 files changed, 82 insertions(+), 19 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8b18ac8db873a..b31453147e617 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,26 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed `CombinedLoader` and `max_size_cycle` didn't receive a `DistributedSampler` ([#10374](https://github.com/PyTorchLightning/pytorch-lightning/issues/10374)) - Fixed an issue where class or init-only variables of dataclasses were passed to the dataclass constructor in `utilities.apply_to_collection` ([#9702](https://github.com/PyTorchLightning/pytorch-lightning/issues/9702)) +- Fixed `to_torchscript()` causing false positive deprecation warnings ([#10470](https://github.com/PyTorchLightning/pytorch-lightning/issues/10470)) + + +- Fixed `isinstance` not working with `init_meta_context`, materialized model not being moved to the device ([#10493](https://github.com/PyTorchLightning/metrics/pull/10493)) + + +- Fixed an issue that prevented the Trainer to shutdown workers when execution is interrupted due to failure([#10463](https://github.com/PyTorchLightning/pytorch-lightning/issues/10463)) + + +- Squeeze the early stopping monitor to remove empty tensor dimensions ([#10461](https://github.com/PyTorchLightning/pytorch-lightning/issues/10461)) + + +- Fixed sampler replacement logic with `overfit_batches` to only replace the sample when `SequentialSampler` is not used ([#10486](https://github.com/PyTorchLightning/pytorch-lightning/issues/10486)) + + +- + + +- + ## [1.5.1] - 2021-11-09 diff --git a/pytorch_lightning/trainer/data_loading.py b/pytorch_lightning/trainer/data_loading.py index 37a234f32f711..9c40e728391c1 100644 --- a/pytorch_lightning/trainer/data_loading.py +++ b/pytorch_lightning/trainer/data_loading.py @@ -438,8 +438,7 @@ def _reset_eval_dataloader( for loader_i in range(len(dataloaders)): loader = dataloaders[loader_i] - if hasattr(loader, "sampler") and isinstance(loader.sampler, RandomSampler): - + if hasattr(loader, "sampler") and not isinstance(loader.sampler, SequentialSampler): # when overfitting, the dataloader should not have sampler if self.overfit_batches > 0 and mode.evaluating: rank_zero_warn( @@ -591,16 +590,17 @@ def _add_sampler_metadata_collate(dataloader: DataLoader) -> None: @staticmethod def _resolve_overfit_batches(dataloader: Collection[DataLoader]) -> Collection[DataLoader]: - has_random_sampler = False + all_have_sequential_sampler = True - def resolve_had_random_sampler(dataloader: DataLoader): - nonlocal has_random_sampler - if not has_random_sampler: - has_random_sampler = isinstance(dataloader.sampler, RandomSampler) + def resolve_has_no_sequential_sampler(dataloader: DataLoader): + nonlocal all_have_sequential_sampler + all_have_sequential_sampler = all_have_sequential_sampler & isinstance( + dataloader.sampler, SequentialSampler + ) - apply_to_collection(dataloader, DataLoader, resolve_had_random_sampler) + apply_to_collection(dataloader, DataLoader, resolve_has_no_sequential_sampler) - if has_random_sampler: + if not all_have_sequential_sampler: rank_zero_warn( "You requested to overfit but enabled training dataloader shuffling." " We are turning off the training dataloader shuffling for you." diff --git a/tests/trainer/flags/test_overfit_batches.py b/tests/trainer/flags/test_overfit_batches.py index 76c8b37405b47..3860d85ec9836 100644 --- a/tests/trainer/flags/test_overfit_batches.py +++ b/tests/trainer/flags/test_overfit_batches.py @@ -13,13 +13,16 @@ # limitations under the License. import pytest import torch +from torch.utils.data.sampler import Sampler, SequentialSampler from pytorch_lightning import Trainer from tests.helpers.boring_model import BoringModel, RandomDataset def test_overfit_multiple_val_loaders(tmpdir): - """Tests that only training_step can be used.""" + """Tests that overfit batches works with multiple val dataloaders.""" + val_dl_count = 2 + overfit_batches = 3 class TestModel(BoringModel): def validation_step(self, batch, batch_idx, dataloader_idx): @@ -31,25 +34,65 @@ def validation_epoch_end(self, outputs) -> None: pass def val_dataloader(self): - dl1 = torch.utils.data.DataLoader(RandomDataset(32, 64)) - dl2 = torch.utils.data.DataLoader(RandomDataset(32, 64)) - return [dl1, dl2] + dls = [torch.utils.data.DataLoader(RandomDataset(32, 64)) for _ in range(val_dl_count)] + return dls model = TestModel() trainer = Trainer( - default_root_dir=tmpdir, max_epochs=2, overfit_batches=1, log_every_n_steps=1, enable_model_summary=False + default_root_dir=tmpdir, + max_epochs=2, + overfit_batches=overfit_batches, + log_every_n_steps=1, + enable_model_summary=False, ) trainer.fit(model) + assert trainer.num_training_batches == overfit_batches + assert len(trainer.num_val_batches) == val_dl_count + assert all(nbatches == overfit_batches for nbatches in trainer.num_val_batches) -@pytest.mark.parametrize("overfit", [1, 2, 0.1, 0.25, 1.0]) -def test_overfit_basic(tmpdir, overfit): - """Tests that only training_step can be used.""" +@pytest.mark.parametrize("overfit_batches", [1, 2, 0.1, 0.25, 1.0]) +def test_overfit_basic(tmpdir, overfit_batches): + """Tests that only training_step can be used when overfitting.""" model = BoringModel() + model.validation_step = None + total_train_samples = len(BoringModel().train_dataloader()) - trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, overfit_batches=overfit, enable_model_summary=False) - + trainer = Trainer( + default_root_dir=tmpdir, max_epochs=1, overfit_batches=overfit_batches, enable_model_summary=False + ) trainer.fit(model) + + assert trainer.num_val_batches == [] + assert trainer.num_training_batches == int( + overfit_batches * (1 if isinstance(overfit_batches, int) else total_train_samples) + ) + + +def test_overfit_batches_raises_warning_in_case_of_sequential_sampler(tmpdir): + class NonSequentialSampler(Sampler): + def __init__(self, data_source): + self.data_source = data_source + + def __iter__(self): + return iter(range(len(self.data_source))) + + def __len__(self): + return len(self.data_source) + + class TestModel(BoringModel): + def train_dataloader(self): + dataset = RandomDataset(32, 64) + sampler = NonSequentialSampler(dataset) + return torch.utils.data.DataLoader(dataset, sampler=sampler) + + model = TestModel() + trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, overfit_batches=2) + + with pytest.warns(UserWarning, match="requested to overfit but enabled training dataloader shuffling"): + trainer.fit(model) + + assert isinstance(trainer.train_dataloader.loaders.sampler, SequentialSampler) From 391e0d6e3bb569653f7da54a36d91f7140404474 Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Mon, 15 Nov 2021 10:03:46 +0000 Subject: [PATCH 020/123] shutdown workers on failure (#10463) --- CHANGELOG.md | 3 +++ pytorch_lightning/trainer/trainer.py | 2 ++ tests/loops/test_loops.py | 34 ++++++++++++++++++++++------ 3 files changed, 32 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b31453147e617..3efc079ce6e4a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -27,6 +27,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed sampler replacement logic with `overfit_batches` to only replace the sample when `SequentialSampler` is not used ([#10486](https://github.com/PyTorchLightning/pytorch-lightning/issues/10486)) +- Fixed an issue that prevented the Trainer to shutdown workers when execution is interrupted due to failure([#10463](https://github.com/PyTorchLightning/pytorch-lightning/issues/10463)) + + - diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 7e5d21e18dc26..567f3cf99cee0 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -697,6 +697,8 @@ def _call_and_handle_interrupt(self, trainer_fn: Callable, *args: Any, **kwargs: # reset bookkeeping self.state.stage = None self.on_exception(exception) + # shutdown workers + self._data_connector.teardown() raise def fit( diff --git a/tests/loops/test_loops.py b/tests/loops/test_loops.py index bad9a717d1629..28ac1f3f2aefc 100644 --- a/tests/loops/test_loops.py +++ b/tests/loops/test_loops.py @@ -24,7 +24,7 @@ from pl_examples.bug_report_model import RandomDataset from pytorch_lightning import LightningModule, Trainer -from pytorch_lightning.callbacks import ModelCheckpoint +from pytorch_lightning.callbacks import Callback, ModelCheckpoint from pytorch_lightning.loops import Loop, TrainingBatchLoop from pytorch_lightning.trainer.progress import BaseProgress from tests.helpers import BoringModel @@ -912,8 +912,10 @@ def val_dataloader(self): @RunIf(min_torch="1.8.0") -@pytest.mark.parametrize("persistent_workers", (False, True)) -def test_workers_are_shutdown(tmpdir, persistent_workers): +@pytest.mark.parametrize("should_fail", [False, True]) +# False is de-activated due to slowness +@pytest.mark.parametrize("persistent_workers", [True]) +def test_workers_are_shutdown(tmpdir, should_fail, persistent_workers): # `num_workers == 1` uses `_MultiProcessingDataLoaderIter` # `persistent_workers` makes sure `self._iterator` gets set on the `DataLoader` instance @@ -941,12 +943,30 @@ def _get_iterator(self): train_dataloader = TestDataLoader(RandomDataset(32, 64), num_workers=1, persistent_workers=persistent_workers) val_dataloader = TestDataLoader(RandomDataset(32, 64), num_workers=1, persistent_workers=persistent_workers) + class TestCallback(Callback): + def on_train_epoch_end(self, trainer, *_): + if trainer.current_epoch == 1: + raise CustomException + max_epochs = 3 + model = BoringModel() - trainer = Trainer(default_root_dir=tmpdir, limit_train_batches=2, limit_val_batches=2, max_epochs=max_epochs) - trainer.fit(model, train_dataloader, val_dataloader) - assert train_dataloader.count_shutdown_workers == (2 if persistent_workers else max_epochs) + trainer = Trainer( + default_root_dir=tmpdir, + limit_train_batches=2, + limit_val_batches=2, + max_epochs=max_epochs, + callbacks=TestCallback() if should_fail else None, + ) + + if should_fail: + with pytest.raises(CustomException): + trainer.fit(model, train_dataloader, val_dataloader) + else: + trainer.fit(model, train_dataloader, val_dataloader) + + assert train_dataloader.count_shutdown_workers == 2 if should_fail else (2 if persistent_workers else max_epochs) # on sanity checking end, the workers are being deleted too. - assert val_dataloader.count_shutdown_workers == (2 if persistent_workers else max_epochs + 1) + assert val_dataloader.count_shutdown_workers == 2 if persistent_workers else (3 if should_fail else max_epochs + 1) assert train_dataloader._iterator is None assert val_dataloader._iterator is None From 5e6db79f65890c700d1b45663995aa0c888e731d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Fri, 12 Nov 2021 19:03:47 +0100 Subject: [PATCH 021/123] Squeeze the early stopping monitor (#10461) --- CHANGELOG.md | 2 +- pytorch_lightning/callbacks/early_stopping.py | 2 +- tests/callbacks/test_early_stopping.py | 13 +++++++++++++ 3 files changed, 15 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3efc079ce6e4a..8212ecd8eb6d9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -30,7 +30,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed an issue that prevented the Trainer to shutdown workers when execution is interrupted due to failure([#10463](https://github.com/PyTorchLightning/pytorch-lightning/issues/10463)) -- +- Squeeze the early stopping monitor to remove empty tensor dimensions ([#10461](https://github.com/PyTorchLightning/pytorch-lightning/issues/10461)) - diff --git a/pytorch_lightning/callbacks/early_stopping.py b/pytorch_lightning/callbacks/early_stopping.py index b5118846875db..096cb4849cc39 100644 --- a/pytorch_lightning/callbacks/early_stopping.py +++ b/pytorch_lightning/callbacks/early_stopping.py @@ -202,7 +202,7 @@ def _run_early_stopping_check(self, trainer: "pl.Trainer") -> None: ): # short circuit if metric not present return - current = logs.get(self.monitor) + current = logs[self.monitor].squeeze() should_stop, reason = self._evaluate_stopping_criteria(current) # stop every ddp process if any world process decides to stop diff --git a/tests/callbacks/test_early_stopping.py b/tests/callbacks/test_early_stopping.py index 2b4fe9f05eb87..c5a9e85d40f49 100644 --- a/tests/callbacks/test_early_stopping.py +++ b/tests/callbacks/test_early_stopping.py @@ -469,3 +469,16 @@ def validation_step(self, batch, batch_idx): assert trainer.global_step == len(side_effect) * int(trainer.limit_train_batches * trainer.val_check_interval) else: assert trainer.current_epoch == len(side_effect) * trainer.check_val_every_n_epoch - 1 + + +def test_early_stopping_squeezes(): + early_stopping = EarlyStopping(monitor="foo") + trainer = Trainer() + trainer.callback_metrics["foo"] = torch.tensor([[[0]]]) + + with mock.patch( + "pytorch_lightning.callbacks.EarlyStopping._evaluate_stopping_criteria", return_value=(False, "") + ) as es_mock: + early_stopping._run_early_stopping_check(trainer) + + es_mock.assert_called_once_with(torch.tensor(0)) From 53ff8402b9cfb20af191ca50c5aa0ff7bb6b33a2 Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Mon, 15 Nov 2021 19:13:01 +0000 Subject: [PATCH 022/123] Resolve instantiation problem with init_meta_context (#10493) --- CHANGELOG.md | 3 ++ .../core/mixins/device_dtype_mixin.py | 6 ++- pytorch_lightning/trainer/trainer.py | 15 +++++- pytorch_lightning/utilities/meta.py | 46 ++++++++++++------- tests/utilities/test_meta.py | 9 +++- 5 files changed, 58 insertions(+), 21 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8212ecd8eb6d9..03b8650b6d401 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -27,6 +27,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed sampler replacement logic with `overfit_batches` to only replace the sample when `SequentialSampler` is not used ([#10486](https://github.com/PyTorchLightning/pytorch-lightning/issues/10486)) +- Fixed `isinstance` not working with `init_meta_context`, materialized model not being moved to the device ([#10493](https://github.com/PyTorchLightning/metrics/pull/10493)) + + - Fixed an issue that prevented the Trainer to shutdown workers when execution is interrupted due to failure([#10463](https://github.com/PyTorchLightning/pytorch-lightning/issues/10463)) diff --git a/pytorch_lightning/core/mixins/device_dtype_mixin.py b/pytorch_lightning/core/mixins/device_dtype_mixin.py index e02790edddd1e..e8b122989cd9c 100644 --- a/pytorch_lightning/core/mixins/device_dtype_mixin.py +++ b/pytorch_lightning/core/mixins/device_dtype_mixin.py @@ -17,6 +17,8 @@ import torch from torch.nn import Module +import pytorch_lightning as pl + class DeviceDtypeModuleMixin(Module): __jit_unused_properties__ = ["device", "dtype"] @@ -177,7 +179,9 @@ def __update_properties( self, device: Optional[torch.device] = None, dtype: Optional[Union[str, torch.dtype]] = None ) -> None: def apply_fn(module: Union["DeviceDtypeModuleMixin", Module]) -> None: - if not isinstance(module, DeviceDtypeModuleMixin): + # TODO: Find why `isinstance(module, DeviceDtypeModuleMixin)` doesn't + # work when using `init_meta_context`. + if not isinstance(module, (DeviceDtypeModuleMixin, pl.LightningModule)): return if device is not None: module._device = device diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 567f3cf99cee0..b4a3f97025701 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -86,7 +86,7 @@ from pytorch_lightning.utilities.distributed import distributed_available from pytorch_lightning.utilities.exceptions import ExitGracefullyException, MisconfigurationException from pytorch_lightning.utilities.imports import _fault_tolerant_training -from pytorch_lightning.utilities.meta import materialize_module +from pytorch_lightning.utilities.meta import is_on_meta_device, materialize_module from pytorch_lightning.utilities.model_helpers import is_overridden from pytorch_lightning.utilities.seed import reset_seed from pytorch_lightning.utilities.types import ( @@ -1437,10 +1437,21 @@ def _call_setup_hook(self) -> None: def _call_configure_sharded_model(self) -> None: with self.accelerator.model_sharded_context(): - materialize_module(self.lightning_module) + self._handle_meta_model() self.call_hook("configure_sharded_model") self.call_hook("on_configure_sharded_model") + def _handle_meta_model(self) -> None: + if not is_on_meta_device(self.lightning_module): + return + + if isinstance(self.training_type_plugin, DDPSpawnPlugin): + raise MisconfigurationException("LightningModule on meta device isn't supported with spawn.") + + materialize_module(self.lightning_module) + # the trainer reference is lost during materialization + self.lightning_module.trainer = proxy(self) + def _call_teardown_hook(self) -> None: fn = self.state.fn._setup_fn diff --git a/pytorch_lightning/utilities/meta.py b/pytorch_lightning/utilities/meta.py index 60e6cc791b7ae..6d3c1d6b5f11b 100644 --- a/pytorch_lightning/utilities/meta.py +++ b/pytorch_lightning/utilities/meta.py @@ -18,13 +18,14 @@ from functools import partial from itertools import chain from types import ModuleType -from typing import Callable, Dict, Generator, Iterator, List, Optional, Set, Type +from typing import Any, Callable, Dict, Generator, Iterator, List, Optional, Set, Type import torch from torch import nn, Tensor from torch.nn import Module from torch.nn.modules.container import ModuleDict, ModuleList, Sequential +import pytorch_lightning as pl from pytorch_lightning.utilities import rank_zero_warn from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_10 @@ -191,7 +192,6 @@ def materialize_module(root_module: nn.Module) -> nn.Module: # cache subclasses to optimize the search when resetting the meta device later on. __STORAGE_META__ = {} - __CREATED_MODULES__ = set() @@ -237,45 +237,52 @@ def _set_meta_device() -> None: for subclass in get_all_subclasses(torch.nn.modules.module.Module): - if isinstance(subclass, (Sequential, ModuleList, ModuleDict)): + if subclass in (Sequential, ModuleList, ModuleDict, pl.LightningModule): continue # if a subclass has already been stored, we should use the cache if str(subclass) in __STORAGE_META__: - # reset the class import package to its rightfull state. + # reset the class import package to its rightful state. mods, subclass, meta_class = __STORAGE_META__[subclass] for mod in mods: setattr(mod, subclass.__name__, meta_class) continue + class _IsinstanceMetaclass(type(subclass)): + def __instancecheck__(self, instance: Any) -> bool: + """Overrides the ``isinstance`` check on ``_MaterializerModule`` objects.""" + return isinstance(instance, self.__bases__[0]) + # Create a class subclassing current `subclass` overriding its new method. # this will enable use to use `torch.distributed.nn.utils.init_meta` to create a `meta` # version of the current subclass module - class _MetaClass(subclass): + class _MaterializerModule(subclass, metaclass=_IsinstanceMetaclass): @classmethod @contextmanager - def instantiation_context(cls, materialize: bool): + def instantiation_context(cls): _unset_meta_device(from_created=True) yield _set_meta_device_populated(from_created=True) @classmethod def materialize(cls, materialize_fn: Callable): - with cls.instantiation_context(materialize=True): + with cls.instantiation_context(): obj = materialize_fn() return obj @staticmethod def add_subclasses(subclass): - """This is used to unrol the instantion tree while creating the modules.""" - __CREATED_MODULES__.add(subclass) + """This is used to unroll the instantiation tree while creating the modules.""" + # Don't store the LightningModule as skipped from the Meta process. + if subclass != pl.LightningModule: + __CREATED_MODULES__.add(subclass) if subclass.__bases__[0] != torch.nn.modules.module.Module: - _MetaClass.add_subclasses(subclass.__bases__[0]) + _MaterializerModule.add_subclasses(subclass.__bases__[0]) def __new__(cls, *args, **kwargs): subclass = cls.__bases__[0] cls.add_subclasses(subclass) - with cls.instantiation_context(materialize=False): + with cls.instantiation_context(): obj = init_meta(subclass, *args, **kwargs) obj.materialize = partial(cls.materialize, materialize_fn=obj.materialize) @@ -294,9 +301,8 @@ def search(mod: ModuleType) -> List[ModuleType]: # nn.Module class can be imported at different level and they all need to be mocked. # Example: torch.nn.Linear is actually torch.nn.modules.linear.Linear # Therefore, torch.nn.Linear, torch.nn.modules.Linear, torch.nn.modules.linear.Linear - # needs to be replaced by the torch.nn.linear.modules.Linear _MetaClass - out = [] - out.append(search(mod)) + # needs to be replaced by the torch.nn.linear.modules.Linear _MaterializerModule + out = [search(mod)] for name in submodules[1:]: mod = getattr(mod, name) out.append(search(mod)) @@ -305,11 +311,11 @@ def search(mod: ModuleType) -> List[ModuleType]: mods = [mod for mod in chain(*out) if mod] # store the modules search so it doesn't have to be performed again for this class - __STORAGE_META__[subclass] = (mods, subclass, _MetaClass) + __STORAGE_META__[subclass] = (mods, subclass, _MaterializerModule) # replace all subclass by its meta form for mod in mods: - setattr(mod, subclass.__name__, _MetaClass) + setattr(mod, subclass.__name__, _MaterializerModule) @contextmanager @@ -321,3 +327,11 @@ def init_meta_context() -> Generator: _set_meta_device() yield _unset_meta_device() + + +def is_on_meta_device(module: nn.Module) -> bool: + try: + param = next(module.parameters()) + return param.device.type == "meta" + except StopIteration: + return False diff --git a/tests/utilities/test_meta.py b/tests/utilities/test_meta.py index 8e36a86c3beef..581b949d9167f 100644 --- a/tests/utilities/test_meta.py +++ b/tests/utilities/test_meta.py @@ -14,7 +14,7 @@ from torch import nn from pytorch_lightning.core.lightning import LightningModule -from pytorch_lightning.utilities.meta import init_meta_context, materialize_module +from pytorch_lightning.utilities.meta import init_meta_context, is_on_meta_device, materialize_module from tests.helpers.runif import RunIf @@ -31,18 +31,23 @@ def __init__(self, num_layers: int): self.layer = nn.Sequential(*[nn.Linear(1, 1) for _ in range(self.hparams.num_layers)]) -@RunIf(min_torch="1.10.0") +@RunIf(special=True, min_torch="1.10.0") def test_init_meta_context(): with init_meta_context(): m = nn.Linear(in_features=1, out_features=1) + assert isinstance(m, nn.Linear) assert m.weight.device.type == "meta" + assert is_on_meta_device(m) mlp = MLP(4) assert mlp.layer[0].weight.device.type == "meta" mlp = materialize_module(mlp) assert mlp.layer[0].weight.device.type == "cpu" + assert not is_on_meta_device(mlp) + assert not is_on_meta_device(nn.Module()) + model = BoringModel(4) assert model.layer[0].weight.device.type == "meta" materialize_module(model) From 5f4a5feaa98bab97dc3f497c778d24955c2db852 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Mon, 15 Nov 2021 23:12:55 +0100 Subject: [PATCH 023/123] Fix `to_torchscript()` causing false positive deprecation warnings (#10470) --- CHANGELOG.md | 3 +++ pytorch_lightning/core/lightning.py | 17 ++++++++++++----- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 03b8650b6d401..102f74894d1dc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -27,6 +27,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed sampler replacement logic with `overfit_batches` to only replace the sample when `SequentialSampler` is not used ([#10486](https://github.com/PyTorchLightning/pytorch-lightning/issues/10486)) +- Fixed `to_torchscript()` causing false positive deprecation warnings ([#10470](https://github.com/PyTorchLightning/pytorch-lightning/issues/10470)) + + - Fixed `isinstance` not working with `init_meta_context`, materialized model not being moved to the device ([#10493](https://github.com/PyTorchLightning/metrics/pull/10493)) diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py index c59193859b171..bc89cc2b18e93 100644 --- a/pytorch_lightning/core/lightning.py +++ b/pytorch_lightning/core/lightning.py @@ -116,6 +116,8 @@ def __init__(self, *args: Any, **kwargs: Any) -> None: self._param_requires_grad_state = {} self._metric_attributes: Optional[Dict[int, str]] = None self._should_prevent_trainer_and_dataloaders_deepcopy: bool = False + # TODO: remove after the 1.6 release + self._running_torchscript = False self._register_sharded_tensor_state_dict_hooks_if_available() @@ -1962,6 +1964,8 @@ def to_torchscript( """ mode = self.training + self._running_torchscript = True + if method == "script": torchscript_module = torch.jit.script(self.eval(), **kwargs) elif method == "trace": @@ -1987,6 +1991,8 @@ def to_torchscript( with fs.open(file_path, "wb") as f: torch.jit.save(torchscript_module, f) + self._running_torchscript = False + return torchscript_module @property @@ -1996,11 +2002,12 @@ def model_size(self) -> float: Note: This property will not return correct value for Deepspeed (stage 3) and fully-sharded training. """ - rank_zero_deprecation( - "The `LightningModule.model_size` property was deprecated in v1.5 and will be removed in v1.7." - " Please use the `pytorch_lightning.utilities.memory.get_model_size_mb`.", - stacklevel=5, - ) + if not self._running_torchscript: # remove with the deprecation removal + rank_zero_deprecation( + "The `LightningModule.model_size` property was deprecated in v1.5 and will be removed in v1.7." + " Please use the `pytorch_lightning.utilities.memory.get_model_size_mb`.", + stacklevel=5, + ) return get_model_size_mb(self) def add_to_queue(self, queue: torch.multiprocessing.SimpleQueue) -> None: From 122e503be91843ff1473c4e7465585b25eb5373f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Tue, 16 Nov 2021 05:36:47 +0100 Subject: [PATCH 024/123] Skip strategy=ddp_spawn, accelerator=cpu, python>=3.9 tests (#10550) --- tests/callbacks/test_early_stopping.py | 10 +++++----- tests/callbacks/test_pruning.py | 2 +- tests/callbacks/test_stochastic_weight_avg.py | 2 +- tests/checkpointing/test_model_checkpoint.py | 2 +- tests/checkpointing/test_torch_saving.py | 2 +- tests/deprecated_api/test_remove_1-7.py | 2 +- tests/helpers/runif.py | 11 +++++++++++ tests/loggers/test_all.py | 2 +- tests/models/test_cpu.py | 2 +- tests/models/test_horovod.py | 6 +++--- tests/plugins/test_ddp_spawn_plugin.py | 6 +++--- tests/profiler/test_profiler.py | 3 ++- tests/trainer/logging_/test_distributed_logging.py | 2 +- tests/trainer/logging_/test_train_loop_logging.py | 2 +- tests/trainer/properties/test_get_model.py | 2 +- tests/trainer/test_data_loading.py | 2 +- tests/trainer/test_trainer.py | 4 ++-- tests/utilities/test_all_gather_grad.py | 4 ++-- 18 files changed, 39 insertions(+), 27 deletions(-) diff --git a/tests/callbacks/test_early_stopping.py b/tests/callbacks/test_early_stopping.py index c5a9e85d40f49..1540cbeba5189 100644 --- a/tests/callbacks/test_early_stopping.py +++ b/tests/callbacks/test_early_stopping.py @@ -381,7 +381,7 @@ def on_train_end(self) -> None: _ES_CHECK = dict(check_on_train_epoch_end=True) _ES_CHECK_P3 = dict(patience=3, check_on_train_epoch_end=True) -_NO_WIN = dict(marks=RunIf(skip_windows=True)) +_SPAWN_MARK = dict(marks=RunIf(skip_windows=True, skip_49370=True)) @pytest.mark.parametrize( @@ -389,8 +389,8 @@ def on_train_end(self) -> None: [ ([EarlyStopping("abc"), EarlyStopping("cba", patience=3)], 3, False, None, 1), ([EarlyStopping("cba", patience=3), EarlyStopping("abc")], 3, False, None, 1), - pytest.param([EarlyStopping("abc"), EarlyStopping("cba", patience=3)], 3, False, "ddp_spawn", 2, **_NO_WIN), - pytest.param([EarlyStopping("cba", patience=3), EarlyStopping("abc")], 3, False, "ddp_spawn", 2, **_NO_WIN), + pytest.param([EarlyStopping("abc"), EarlyStopping("cba", patience=3)], 3, False, "ddp_spawn", 2, **_SPAWN_MARK), + pytest.param([EarlyStopping("cba", patience=3), EarlyStopping("abc")], 3, False, "ddp_spawn", 2, **_SPAWN_MARK), ([EarlyStopping("abc", **_ES_CHECK), EarlyStopping("cba", **_ES_CHECK_P3)], 3, True, None, 1), ([EarlyStopping("cba", **_ES_CHECK_P3), EarlyStopping("abc", **_ES_CHECK)], 3, True, None, 1), pytest.param( @@ -399,7 +399,7 @@ def on_train_end(self) -> None: True, "ddp_spawn", 2, - **_NO_WIN, + **_SPAWN_MARK, ), pytest.param( [EarlyStopping("cba", **_ES_CHECK_P3), EarlyStopping("abc", **_ES_CHECK)], @@ -407,7 +407,7 @@ def on_train_end(self) -> None: True, "ddp_spawn", 2, - **_NO_WIN, + **_SPAWN_MARK, ), ], ) diff --git a/tests/callbacks/test_pruning.py b/tests/callbacks/test_pruning.py index 1c1f84b5b95a0..c813ed2b02e28 100644 --- a/tests/callbacks/test_pruning.py +++ b/tests/callbacks/test_pruning.py @@ -187,7 +187,7 @@ def test_pruning_callback_ddp_spawn(tmpdir): train_with_pruning_callback(tmpdir, use_global_unstructured=True, strategy="ddp_spawn", gpus=2) -@RunIf(skip_windows=True) +@RunIf(skip_windows=True, skip_49370=True) def test_pruning_callback_ddp_cpu(tmpdir): train_with_pruning_callback(tmpdir, parameters_to_prune=True, strategy="ddp_spawn", num_processes=2) diff --git a/tests/callbacks/test_stochastic_weight_avg.py b/tests/callbacks/test_stochastic_weight_avg.py index e10f99d33d564..4a0f154928adb 100644 --- a/tests/callbacks/test_stochastic_weight_avg.py +++ b/tests/callbacks/test_stochastic_weight_avg.py @@ -148,7 +148,7 @@ def test_swa_callback_ddp_spawn(tmpdir): train_with_swa(tmpdir, strategy="ddp_spawn", gpus=2) -@RunIf(skip_windows=True) +@RunIf(skip_windows=True, skip_49370=True) def test_swa_callback_ddp_cpu(tmpdir): train_with_swa(tmpdir, strategy="ddp_spawn", num_processes=2) diff --git a/tests/checkpointing/test_model_checkpoint.py b/tests/checkpointing/test_model_checkpoint.py index 518d67cf251f5..04255d51ad069 100644 --- a/tests/checkpointing/test_model_checkpoint.py +++ b/tests/checkpointing/test_model_checkpoint.py @@ -385,7 +385,7 @@ def on_train_end(self, trainer, pl_module): assert torch.save.call_count == 0 -@RunIf(skip_windows=True) +@RunIf(skip_windows=True, skip_49370=True) def test_model_checkpoint_no_extraneous_invocations(tmpdir): """Test to ensure that the model callback saves the checkpoints only once in distributed mode.""" model = LogInTwoMethods() diff --git a/tests/checkpointing/test_torch_saving.py b/tests/checkpointing/test_torch_saving.py index 8b0f0e457bff9..f9634a9dadb2a 100644 --- a/tests/checkpointing/test_torch_saving.py +++ b/tests/checkpointing/test_torch_saving.py @@ -34,7 +34,7 @@ def test_model_torch_save(tmpdir): trainer = torch.load(temp_path) -@RunIf(skip_windows=True) +@RunIf(skip_windows=True, skip_49370=True) def test_model_torch_save_ddp_cpu(tmpdir): """Test to ensure torch save does not fail for model and trainer using cpu ddp.""" model = BoringModel() diff --git a/tests/deprecated_api/test_remove_1-7.py b/tests/deprecated_api/test_remove_1-7.py index 16c511b6effd9..47d72814cd2b6 100644 --- a/tests/deprecated_api/test_remove_1-7.py +++ b/tests/deprecated_api/test_remove_1-7.py @@ -242,7 +242,7 @@ def get_from_queue(self, queue: torch.multiprocessing.SimpleQueue) -> None: return super().get_from_queue(queue) -@RunIf(skip_windows=True) +@RunIf(skip_windows=True, skip_49370=True) def test_v1_7_0_deprecate_add_get_queue(tmpdir): model = BoringCallbackDDPSpawnModel() trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, num_processes=2, strategy="ddp_spawn") diff --git a/tests/helpers/runif.py b/tests/helpers/runif.py index 490e023662f79..e53d3811f6b34 100644 --- a/tests/helpers/runif.py +++ b/tests/helpers/runif.py @@ -70,6 +70,7 @@ def __new__( fairscale_fully_sharded: bool = False, deepspeed: bool = False, rich: bool = False, + skip_49370: bool = False, **kwargs, ): """ @@ -91,6 +92,7 @@ def __new__( fairscale_fully_sharded: if `fairscale` fully sharded module is required to run the test deepspeed: if `deepspeed` module is required to run the test rich: if `rich` module is required to run the test + skip_49370: Skip the test as it's impacted by https://github.com/pytorch/pytorch/issues/49370. kwargs: native pytest.mark.skipif keyword arguments """ conditions = [] @@ -165,6 +167,15 @@ def __new__( conditions.append(not _RICH_AVAILABLE) reasons.append("Rich") + if skip_49370: + # strategy=ddp_spawn, accelerator=cpu, python>=3.9, torch<1.8 does not work + py_version = f"{sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}" + ge_3_9 = Version(py_version) >= Version("3.9") + torch_version = get_distribution("torch").version + old_torch = Version(torch_version) < Version("1.8") + conditions.append(ge_3_9 and old_torch) + reasons.append("Impacted by https://github.com/pytorch/pytorch/issues/49370") + reasons = [rs for cond, rs in zip(conditions, reasons) if cond] return pytest.mark.skipif( *args, condition=any(conditions), reason=f"Requires: [{' + '.join(reasons)}]", **kwargs diff --git a/tests/loggers/test_all.py b/tests/loggers/test_all.py index 271ffce811fe5..370b24431b088 100644 --- a/tests/loggers/test_all.py +++ b/tests/loggers/test_all.py @@ -321,8 +321,8 @@ def on_train_batch_start(self, trainer, pl_module, batch, batch_idx): assert pl_module.logger.experiment.something(foo="bar") is None +@RunIf(skip_windows=True, skip_49370=True) @pytest.mark.parametrize("logger_class", [CometLogger, CSVLogger, MLFlowLogger, TensorBoardLogger, TestTubeLogger]) -@RunIf(skip_windows=True) def test_logger_created_on_rank_zero_only(tmpdir, monkeypatch, logger_class): """Test that loggers get replaced by dummy loggers on global rank > 0.""" _patch_comet_atexit(monkeypatch) diff --git a/tests/models/test_cpu.py b/tests/models/test_cpu.py index 2fb537b1d2861..c110f3a83d815 100644 --- a/tests/models/test_cpu.py +++ b/tests/models/test_cpu.py @@ -122,7 +122,7 @@ def validation_step(self, *args, **kwargs): model.unfreeze() -@RunIf(skip_windows=True) +@RunIf(skip_windows=True, skip_49370=True) def test_multi_cpu_model_ddp(tmpdir): """Make sure DDP works.""" tutils.set_random_main_port() diff --git a/tests/models/test_horovod.py b/tests/models/test_horovod.py index abf5a34757424..59a22cf1656d1 100644 --- a/tests/models/test_horovod.py +++ b/tests/models/test_horovod.py @@ -66,7 +66,7 @@ def _run_horovod(trainer_options, on_gpu=False): assert exit_code == 0 -@RunIf(skip_windows=True, horovod=True) +@RunIf(skip_windows=True, horovod=True, skip_49370=True) def test_horovod_cpu(tmpdir): """Test Horovod running multi-process on CPU.""" trainer_options = dict( @@ -82,7 +82,7 @@ def test_horovod_cpu(tmpdir): _run_horovod(trainer_options) -@RunIf(skip_windows=True, horovod=True) +@RunIf(skip_windows=True, horovod=True, skip_49370=True) def test_horovod_cpu_clip_grad_by_value(tmpdir): """Test Horovod running multi-process on CPU.""" trainer_options = dict( @@ -99,7 +99,7 @@ def test_horovod_cpu_clip_grad_by_value(tmpdir): _run_horovod(trainer_options) -@RunIf(skip_windows=True, horovod=True) +@RunIf(skip_windows=True, horovod=True, skip_49370=True) def test_horovod_cpu_implicit(tmpdir): """Test Horovod without specifying a backend, inferring from env set by `horovodrun`.""" trainer_options = dict( diff --git a/tests/plugins/test_ddp_spawn_plugin.py b/tests/plugins/test_ddp_spawn_plugin.py index c389cf9290c78..c5e5f7ccda748 100644 --- a/tests/plugins/test_ddp_spawn_plugin.py +++ b/tests/plugins/test_ddp_spawn_plugin.py @@ -46,7 +46,7 @@ def get_from_queue(self, queue: torch.multiprocessing.SimpleQueue) -> None: return super().get_from_queue(queue) -@RunIf(skip_windows=True) +@RunIf(skip_windows=True, skip_49370=True) def test_ddp_cpu(): """Tests if device is set correctly when training for DDPSpawnPlugin.""" trainer = Trainer(num_processes=2, fast_dev_run=True) @@ -91,7 +91,7 @@ def get_from_queue(self, trainer: Trainer, queue: torch.multiprocessing.SimpleQu return super().get_from_queue(trainer, queue) -@RunIf(skip_windows=True) +@RunIf(skip_windows=True, skip_49370=True) def test_ddp_spawn_add_get_queue(tmpdir): """Tests add_to_queue/get_from_queue with DDPSpawnPlugin.""" @@ -128,7 +128,7 @@ def on_predict_start(self) -> None: assert isinstance(self.trainer.model, LightningModule) -@RunIf(skip_windows=True) +@RunIf(skip_windows=True, skip_49370=True) def test_ddp_spawn_configure_ddp(tmpdir): """Tests with ddp spawn plugin.""" trainer = Trainer(default_root_dir=tmpdir, num_processes=2, strategy="ddp_spawn", fast_dev_run=True) diff --git a/tests/profiler/test_profiler.py b/tests/profiler/test_profiler.py index 7369ab9a4a140..f9a4727334a4f 100644 --- a/tests/profiler/test_profiler.py +++ b/tests/profiler/test_profiler.py @@ -162,7 +162,7 @@ def test_simple_profiler_with_nonexisting_dirpath(tmpdir): assert nonexisting_tmpdir.join("fit-profiler.txt").exists() -@RunIf(skip_windows=True) +@RunIf(skip_windows=True, skip_49370=True) def test_simple_profiler_distributed_files(tmpdir): """Ensure the proper files are saved in distributed.""" profiler = SimpleProfiler(dirpath=tmpdir, filename="profiler") @@ -227,6 +227,7 @@ def test_advanced_profiler_iterable_durations(advanced_profiler, action: str, ex np.testing.assert_allclose(recored_total_duration, expected_total_duration, rtol=0.2) +@pytest.mark.flaky(reruns=3) def test_advanced_profiler_overhead(advanced_profiler, n_iter=5): """ensure that the profiler doesn't introduce too much overhead during training.""" for _ in range(n_iter): diff --git a/tests/trainer/logging_/test_distributed_logging.py b/tests/trainer/logging_/test_distributed_logging.py index 487b7f38e4e19..d4ba4f242294a 100644 --- a/tests/trainer/logging_/test_distributed_logging.py +++ b/tests/trainer/logging_/test_distributed_logging.py @@ -59,7 +59,7 @@ def on_train_end(self): assert self.log_name.format(rank=self.local_rank) in self.logger.logs, "Expected rank to be logged" -@RunIf(skip_windows=True) +@RunIf(skip_windows=True, skip_49370=True) def test_all_rank_logging_ddp_cpu(tmpdir): """Check that all ranks can be logged from.""" model = TestModel() diff --git a/tests/trainer/logging_/test_train_loop_logging.py b/tests/trainer/logging_/test_train_loop_logging.py index 5b775b9968d99..22a1a2c90d756 100644 --- a/tests/trainer/logging_/test_train_loop_logging.py +++ b/tests/trainer/logging_/test_train_loop_logging.py @@ -395,7 +395,7 @@ def validation_step(self, batch, batch_idx): return super().validation_step(batch, batch_idx) -@pytest.mark.parametrize("devices", [1, pytest.param(2, marks=RunIf(skip_windows=True))]) +@pytest.mark.parametrize("devices", [1, pytest.param(2, marks=RunIf(skip_windows=True, skip_49370=True))]) def test_logging_sync_dist_true(tmpdir, devices): """Tests to ensure that the sync_dist flag works (should just return the original value)""" fake_result = 1 diff --git a/tests/trainer/properties/test_get_model.py b/tests/trainer/properties/test_get_model.py index 6e405739e83fe..ed81b90a2d142 100644 --- a/tests/trainer/properties/test_get_model.py +++ b/tests/trainer/properties/test_get_model.py @@ -37,7 +37,7 @@ def test_get_model(tmpdir): trainer.fit(model) -@RunIf(skip_windows=True) +@RunIf(skip_windows=True, skip_49370=True) def test_get_model_ddp_cpu(tmpdir): """Tests that `trainer.lightning_module` extracts the model correctly when using ddp on cpu.""" diff --git a/tests/trainer/test_data_loading.py b/tests/trainer/test_data_loading.py index 35f9838f0b04a..c04c6f0f6ea41 100644 --- a/tests/trainer/test_data_loading.py +++ b/tests/trainer/test_data_loading.py @@ -133,7 +133,7 @@ def _get_warning_msg(): assert warn_str in msg -@RunIf(skip_windows=True) +@RunIf(skip_windows=True, skip_49370=True) @pytest.mark.parametrize("num_workers", [0, 1]) def test_dataloader_warnings(tmpdir, num_workers): trainer = Trainer(default_root_dir=tmpdir, strategy="ddp_spawn", num_processes=2, fast_dev_run=4) diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py index 0a3eacb23863f..c4a8884a23ccd 100644 --- a/tests/trainer/test_trainer.py +++ b/tests/trainer/test_trainer.py @@ -1809,7 +1809,7 @@ def on_predict_start(self) -> None: @pytest.mark.parametrize( - "strategy,num_processes", [(None, 1), pytest.param("ddp_spawn", 2, marks=RunIf(skip_windows=True))] + "strategy,num_processes", [(None, 1), pytest.param("ddp_spawn", 2, marks=RunIf(skip_windows=True, skip_49370=True))] ) def test_model_in_correct_mode_during_stages(tmpdir, strategy, num_processes): model = TrainerStagesModel() @@ -1830,7 +1830,7 @@ def validation_epoch_end(self, outputs) -> None: pass -@RunIf(skip_windows=True) +@RunIf(skip_windows=True, skip_49370=True) def test_fit_test_synchronization(tmpdir): """Test that the trainer synchronizes processes before returning control back to the caller.""" tutils.set_random_main_port() diff --git a/tests/utilities/test_all_gather_grad.py b/tests/utilities/test_all_gather_grad.py index 073468fc4cb28..2ed42b0b0f21a 100644 --- a/tests/utilities/test_all_gather_grad.py +++ b/tests/utilities/test_all_gather_grad.py @@ -41,8 +41,8 @@ def _test_all_gather_ddp(rank, world_size): assert torch.allclose(grad2, tensor2.grad) -@RunIf(skip_windows=True) -def test_all_gather_ddp(): +@RunIf(skip_windows=True, skip_49370=True) +def test_all_gather_ddp_spawn(): world_size = 3 torch.multiprocessing.spawn(_test_all_gather_ddp, args=(world_size,), nprocs=world_size) From 9e45024473d89a89541603e73a9ceb18afe4a4c1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Tue, 16 Nov 2021 16:52:09 +0100 Subject: [PATCH 025/123] Fix scripting causing false positive deprecation warnings (#10555) Co-authored-by: Rohit Gupta --- CHANGELOG.md | 2 +- pytorch_lightning/loggers/tensorboard.py | 2 ++ .../plugins/training_type/ipu.py | 20 +++++++++++-------- 3 files changed, 15 insertions(+), 9 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 102f74894d1dc..33b8b7928061c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -27,7 +27,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed sampler replacement logic with `overfit_batches` to only replace the sample when `SequentialSampler` is not used ([#10486](https://github.com/PyTorchLightning/pytorch-lightning/issues/10486)) -- Fixed `to_torchscript()` causing false positive deprecation warnings ([#10470](https://github.com/PyTorchLightning/pytorch-lightning/issues/10470)) +- Fixed scripting causing false positive deprecation warnings ([#10470](https://github.com/PyTorchLightning/pytorch-lightning/pull/10470), [#10555](https://github.com/PyTorchLightning/pytorch-lightning/pull/10555)) - Fixed `isinstance` not working with `init_meta_context`, materialized model not being moved to the device ([#10493](https://github.com/PyTorchLightning/metrics/pull/10493)) diff --git a/pytorch_lightning/loggers/tensorboard.py b/pytorch_lightning/loggers/tensorboard.py index f26fc75ac58db..1ceadb8658a3d 100644 --- a/pytorch_lightning/loggers/tensorboard.py +++ b/pytorch_lightning/loggers/tensorboard.py @@ -240,7 +240,9 @@ def log_graph(self, model: "pl.LightningModule", input_array=None): if input_array is not None: input_array = model._apply_batch_transfer_handler(input_array) + model._running_torchscript = True self.experiment.add_graph(model, input_array) + model._running_torchscript = False else: rank_zero_warn( "Could not log computational graph since the" diff --git a/pytorch_lightning/plugins/training_type/ipu.py b/pytorch_lightning/plugins/training_type/ipu.py index 4d9f937c58467..898e62791d6ee 100644 --- a/pytorch_lightning/plugins/training_type/ipu.py +++ b/pytorch_lightning/plugins/training_type/ipu.py @@ -237,21 +237,25 @@ def to_tensor(x): args = apply_to_collection(args, dtype=(int, float), function=to_tensor) return args - def training_step(self, *args, **kwargs): + def _step(self, stage: RunningStage, *args: Any, **kwargs: Any): args = self._prepare_input(args) - return self.poptorch_models[RunningStage.TRAINING](*args, **kwargs) + poptorch_model = self.poptorch_models[stage] + self.lightning_module._running_torchscript = True + out = poptorch_model(*args, **kwargs) + self.lightning_module._running_torchscript = False + return out + + def training_step(self, *args, **kwargs): + return self._step(RunningStage.TRAINING, *args, **kwargs) def validation_step(self, *args, **kwargs): - args = self._prepare_input(args) - return self.poptorch_models[RunningStage.VALIDATING](*args, **kwargs) + return self._step(RunningStage.VALIDATING, *args, **kwargs) def test_step(self, *args, **kwargs): - args = self._prepare_input(args) - return self.poptorch_models[RunningStage.TESTING](*args, **kwargs) + return self._step(RunningStage.TESTING, *args, **kwargs) def predict_step(self, *args, **kwargs): - args = self._prepare_input(args) - return self.poptorch_models[RunningStage.PREDICTING](*args, **kwargs) + return self._step(RunningStage.PREDICTING, *args, **kwargs) def teardown(self) -> None: # undo dataloader patching From 1ecb962495db5304e453bc1ca2b75fb3ccd46071 Mon Sep 17 00:00:00 2001 From: Raahul Singh Date: Fri, 12 Nov 2021 01:23:40 +0530 Subject: [PATCH 026/123] Change attributes of `RichProgressBarTheme` dataclass (#10454) Co-authored-by: Kaushik B <45285388+kaushikb11@users.noreply.github.com> --- .../callbacks/progress/rich_progress.py | 34 +++++++++++++------ tests/callbacks/test_rich_progress_bar.py | 4 +-- 2 files changed, 25 insertions(+), 13 deletions(-) diff --git a/pytorch_lightning/callbacks/progress/rich_progress.py b/pytorch_lightning/callbacks/progress/rich_progress.py index ab771992a960b..3c70bfe735f95 100644 --- a/pytorch_lightning/callbacks/progress/rich_progress.py +++ b/pytorch_lightning/callbacks/progress/rich_progress.py @@ -129,11 +129,12 @@ def render(self, task) -> RenderableType: class MetricsTextColumn(ProgressColumn): """A column containing text.""" - def __init__(self, trainer): + def __init__(self, trainer, style): self._trainer = trainer self._tasks = {} self._current_task_id = 0 self._metrics = {} + self._style = style super().__init__() def update(self, metrics): @@ -158,23 +159,34 @@ def render(self, task) -> Text: for k, v in self._metrics.items(): _text += f"{k}: {round(v, 3) if isinstance(v, float) else v} " - return Text(_text, justify="left") + return Text(_text, justify="left", style=self._style) @dataclass class RichProgressBarTheme: """Styles to associate to different base components. + Args: + description: Style for the progress bar description. For eg., Epoch x, Testing, etc. + progress_bar: Style for the bar in progress. + progress_bar_finished: Style for the finished progress bar. + progress_bar_pulse: Style for the progress bar when `IterableDataset` is being processed. + batch_progress: Style for the progress tracker (i.e 10/50 batches completed). + time: Style for the processed time and estimate time remaining. + processing_speed: Style for the speed of the batches being processed. + metrics: Style for the metrics + https://rich.readthedocs.io/en/stable/style.html """ - text_color: str = "white" - progress_bar_complete: Union[str, Style] = "#6206E0" + description: Union[str, Style] = "white" + progress_bar: Union[str, Style] = "#6206E0" progress_bar_finished: Union[str, Style] = "#6206E0" progress_bar_pulse: Union[str, Style] = "#6206E0" - batch_process: str = "white" - time: str = "grey54" - processing_speed: str = "grey70" + batch_progress: Union[str, Style] = "white" + time: Union[str, Style] = "grey54" + processing_speed: Union[str, Style] = "grey70" + metrics: Union[str, Style] = "white" class RichProgressBar(ProgressBarBase): @@ -268,7 +280,7 @@ def _init_progress(self, trainer): self._reset_progress_bar_ids() self._console: Console = Console() self._console.clear_live() - self._metric_component = MetricsTextColumn(trainer) + self._metric_component = MetricsTextColumn(trainer, self.theme.metrics) self.progress = CustomProgress( *self.configure_columns(trainer), self._metric_component, @@ -351,7 +363,7 @@ def on_validation_epoch_start(self, trainer, pl_module): def _add_task(self, total_batches: int, description: str, visible: bool = True) -> Optional[int]: if self.progress is not None: return self.progress.add_task( - f"[{self.theme.text_color}]{description}", total=total_batches, visible=visible + f"[{self.theme.description}]{description}", total=total_batches, visible=visible ) def _update(self, progress_bar_id: int, visible: bool = True) -> None: @@ -448,11 +460,11 @@ def configure_columns(self, trainer) -> list: return [ TextColumn("[progress.description]{task.description}"), CustomBarColumn( - complete_style=self.theme.progress_bar_complete, + complete_style=self.theme.progress_bar, finished_style=self.theme.progress_bar_finished, pulse_style=self.theme.progress_bar_pulse, ), - BatchesProcessedColumn(style=self.theme.batch_process), + BatchesProcessedColumn(style=self.theme.batch_progress), CustomTimeColumn(style=self.theme.time), ProcessingSpeedColumn(style=self.theme.processing_speed), ] diff --git a/tests/callbacks/test_rich_progress_bar.py b/tests/callbacks/test_rich_progress_bar.py index 31681754423a8..8f3f20630b5c0 100644 --- a/tests/callbacks/test_rich_progress_bar.py +++ b/tests/callbacks/test_rich_progress_bar.py @@ -106,11 +106,11 @@ def test_rich_progress_bar_custom_theme(tmpdir): assert progress_bar.theme == theme args, kwargs = mocks["CustomBarColumn"].call_args - assert kwargs["complete_style"] == theme.progress_bar_complete + assert kwargs["complete_style"] == theme.progress_bar assert kwargs["finished_style"] == theme.progress_bar_finished args, kwargs = mocks["BatchesProcessedColumn"].call_args - assert kwargs["style"] == theme.batch_process + assert kwargs["style"] == theme.batch_progress args, kwargs = mocks["CustomTimeColumn"].call_args assert kwargs["style"] == theme.time From ae6da920bbf1e0b5371ec6a82e6af525ea836b67 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 16 Nov 2021 17:42:40 +0100 Subject: [PATCH 027/123] 1.5.2 release --- CHANGELOG.md | 24 ------------------------ 1 file changed, 24 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 33b8b7928061c..cfb5c3e52e815 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,37 +11,13 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed `CombinedLoader` and `max_size_cycle` didn't receive a `DistributedSampler` ([#10374](https://github.com/PyTorchLightning/pytorch-lightning/issues/10374)) - Fixed an issue where class or init-only variables of dataclasses were passed to the dataclass constructor in `utilities.apply_to_collection` ([#9702](https://github.com/PyTorchLightning/pytorch-lightning/issues/9702)) - -- Fixed `to_torchscript()` causing false positive deprecation warnings ([#10470](https://github.com/PyTorchLightning/pytorch-lightning/issues/10470)) - - - Fixed `isinstance` not working with `init_meta_context`, materialized model not being moved to the device ([#10493](https://github.com/PyTorchLightning/metrics/pull/10493)) - - - Fixed an issue that prevented the Trainer to shutdown workers when execution is interrupted due to failure([#10463](https://github.com/PyTorchLightning/pytorch-lightning/issues/10463)) - - - Squeeze the early stopping monitor to remove empty tensor dimensions ([#10461](https://github.com/PyTorchLightning/pytorch-lightning/issues/10461)) - - - Fixed sampler replacement logic with `overfit_batches` to only replace the sample when `SequentialSampler` is not used ([#10486](https://github.com/PyTorchLightning/pytorch-lightning/issues/10486)) - - - Fixed scripting causing false positive deprecation warnings ([#10470](https://github.com/PyTorchLightning/pytorch-lightning/pull/10470), [#10555](https://github.com/PyTorchLightning/pytorch-lightning/pull/10555)) -- Fixed `isinstance` not working with `init_meta_context`, materialized model not being moved to the device ([#10493](https://github.com/PyTorchLightning/metrics/pull/10493)) - - -- Fixed an issue that prevented the Trainer to shutdown workers when execution is interrupted due to failure([#10463](https://github.com/PyTorchLightning/pytorch-lightning/issues/10463)) - - -- Squeeze the early stopping monitor to remove empty tensor dimensions ([#10461](https://github.com/PyTorchLightning/pytorch-lightning/issues/10461)) - - -- - - ## [1.5.1] - 2021-11-09 ### Fixed From a7074381c74390eecd462d34cb61d995c95aa47e Mon Sep 17 00:00:00 2001 From: Sean Naren Date: Tue, 16 Nov 2021 11:42:25 +0000 Subject: [PATCH 028/123] [DeepSpeed] Do not fail if batch size could not be inferred for logging (#10438) (cherry picked from commit e98ace3adc2ec468d5376e39241d5c97d52dbbf1) --- CHANGELOG.md | 4 +++ .../plugins/training_type/deepspeed.py | 21 ++++++++++------ tests/plugins/test_deepspeed_plugin.py | 25 +++++-------------- 3 files changed, 23 insertions(+), 27 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index cfb5c3e52e815..149ecbfd54087 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,6 +17,10 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed sampler replacement logic with `overfit_batches` to only replace the sample when `SequentialSampler` is not used ([#10486](https://github.com/PyTorchLightning/pytorch-lightning/issues/10486)) - Fixed scripting causing false positive deprecation warnings ([#10470](https://github.com/PyTorchLightning/pytorch-lightning/pull/10470), [#10555](https://github.com/PyTorchLightning/pytorch-lightning/pull/10555)) +### Changed + +- Do not fail if batch size could not be inferred for logging when using DeepSpeed ([#10438](https://github.com/PyTorchLightning/pytorch-lightning/issues/10438)) + ## [1.5.1] - 2021-11-09 diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py index 8dc42c2f36b88..13f518d53ec00 100644 --- a/pytorch_lightning/plugins/training_type/deepspeed.py +++ b/pytorch_lightning/plugins/training_type/deepspeed.py @@ -622,11 +622,6 @@ def _format_batch_size_and_grad_accum_config(self): ) self.config["gradient_accumulation_steps"] = self.lightning_module.trainer.accumulate_grad_batches if "train_micro_batch_size_per_gpu" not in self.config: - rank_zero_warn( - "Inferring the batch size for internal deepspeed logging from the `train_dataloader()`. " - "If you require skipping this, please pass " - "`Trainer(strategy=DeepSpeedPlugin(logging_batch_size_per_gpu=batch_size))`" - ) batch_size = self._auto_select_batch_size() self.config["train_micro_batch_size_per_gpu"] = batch_size if "gradient_clipping" not in self.config: @@ -638,9 +633,19 @@ def _auto_select_batch_size(self): batch_size = 1 train_dl_source = self.lightning_module.trainer._data_connector._train_dataloader_source if train_dl_source.is_defined(): - train_dataloader = train_dl_source.dataloader() - if hasattr(train_dataloader, "batch_sampler"): - batch_size = train_dataloader.batch_sampler.batch_size + try: + train_dataloader = train_dl_source.dataloader() + if hasattr(train_dataloader, "batch_sampler"): + batch_size = train_dataloader.batch_sampler.batch_size + # broad exception on purpose as `source.dataloader()` will fail if the dataloader requires `setup` + # to have been called before + except Exception: + if self.global_rank == 0: + deepspeed.utils.logging.logger.warning( + "Tried to infer the batch size for internal deepspeed logging from the `train_dataloader()`. " + "To ensure DeepSpeed logging remains correct, please manually pass the plugin with the " + "batch size, `Trainer(strategy=DeepSpeedPlugin(logging_batch_size_per_gpu=batch_size))`." + ) return batch_size def _format_precision_config(self): diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py index 981d0d5db8cf6..c5b71e908795b 100644 --- a/tests/plugins/test_deepspeed_plugin.py +++ b/tests/plugins/test_deepspeed_plugin.py @@ -1,5 +1,6 @@ import contextlib import json +import logging import os from typing import Any, Dict, Optional from unittest import mock @@ -872,24 +873,9 @@ def training_step(self, batch, batch_idx): trainer.fit(model) -@RunIf(min_gpus=1, deepspeed=True, special=True) -def test_deepspeed_warn_train_dataloader_called(tmpdir): - """Test DeepSpeed warns when it calls ``lightning_module.train_dataloader`` internally for logging batch - size.""" - model = BoringModel() - trainer = Trainer( - default_root_dir=tmpdir, - strategy=DeepSpeedPlugin(), - gpus=1, - fast_dev_run=True, - ) - with pytest.warns(UserWarning, match="Inferring the batch size for internal deepspeed logging"): - trainer.fit(model) - - @RunIf(min_gpus=1, deepspeed=True, special=True) def test_deepspeed_setup_train_dataloader(tmpdir): - """Test DeepSpeed works when setup is required to call, and the user passes the batch size manually.""" + """Test DeepSpeed works when setup is required to call in the DataModule.""" class TestSetupIsCalledDataModule(LightningDataModule): def __init__(self): @@ -914,13 +900,14 @@ def test_dataloader(self): model = BoringModel() trainer = Trainer( default_root_dir=tmpdir, - strategy=DeepSpeedPlugin(logging_batch_size_per_gpu=32), + strategy=DeepSpeedPlugin(logging_level=logging.INFO), gpus=1, fast_dev_run=True, ) dm = TestSetupIsCalledDataModule() - trainer.fit(model, datamodule=dm) - trainer.test(model, datamodule=dm) + with mock.patch("deepspeed.utils.logging.logger.warning", autospec=True) as mock_object: + trainer.fit(model, datamodule=dm) + assert any("Tried to infer the batch size" in str(arg) for arg in mock_object.call_args_list) @mock.patch("torch.optim.lr_scheduler.StepLR.step", autospec=True) From 0865ad1d66e903d0c89aafa4f6f2a6651f840a18 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 16 Nov 2021 19:13:37 +0100 Subject: [PATCH 029/123] Fix propagation of device and dtype properties in Lite modules (#10559) --- CHANGELOG.md | 4 +--- pytorch_lightning/lite/wrappers.py | 3 ++- tests/lite/test_wrappers.py | 22 ++++++++++++++++++++++ 3 files changed, 25 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 149ecbfd54087..1d23dc150648b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,10 +16,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Squeeze the early stopping monitor to remove empty tensor dimensions ([#10461](https://github.com/PyTorchLightning/pytorch-lightning/issues/10461)) - Fixed sampler replacement logic with `overfit_batches` to only replace the sample when `SequentialSampler` is not used ([#10486](https://github.com/PyTorchLightning/pytorch-lightning/issues/10486)) - Fixed scripting causing false positive deprecation warnings ([#10470](https://github.com/PyTorchLightning/pytorch-lightning/pull/10470), [#10555](https://github.com/PyTorchLightning/pytorch-lightning/pull/10555)) - -### Changed - - Do not fail if batch size could not be inferred for logging when using DeepSpeed ([#10438](https://github.com/PyTorchLightning/pytorch-lightning/issues/10438)) +- Fixed propagation of device and dtype information to submodules of LightningLite when they inherit from `DeviceDtypeModuleMixin` ([#10559](https://github.com/PyTorchLightning/pytorch-lightning/issues/10559)) ## [1.5.1] - 2021-11-09 diff --git a/pytorch_lightning/lite/wrappers.py b/pytorch_lightning/lite/wrappers.py index 615f461055204..ff95e89d1d2cf 100644 --- a/pytorch_lightning/lite/wrappers.py +++ b/pytorch_lightning/lite/wrappers.py @@ -24,6 +24,7 @@ from torch.utils.data import DataLoader from pytorch_lightning.accelerators import Accelerator +from pytorch_lightning.core.mixins import DeviceDtypeModuleMixin from pytorch_lightning.plugins import PrecisionPlugin from pytorch_lightning.utilities.apply_func import apply_to_collection, move_data_to_device @@ -64,7 +65,7 @@ def step(self, closure: Optional[Callable] = None) -> None: ) -class _LiteModule(nn.Module): +class _LiteModule(DeviceDtypeModuleMixin): def __init__(self, module: nn.Module, precision_plugin: PrecisionPlugin) -> None: """The LiteModule is a thin wrapper around the :class:`torch.nn.Module` and handles precision / autocast automatically for the forward pass. diff --git a/tests/lite/test_wrappers.py b/tests/lite/test_wrappers.py index 4993a10c8dbc2..c271d3b3163ed 100644 --- a/tests/lite/test_wrappers.py +++ b/tests/lite/test_wrappers.py @@ -17,6 +17,7 @@ import torch from torch.utils.data.dataloader import DataLoader +from pytorch_lightning.core.mixins import DeviceDtypeModuleMixin from pytorch_lightning.lite import LightningLite from pytorch_lightning.lite.wrappers import _LiteDataLoader, _LiteModule, _LiteOptimizer from tests.helpers.runif import RunIf @@ -65,6 +66,27 @@ def check_autocast(forward_input): assert out.dtype == input_type or out.dtype == torch.get_default_dtype() +@pytest.mark.parametrize( + "device", [torch.device("cpu"), pytest.param(torch.device("cuda", 0), marks=RunIf(min_gpus=1))] +) +@pytest.mark.parametrize("dtype", [torch.float32, torch.float16]) +def test_lite_module_device_dtype_propagation(device, dtype): + """Test that the LiteModule propagates device and dtype properties to its submodules (e.g. torchmetrics).""" + + class DeviceModule(DeviceDtypeModuleMixin): + pass + + device_module = DeviceModule() + lite_module = _LiteModule(device_module, Mock()) + lite_module.to(device) + assert device_module.device == device + assert lite_module.device == device + + lite_module.to(dtype) + assert device_module.dtype == dtype + assert lite_module.dtype == dtype + + def test_lite_dataloader_iterator(): """Test that the iteration over a LiteDataLoader wraps the iterator of the underlying dataloader (no automatic device placement).""" From b31ab514aa5abddeed796a59c200bf283b5f3133 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 16 Nov 2021 20:23:19 +0100 Subject: [PATCH 030/123] update version --- pytorch_lightning/__about__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/__about__.py b/pytorch_lightning/__about__.py index bc9aefd1d2dad..2cfcb2c1778f8 100644 --- a/pytorch_lightning/__about__.py +++ b/pytorch_lightning/__about__.py @@ -1,7 +1,7 @@ import time _this_year = time.strftime("%Y") -__version__ = "1.5.2" +__version__ = "1.5.3" __author__ = "William Falcon et al." __author_email__ = "waf2107@columbia.edu" __license__ = "Apache-2.0" From 1510e154c5bd5d354850ffc1525c55fff3967083 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Fri, 19 Nov 2021 12:56:39 +0100 Subject: [PATCH 031/123] Remove redundant fit call from accelerator connector test (#10626) --- tests/accelerators/test_accelerator_connector.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/tests/accelerators/test_accelerator_connector.py b/tests/accelerators/test_accelerator_connector.py index 810f96bfdd08d..0ac22f5f204ba 100644 --- a/tests/accelerators/test_accelerator_connector.py +++ b/tests/accelerators/test_accelerator_connector.py @@ -336,8 +336,6 @@ def test_accelerator_choice_ddp_cpu_and_plugin_spawn(tmpdir): def _test_accelerator_choice_ddp_cpu_and_plugin(tmpdir, ddp_plugin_class): - - model = BoringModel() trainer = Trainer( default_root_dir=tmpdir, plugins=[ddp_plugin_class(find_unused_parameters=True)], @@ -349,7 +347,6 @@ def _test_accelerator_choice_ddp_cpu_and_plugin(tmpdir, ddp_plugin_class): assert isinstance(trainer.accelerator, CPUAccelerator) assert trainer.training_type_plugin.num_processes == 2 assert trainer.training_type_plugin.parallel_devices == [torch.device("cpu")] * 2 - trainer.fit(model) @mock.patch.dict( From 81dea1ac23578ca54bedff584b90a7fc95ebaecb Mon Sep 17 00:00:00 2001 From: ananthsub Date: Fri, 19 Nov 2021 09:34:23 -0800 Subject: [PATCH 032/123] Check torch.distributed availability before sharded tensor state dict hook registration (#10621) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Carlos Mocholí --- CHANGELOG.md | 26 ++++++++++++++++++++++++++ pytorch_lightning/core/lightning.py | 5 +++-- 2 files changed, 29 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1d23dc150648b..c8bbed3eea4ca 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,32 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). +## [1.5.3] - 2021-11-23 + +### Fixed + +- When a tensor is logged with `self.log`, run its computation with the same `dtype` ([#10076](https://github.com/PyTorchLightning/pytorch-lightning/pull/10076)) + + +- Fixed `ShardedTensor` state dict hook registration to check if torch distributed is available ([#10621](https://github.com/PyTorchLightning/pytorch-lightning/pull/10621)) + + +- Fixed LigtningLite `_wrap_init` popping unexisting keys from DataLoader signature parameters ([#10613](https://github.com/PyTorchLightning/pytorch-lightning/pull/10613)) + + +- Fixed signals being registered within threads ([#10610](https://github.com/PyTorchLightning/pytorch-lightning/pull/10610)) + + +- Fixed an issue that caused Lightning to extract the batch size even though it was set by the user in `LightningModule.log` ([#10408](https://github.com/PyTorchLightning/pytorch-lightning/pull/10408)) + + +- + + +- + + + ## [1.5.2] - 2021-11-16 ### Fixed diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py index bc89cc2b18e93..4d327684978b6 100644 --- a/pytorch_lightning/core/lightning.py +++ b/pytorch_lightning/core/lightning.py @@ -46,7 +46,7 @@ ) from pytorch_lightning.utilities.apply_func import apply_to_collection, convert_to_tensors from pytorch_lightning.utilities.cloud_io import get_filesystem -from pytorch_lightning.utilities.distributed import distributed_available, sync_ddp +from pytorch_lightning.utilities.distributed import distributed_available, rank_zero_debug, sync_ddp from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.memory import get_model_size_mb from pytorch_lightning.utilities.model_summary import ModelSummary, summarize @@ -2059,7 +2059,8 @@ def _register_sharded_tensor_state_dict_hooks_if_available(self) -> None: These hooks ensure that ShardedTensors are included when saving, and are loaded the LightningModule correctly. """ - if not _TORCH_GREATER_EQUAL_1_10 or _IS_WINDOWS: + if not _TORCH_GREATER_EQUAL_1_10 or _IS_WINDOWS or not torch.distributed.is_available(): + rank_zero_debug("Could not register sharded tensor state dict hooks") return from torch.distributed._sharded_tensor import pre_load_state_dict_hook, state_dict_hook From 302c611d91c5e8029a4302cef54d98491d981ac2 Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Fri, 19 Nov 2021 14:04:33 +0000 Subject: [PATCH 033/123] Lite: Don't pop value if they don't exist (#10613) --- pytorch_lightning/lite/wrappers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/lite/wrappers.py b/pytorch_lightning/lite/wrappers.py index ff95e89d1d2cf..6b8e44b610352 100644 --- a/pytorch_lightning/lite/wrappers.py +++ b/pytorch_lightning/lite/wrappers.py @@ -114,8 +114,8 @@ def _wrap_init(f: Callable) -> Callable: @functools.wraps(f) def wrapper(module: Any, *args: Any, **kwargs: Dict[str, Any]) -> None: params = dict(inspect.signature(module._old_init).parameters) - params.pop("args") - params.pop("kwargs") + params.pop("args", None) + params.pop("kwargs", None) for init_name, init_arg in chain(zip(params, args), kwargs.items()): setattr(module, init_name, init_arg) f(module, *args, **kwargs) From 8eee9b3cd97800836dc81df13a64a40f18977ad6 Mon Sep 17 00:00:00 2001 From: Rohit Gupta Date: Fri, 19 Nov 2021 22:18:26 +0530 Subject: [PATCH 034/123] Fix batch size extraction when set by the user in `LightningModule.log` (#10408) Co-authored-by: Carlos Mocholi --- .../loops/epoch/training_epoch_loop.py | 6 +-- .../logger_connector/logger_connector.py | 23 ++++----- .../connectors/logger_connector/result.py | 51 +++++++++---------- pytorch_lightning/utilities/data.py | 5 +- tests/deprecated_api/__init__.py | 28 +++++++--- tests/loops/test_loop_state_dict.py | 13 +++-- .../logging_/test_train_loop_logging.py | 38 ++++++++++++-- tests/utilities/test_data.py | 7 ++- 8 files changed, 109 insertions(+), 62 deletions(-) diff --git a/pytorch_lightning/loops/epoch/training_epoch_loop.py b/pytorch_lightning/loops/epoch/training_epoch_loop.py index 21d89a8be8b52..8ddca3ad505e8 100644 --- a/pytorch_lightning/loops/epoch/training_epoch_loop.py +++ b/pytorch_lightning/loops/epoch/training_epoch_loop.py @@ -161,9 +161,7 @@ def advance(self, *args: Any, **kwargs: Any) -> None: self.batch_progress.increment_ready() - # cache the batch size value to avoid extracting it again after the batch loop runs as the value will be - # different if tbptt is enabled - batch_size = self.trainer.logger_connector.on_batch_start(batch_idx, batch) + self.trainer.logger_connector.on_batch_start(batch_idx, batch) if batch is None: self._warning_cache.warn("train_dataloader yielded None. If this was on purpose, ignore this warning...") @@ -194,8 +192,6 @@ def advance(self, *args: Any, **kwargs: Any) -> None: with self.trainer.profiler.profile("run_training_batch"): batch_output = self.batch_loop.run(batch, batch_idx) - self.trainer._results.batch_size = batch_size - self.batch_progress.increment_processed() # update non-plateau LR schedulers diff --git a/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py b/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py index 37fcb06a1dc24..4b56aefb9809f 100644 --- a/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py +++ b/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py @@ -210,7 +210,6 @@ def update_eval_epoch_metrics(self) -> List[_OUT_DICT]: def on_train_split_start(self, split_idx: int, split_batch: Any) -> None: self._split_idx = split_idx - self.on_new_batch(split_batch) def update_train_step_metrics(self) -> None: if self.trainer.fit_loop._should_accumulate() and self.trainer.lightning_module.automatic_optimization: @@ -253,28 +252,23 @@ def _log_gpus_metrics(self) -> None: Utilities and properties """ - def on_new_batch(self, batch: Any) -> int: - # when the user requests `dataloader_iter`, we can't track the batch_size - # and this is left to user responsibility. - if not isinstance(batch, pl.utilities.fetching.StepFuncDataLoaderIter): - assert self.trainer._results is not None - return self.trainer._results.extract_batch_size(batch) - return 1 - def on_epoch_start(self) -> None: self._epoch_end_reached = False - def on_batch_start(self, batch_idx: int, batch: Any) -> int: + def on_batch_start(self, batch_idx: int, batch: Any) -> None: self._batch_idx = batch_idx self._epoch_end_reached = False - return self.on_new_batch(batch) + + assert self.trainer._results is not None + # attach reference to the new batch and remove the cached batch_size + self.trainer._results.batch = batch + self.trainer._results.batch_size = None def epoch_end_reached(self) -> None: self._epoch_end_reached = True self._batch_idx = None self._split_idx = None assert self.trainer._results is not None - self.trainer._results.batch_size = 1 def on_epoch_end(self) -> None: assert self._epoch_end_reached @@ -291,6 +285,11 @@ def on_batch_end(self) -> None: self._callback_metrics.update(metrics["callback"]) self._logged_metrics.update(metrics["log"]) + assert self.trainer._results is not None + # drop the reference to current batch and batch_size + self.trainer._results.batch = None + self.trainer._results.batch_size = None + def should_reset_tensors(self, fx: str) -> bool: is_different_fx = self._current_fx != fx if self._split_idx is None: diff --git a/pytorch_lightning/trainer/connectors/logger_connector/result.py b/pytorch_lightning/trainer/connectors/logger_connector/result.py index 53034ac77db3f..06da64bf3ed8e 100644 --- a/pytorch_lightning/trainer/connectors/logger_connector/result.py +++ b/pytorch_lightning/trainer/connectors/logger_connector/result.py @@ -211,7 +211,7 @@ def __init__(self, metadata: _Metadata, is_tensor: bool) -> None: if self.meta.is_mean_reduction: self.add_state("cumulated_batch_size", torch.tensor(0, dtype=torch.float), dist_reduce_fx=torch.sum) - def update(self, value: _IN_METRIC, batch_size: torch.Tensor) -> None: + def update(self, value: _IN_METRIC, batch_size: int) -> None: if self.is_tensor: value = value.float() if self.meta.on_step: @@ -250,7 +250,7 @@ def reset(self) -> None: self.value.reset() self.has_reset = True - def forward(self, value: _IN_METRIC, batch_size: torch.Tensor) -> None: + def forward(self, value: _IN_METRIC, batch_size: int) -> None: if self.meta.enable_graph: with torch.no_grad(): self.update(value, batch_size) @@ -376,8 +376,9 @@ class ResultCollection(dict): def __init__(self, training: bool, device: Optional[Union[str, torch.device]] = None) -> None: super().__init__() self.training = training - self._batch_size = torch.tensor(1, device=device) self.device: Optional[Union[str, torch.device]] = device + self.batch: Optional[Any] = None + self.batch_size: Optional[int] = None @property def result_metrics(self) -> List[ResultMetric]: @@ -390,14 +391,23 @@ def append_fn(v: ResultMetric) -> None: apply_to_collection(list(self.values()), ResultMetric, append_fn) return o - @property - def batch_size(self) -> torch.Tensor: - # performance: cache the `batch_size` tensor instead of re-creating it - return self._batch_size + def _extract_batch_size(self, batch_size: Optional[int], meta: _Metadata) -> int: + # check if we have extracted the batch size already + if batch_size is None: + batch_size = self.batch_size + + if batch_size is not None: + return batch_size - @batch_size.setter - def batch_size(self, value: int) -> None: - self._batch_size = torch.tensor(value, device=self.device) + batch_size = 1 + if self.batch is not None and meta.on_epoch and meta.is_mean_reduction: + try: + batch_size = extract_batch_size(self.batch) + self.batch_size = batch_size + except RecursionError: + pass + + return batch_size def log( self, @@ -458,10 +468,8 @@ def log( f"You called `self.log({name}, ...)` twice in `{fx}` with different arguments. This is not allowed" ) - if batch_size is not None: - self.batch_size = batch_size - - self.update_metrics(key, value) + batch_size = self._extract_batch_size(batch_size, meta) + self.update_metrics(key, value, batch_size) def register_key(self, key: str, meta: _Metadata, value: _METRIC_COLLECTION) -> None: """Create one ResultMetric object per value. @@ -478,10 +486,10 @@ def fn(v: _IN_METRIC) -> ResultMetric: value = ResultMetricCollection(value) self[key] = value - def update_metrics(self, key: str, value: _METRIC_COLLECTION) -> None: - def fn(result_metric: ResultMetric, v: ResultMetric) -> None: + def update_metrics(self, key: str, value: _METRIC_COLLECTION, batch_size: int) -> None: + def fn(result_metric: ResultMetric, v: torch.Tensor) -> None: # performance: avoid calling `__call__` to avoid the checks in `torch.nn.Module._call_impl` - result_metric.forward(v.to(self.device), self.batch_size) + result_metric.forward(v.to(self.device), batch_size) result_metric.has_reset = False apply_to_collections(self[key], value, ResultMetric, fn) @@ -575,19 +583,10 @@ def fn(item: ResultMetric) -> None: apply_to_collection(self, ResultMetric, fn) - def extract_batch_size(self, batch: Any) -> int: - try: - batch_size = extract_batch_size(batch) - except RecursionError: - batch_size = 1 - self.batch_size = batch_size # the setter converts it to `Tensor` - return batch_size - def to(self, *args: Any, **kwargs: Any) -> "ResultCollection": """Move all data to the given device.""" self.update(apply_to_collection(dict(self), (torch.Tensor, Metric), move_data_to_device, *args, **kwargs)) - self._batch_size = self._batch_size.to(*args, **kwargs) if "device" in kwargs: self.device = kwargs["device"] return self diff --git a/pytorch_lightning/utilities/data.py b/pytorch_lightning/utilities/data.py index a75afa775848b..e6cfdcd953e61 100644 --- a/pytorch_lightning/utilities/data.py +++ b/pytorch_lightning/utilities/data.py @@ -29,7 +29,10 @@ def _extract_batch_size(batch: BType) -> Generator[int, None, None]: if isinstance(batch, torch.Tensor): - yield batch.size(0) + if batch.ndim == 0: + yield 1 + else: + yield batch.size(0) elif isinstance(batch, str): yield len(batch) elif isinstance(batch, (Iterable, Mapping)): diff --git a/tests/deprecated_api/__init__.py b/tests/deprecated_api/__init__.py index 1026981f75307..91c7ef1c1f880 100644 --- a/tests/deprecated_api/__init__.py +++ b/tests/deprecated_api/__init__.py @@ -14,7 +14,7 @@ """Test deprecated functionality which will be removed in vX.Y.Z.""" import sys from contextlib import contextmanager -from typing import Optional +from typing import Optional, Type import pytest @@ -26,14 +26,28 @@ def _soft_unimport_module(str_module): @contextmanager -def no_deprecated_call(match: Optional[str] = None): +def no_warning_call(expected_warning: Type[Warning] = UserWarning, match: Optional[str] = None): with pytest.warns(None) as record: yield + + if match is None: try: - w = record.pop(DeprecationWarning) - if match is not None and match not in str(w.message): - return + w = record.pop(expected_warning) except AssertionError: - # no DeprecationWarning raised + # no warning raised + return + else: + for w in record.list: + if w.category is expected_warning and match in w.message.args[0]: + break + else: return - raise AssertionError(f"`DeprecationWarning` was raised: {w}") + + msg = "A warning" if expected_warning is None else f"`{expected_warning.__name__}`" + raise AssertionError(f"{msg} was raised: {w}") + + +@contextmanager +def no_deprecated_call(match: Optional[str] = None): + with no_warning_call(expected_warning=DeprecationWarning, match=match): + yield diff --git a/tests/loops/test_loop_state_dict.py b/tests/loops/test_loop_state_dict.py index 717d625f6c44e..72eeb197e9e57 100644 --- a/tests/loops/test_loop_state_dict.py +++ b/tests/loops/test_loop_state_dict.py @@ -14,7 +14,6 @@ from unittest.mock import Mock import pytest -import torch from pytorch_lightning.loops import FitLoop from pytorch_lightning.trainer.trainer import Trainer @@ -80,14 +79,16 @@ def test_loops_state_dict_structure(): "is_last_batch": False, }, "epoch_loop.val_loop._results": { + "batch": None, + "batch_size": None, "training": False, - "_batch_size": torch.tensor(1), "device": None, "items": {}, }, "epoch_loop._results": { + "batch": None, + "batch_size": None, "training": True, - "_batch_size": torch.tensor(1), "device": None, "items": {}, }, @@ -106,8 +107,9 @@ def test_loops_state_dict_structure(): "is_last_batch": False, }, "_results": { + "batch": None, + "batch_size": None, "training": False, - "_batch_size": torch.tensor(1), "device": None, "items": {}, }, @@ -122,8 +124,9 @@ def test_loops_state_dict_structure(): "is_last_batch": False, }, "_results": { + "batch": None, + "batch_size": None, "training": False, - "_batch_size": torch.tensor(1), "device": None, "items": {}, }, diff --git a/tests/trainer/logging_/test_train_loop_logging.py b/tests/trainer/logging_/test_train_loop_logging.py index 22a1a2c90d756..0ec61358d9408 100644 --- a/tests/trainer/logging_/test_train_loop_logging.py +++ b/tests/trainer/logging_/test_train_loop_logging.py @@ -27,6 +27,7 @@ from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint, TQDMProgressBar from pytorch_lightning.core.lightning import LightningModule from pytorch_lightning.utilities.exceptions import MisconfigurationException +from tests.deprecated_api import no_warning_call from tests.helpers.boring_model import BoringModel, RandomDataset, RandomDictDataset from tests.helpers.runif import RunIf @@ -715,19 +716,15 @@ def on_validation_epoch_end(self): assert all(v == 3 for v in self.trainer.callback_metrics.values()) def on_train_batch_start(self, batch, batch_idx): - assert self.trainer._results.batch_size == 2 self.log("on_train_batch_start", 1.0, reduce_fx="sum") def on_train_batch_end(self, outputs, batch, batch_idx): - assert self.trainer._results.batch_size == 2 self.log("on_train_batch_end", 1.0, reduce_fx="sum") def on_validation_batch_start(self, batch, batch_idx, dataloader_idx): - assert self.trainer._results.batch_size == 2 self.log("on_validation_batch_start", 1.0, reduce_fx="sum") def on_validation_batch_end(self, outputs, batch, batch_idx, dataloader_idx): - assert self.trainer._results.batch_size == 2 self.log("on_validation_batch_end", 1.0, reduce_fx="sum") def training_epoch_end(self, *_) -> None: @@ -749,3 +746,36 @@ def validation_epoch_end(self, *_) -> None: train_data = DataLoader(RandomDataset(32, 64), batch_size=2) val_data = DataLoader(RandomDataset(32, 64), batch_size=2) trainer.fit(model, train_dataloaders=train_data, val_dataloaders=val_data) + + +def test_no_batch_size_extraction_with_specifying_explictly(tmpdir): + batch_size = BoringModel().train_dataloader().batch_size + 1 + fast_dev_run = 2 + log_val = 7 + + class CustomBoringModel(BoringModel): + def on_before_batch_transfer(self, batch, *args, **kwargs): + # This is an ambiguous batch which have multiple potential batch sizes + if self.trainer.training: + batch = {"batch1": torch.randn(batch_size, 10), "batch2": batch} + return batch + + def training_step(self, batch, batch_idx): + self.log("step_log_val", log_val, on_epoch=False) + self.log("epoch_log_val", log_val, batch_size=batch_size, on_step=False, on_epoch=True) + self.log("epoch_sum_log_val", log_val, on_epoch=True, reduce_fx="sum") + return super().training_step(batch["batch2"], batch_idx) + + def on_train_epoch_end(self, *args, **kwargs): + results = self.trainer._results + assert results["training_step.step_log_val"].value == log_val + assert results["training_step.step_log_val"].cumulated_batch_size == 0 + assert results["training_step.epoch_log_val"].value == log_val * batch_size * fast_dev_run + assert results["training_step.epoch_log_val"].cumulated_batch_size == batch_size * fast_dev_run + assert results["training_step.epoch_sum_log_val"].value == log_val * fast_dev_run + + model = CustomBoringModel() + trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=fast_dev_run) + + with no_warning_call(match="Trying to infer the `batch_size`"): + trainer.fit(model) diff --git a/tests/utilities/test_data.py b/tests/utilities/test_data.py index acbe645515f55..f4c61cda64f5d 100644 --- a/tests/utilities/test_data.py +++ b/tests/utilities/test_data.py @@ -12,6 +12,7 @@ warning_cache, ) from pytorch_lightning.utilities.exceptions import MisconfigurationException +from tests.deprecated_api import no_warning_call from tests.helpers.boring_model import BoringModel, RandomDataset, RandomIterableDataset @@ -19,9 +20,8 @@ def test_extract_batch_size(): """Tests the behavior of extracting the batch size.""" def _check_warning_not_raised(data, expected): - with pytest.warns(None) as record: + with no_warning_call(match="Trying to infer the `batch_size`"): assert extract_batch_size(data) == expected - assert len(record) == 0 def _check_warning_raised(data, expected): with pytest.warns(UserWarning, match=f"Trying to infer the `batch_size` .* we found is {expected}."): @@ -43,6 +43,9 @@ def _check_warning_raised(data, expected): batch = {"test": [{"test": [torch.zeros(11, 10)]}]} _check_warning_not_raised(batch, 11) + batch = {"a": [torch.tensor(1), torch.tensor(2)], "b": torch.tensor([1, 2, 3, 4])} + _check_warning_raised(batch, 1) + batch = {"test": [{"test": [torch.zeros(11, 10), torch.zeros(10, 10)]}]} _check_warning_raised(batch, 11) From 2bb43435049fe576d9d1472f204adf81a1d676ea Mon Sep 17 00:00:00 2001 From: Biho-Kim <22164993+qqueing@users.noreply.github.com> Date: Sat, 20 Nov 2021 00:16:33 +0900 Subject: [PATCH 035/123] Respect the passed dtype with `self.log` (#10076) Co-authored-by: Carlos Mocholi Co-authored-by: Rohit Gupta --- CHANGELOG.md | 2 ++ .../logger_connector/logger_connector.py | 10 ++++-- .../connectors/logger_connector/result.py | 15 ++++++-- tests/core/test_metric_result_integration.py | 34 +++++++++++++++++-- .../trainer/logging_/test_logger_connector.py | 4 +-- 5 files changed, 55 insertions(+), 10 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c8bbed3eea4ca..40be4c31900a7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed `ShardedTensor` state dict hook registration to check if torch distributed is available ([#10621](https://github.com/PyTorchLightning/pytorch-lightning/pull/10621)) +- When a tensor is logged with `self.log`, run its computation with the same `dtype` ([#10076](https://github.com/PyTorchLightning/pytorch-lightning/pull/10076)) + - Fixed LigtningLite `_wrap_init` popping unexisting keys from DataLoader signature parameters ([#10613](https://github.com/PyTorchLightning/pytorch-lightning/pull/10613)) diff --git a/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py b/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py index 4b56aefb9809f..d970d98c602bc 100644 --- a/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py +++ b/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py @@ -154,6 +154,12 @@ def update_eval_step_metrics(self) -> None: # increment the step even if nothing was logged self._increment_eval_log_step() + @staticmethod + def _filter_metrics_for_dataloader( + dl_idx: int, metrics: _OUT_DICT, metric_prefix: str = "dataloader_idx" + ) -> _OUT_DICT: + return {k: v for k, v in metrics.items() if metric_prefix not in k or k.endswith(f"{metric_prefix}_{dl_idx}")} + def _prepare_eval_loop_results(self, metrics: _OUT_DICT) -> None: if self.trainer.sanity_checking: return @@ -162,9 +168,7 @@ def _prepare_eval_loop_results(self, metrics: _OUT_DICT) -> None: has_been_initialized = len(self.eval_loop_results) == num_dataloaders for dl_idx in range(self.trainer._evaluation_loop.num_dataloaders): # remove callback metrics that don't belong to this dataloader - callback_metrics = { - k: v for k, v in metrics.items() if "dataloader_idx" not in k or f"dataloader_idx_{dl_idx}" in k - } + callback_metrics = self._filter_metrics_for_dataloader(dl_idx, metrics) if has_been_initialized: self.eval_loop_results[dl_idx].update(callback_metrics) else: diff --git a/pytorch_lightning/trainer/connectors/logger_connector/result.py b/pytorch_lightning/trainer/connectors/logger_connector/result.py index 06da64bf3ed8e..1b82baf0440c9 100644 --- a/pytorch_lightning/trainer/connectors/logger_connector/result.py +++ b/pytorch_lightning/trainer/connectors/logger_connector/result.py @@ -207,13 +207,22 @@ def __init__(self, metadata: _Metadata, is_tensor: bool) -> None: self.meta = metadata self.has_reset = False if is_tensor: - self.add_state("value", torch.tensor(0, dtype=torch.float), dist_reduce_fx=torch.sum) + # do not set a dtype in case the default dtype was changed + self.add_state("value", torch.tensor(0.0), dist_reduce_fx=torch.sum) if self.meta.is_mean_reduction: - self.add_state("cumulated_batch_size", torch.tensor(0, dtype=torch.float), dist_reduce_fx=torch.sum) + self.add_state("cumulated_batch_size", torch.tensor(0), dist_reduce_fx=torch.sum) def update(self, value: _IN_METRIC, batch_size: int) -> None: if self.is_tensor: - value = value.float() + if not torch.is_floating_point(value): + dtype = torch.get_default_dtype() + warning_cache.warn( + # do not include the value to avoid cache misses + f"You called `self.log({self.meta.name!r}, ...)` in your `{self.meta.fx}` but the value needs to" + f" be floating point. Converting it to {dtype}." + ) + value = value.to(dtype) + if self.meta.on_step: self._forward_cache = self.meta.sync(value.clone()) # `clone` because `sync` is in-place diff --git a/tests/core/test_metric_result_integration.py b/tests/core/test_metric_result_integration.py index 12fe7f2fb4652..6b2d965c5add5 100644 --- a/tests/core/test_metric_result_integration.py +++ b/tests/core/test_metric_result_integration.py @@ -552,12 +552,42 @@ def on_train_epoch_end(self) -> None: def test_metric_result_computed_check(): """Unittest ``_get_cache`` with multielement tensors.""" - sync = _Sync() metadata = _Metadata("foo", "bar", on_epoch=True, enable_graph=True) - metadata.sync = sync + metadata.sync = _Sync() rm = ResultMetric(metadata, is_tensor=True) computed_value = torch.tensor([1, 2, 3]) rm._computed = computed_value cache = ResultCollection._get_cache(rm, on_step=False) # `enable_graph=True` so no detach, identity works assert cache is computed_value + + +@pytest.mark.parametrize("floating_dtype", (torch.float, torch.double)) +def test_metric_result_respects_dtype(floating_dtype): + torch.set_default_dtype(floating_dtype) + fixed_dtype = torch.long # default by PyTorch + + metadata = _Metadata("foo", "bar") + metadata.sync = _Sync() + rm = ResultMetric(metadata, is_tensor=True) + + assert rm.value.dtype == floating_dtype + assert rm.cumulated_batch_size.dtype == fixed_dtype + + # two fixed point numbers - should be converted + value, batch_size = torch.tensor(2), torch.tensor(3) + assert value.dtype == fixed_dtype + with pytest.warns( + UserWarning, match=rf"`self.log\('bar', ...\)` in your `foo` .* Converting it to {floating_dtype}" + ): + rm.update(value, batch_size) + # floating and fixed + rm.update(torch.tensor(4.0), torch.tensor(5)) + + total = rm.compute() + + assert total == (2 * 3 + 4 * 5) / (5 + 3) + assert total.dtype == floating_dtype + + # restore to avoid impacting other tests + torch.set_default_dtype(torch.float) diff --git a/tests/trainer/logging_/test_logger_connector.py b/tests/trainer/logging_/test_logger_connector.py index d26245a377897..c6afc1ef60503 100644 --- a/tests/trainer/logging_/test_logger_connector.py +++ b/tests/trainer/logging_/test_logger_connector.py @@ -527,9 +527,9 @@ def _assert_called(model, fn, stage): def test_result_collection_on_tensor_with_mean_reduction(): - result_collection = ResultCollection(True, torch.device("cpu")) + result_collection = ResultCollection(True) product = [(True, True), (False, True), (True, False), (False, False)] - values = torch.arange(1, 10).float() # need to convert to float() due to precision issues using torch 1.4 + values = torch.arange(1, 10) batches = values * values for i, v in enumerate(values): From 024acd57c032364409cc087f448100faaa687347 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Wed, 17 Nov 2021 23:29:48 +0100 Subject: [PATCH 036/123] Simplify hanging queue test (#10591) --- .github/workflows/ci_test-base.yml | 2 +- .github/workflows/ci_test-conda.yml | 1 - .github/workflows/ci_test-full.yml | 2 -- tests/deprecated_api/test_remove_1-7.py | 17 +++++------------ 4 files changed, 6 insertions(+), 16 deletions(-) diff --git a/.github/workflows/ci_test-base.yml b/.github/workflows/ci_test-base.yml index e92249cab4030..e3d3ca2e4e82b 100644 --- a/.github/workflows/ci_test-base.yml +++ b/.github/workflows/ci_test-base.yml @@ -20,7 +20,7 @@ jobs: # this will install stable torch python-version: [3.9] - # Timeout: https://stackoverflow.com/a/59076067/4521646 + # lower timeout as this should run very quickly timeout-minutes: 20 steps: - uses: actions/checkout@v2 diff --git a/.github/workflows/ci_test-conda.yml b/.github/workflows/ci_test-conda.yml index edae03db7936b..9196034bf2757 100644 --- a/.github/workflows/ci_test-conda.yml +++ b/.github/workflows/ci_test-conda.yml @@ -17,7 +17,6 @@ jobs: python-version: ["3.8"] # previous to last Python version as that one is already used in test-full pytorch-version: ["1.6", "1.7", "1.8", "1.9", "1.10"] - # Timeout: https://stackoverflow.com/a/59076067/4521646 timeout-minutes: 35 steps: - uses: actions/checkout@v2 diff --git a/.github/workflows/ci_test-full.yml b/.github/workflows/ci_test-full.yml index 8be8fd1146864..43096316cbff6 100644 --- a/.github/workflows/ci_test-full.yml +++ b/.github/workflows/ci_test-full.yml @@ -34,8 +34,6 @@ jobs: - {os: windows-2019, python-version: "3.9", requires: "oldest", release: "stable"} - {os: macOS-10.15, python-version: "3.9", requires: "oldest", release: "stable"} - # Timeout: https://stackoverflow.com/a/59076067/4521646 - # TODO: the macOS is taking too long, probably caching did not work... timeout-minutes: 40 steps: diff --git a/tests/deprecated_api/test_remove_1-7.py b/tests/deprecated_api/test_remove_1-7.py index 47d72814cd2b6..e0e51575f5f22 100644 --- a/tests/deprecated_api/test_remove_1-7.py +++ b/tests/deprecated_api/test_remove_1-7.py @@ -15,7 +15,6 @@ from unittest import mock import pytest -import torch from pytorch_lightning import Callback, LightningDataModule, Trainer from pytorch_lightning.callbacks.gpu_stats_monitor import GPUStatsMonitor @@ -230,22 +229,16 @@ def test_v1_7_0_flush_logs_every_n_steps_trainer_constructor(tmpdir): class BoringCallbackDDPSpawnModel(BoringModel): - def __init__(self): - super().__init__() + def add_to_queue(self, queue): + ... - def add_to_queue(self, queue: torch.multiprocessing.SimpleQueue) -> None: - queue.put("test_val") - return super().add_to_queue(queue) + def get_from_queue(self, queue): + ... - def get_from_queue(self, queue: torch.multiprocessing.SimpleQueue) -> None: - self.test_val = queue.get() - return super().get_from_queue(queue) - -@RunIf(skip_windows=True, skip_49370=True) def test_v1_7_0_deprecate_add_get_queue(tmpdir): model = BoringCallbackDDPSpawnModel() - trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, num_processes=2, strategy="ddp_spawn") + trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True) with pytest.deprecated_call(match=r"`LightningModule.add_to_queue` method was deprecated in v1.5"): trainer.fit(model) From 20529581ec6c934bedd6bd6ab09f45ebba985819 Mon Sep 17 00:00:00 2001 From: Jaime Ferrando Huertas Date: Fri, 19 Nov 2021 17:32:30 +0100 Subject: [PATCH 037/123] Added boring model as a ipynb so it can be updated (#10521) Co-authored-by: Carlos Mocholi --- .github/ISSUE_TEMPLATE/bug_report.md | 2 +- pl_examples/bug_report/The_BoringModel.ipynb | 1420 +++++++++++++++++ .../{ => bug_report}/bug_report_model.py | 0 tests/loops/test_loops.py | 3 +- 4 files changed, 1422 insertions(+), 3 deletions(-) create mode 100644 pl_examples/bug_report/The_BoringModel.ipynb rename pl_examples/{ => bug_report}/bug_report_model.py (100%) diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md index 3a94ef6758910..44bcec54517d3 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.md +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -16,7 +16,7 @@ assignees: '' Please reproduce using the BoringModel! You can use the following Colab link: -https://colab.research.google.com/drive/1HvWVVTK8j2Nj52qU4Q4YCyzOm0_aLQF3?usp=sharing +https://colab.research.google.com/github/PytorchLightning/pytorch-lightning/blob/master/pl_examples/bug_report/The_BoringModel.ipynb IMPORTANT: has to be public. or this simple template: diff --git a/pl_examples/bug_report/The_BoringModel.ipynb b/pl_examples/bug_report/The_BoringModel.ipynb new file mode 100644 index 0000000000000..9b061c4283cbf --- /dev/null +++ b/pl_examples/bug_report/The_BoringModel.ipynb @@ -0,0 +1,1420 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "name": "The BoringModel.ipynb", + "provenance": [], + "collapsed_sections": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "accelerator": "GPU", + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "d79c1628eded487a974da18a2ea1f98b": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_view_name": "HBoxView", + "_dom_classes": [], + "_model_name": "HBoxModel", + "_view_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_view_count": null, + "_view_module_version": "1.5.0", + "box_style": "", + "layout": "IPY_MODEL_02695b143b764932ba8d0c08a872987e", + "_model_module": "@jupyter-widgets/controls", + "children": [ + "IPY_MODEL_28eb6a3218f64f26abcdff756ffda3ad", + "IPY_MODEL_02cfffd590014c3cbc44ab06c69f9181", + "IPY_MODEL_0d7c50e36cb84f01a57a9d7d8b913393" + ] + } + }, + "02695b143b764932ba8d0c08a872987e": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_view_name": "LayoutView", + "grid_template_rows": null, + "right": null, + "justify_content": null, + "_view_module": "@jupyter-widgets/base", + "overflow": null, + "_model_module_version": "1.2.0", + "_view_count": null, + "flex_flow": "row wrap", + "width": "100%", + "min_width": null, + "border": null, + "align_items": null, + "bottom": null, + "_model_module": "@jupyter-widgets/base", + "top": null, + "grid_column": null, + "overflow_y": null, + "overflow_x": null, + "grid_auto_flow": null, + "grid_area": null, + "grid_template_columns": null, + "flex": null, + "_model_name": "LayoutModel", + "justify_items": null, + "grid_row": null, + "max_height": null, + "align_content": null, + "visibility": null, + "align_self": null, + "height": null, + "min_height": null, + "padding": null, + "grid_auto_rows": null, + "grid_gap": null, + "max_width": null, + "order": null, + "_view_module_version": "1.2.0", + "grid_template_areas": null, + "object_position": null, + "object_fit": null, + "grid_auto_columns": null, + "margin": null, + "display": "inline-flex", + "left": null + } + }, + "28eb6a3218f64f26abcdff756ffda3ad": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_view_name": "HTMLView", + "style": "IPY_MODEL_6ba2782883ae424dbfc8868224d95da9", + "_dom_classes": [], + "description": "", + "_model_name": "HTMLModel", + "placeholder": "​", + "_view_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "value": "Epoch 0: 100%", + "_view_count": null, + "_view_module_version": "1.5.0", + "description_tooltip": null, + "_model_module": "@jupyter-widgets/controls", + "layout": "IPY_MODEL_baa4aacd0da64cf291fb31c000724573" + } + }, + "02cfffd590014c3cbc44ab06c69f9181": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_view_name": "ProgressView", + "style": "IPY_MODEL_7dad3d2feced492a999fb6c91186be50", + "_dom_classes": [], + "description": "", + "_model_name": "FloatProgressModel", + "bar_style": "success", + "max": 2, + "_view_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "value": 2, + "_view_count": null, + "_view_module_version": "1.5.0", + "orientation": "horizontal", + "min": 0, + "description_tooltip": null, + "_model_module": "@jupyter-widgets/controls", + "layout": "IPY_MODEL_ea702a091eb642f7bdda81aa55db8c26" + } + }, + "0d7c50e36cb84f01a57a9d7d8b913393": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_view_name": "HTMLView", + "style": "IPY_MODEL_4802a47c6dfb439c83d8b860dce42006", + "_dom_classes": [], + "description": "", + "_model_name": "HTMLModel", + "placeholder": "​", + "_view_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "value": " 2/2 [00:00<00:00, 9.45it/s, loss=-0.618, v_num=0]", + "_view_count": null, + "_view_module_version": "1.5.0", + "description_tooltip": null, + "_model_module": "@jupyter-widgets/controls", + "layout": "IPY_MODEL_68c87e6a7fcf4e4eab98a941c7c3e867" + } + }, + "6ba2782883ae424dbfc8868224d95da9": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_view_name": "StyleView", + "_model_name": "DescriptionStyleModel", + "description_width": "", + "_view_module": "@jupyter-widgets/base", + "_model_module_version": "1.5.0", + "_view_count": null, + "_view_module_version": "1.2.0", + "_model_module": "@jupyter-widgets/controls" + } + }, + "baa4aacd0da64cf291fb31c000724573": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_view_name": "LayoutView", + "grid_template_rows": null, + "right": null, + "justify_content": null, + "_view_module": "@jupyter-widgets/base", + "overflow": null, + "_model_module_version": "1.2.0", + "_view_count": null, + "flex_flow": null, + "width": null, + "min_width": null, + "border": null, + "align_items": null, + "bottom": null, + "_model_module": "@jupyter-widgets/base", + "top": null, + "grid_column": null, + "overflow_y": null, + "overflow_x": null, + "grid_auto_flow": null, + "grid_area": null, + "grid_template_columns": null, + "flex": null, + "_model_name": "LayoutModel", + "justify_items": null, + "grid_row": null, + "max_height": null, + "align_content": null, + "visibility": null, + "align_self": null, + "height": null, + "min_height": null, + "padding": null, + "grid_auto_rows": null, + "grid_gap": null, + "max_width": null, + "order": null, + "_view_module_version": "1.2.0", + "grid_template_areas": null, + "object_position": null, + "object_fit": null, + "grid_auto_columns": null, + "margin": null, + "display": null, + "left": null + } + }, + "7dad3d2feced492a999fb6c91186be50": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_view_name": "StyleView", + "_model_name": "ProgressStyleModel", + "description_width": "", + "_view_module": "@jupyter-widgets/base", + "_model_module_version": "1.5.0", + "_view_count": null, + "_view_module_version": "1.2.0", + "bar_color": null, + "_model_module": "@jupyter-widgets/controls" + } + }, + "ea702a091eb642f7bdda81aa55db8c26": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_view_name": "LayoutView", + "grid_template_rows": null, + "right": null, + "justify_content": null, + "_view_module": "@jupyter-widgets/base", + "overflow": null, + "_model_module_version": "1.2.0", + "_view_count": null, + "flex_flow": null, + "width": null, + "min_width": null, + "border": null, + "align_items": null, + "bottom": null, + "_model_module": "@jupyter-widgets/base", + "top": null, + "grid_column": null, + "overflow_y": null, + "overflow_x": null, + "grid_auto_flow": null, + "grid_area": null, + "grid_template_columns": null, + "flex": "2", + "_model_name": "LayoutModel", + "justify_items": null, + "grid_row": null, + "max_height": null, + "align_content": null, + "visibility": null, + "align_self": null, + "height": null, + "min_height": null, + "padding": null, + "grid_auto_rows": null, + "grid_gap": null, + "max_width": null, + "order": null, + "_view_module_version": "1.2.0", + "grid_template_areas": null, + "object_position": null, + "object_fit": null, + "grid_auto_columns": null, + "margin": null, + "display": null, + "left": null + } + }, + "4802a47c6dfb439c83d8b860dce42006": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_view_name": "StyleView", + "_model_name": "DescriptionStyleModel", + "description_width": "", + "_view_module": "@jupyter-widgets/base", + "_model_module_version": "1.5.0", + "_view_count": null, + "_view_module_version": "1.2.0", + "_model_module": "@jupyter-widgets/controls" + } + }, + "68c87e6a7fcf4e4eab98a941c7c3e867": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_view_name": "LayoutView", + "grid_template_rows": null, + "right": null, + "justify_content": null, + "_view_module": "@jupyter-widgets/base", + "overflow": null, + "_model_module_version": "1.2.0", + "_view_count": null, + "flex_flow": null, + "width": null, + "min_width": null, + "border": null, + "align_items": null, + "bottom": null, + "_model_module": "@jupyter-widgets/base", + "top": null, + "grid_column": null, + "overflow_y": null, + "overflow_x": null, + "grid_auto_flow": null, + "grid_area": null, + "grid_template_columns": null, + "flex": null, + "_model_name": "LayoutModel", + "justify_items": null, + "grid_row": null, + "max_height": null, + "align_content": null, + "visibility": null, + "align_self": null, + "height": null, + "min_height": null, + "padding": null, + "grid_auto_rows": null, + "grid_gap": null, + "max_width": null, + "order": null, + "_view_module_version": "1.2.0", + "grid_template_areas": null, + "object_position": null, + "object_fit": null, + "grid_auto_columns": null, + "margin": null, + "display": null, + "left": null + } + }, + "e6cbe583c2e14986b4faeb27e31f73e1": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_view_name": "HBoxView", + "_dom_classes": [], + "_model_name": "HBoxModel", + "_view_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_view_count": null, + "_view_module_version": "1.5.0", + "box_style": "", + "layout": "IPY_MODEL_672dd78899f944cea7e57f388f3ecb31", + "_model_module": "@jupyter-widgets/controls", + "children": [ + "IPY_MODEL_cd61dda59d104e0a8a8aa9bfc1e55c24", + "IPY_MODEL_1cd72d82332941a6929f88fad5173096", + "IPY_MODEL_92a38638060c4ed5b6d44a2078667e53" + ] + } + }, + "672dd78899f944cea7e57f388f3ecb31": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_view_name": "LayoutView", + "grid_template_rows": null, + "right": null, + "justify_content": null, + "_view_module": "@jupyter-widgets/base", + "overflow": null, + "_model_module_version": "1.2.0", + "_view_count": null, + "flex_flow": "row wrap", + "width": "100%", + "min_width": null, + "border": null, + "align_items": null, + "bottom": null, + "_model_module": "@jupyter-widgets/base", + "top": null, + "grid_column": null, + "overflow_y": null, + "overflow_x": null, + "grid_auto_flow": null, + "grid_area": null, + "grid_template_columns": null, + "flex": null, + "_model_name": "LayoutModel", + "justify_items": null, + "grid_row": null, + "max_height": null, + "align_content": null, + "visibility": null, + "align_self": null, + "height": null, + "min_height": null, + "padding": null, + "grid_auto_rows": null, + "grid_gap": null, + "max_width": null, + "order": null, + "_view_module_version": "1.2.0", + "grid_template_areas": null, + "object_position": null, + "object_fit": null, + "grid_auto_columns": null, + "margin": null, + "display": "inline-flex", + "left": null + } + }, + "cd61dda59d104e0a8a8aa9bfc1e55c24": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_view_name": "HTMLView", + "style": "IPY_MODEL_bdc9b06391ee47478efd58cc91ca87ac", + "_dom_classes": [], + "description": "", + "_model_name": "HTMLModel", + "placeholder": "​", + "_view_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "value": "Validating: 0%", + "_view_count": null, + "_view_module_version": "1.5.0", + "description_tooltip": null, + "_model_module": "@jupyter-widgets/controls", + "layout": "IPY_MODEL_ee80657d62c6452d9e9ac199157cdf2a" + } + }, + "1cd72d82332941a6929f88fad5173096": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_view_name": "ProgressView", + "style": "IPY_MODEL_eb16b87bcb8d4ca6a83e8b44ea2d1311", + "_dom_classes": [], + "description": "", + "_model_name": "FloatProgressModel", + "bar_style": "", + "max": 1, + "_view_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "value": 1, + "_view_count": null, + "_view_module_version": "1.5.0", + "orientation": "horizontal", + "min": 0, + "description_tooltip": null, + "_model_module": "@jupyter-widgets/controls", + "layout": "IPY_MODEL_2a6327dd568241e3acbb6aec1926bd80" + } + }, + "92a38638060c4ed5b6d44a2078667e53": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_view_name": "HTMLView", + "style": "IPY_MODEL_a45aba8517e14654850453159780b54a", + "_dom_classes": [], + "description": "", + "_model_name": "HTMLModel", + "placeholder": "​", + "_view_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "value": " 0/1 [00:00<?, ?it/s]", + "_view_count": null, + "_view_module_version": "1.5.0", + "description_tooltip": null, + "_model_module": "@jupyter-widgets/controls", + "layout": "IPY_MODEL_7fb167222e7143b789b7f40af7cb39dd" + } + }, + "bdc9b06391ee47478efd58cc91ca87ac": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_view_name": "StyleView", + "_model_name": "DescriptionStyleModel", + "description_width": "", + "_view_module": "@jupyter-widgets/base", + "_model_module_version": "1.5.0", + "_view_count": null, + "_view_module_version": "1.2.0", + "_model_module": "@jupyter-widgets/controls" + } + }, + "ee80657d62c6452d9e9ac199157cdf2a": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_view_name": "LayoutView", + "grid_template_rows": null, + "right": null, + "justify_content": null, + "_view_module": "@jupyter-widgets/base", + "overflow": null, + "_model_module_version": "1.2.0", + "_view_count": null, + "flex_flow": null, + "width": null, + "min_width": null, + "border": null, + "align_items": null, + "bottom": null, + "_model_module": "@jupyter-widgets/base", + "top": null, + "grid_column": null, + "overflow_y": null, + "overflow_x": null, + "grid_auto_flow": null, + "grid_area": null, + "grid_template_columns": null, + "flex": null, + "_model_name": "LayoutModel", + "justify_items": null, + "grid_row": null, + "max_height": null, + "align_content": null, + "visibility": null, + "align_self": null, + "height": null, + "min_height": null, + "padding": null, + "grid_auto_rows": null, + "grid_gap": null, + "max_width": null, + "order": null, + "_view_module_version": "1.2.0", + "grid_template_areas": null, + "object_position": null, + "object_fit": null, + "grid_auto_columns": null, + "margin": null, + "display": null, + "left": null + } + }, + "eb16b87bcb8d4ca6a83e8b44ea2d1311": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_view_name": "StyleView", + "_model_name": "ProgressStyleModel", + "description_width": "", + "_view_module": "@jupyter-widgets/base", + "_model_module_version": "1.5.0", + "_view_count": null, + "_view_module_version": "1.2.0", + "bar_color": null, + "_model_module": "@jupyter-widgets/controls" + } + }, + "2a6327dd568241e3acbb6aec1926bd80": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_view_name": "LayoutView", + "grid_template_rows": null, + "right": null, + "justify_content": null, + "_view_module": "@jupyter-widgets/base", + "overflow": null, + "_model_module_version": "1.2.0", + "_view_count": null, + "flex_flow": null, + "width": null, + "min_width": null, + "border": null, + "align_items": null, + "bottom": null, + "_model_module": "@jupyter-widgets/base", + "top": null, + "grid_column": null, + "overflow_y": null, + "overflow_x": null, + "grid_auto_flow": null, + "grid_area": null, + "grid_template_columns": null, + "flex": "2", + "_model_name": "LayoutModel", + "justify_items": null, + "grid_row": null, + "max_height": null, + "align_content": null, + "visibility": null, + "align_self": null, + "height": null, + "min_height": null, + "padding": null, + "grid_auto_rows": null, + "grid_gap": null, + "max_width": null, + "order": null, + "_view_module_version": "1.2.0", + "grid_template_areas": null, + "object_position": null, + "object_fit": null, + "grid_auto_columns": null, + "margin": null, + "display": null, + "left": null + } + }, + "a45aba8517e14654850453159780b54a": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_view_name": "StyleView", + "_model_name": "DescriptionStyleModel", + "description_width": "", + "_view_module": "@jupyter-widgets/base", + "_model_module_version": "1.5.0", + "_view_count": null, + "_view_module_version": "1.2.0", + "_model_module": "@jupyter-widgets/controls" + } + }, + "7fb167222e7143b789b7f40af7cb39dd": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_view_name": "LayoutView", + "grid_template_rows": null, + "right": null, + "justify_content": null, + "_view_module": "@jupyter-widgets/base", + "overflow": null, + "_model_module_version": "1.2.0", + "_view_count": null, + "flex_flow": null, + "width": null, + "min_width": null, + "border": null, + "align_items": null, + "bottom": null, + "_model_module": "@jupyter-widgets/base", + "top": null, + "grid_column": null, + "overflow_y": null, + "overflow_x": null, + "grid_auto_flow": null, + "grid_area": null, + "grid_template_columns": null, + "flex": null, + "_model_name": "LayoutModel", + "justify_items": null, + "grid_row": null, + "max_height": null, + "align_content": null, + "visibility": null, + "align_self": null, + "height": null, + "min_height": null, + "padding": null, + "grid_auto_rows": null, + "grid_gap": null, + "max_width": null, + "order": null, + "_view_module_version": "1.2.0", + "grid_template_areas": null, + "object_position": null, + "object_fit": null, + "grid_auto_columns": null, + "margin": null, + "display": null, + "left": null + } + }, + "abe1c0c4dac94e0e9b894bb69c3ec450": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_view_name": "HBoxView", + "_dom_classes": [], + "_model_name": "HBoxModel", + "_view_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_view_count": null, + "_view_module_version": "1.5.0", + "box_style": "", + "layout": "IPY_MODEL_23763e19d40d4020b3342a47366e2e19", + "_model_module": "@jupyter-widgets/controls", + "children": [ + "IPY_MODEL_0b7b7da6a6134f0fb26a05adc062ee6f", + "IPY_MODEL_9941635d9d694ba7bce0c7a14c500e5e", + "IPY_MODEL_c7f1407ba92f4dc6ba34bd9cf73fea69" + ] + } + }, + "23763e19d40d4020b3342a47366e2e19": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_view_name": "LayoutView", + "grid_template_rows": null, + "right": null, + "justify_content": null, + "_view_module": "@jupyter-widgets/base", + "overflow": null, + "_model_module_version": "1.2.0", + "_view_count": null, + "flex_flow": "row wrap", + "width": "100%", + "min_width": null, + "border": null, + "align_items": null, + "bottom": null, + "_model_module": "@jupyter-widgets/base", + "top": null, + "grid_column": null, + "overflow_y": null, + "overflow_x": null, + "grid_auto_flow": null, + "grid_area": null, + "grid_template_columns": null, + "flex": null, + "_model_name": "LayoutModel", + "justify_items": null, + "grid_row": null, + "max_height": null, + "align_content": null, + "visibility": null, + "align_self": null, + "height": null, + "min_height": null, + "padding": null, + "grid_auto_rows": null, + "grid_gap": null, + "max_width": null, + "order": null, + "_view_module_version": "1.2.0", + "grid_template_areas": null, + "object_position": null, + "object_fit": null, + "grid_auto_columns": null, + "margin": null, + "display": "inline-flex", + "left": null + } + }, + "0b7b7da6a6134f0fb26a05adc062ee6f": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_view_name": "HTMLView", + "style": "IPY_MODEL_86f2e0a558cc419e84ed9192ccd3d1b6", + "_dom_classes": [], + "description": "", + "_model_name": "HTMLModel", + "placeholder": "​", + "_view_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "value": "Testing: 100%", + "_view_count": null, + "_view_module_version": "1.5.0", + "description_tooltip": null, + "_model_module": "@jupyter-widgets/controls", + "layout": "IPY_MODEL_141a9c35ade14d9e8645b2c108ab4d66" + } + }, + "9941635d9d694ba7bce0c7a14c500e5e": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_view_name": "ProgressView", + "style": "IPY_MODEL_833bb79bb1214a3a88795f41b9375690", + "_dom_classes": [], + "description": "", + "_model_name": "FloatProgressModel", + "bar_style": "success", + "max": 1, + "_view_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "value": 1, + "_view_count": null, + "_view_module_version": "1.5.0", + "orientation": "horizontal", + "min": 0, + "description_tooltip": null, + "_model_module": "@jupyter-widgets/controls", + "layout": "IPY_MODEL_51b32955ad544803b1d78f07bc685569" + } + }, + "c7f1407ba92f4dc6ba34bd9cf73fea69": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_view_name": "HTMLView", + "style": "IPY_MODEL_a6b2764a5fa9444a9d77e8d74c67ef47", + "_dom_classes": [], + "description": "", + "_model_name": "HTMLModel", + "placeholder": "​", + "_view_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "value": " 32/32 [00:00<00:00, 174.23it/s]", + "_view_count": null, + "_view_module_version": "1.5.0", + "description_tooltip": null, + "_model_module": "@jupyter-widgets/controls", + "layout": "IPY_MODEL_3f0c08c03e284ebb905dae8aca72fffc" + } + }, + "86f2e0a558cc419e84ed9192ccd3d1b6": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_view_name": "StyleView", + "_model_name": "DescriptionStyleModel", + "description_width": "", + "_view_module": "@jupyter-widgets/base", + "_model_module_version": "1.5.0", + "_view_count": null, + "_view_module_version": "1.2.0", + "_model_module": "@jupyter-widgets/controls" + } + }, + "141a9c35ade14d9e8645b2c108ab4d66": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_view_name": "LayoutView", + "grid_template_rows": null, + "right": null, + "justify_content": null, + "_view_module": "@jupyter-widgets/base", + "overflow": null, + "_model_module_version": "1.2.0", + "_view_count": null, + "flex_flow": null, + "width": null, + "min_width": null, + "border": null, + "align_items": null, + "bottom": null, + "_model_module": "@jupyter-widgets/base", + "top": null, + "grid_column": null, + "overflow_y": null, + "overflow_x": null, + "grid_auto_flow": null, + "grid_area": null, + "grid_template_columns": null, + "flex": null, + "_model_name": "LayoutModel", + "justify_items": null, + "grid_row": null, + "max_height": null, + "align_content": null, + "visibility": null, + "align_self": null, + "height": null, + "min_height": null, + "padding": null, + "grid_auto_rows": null, + "grid_gap": null, + "max_width": null, + "order": null, + "_view_module_version": "1.2.0", + "grid_template_areas": null, + "object_position": null, + "object_fit": null, + "grid_auto_columns": null, + "margin": null, + "display": null, + "left": null + } + }, + "833bb79bb1214a3a88795f41b9375690": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_view_name": "StyleView", + "_model_name": "ProgressStyleModel", + "description_width": "", + "_view_module": "@jupyter-widgets/base", + "_model_module_version": "1.5.0", + "_view_count": null, + "_view_module_version": "1.2.0", + "bar_color": null, + "_model_module": "@jupyter-widgets/controls" + } + }, + "51b32955ad544803b1d78f07bc685569": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_view_name": "LayoutView", + "grid_template_rows": null, + "right": null, + "justify_content": null, + "_view_module": "@jupyter-widgets/base", + "overflow": null, + "_model_module_version": "1.2.0", + "_view_count": null, + "flex_flow": null, + "width": null, + "min_width": null, + "border": null, + "align_items": null, + "bottom": null, + "_model_module": "@jupyter-widgets/base", + "top": null, + "grid_column": null, + "overflow_y": null, + "overflow_x": null, + "grid_auto_flow": null, + "grid_area": null, + "grid_template_columns": null, + "flex": "2", + "_model_name": "LayoutModel", + "justify_items": null, + "grid_row": null, + "max_height": null, + "align_content": null, + "visibility": null, + "align_self": null, + "height": null, + "min_height": null, + "padding": null, + "grid_auto_rows": null, + "grid_gap": null, + "max_width": null, + "order": null, + "_view_module_version": "1.2.0", + "grid_template_areas": null, + "object_position": null, + "object_fit": null, + "grid_auto_columns": null, + "margin": null, + "display": null, + "left": null + } + }, + "a6b2764a5fa9444a9d77e8d74c67ef47": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_view_name": "StyleView", + "_model_name": "DescriptionStyleModel", + "description_width": "", + "_view_module": "@jupyter-widgets/base", + "_model_module_version": "1.5.0", + "_view_count": null, + "_view_module_version": "1.2.0", + "_model_module": "@jupyter-widgets/controls" + } + }, + "3f0c08c03e284ebb905dae8aca72fffc": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_view_name": "LayoutView", + "grid_template_rows": null, + "right": null, + "justify_content": null, + "_view_module": "@jupyter-widgets/base", + "overflow": null, + "_model_module_version": "1.2.0", + "_view_count": null, + "flex_flow": null, + "width": null, + "min_width": null, + "border": null, + "align_items": null, + "bottom": null, + "_model_module": "@jupyter-widgets/base", + "top": null, + "grid_column": null, + "overflow_y": null, + "overflow_x": null, + "grid_auto_flow": null, + "grid_area": null, + "grid_template_columns": null, + "flex": null, + "_model_name": "LayoutModel", + "justify_items": null, + "grid_row": null, + "max_height": null, + "align_content": null, + "visibility": null, + "align_self": null, + "height": null, + "min_height": null, + "padding": null, + "grid_auto_rows": null, + "grid_gap": null, + "max_width": null, + "order": null, + "_view_module_version": "1.2.0", + "grid_template_areas": null, + "object_position": null, + "object_fit": null, + "grid_auto_columns": null, + "margin": null, + "display": null, + "left": null + } + } + } + } + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "rR4_BAUYs3Mb" + }, + "source": [ + "![image.png]()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "i7XbLCXGkll9" + }, + "source": [ + "# The Boring Model\n", + "Replicate a bug you experience, using this model.\n", + "\n", + "[Remember! we're always available for support on Slack](https://join.slack.com/t/pytorch-lightning/shared_invite/zt-f6bl2l0l-JYMK3tbAgAmGRrlNr00f1A)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "2LODD6w9ixlT" + }, + "source": [ + "---\n", + "## Setup env" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "zK7-Gg69kMnG" + }, + "source": [ + "%%capture\n", + "! pip install pytorch-lightning --upgrade" + ], + "execution_count": 1, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "WvuSN5jEbY8P" + }, + "source": [ + "---\n", + "## Deps" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "w4_TYnt_keJi" + }, + "source": [ + "import os\n", + "\n", + "import torch\n", + "from torch.utils.data import DataLoader, Dataset\n", + "\n", + "from pytorch_lightning import LightningModule, Trainer" + ], + "execution_count": 2, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "XrJDukwPtUnS" + }, + "source": [ + "---\n", + "## Data\n", + "Random data is best for debugging. If you needs special tensor shapes or batch compositions or dataloaders, modify as needed" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "hvgTiaZpkvwS" + }, + "source": [ + "class RandomDataset(Dataset):\n", + " def __init__(self, size, num_samples):\n", + " self.len = num_samples\n", + " self.data = torch.randn(num_samples, size)\n", + "\n", + " def __getitem__(self, index):\n", + " return self.data[index]\n", + "\n", + " def __len__(self):\n", + " return self.len\n" + ], + "execution_count": 3, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "sxVlWjGhl02D" + }, + "source": [ + "num_samples = 10000" + ], + "execution_count": 4, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "V7ELesz1kVQo" + }, + "source": [ + "class BoringModel(LightningModule):\n", + " def __init__(self):\n", + " super().__init__()\n", + " self.layer = torch.nn.Linear(32, 2)\n", + "\n", + " def forward(self, x):\n", + " return self.layer(x)\n", + "\n", + " def training_step(self, batch, batch_idx):\n", + " loss = self(batch).sum()\n", + " self.log(\"train_loss\", loss)\n", + " return {\"loss\": loss}\n", + "\n", + " def validation_step(self, batch, batch_idx):\n", + " loss = self(batch).sum()\n", + " self.log(\"valid_loss\", loss)\n", + "\n", + " def test_step(self, batch, batch_idx):\n", + " loss = self(batch).sum()\n", + " self.log(\"test_loss\", loss)\n", + "\n", + " def configure_optimizers(self):\n", + " return torch.optim.SGD(self.layer.parameters(), lr=0.1)" + ], + "execution_count": 5, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ubvW3LGSupmt" + }, + "source": [ + "---\n", + "## Define the test\n", + "NOTE: in colab, set progress_bar_refresh_rate high or the screen will freeze because of the rapid tqdm update speed." + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "4Dk6Ykv8lI7X" + }, + "source": [ + "def run():\n", + " train_data = DataLoader(RandomDataset(32, 64), batch_size=2)\n", + " val_data = DataLoader(RandomDataset(32, 64), batch_size=2)\n", + " test_data = DataLoader(RandomDataset(32, 64), batch_size=2)\n", + "\n", + " model = BoringModel()\n", + " trainer = Trainer(\n", + " default_root_dir=os.getcwd(),\n", + " limit_train_batches=1,\n", + " limit_val_batches=1,\n", + " num_sanity_val_steps=0,\n", + " max_epochs=1,\n", + " enable_model_summary=False,\n", + " )\n", + " trainer.fit(model, train_dataloaders=train_data, val_dataloaders=val_data)\n", + " trainer.test(model, dataloaders=test_data)" + ], + "execution_count": 6, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4dPfTZVgmgxz" + }, + "source": [ + "---\n", + "## Run Test" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "AAtq1hwSmjKe", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 272, + "referenced_widgets": [ + "d79c1628eded487a974da18a2ea1f98b", + "02695b143b764932ba8d0c08a872987e", + "28eb6a3218f64f26abcdff756ffda3ad", + "02cfffd590014c3cbc44ab06c69f9181", + "0d7c50e36cb84f01a57a9d7d8b913393", + "6ba2782883ae424dbfc8868224d95da9", + "baa4aacd0da64cf291fb31c000724573", + "7dad3d2feced492a999fb6c91186be50", + "ea702a091eb642f7bdda81aa55db8c26", + "4802a47c6dfb439c83d8b860dce42006", + "68c87e6a7fcf4e4eab98a941c7c3e867", + "e6cbe583c2e14986b4faeb27e31f73e1", + "672dd78899f944cea7e57f388f3ecb31", + "cd61dda59d104e0a8a8aa9bfc1e55c24", + "1cd72d82332941a6929f88fad5173096", + "92a38638060c4ed5b6d44a2078667e53", + "bdc9b06391ee47478efd58cc91ca87ac", + "ee80657d62c6452d9e9ac199157cdf2a", + "eb16b87bcb8d4ca6a83e8b44ea2d1311", + "2a6327dd568241e3acbb6aec1926bd80", + "a45aba8517e14654850453159780b54a", + "7fb167222e7143b789b7f40af7cb39dd", + "abe1c0c4dac94e0e9b894bb69c3ec450", + "23763e19d40d4020b3342a47366e2e19", + "0b7b7da6a6134f0fb26a05adc062ee6f", + "9941635d9d694ba7bce0c7a14c500e5e", + "c7f1407ba92f4dc6ba34bd9cf73fea69", + "86f2e0a558cc419e84ed9192ccd3d1b6", + "141a9c35ade14d9e8645b2c108ab4d66", + "833bb79bb1214a3a88795f41b9375690", + "51b32955ad544803b1d78f07bc685569", + "a6b2764a5fa9444a9d77e8d74c67ef47", + "3f0c08c03e284ebb905dae8aca72fffc" + ] + }, + "outputId": "59e8bcf2-a944-46fc-a771-e7cbbbe4727d" + }, + "source": [ + "run()" + ], + "execution_count": 7, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "GPU available: True, used: False\n", + "TPU available: False, using: 0 TPU cores\n", + "IPU available: False, using: 0 IPUs\n", + "/usr/local/lib/python3.7/dist-packages/pytorch_lightning/trainer/trainer.py:1567: UserWarning: GPU available but not used. Set the gpus flag in your trainer `Trainer(gpus=1)` or script `--gpus=1`.\n", + " \"GPU available but not used. Set the gpus flag in your trainer `Trainer(gpus=1)` or script `--gpus=1`.\"\n", + "/usr/local/lib/python3.7/dist-packages/pytorch_lightning/trainer/data_loading.py:395: UserWarning: The number of training samples (1) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.\n", + " f\"The number of training samples ({self.num_training_batches}) is smaller than the logging interval\"\n" + ] + }, + { + "output_type": "display_data", + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "d79c1628eded487a974da18a2ea1f98b", + "version_minor": 0, + "version_major": 2 + }, + "text/plain": [ + "Training: 0it [00:00, ?it/s]" + ] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "e6cbe583c2e14986b4faeb27e31f73e1", + "version_minor": 0, + "version_major": 2 + }, + "text/plain": [ + "Validating: 0it [00:00, ?it/s]" + ] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "abe1c0c4dac94e0e9b894bb69c3ec450", + "version_minor": 0, + "version_major": 2 + }, + "text/plain": [ + "Testing: 0it [00:00, ?it/s]" + ] + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "--------------------------------------------------------------------------------\n", + "DATALOADER:0 TEST RESULTS\n", + "{'test_loss': -1.676544427871704}\n", + "--------------------------------------------------------------------------------\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Flyi--SpvsJN" + }, + "source": [ + "---\n", + "## Environment\n", + "Run this to get the environment details" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "0-yvGFRoaDSi" + }, + "source": [ + "%%capture\n", + "! wget https://raw.githubusercontent.com/PyTorchLightning/pytorch-lightning/master/requirements/collect_env_details.py" + ], + "execution_count": 8, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "quj4LUDgmFvj", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "bb7a5f74-d52c-4927-b12a-49589aed7dcb" + }, + "source": [ + "! python collect_env_details.py" + ], + "execution_count": 9, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "* CUDA:\n", + "\t- GPU:\n", + "\t\t- Tesla K80\n", + "\t- available: True\n", + "\t- version: 11.1\n", + "* Packages:\n", + "\t- numpy: 1.19.5\n", + "\t- pyTorch_debug: False\n", + "\t- pyTorch_version: 1.10.0+cu111\n", + "\t- pytorch-lightning: 1.5.1\n", + "\t- tqdm: 4.62.3\n", + "* System:\n", + "\t- OS: Linux\n", + "\t- architecture:\n", + "\t\t- 64bit\n", + "\t\t- \n", + "\t- processor: x86_64\n", + "\t- python: 3.7.12\n", + "\t- version: #1 SMP Sat Jun 5 09:50:34 PDT 2021\n" + ] + } + ] + } + ] +} diff --git a/pl_examples/bug_report_model.py b/pl_examples/bug_report/bug_report_model.py similarity index 100% rename from pl_examples/bug_report_model.py rename to pl_examples/bug_report/bug_report_model.py diff --git a/tests/loops/test_loops.py b/tests/loops/test_loops.py index 28ac1f3f2aefc..0d8b80c44af36 100644 --- a/tests/loops/test_loops.py +++ b/tests/loops/test_loops.py @@ -22,12 +22,11 @@ import torch from torch.utils.data.dataloader import _MultiProcessingDataLoaderIter, DataLoader -from pl_examples.bug_report_model import RandomDataset from pytorch_lightning import LightningModule, Trainer from pytorch_lightning.callbacks import Callback, ModelCheckpoint from pytorch_lightning.loops import Loop, TrainingBatchLoop from pytorch_lightning.trainer.progress import BaseProgress -from tests.helpers import BoringModel +from tests.helpers import BoringModel, RandomDataset from tests.helpers.runif import RunIf From d249060c74f3fabe04f5f127a03fa4d6be4aa56b Mon Sep 17 00:00:00 2001 From: Adam Reeve Date: Sat, 20 Nov 2021 06:26:50 +1300 Subject: [PATCH 038/123] Fix misleading ModelCheckpoint documentation on every_n_epochs parameter (#10421) --- pytorch_lightning/callbacks/model_checkpoint.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/callbacks/model_checkpoint.py b/pytorch_lightning/callbacks/model_checkpoint.py index 6b70f6af171c5..f9fbf1c93d380 100644 --- a/pytorch_lightning/callbacks/model_checkpoint.py +++ b/pytorch_lightning/callbacks/model_checkpoint.py @@ -114,8 +114,14 @@ class ModelCheckpoint(Callback): guaranteed to execute at the exact time specified, but should be close. This must be mutually exclusive with ``every_n_train_steps`` and ``every_n_epochs``. every_n_epochs: Number of epochs between checkpoints. - If ``every_n_epochs == None or every_n_epochs == 0``, we skip saving when the epoch ends. - To disable, set ``every_n_epochs = 0``. This value must be ``None`` or non-negative. + This value must be ``None`` or non-negative. + To disable saving after each epoch, set ``every_n_epochs = 0``. + If all of ``every_n_epochs``, ``every_n_train_steps`` and + ``train_time_interval`` are ``None``, we save a checkpoint at the end of every epoch + (equivalent to ``every_n_epochs = 1``). + If ``every_n_epochs == None`` and either ``every_n_train_steps != None`` or ``train_time_interval != None``, + saving at the end of each epoch is disabled + (equivalent to ``every_n_epochs = 0``). This must be mutually exclusive with ``every_n_train_steps`` and ``train_time_interval``. Setting both ``ModelCheckpoint(..., every_n_epochs=V, save_on_train_epoch_end=False)`` and ``Trainer(max_epochs=N, check_val_every_n_epoch=M)`` From ebae4203c541601acc649ac3a4ee2af04fb30fa6 Mon Sep 17 00:00:00 2001 From: Rohit Gupta Date: Mon, 22 Nov 2021 16:49:46 +0530 Subject: [PATCH 039/123] update bug_report model links and notebook (#10665) --- .github/ISSUE_TEMPLATE/bug_report.md | 10 +- .../advanced/fault_tolerant_training.rst | 2 +- pl_examples/bug_report/The_BoringModel.ipynb | 1420 ----------------- pl_examples/bug_report/bug_report_model.ipynb | 267 ++++ pl_examples/bug_report/bug_report_model.py | 1 + 5 files changed, 274 insertions(+), 1426 deletions(-) delete mode 100644 pl_examples/bug_report/The_BoringModel.ipynb create mode 100644 pl_examples/bug_report/bug_report_model.ipynb diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md index 44bcec54517d3..7719cf2812558 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.md +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -16,11 +16,11 @@ assignees: '' Please reproduce using the BoringModel! You can use the following Colab link: -https://colab.research.google.com/github/PytorchLightning/pytorch-lightning/blob/master/pl_examples/bug_report/The_BoringModel.ipynb +https://colab.research.google.com/github/PytorchLightning/pytorch-lightning/blob/master/pl_examples/bug_report/bug_report_model.ipynb IMPORTANT: has to be public. or this simple template: -https://github.com/PyTorchLightning/pytorch-lightning/blob/master/pl_examples/bug_report_model.py +https://github.com/PyTorchLightning/pytorch-lightning/blob/master/pl_examples/bug_report/bug_report_model.py If you could not reproduce using the BoringModel and still think there's a bug, please post here but remember, bugs with code are fixed faster! @@ -46,9 +46,9 @@ python collect_env_details.py You can also fill out the list below manually. --> -- PyTorch Lightning Version (e.g., 1.3.0): -- PyTorch Version (e.g., 1.8) -- Python version: +- PyTorch Lightning Version (e.g., 1.5.0): +- PyTorch Version (e.g., 1.10): +- Python version (e.g., 3.9): - OS (e.g., Linux): - CUDA/cuDNN version: - GPU models and configuration: diff --git a/docs/source/advanced/fault_tolerant_training.rst b/docs/source/advanced/fault_tolerant_training.rst index e4a61b27e294d..63a3ce41ee8b3 100644 --- a/docs/source/advanced/fault_tolerant_training.rst +++ b/docs/source/advanced/fault_tolerant_training.rst @@ -134,7 +134,7 @@ Performance Impacts ------------------- Fault-tolerant Training was tested on common and worst-case scenarios in order to measure the impact of the internal state tracking on the total training time. -On tiny models like the `BoringModel and RandomDataset `_ +On tiny models like the `BoringModel and RandomDataset `_ which has virtually no data loading and processing overhead, we noticed up to 50% longer training time with fault tolerance enabled. In this worst-case scenario, fault-tolerant adds an overhead that is noticeable in comparison to the compute time for dataloading itself. However, for more realistic training workloads where data loading and preprocessing is more expensive, the constant overhead that fault tolerance adds becomes less noticeable or not noticeable at all. diff --git a/pl_examples/bug_report/The_BoringModel.ipynb b/pl_examples/bug_report/The_BoringModel.ipynb deleted file mode 100644 index 9b061c4283cbf..0000000000000 --- a/pl_examples/bug_report/The_BoringModel.ipynb +++ /dev/null @@ -1,1420 +0,0 @@ -{ - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "colab": { - "name": "The BoringModel.ipynb", - "provenance": [], - "collapsed_sections": [] - }, - "kernelspec": { - "name": "python3", - "display_name": "Python 3" - }, - "accelerator": "GPU", - "widgets": { - "application/vnd.jupyter.widget-state+json": { - "d79c1628eded487a974da18a2ea1f98b": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HBoxModel", - "model_module_version": "1.5.0", - "state": { - "_view_name": "HBoxView", - "_dom_classes": [], - "_model_name": "HBoxModel", - "_view_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_view_count": null, - "_view_module_version": "1.5.0", - "box_style": "", - "layout": "IPY_MODEL_02695b143b764932ba8d0c08a872987e", - "_model_module": "@jupyter-widgets/controls", - "children": [ - "IPY_MODEL_28eb6a3218f64f26abcdff756ffda3ad", - "IPY_MODEL_02cfffd590014c3cbc44ab06c69f9181", - "IPY_MODEL_0d7c50e36cb84f01a57a9d7d8b913393" - ] - } - }, - "02695b143b764932ba8d0c08a872987e": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_view_name": "LayoutView", - "grid_template_rows": null, - "right": null, - "justify_content": null, - "_view_module": "@jupyter-widgets/base", - "overflow": null, - "_model_module_version": "1.2.0", - "_view_count": null, - "flex_flow": "row wrap", - "width": "100%", - "min_width": null, - "border": null, - "align_items": null, - "bottom": null, - "_model_module": "@jupyter-widgets/base", - "top": null, - "grid_column": null, - "overflow_y": null, - "overflow_x": null, - "grid_auto_flow": null, - "grid_area": null, - "grid_template_columns": null, - "flex": null, - "_model_name": "LayoutModel", - "justify_items": null, - "grid_row": null, - "max_height": null, - "align_content": null, - "visibility": null, - "align_self": null, - "height": null, - "min_height": null, - "padding": null, - "grid_auto_rows": null, - "grid_gap": null, - "max_width": null, - "order": null, - "_view_module_version": "1.2.0", - "grid_template_areas": null, - "object_position": null, - "object_fit": null, - "grid_auto_columns": null, - "margin": null, - "display": "inline-flex", - "left": null - } - }, - "28eb6a3218f64f26abcdff756ffda3ad": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_view_name": "HTMLView", - "style": "IPY_MODEL_6ba2782883ae424dbfc8868224d95da9", - "_dom_classes": [], - "description": "", - "_model_name": "HTMLModel", - "placeholder": "​", - "_view_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "value": "Epoch 0: 100%", - "_view_count": null, - "_view_module_version": "1.5.0", - "description_tooltip": null, - "_model_module": "@jupyter-widgets/controls", - "layout": "IPY_MODEL_baa4aacd0da64cf291fb31c000724573" - } - }, - "02cfffd590014c3cbc44ab06c69f9181": { - "model_module": "@jupyter-widgets/controls", - "model_name": "FloatProgressModel", - "model_module_version": "1.5.0", - "state": { - "_view_name": "ProgressView", - "style": "IPY_MODEL_7dad3d2feced492a999fb6c91186be50", - "_dom_classes": [], - "description": "", - "_model_name": "FloatProgressModel", - "bar_style": "success", - "max": 2, - "_view_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "value": 2, - "_view_count": null, - "_view_module_version": "1.5.0", - "orientation": "horizontal", - "min": 0, - "description_tooltip": null, - "_model_module": "@jupyter-widgets/controls", - "layout": "IPY_MODEL_ea702a091eb642f7bdda81aa55db8c26" - } - }, - "0d7c50e36cb84f01a57a9d7d8b913393": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_view_name": "HTMLView", - "style": "IPY_MODEL_4802a47c6dfb439c83d8b860dce42006", - "_dom_classes": [], - "description": "", - "_model_name": "HTMLModel", - "placeholder": "​", - "_view_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "value": " 2/2 [00:00<00:00, 9.45it/s, loss=-0.618, v_num=0]", - "_view_count": null, - "_view_module_version": "1.5.0", - "description_tooltip": null, - "_model_module": "@jupyter-widgets/controls", - "layout": "IPY_MODEL_68c87e6a7fcf4e4eab98a941c7c3e867" - } - }, - "6ba2782883ae424dbfc8868224d95da9": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_view_name": "StyleView", - "_model_name": "DescriptionStyleModel", - "description_width": "", - "_view_module": "@jupyter-widgets/base", - "_model_module_version": "1.5.0", - "_view_count": null, - "_view_module_version": "1.2.0", - "_model_module": "@jupyter-widgets/controls" - } - }, - "baa4aacd0da64cf291fb31c000724573": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_view_name": "LayoutView", - "grid_template_rows": null, - "right": null, - "justify_content": null, - "_view_module": "@jupyter-widgets/base", - "overflow": null, - "_model_module_version": "1.2.0", - "_view_count": null, - "flex_flow": null, - "width": null, - "min_width": null, - "border": null, - "align_items": null, - "bottom": null, - "_model_module": "@jupyter-widgets/base", - "top": null, - "grid_column": null, - "overflow_y": null, - "overflow_x": null, - "grid_auto_flow": null, - "grid_area": null, - "grid_template_columns": null, - "flex": null, - "_model_name": "LayoutModel", - "justify_items": null, - "grid_row": null, - "max_height": null, - "align_content": null, - "visibility": null, - "align_self": null, - "height": null, - "min_height": null, - "padding": null, - "grid_auto_rows": null, - "grid_gap": null, - "max_width": null, - "order": null, - "_view_module_version": "1.2.0", - "grid_template_areas": null, - "object_position": null, - "object_fit": null, - "grid_auto_columns": null, - "margin": null, - "display": null, - "left": null - } - }, - "7dad3d2feced492a999fb6c91186be50": { - "model_module": "@jupyter-widgets/controls", - "model_name": "ProgressStyleModel", - "model_module_version": "1.5.0", - "state": { - "_view_name": "StyleView", - "_model_name": "ProgressStyleModel", - "description_width": "", - "_view_module": "@jupyter-widgets/base", - "_model_module_version": "1.5.0", - "_view_count": null, - "_view_module_version": "1.2.0", - "bar_color": null, - "_model_module": "@jupyter-widgets/controls" - } - }, - "ea702a091eb642f7bdda81aa55db8c26": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_view_name": "LayoutView", - "grid_template_rows": null, - "right": null, - "justify_content": null, - "_view_module": "@jupyter-widgets/base", - "overflow": null, - "_model_module_version": "1.2.0", - "_view_count": null, - "flex_flow": null, - "width": null, - "min_width": null, - "border": null, - "align_items": null, - "bottom": null, - "_model_module": "@jupyter-widgets/base", - "top": null, - "grid_column": null, - "overflow_y": null, - "overflow_x": null, - "grid_auto_flow": null, - "grid_area": null, - "grid_template_columns": null, - "flex": "2", - "_model_name": "LayoutModel", - "justify_items": null, - "grid_row": null, - "max_height": null, - "align_content": null, - "visibility": null, - "align_self": null, - "height": null, - "min_height": null, - "padding": null, - "grid_auto_rows": null, - "grid_gap": null, - "max_width": null, - "order": null, - "_view_module_version": "1.2.0", - "grid_template_areas": null, - "object_position": null, - "object_fit": null, - "grid_auto_columns": null, - "margin": null, - "display": null, - "left": null - } - }, - "4802a47c6dfb439c83d8b860dce42006": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_view_name": "StyleView", - "_model_name": "DescriptionStyleModel", - "description_width": "", - "_view_module": "@jupyter-widgets/base", - "_model_module_version": "1.5.0", - "_view_count": null, - "_view_module_version": "1.2.0", - "_model_module": "@jupyter-widgets/controls" - } - }, - "68c87e6a7fcf4e4eab98a941c7c3e867": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_view_name": "LayoutView", - "grid_template_rows": null, - "right": null, - "justify_content": null, - "_view_module": "@jupyter-widgets/base", - "overflow": null, - "_model_module_version": "1.2.0", - "_view_count": null, - "flex_flow": null, - "width": null, - "min_width": null, - "border": null, - "align_items": null, - "bottom": null, - "_model_module": "@jupyter-widgets/base", - "top": null, - "grid_column": null, - "overflow_y": null, - "overflow_x": null, - "grid_auto_flow": null, - "grid_area": null, - "grid_template_columns": null, - "flex": null, - "_model_name": "LayoutModel", - "justify_items": null, - "grid_row": null, - "max_height": null, - "align_content": null, - "visibility": null, - "align_self": null, - "height": null, - "min_height": null, - "padding": null, - "grid_auto_rows": null, - "grid_gap": null, - "max_width": null, - "order": null, - "_view_module_version": "1.2.0", - "grid_template_areas": null, - "object_position": null, - "object_fit": null, - "grid_auto_columns": null, - "margin": null, - "display": null, - "left": null - } - }, - "e6cbe583c2e14986b4faeb27e31f73e1": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HBoxModel", - "model_module_version": "1.5.0", - "state": { - "_view_name": "HBoxView", - "_dom_classes": [], - "_model_name": "HBoxModel", - "_view_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_view_count": null, - "_view_module_version": "1.5.0", - "box_style": "", - "layout": "IPY_MODEL_672dd78899f944cea7e57f388f3ecb31", - "_model_module": "@jupyter-widgets/controls", - "children": [ - "IPY_MODEL_cd61dda59d104e0a8a8aa9bfc1e55c24", - "IPY_MODEL_1cd72d82332941a6929f88fad5173096", - "IPY_MODEL_92a38638060c4ed5b6d44a2078667e53" - ] - } - }, - "672dd78899f944cea7e57f388f3ecb31": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_view_name": "LayoutView", - "grid_template_rows": null, - "right": null, - "justify_content": null, - "_view_module": "@jupyter-widgets/base", - "overflow": null, - "_model_module_version": "1.2.0", - "_view_count": null, - "flex_flow": "row wrap", - "width": "100%", - "min_width": null, - "border": null, - "align_items": null, - "bottom": null, - "_model_module": "@jupyter-widgets/base", - "top": null, - "grid_column": null, - "overflow_y": null, - "overflow_x": null, - "grid_auto_flow": null, - "grid_area": null, - "grid_template_columns": null, - "flex": null, - "_model_name": "LayoutModel", - "justify_items": null, - "grid_row": null, - "max_height": null, - "align_content": null, - "visibility": null, - "align_self": null, - "height": null, - "min_height": null, - "padding": null, - "grid_auto_rows": null, - "grid_gap": null, - "max_width": null, - "order": null, - "_view_module_version": "1.2.0", - "grid_template_areas": null, - "object_position": null, - "object_fit": null, - "grid_auto_columns": null, - "margin": null, - "display": "inline-flex", - "left": null - } - }, - "cd61dda59d104e0a8a8aa9bfc1e55c24": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_view_name": "HTMLView", - "style": "IPY_MODEL_bdc9b06391ee47478efd58cc91ca87ac", - "_dom_classes": [], - "description": "", - "_model_name": "HTMLModel", - "placeholder": "​", - "_view_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "value": "Validating: 0%", - "_view_count": null, - "_view_module_version": "1.5.0", - "description_tooltip": null, - "_model_module": "@jupyter-widgets/controls", - "layout": "IPY_MODEL_ee80657d62c6452d9e9ac199157cdf2a" - } - }, - "1cd72d82332941a6929f88fad5173096": { - "model_module": "@jupyter-widgets/controls", - "model_name": "FloatProgressModel", - "model_module_version": "1.5.0", - "state": { - "_view_name": "ProgressView", - "style": "IPY_MODEL_eb16b87bcb8d4ca6a83e8b44ea2d1311", - "_dom_classes": [], - "description": "", - "_model_name": "FloatProgressModel", - "bar_style": "", - "max": 1, - "_view_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "value": 1, - "_view_count": null, - "_view_module_version": "1.5.0", - "orientation": "horizontal", - "min": 0, - "description_tooltip": null, - "_model_module": "@jupyter-widgets/controls", - "layout": "IPY_MODEL_2a6327dd568241e3acbb6aec1926bd80" - } - }, - "92a38638060c4ed5b6d44a2078667e53": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_view_name": "HTMLView", - "style": "IPY_MODEL_a45aba8517e14654850453159780b54a", - "_dom_classes": [], - "description": "", - "_model_name": "HTMLModel", - "placeholder": "​", - "_view_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "value": " 0/1 [00:00<?, ?it/s]", - "_view_count": null, - "_view_module_version": "1.5.0", - "description_tooltip": null, - "_model_module": "@jupyter-widgets/controls", - "layout": "IPY_MODEL_7fb167222e7143b789b7f40af7cb39dd" - } - }, - "bdc9b06391ee47478efd58cc91ca87ac": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_view_name": "StyleView", - "_model_name": "DescriptionStyleModel", - "description_width": "", - "_view_module": "@jupyter-widgets/base", - "_model_module_version": "1.5.0", - "_view_count": null, - "_view_module_version": "1.2.0", - "_model_module": "@jupyter-widgets/controls" - } - }, - "ee80657d62c6452d9e9ac199157cdf2a": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_view_name": "LayoutView", - "grid_template_rows": null, - "right": null, - "justify_content": null, - "_view_module": "@jupyter-widgets/base", - "overflow": null, - "_model_module_version": "1.2.0", - "_view_count": null, - "flex_flow": null, - "width": null, - "min_width": null, - "border": null, - "align_items": null, - "bottom": null, - "_model_module": "@jupyter-widgets/base", - "top": null, - "grid_column": null, - "overflow_y": null, - "overflow_x": null, - "grid_auto_flow": null, - "grid_area": null, - "grid_template_columns": null, - "flex": null, - "_model_name": "LayoutModel", - "justify_items": null, - "grid_row": null, - "max_height": null, - "align_content": null, - "visibility": null, - "align_self": null, - "height": null, - "min_height": null, - "padding": null, - "grid_auto_rows": null, - "grid_gap": null, - "max_width": null, - "order": null, - "_view_module_version": "1.2.0", - "grid_template_areas": null, - "object_position": null, - "object_fit": null, - "grid_auto_columns": null, - "margin": null, - "display": null, - "left": null - } - }, - "eb16b87bcb8d4ca6a83e8b44ea2d1311": { - "model_module": "@jupyter-widgets/controls", - "model_name": "ProgressStyleModel", - "model_module_version": "1.5.0", - "state": { - "_view_name": "StyleView", - "_model_name": "ProgressStyleModel", - "description_width": "", - "_view_module": "@jupyter-widgets/base", - "_model_module_version": "1.5.0", - "_view_count": null, - "_view_module_version": "1.2.0", - "bar_color": null, - "_model_module": "@jupyter-widgets/controls" - } - }, - "2a6327dd568241e3acbb6aec1926bd80": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_view_name": "LayoutView", - "grid_template_rows": null, - "right": null, - "justify_content": null, - "_view_module": "@jupyter-widgets/base", - "overflow": null, - "_model_module_version": "1.2.0", - "_view_count": null, - "flex_flow": null, - "width": null, - "min_width": null, - "border": null, - "align_items": null, - "bottom": null, - "_model_module": "@jupyter-widgets/base", - "top": null, - "grid_column": null, - "overflow_y": null, - "overflow_x": null, - "grid_auto_flow": null, - "grid_area": null, - "grid_template_columns": null, - "flex": "2", - "_model_name": "LayoutModel", - "justify_items": null, - "grid_row": null, - "max_height": null, - "align_content": null, - "visibility": null, - "align_self": null, - "height": null, - "min_height": null, - "padding": null, - "grid_auto_rows": null, - "grid_gap": null, - "max_width": null, - "order": null, - "_view_module_version": "1.2.0", - "grid_template_areas": null, - "object_position": null, - "object_fit": null, - "grid_auto_columns": null, - "margin": null, - "display": null, - "left": null - } - }, - "a45aba8517e14654850453159780b54a": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_view_name": "StyleView", - "_model_name": "DescriptionStyleModel", - "description_width": "", - "_view_module": "@jupyter-widgets/base", - "_model_module_version": "1.5.0", - "_view_count": null, - "_view_module_version": "1.2.0", - "_model_module": "@jupyter-widgets/controls" - } - }, - "7fb167222e7143b789b7f40af7cb39dd": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_view_name": "LayoutView", - "grid_template_rows": null, - "right": null, - "justify_content": null, - "_view_module": "@jupyter-widgets/base", - "overflow": null, - "_model_module_version": "1.2.0", - "_view_count": null, - "flex_flow": null, - "width": null, - "min_width": null, - "border": null, - "align_items": null, - "bottom": null, - "_model_module": "@jupyter-widgets/base", - "top": null, - "grid_column": null, - "overflow_y": null, - "overflow_x": null, - "grid_auto_flow": null, - "grid_area": null, - "grid_template_columns": null, - "flex": null, - "_model_name": "LayoutModel", - "justify_items": null, - "grid_row": null, - "max_height": null, - "align_content": null, - "visibility": null, - "align_self": null, - "height": null, - "min_height": null, - "padding": null, - "grid_auto_rows": null, - "grid_gap": null, - "max_width": null, - "order": null, - "_view_module_version": "1.2.0", - "grid_template_areas": null, - "object_position": null, - "object_fit": null, - "grid_auto_columns": null, - "margin": null, - "display": null, - "left": null - } - }, - "abe1c0c4dac94e0e9b894bb69c3ec450": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HBoxModel", - "model_module_version": "1.5.0", - "state": { - "_view_name": "HBoxView", - "_dom_classes": [], - "_model_name": "HBoxModel", - "_view_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_view_count": null, - "_view_module_version": "1.5.0", - "box_style": "", - "layout": "IPY_MODEL_23763e19d40d4020b3342a47366e2e19", - "_model_module": "@jupyter-widgets/controls", - "children": [ - "IPY_MODEL_0b7b7da6a6134f0fb26a05adc062ee6f", - "IPY_MODEL_9941635d9d694ba7bce0c7a14c500e5e", - "IPY_MODEL_c7f1407ba92f4dc6ba34bd9cf73fea69" - ] - } - }, - "23763e19d40d4020b3342a47366e2e19": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_view_name": "LayoutView", - "grid_template_rows": null, - "right": null, - "justify_content": null, - "_view_module": "@jupyter-widgets/base", - "overflow": null, - "_model_module_version": "1.2.0", - "_view_count": null, - "flex_flow": "row wrap", - "width": "100%", - "min_width": null, - "border": null, - "align_items": null, - "bottom": null, - "_model_module": "@jupyter-widgets/base", - "top": null, - "grid_column": null, - "overflow_y": null, - "overflow_x": null, - "grid_auto_flow": null, - "grid_area": null, - "grid_template_columns": null, - "flex": null, - "_model_name": "LayoutModel", - "justify_items": null, - "grid_row": null, - "max_height": null, - "align_content": null, - "visibility": null, - "align_self": null, - "height": null, - "min_height": null, - "padding": null, - "grid_auto_rows": null, - "grid_gap": null, - "max_width": null, - "order": null, - "_view_module_version": "1.2.0", - "grid_template_areas": null, - "object_position": null, - "object_fit": null, - "grid_auto_columns": null, - "margin": null, - "display": "inline-flex", - "left": null - } - }, - "0b7b7da6a6134f0fb26a05adc062ee6f": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_view_name": "HTMLView", - "style": "IPY_MODEL_86f2e0a558cc419e84ed9192ccd3d1b6", - "_dom_classes": [], - "description": "", - "_model_name": "HTMLModel", - "placeholder": "​", - "_view_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "value": "Testing: 100%", - "_view_count": null, - "_view_module_version": "1.5.0", - "description_tooltip": null, - "_model_module": "@jupyter-widgets/controls", - "layout": "IPY_MODEL_141a9c35ade14d9e8645b2c108ab4d66" - } - }, - "9941635d9d694ba7bce0c7a14c500e5e": { - "model_module": "@jupyter-widgets/controls", - "model_name": "FloatProgressModel", - "model_module_version": "1.5.0", - "state": { - "_view_name": "ProgressView", - "style": "IPY_MODEL_833bb79bb1214a3a88795f41b9375690", - "_dom_classes": [], - "description": "", - "_model_name": "FloatProgressModel", - "bar_style": "success", - "max": 1, - "_view_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "value": 1, - "_view_count": null, - "_view_module_version": "1.5.0", - "orientation": "horizontal", - "min": 0, - "description_tooltip": null, - "_model_module": "@jupyter-widgets/controls", - "layout": "IPY_MODEL_51b32955ad544803b1d78f07bc685569" - } - }, - "c7f1407ba92f4dc6ba34bd9cf73fea69": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_view_name": "HTMLView", - "style": "IPY_MODEL_a6b2764a5fa9444a9d77e8d74c67ef47", - "_dom_classes": [], - "description": "", - "_model_name": "HTMLModel", - "placeholder": "​", - "_view_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "value": " 32/32 [00:00<00:00, 174.23it/s]", - "_view_count": null, - "_view_module_version": "1.5.0", - "description_tooltip": null, - "_model_module": "@jupyter-widgets/controls", - "layout": "IPY_MODEL_3f0c08c03e284ebb905dae8aca72fffc" - } - }, - "86f2e0a558cc419e84ed9192ccd3d1b6": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_view_name": "StyleView", - "_model_name": "DescriptionStyleModel", - "description_width": "", - "_view_module": "@jupyter-widgets/base", - "_model_module_version": "1.5.0", - "_view_count": null, - "_view_module_version": "1.2.0", - "_model_module": "@jupyter-widgets/controls" - } - }, - "141a9c35ade14d9e8645b2c108ab4d66": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_view_name": "LayoutView", - "grid_template_rows": null, - "right": null, - "justify_content": null, - "_view_module": "@jupyter-widgets/base", - "overflow": null, - "_model_module_version": "1.2.0", - "_view_count": null, - "flex_flow": null, - "width": null, - "min_width": null, - "border": null, - "align_items": null, - "bottom": null, - "_model_module": "@jupyter-widgets/base", - "top": null, - "grid_column": null, - "overflow_y": null, - "overflow_x": null, - "grid_auto_flow": null, - "grid_area": null, - "grid_template_columns": null, - "flex": null, - "_model_name": "LayoutModel", - "justify_items": null, - "grid_row": null, - "max_height": null, - "align_content": null, - "visibility": null, - "align_self": null, - "height": null, - "min_height": null, - "padding": null, - "grid_auto_rows": null, - "grid_gap": null, - "max_width": null, - "order": null, - "_view_module_version": "1.2.0", - "grid_template_areas": null, - "object_position": null, - "object_fit": null, - "grid_auto_columns": null, - "margin": null, - "display": null, - "left": null - } - }, - "833bb79bb1214a3a88795f41b9375690": { - "model_module": "@jupyter-widgets/controls", - "model_name": "ProgressStyleModel", - "model_module_version": "1.5.0", - "state": { - "_view_name": "StyleView", - "_model_name": "ProgressStyleModel", - "description_width": "", - "_view_module": "@jupyter-widgets/base", - "_model_module_version": "1.5.0", - "_view_count": null, - "_view_module_version": "1.2.0", - "bar_color": null, - "_model_module": "@jupyter-widgets/controls" - } - }, - "51b32955ad544803b1d78f07bc685569": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_view_name": "LayoutView", - "grid_template_rows": null, - "right": null, - "justify_content": null, - "_view_module": "@jupyter-widgets/base", - "overflow": null, - "_model_module_version": "1.2.0", - "_view_count": null, - "flex_flow": null, - "width": null, - "min_width": null, - "border": null, - "align_items": null, - "bottom": null, - "_model_module": "@jupyter-widgets/base", - "top": null, - "grid_column": null, - "overflow_y": null, - "overflow_x": null, - "grid_auto_flow": null, - "grid_area": null, - "grid_template_columns": null, - "flex": "2", - "_model_name": "LayoutModel", - "justify_items": null, - "grid_row": null, - "max_height": null, - "align_content": null, - "visibility": null, - "align_self": null, - "height": null, - "min_height": null, - "padding": null, - "grid_auto_rows": null, - "grid_gap": null, - "max_width": null, - "order": null, - "_view_module_version": "1.2.0", - "grid_template_areas": null, - "object_position": null, - "object_fit": null, - "grid_auto_columns": null, - "margin": null, - "display": null, - "left": null - } - }, - "a6b2764a5fa9444a9d77e8d74c67ef47": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_view_name": "StyleView", - "_model_name": "DescriptionStyleModel", - "description_width": "", - "_view_module": "@jupyter-widgets/base", - "_model_module_version": "1.5.0", - "_view_count": null, - "_view_module_version": "1.2.0", - "_model_module": "@jupyter-widgets/controls" - } - }, - "3f0c08c03e284ebb905dae8aca72fffc": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_view_name": "LayoutView", - "grid_template_rows": null, - "right": null, - "justify_content": null, - "_view_module": "@jupyter-widgets/base", - "overflow": null, - "_model_module_version": "1.2.0", - "_view_count": null, - "flex_flow": null, - "width": null, - "min_width": null, - "border": null, - "align_items": null, - "bottom": null, - "_model_module": "@jupyter-widgets/base", - "top": null, - "grid_column": null, - "overflow_y": null, - "overflow_x": null, - "grid_auto_flow": null, - "grid_area": null, - "grid_template_columns": null, - "flex": null, - "_model_name": "LayoutModel", - "justify_items": null, - "grid_row": null, - "max_height": null, - "align_content": null, - "visibility": null, - "align_self": null, - "height": null, - "min_height": null, - "padding": null, - "grid_auto_rows": null, - "grid_gap": null, - "max_width": null, - "order": null, - "_view_module_version": "1.2.0", - "grid_template_areas": null, - "object_position": null, - "object_fit": null, - "grid_auto_columns": null, - "margin": null, - "display": null, - "left": null - } - } - } - } - }, - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "rR4_BAUYs3Mb" - }, - "source": [ - "![image.png]()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "i7XbLCXGkll9" - }, - "source": [ - "# The Boring Model\n", - "Replicate a bug you experience, using this model.\n", - "\n", - "[Remember! we're always available for support on Slack](https://join.slack.com/t/pytorch-lightning/shared_invite/zt-f6bl2l0l-JYMK3tbAgAmGRrlNr00f1A)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "2LODD6w9ixlT" - }, - "source": [ - "---\n", - "## Setup env" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "zK7-Gg69kMnG" - }, - "source": [ - "%%capture\n", - "! pip install pytorch-lightning --upgrade" - ], - "execution_count": 1, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "WvuSN5jEbY8P" - }, - "source": [ - "---\n", - "## Deps" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "w4_TYnt_keJi" - }, - "source": [ - "import os\n", - "\n", - "import torch\n", - "from torch.utils.data import DataLoader, Dataset\n", - "\n", - "from pytorch_lightning import LightningModule, Trainer" - ], - "execution_count": 2, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "XrJDukwPtUnS" - }, - "source": [ - "---\n", - "## Data\n", - "Random data is best for debugging. If you needs special tensor shapes or batch compositions or dataloaders, modify as needed" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "hvgTiaZpkvwS" - }, - "source": [ - "class RandomDataset(Dataset):\n", - " def __init__(self, size, num_samples):\n", - " self.len = num_samples\n", - " self.data = torch.randn(num_samples, size)\n", - "\n", - " def __getitem__(self, index):\n", - " return self.data[index]\n", - "\n", - " def __len__(self):\n", - " return self.len\n" - ], - "execution_count": 3, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "sxVlWjGhl02D" - }, - "source": [ - "num_samples = 10000" - ], - "execution_count": 4, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "V7ELesz1kVQo" - }, - "source": [ - "class BoringModel(LightningModule):\n", - " def __init__(self):\n", - " super().__init__()\n", - " self.layer = torch.nn.Linear(32, 2)\n", - "\n", - " def forward(self, x):\n", - " return self.layer(x)\n", - "\n", - " def training_step(self, batch, batch_idx):\n", - " loss = self(batch).sum()\n", - " self.log(\"train_loss\", loss)\n", - " return {\"loss\": loss}\n", - "\n", - " def validation_step(self, batch, batch_idx):\n", - " loss = self(batch).sum()\n", - " self.log(\"valid_loss\", loss)\n", - "\n", - " def test_step(self, batch, batch_idx):\n", - " loss = self(batch).sum()\n", - " self.log(\"test_loss\", loss)\n", - "\n", - " def configure_optimizers(self):\n", - " return torch.optim.SGD(self.layer.parameters(), lr=0.1)" - ], - "execution_count": 5, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ubvW3LGSupmt" - }, - "source": [ - "---\n", - "## Define the test\n", - "NOTE: in colab, set progress_bar_refresh_rate high or the screen will freeze because of the rapid tqdm update speed." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "4Dk6Ykv8lI7X" - }, - "source": [ - "def run():\n", - " train_data = DataLoader(RandomDataset(32, 64), batch_size=2)\n", - " val_data = DataLoader(RandomDataset(32, 64), batch_size=2)\n", - " test_data = DataLoader(RandomDataset(32, 64), batch_size=2)\n", - "\n", - " model = BoringModel()\n", - " trainer = Trainer(\n", - " default_root_dir=os.getcwd(),\n", - " limit_train_batches=1,\n", - " limit_val_batches=1,\n", - " num_sanity_val_steps=0,\n", - " max_epochs=1,\n", - " enable_model_summary=False,\n", - " )\n", - " trainer.fit(model, train_dataloaders=train_data, val_dataloaders=val_data)\n", - " trainer.test(model, dataloaders=test_data)" - ], - "execution_count": 6, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "4dPfTZVgmgxz" - }, - "source": [ - "---\n", - "## Run Test" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "AAtq1hwSmjKe", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 272, - "referenced_widgets": [ - "d79c1628eded487a974da18a2ea1f98b", - "02695b143b764932ba8d0c08a872987e", - "28eb6a3218f64f26abcdff756ffda3ad", - "02cfffd590014c3cbc44ab06c69f9181", - "0d7c50e36cb84f01a57a9d7d8b913393", - "6ba2782883ae424dbfc8868224d95da9", - "baa4aacd0da64cf291fb31c000724573", - "7dad3d2feced492a999fb6c91186be50", - "ea702a091eb642f7bdda81aa55db8c26", - "4802a47c6dfb439c83d8b860dce42006", - "68c87e6a7fcf4e4eab98a941c7c3e867", - "e6cbe583c2e14986b4faeb27e31f73e1", - "672dd78899f944cea7e57f388f3ecb31", - "cd61dda59d104e0a8a8aa9bfc1e55c24", - "1cd72d82332941a6929f88fad5173096", - "92a38638060c4ed5b6d44a2078667e53", - "bdc9b06391ee47478efd58cc91ca87ac", - "ee80657d62c6452d9e9ac199157cdf2a", - "eb16b87bcb8d4ca6a83e8b44ea2d1311", - "2a6327dd568241e3acbb6aec1926bd80", - "a45aba8517e14654850453159780b54a", - "7fb167222e7143b789b7f40af7cb39dd", - "abe1c0c4dac94e0e9b894bb69c3ec450", - "23763e19d40d4020b3342a47366e2e19", - "0b7b7da6a6134f0fb26a05adc062ee6f", - "9941635d9d694ba7bce0c7a14c500e5e", - "c7f1407ba92f4dc6ba34bd9cf73fea69", - "86f2e0a558cc419e84ed9192ccd3d1b6", - "141a9c35ade14d9e8645b2c108ab4d66", - "833bb79bb1214a3a88795f41b9375690", - "51b32955ad544803b1d78f07bc685569", - "a6b2764a5fa9444a9d77e8d74c67ef47", - "3f0c08c03e284ebb905dae8aca72fffc" - ] - }, - "outputId": "59e8bcf2-a944-46fc-a771-e7cbbbe4727d" - }, - "source": [ - "run()" - ], - "execution_count": 7, - "outputs": [ - { - "output_type": "stream", - "name": "stderr", - "text": [ - "GPU available: True, used: False\n", - "TPU available: False, using: 0 TPU cores\n", - "IPU available: False, using: 0 IPUs\n", - "/usr/local/lib/python3.7/dist-packages/pytorch_lightning/trainer/trainer.py:1567: UserWarning: GPU available but not used. Set the gpus flag in your trainer `Trainer(gpus=1)` or script `--gpus=1`.\n", - " \"GPU available but not used. Set the gpus flag in your trainer `Trainer(gpus=1)` or script `--gpus=1`.\"\n", - "/usr/local/lib/python3.7/dist-packages/pytorch_lightning/trainer/data_loading.py:395: UserWarning: The number of training samples (1) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.\n", - " f\"The number of training samples ({self.num_training_batches}) is smaller than the logging interval\"\n" - ] - }, - { - "output_type": "display_data", - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "d79c1628eded487a974da18a2ea1f98b", - "version_minor": 0, - "version_major": 2 - }, - "text/plain": [ - "Training: 0it [00:00, ?it/s]" - ] - }, - "metadata": {} - }, - { - "output_type": "display_data", - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "e6cbe583c2e14986b4faeb27e31f73e1", - "version_minor": 0, - "version_major": 2 - }, - "text/plain": [ - "Validating: 0it [00:00, ?it/s]" - ] - }, - "metadata": {} - }, - { - "output_type": "display_data", - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "abe1c0c4dac94e0e9b894bb69c3ec450", - "version_minor": 0, - "version_major": 2 - }, - "text/plain": [ - "Testing: 0it [00:00, ?it/s]" - ] - }, - "metadata": {} - }, - { - "output_type": "stream", - "name": "stdout", - "text": [ - "--------------------------------------------------------------------------------\n", - "DATALOADER:0 TEST RESULTS\n", - "{'test_loss': -1.676544427871704}\n", - "--------------------------------------------------------------------------------\n" - ] - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Flyi--SpvsJN" - }, - "source": [ - "---\n", - "## Environment\n", - "Run this to get the environment details" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "0-yvGFRoaDSi" - }, - "source": [ - "%%capture\n", - "! wget https://raw.githubusercontent.com/PyTorchLightning/pytorch-lightning/master/requirements/collect_env_details.py" - ], - "execution_count": 8, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "quj4LUDgmFvj", - "colab": { - "base_uri": "https://localhost:8080/" - }, - "outputId": "bb7a5f74-d52c-4927-b12a-49589aed7dcb" - }, - "source": [ - "! python collect_env_details.py" - ], - "execution_count": 9, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "* CUDA:\n", - "\t- GPU:\n", - "\t\t- Tesla K80\n", - "\t- available: True\n", - "\t- version: 11.1\n", - "* Packages:\n", - "\t- numpy: 1.19.5\n", - "\t- pyTorch_debug: False\n", - "\t- pyTorch_version: 1.10.0+cu111\n", - "\t- pytorch-lightning: 1.5.1\n", - "\t- tqdm: 4.62.3\n", - "* System:\n", - "\t- OS: Linux\n", - "\t- architecture:\n", - "\t\t- 64bit\n", - "\t\t- \n", - "\t- processor: x86_64\n", - "\t- python: 3.7.12\n", - "\t- version: #1 SMP Sat Jun 5 09:50:34 PDT 2021\n" - ] - } - ] - } - ] -} diff --git a/pl_examples/bug_report/bug_report_model.ipynb b/pl_examples/bug_report/bug_report_model.ipynb new file mode 100644 index 0000000000000..a6cb1933f113d --- /dev/null +++ b/pl_examples/bug_report/bug_report_model.ipynb @@ -0,0 +1,267 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "accelerator": "GPU", + "colab": { + "name": "bug_report_model.ipynb", + "provenance": [], + "collapsed_sections": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.7" + } + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "rR4_BAUYs3Mb" + }, + "source": [ + "![image.png]()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "i7XbLCXGkll9" + }, + "source": [ + "# The Boring Model\n", + "Replicate a bug you experience, using this model.\n", + "\n", + "[Remember! we're always available for support on Slack](https://join.slack.com/t/pytorch-lightning/shared_invite/zt-f6bl2l0l-JYMK3tbAgAmGRrlNr00f1A)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "2LODD6w9ixlT" + }, + "source": [ + "---\n", + "## Setup env" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "zK7-Gg69kMnG" + }, + "source": [ + "%%capture\n", + "! pip install -qU pytorch-lightning" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "WvuSN5jEbY8P" + }, + "source": [ + "---\n", + "## Deps" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "w4_TYnt_keJi" + }, + "source": [ + "import os\n", + "\n", + "import torch\n", + "from torch.utils.data import DataLoader, Dataset\n", + "\n", + "from pytorch_lightning import LightningModule, Trainer" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "XrJDukwPtUnS" + }, + "source": [ + "---\n", + "## Data\n", + "Random data is best for debugging. If you needs special tensor shapes or batch compositions or dataloaders, modify as needed" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "hvgTiaZpkvwS" + }, + "source": [ + "class RandomDataset(Dataset):\n", + " def __init__(self, size, num_samples):\n", + " self.len = num_samples\n", + " self.data = torch.randn(num_samples, size)\n", + "\n", + " def __getitem__(self, index):\n", + " return self.data[index]\n", + "\n", + " def __len__(self):\n", + " return self.len" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "sxVlWjGhl02D" + }, + "source": [ + "num_samples = 10000" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "V7ELesz1kVQo" + }, + "source": [ + "class BoringModel(LightningModule):\n", + " def __init__(self):\n", + " super().__init__()\n", + " self.layer = torch.nn.Linear(32, 2)\n", + "\n", + " def forward(self, x):\n", + " return self.layer(x)\n", + "\n", + " def training_step(self, batch, batch_idx):\n", + " loss = self(batch).sum()\n", + " self.log(\"train_loss\", loss)\n", + " return {\"loss\": loss}\n", + "\n", + " def validation_step(self, batch, batch_idx):\n", + " loss = self(batch).sum()\n", + " self.log(\"valid_loss\", loss)\n", + "\n", + " def test_step(self, batch, batch_idx):\n", + " loss = self(batch).sum()\n", + " self.log(\"test_loss\", loss)\n", + "\n", + " def configure_optimizers(self):\n", + " return torch.optim.SGD(self.layer.parameters(), lr=0.1)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ubvW3LGSupmt" + }, + "source": [ + "---\n", + "## Define the test" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "4Dk6Ykv8lI7X" + }, + "source": [ + "def run():\n", + " train_data = DataLoader(RandomDataset(32, 64), batch_size=2)\n", + " val_data = DataLoader(RandomDataset(32, 64), batch_size=2)\n", + " test_data = DataLoader(RandomDataset(32, 64), batch_size=2)\n", + "\n", + " model = BoringModel()\n", + " trainer = Trainer(\n", + " default_root_dir=os.getcwd(),\n", + " limit_train_batches=1,\n", + " limit_val_batches=1,\n", + " limit_test_batches=1,\n", + " num_sanity_val_steps=0,\n", + " max_epochs=1,\n", + " enable_model_summary=False,\n", + " )\n", + " trainer.fit(model, train_dataloaders=train_data, val_dataloaders=val_data)\n", + " trainer.test(model, dataloaders=test_data)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4dPfTZVgmgxz" + }, + "source": [ + "---\n", + "## Run Test" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "AAtq1hwSmjKe" + }, + "source": [ + "run()" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Flyi--SpvsJN" + }, + "source": [ + "---\n", + "## Environment\n", + "Run this to get the environment details" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "0-yvGFRoaDSi" + }, + "source": [ + "%%capture\n", + "! wget https://raw.githubusercontent.com/PyTorchLightning/pytorch-lightning/master/requirements/collect_env_details.py" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "quj4LUDgmFvj" + }, + "source": [ + "! python collect_env_details.py" + ], + "execution_count": null, + "outputs": [] + } + ] +} diff --git a/pl_examples/bug_report/bug_report_model.py b/pl_examples/bug_report/bug_report_model.py index 270b0cd2abe8d..7739630237d32 100644 --- a/pl_examples/bug_report/bug_report_model.py +++ b/pl_examples/bug_report/bug_report_model.py @@ -53,6 +53,7 @@ def run(): default_root_dir=os.getcwd(), limit_train_batches=1, limit_val_batches=1, + limit_test_batches=1, num_sanity_val_steps=0, max_epochs=1, enable_model_summary=False, From f620dc8ea736a3e55f4d42cd7451b507683e67a2 Mon Sep 17 00:00:00 2001 From: Kaushik B <45285388+kaushikb11@users.noreply.github.com> Date: Mon, 22 Nov 2021 19:59:06 +0530 Subject: [PATCH 040/123] Remove metrics references from docs (#10567) --- docs/source/advanced/multi_gpu.rst | 2 +- docs/source/extensions/logging.rst | 2 +- docs/source/extensions/metrics.rst | 9 --------- docs/source/index.rst | 1 - pyproject.toml | 1 - 5 files changed, 2 insertions(+), 13 deletions(-) delete mode 100644 docs/source/extensions/metrics.rst diff --git a/docs/source/advanced/multi_gpu.rst b/docs/source/advanced/multi_gpu.rst index 77784f8da0542..51d07f628620f 100644 --- a/docs/source/advanced/multi_gpu.rst +++ b/docs/source/advanced/multi_gpu.rst @@ -90,7 +90,7 @@ This is done by adding ``sync_dist=True`` to all ``self.log`` calls in the valid This ensures that each GPU worker has the same behaviour when tracking model checkpoints, which is important for later downstream tasks such as testing the best checkpoint across all workers. The ``sync_dist`` option can also be used in logging calls during the step methods, but be aware that this can lead to significant communication overhead and slow down your training. -Note if you use any built in metrics or custom metrics that use the :doc:`Metrics API <../extensions/metrics>`, these do not need to be updated and are automatically handled for you. +Note if you use any built in metrics or custom metrics that use `TorchMetrics `_, these do not need to be updated and are automatically handled for you. .. testcode:: diff --git a/docs/source/extensions/logging.rst b/docs/source/extensions/logging.rst index 1facdb93373eb..e652adbecc419 100644 --- a/docs/source/extensions/logging.rst +++ b/docs/source/extensions/logging.rst @@ -111,7 +111,7 @@ The :func:`~~pytorch_lightning.core.lightning.LightningModule.log` method has a .. note:: - Setting ``on_epoch=True`` will cache all your logged values during the full training epoch and perform a - reduction in ``on_train_epoch_end``. We recommend using the :doc:`metrics <../extensions/metrics>` API when working with custom reduction. + reduction in ``on_train_epoch_end``. We recommend using `TorchMetrics `_, when working with custom reduction. - Setting both ``on_step=True`` and ``on_epoch=True`` will create two keys per metric you log with suffix ``_step`` and ``_epoch``, respectively. You can refer to these keys e.g. in the `monitor` diff --git a/docs/source/extensions/metrics.rst b/docs/source/extensions/metrics.rst deleted file mode 100644 index 74a4a15deb2be..0000000000000 --- a/docs/source/extensions/metrics.rst +++ /dev/null @@ -1,9 +0,0 @@ -####### -Metrics -####### - -``pytorch_lightning.metrics`` has been moved to a separate package `TorchMetrics `_. -We will preserve compatibility for the next few releases, nevertheless, we encourage users to update to use this stand-alone package. - -.. warning:: - ``pytorch_lightning.metrics`` is deprecated from v1.3 and will be removed in v1.5. diff --git a/docs/source/index.rst b/docs/source/index.rst index 72da9c3e354c4..c1b20b958591b 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -84,7 +84,6 @@ PyTorch Lightning extensions/callbacks extensions/datamodules extensions/logging - extensions/metrics extensions/plugins extensions/loops diff --git a/pyproject.toml b/pyproject.toml index 6546d96e3d5e5..00e22c42e12ba 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -44,7 +44,6 @@ module = [ "pytorch_lightning.core.*", "pytorch_lightning.loggers.*", "pytorch_lightning.loops.*", - "pytorch_lightning.metrics.*", "pytorch_lightning.overrides.*", "pytorch_lightning.plugins.environments.*", "pytorch_lightning.plugins.training_type.*", From 9cbc5da19d981465960d3b836627b8f7eac64482 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Mon, 22 Nov 2021 15:52:21 +0100 Subject: [PATCH 041/123] Fix docs filterwarnings snippet (#10671) --- docs/source/guides/speed.rst | 4 +--- pytorch_lightning/trainer/data_loading.py | 1 + 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/docs/source/guides/speed.rst b/docs/source/guides/speed.rst index 0b8edb43b7ec5..87b5b9d0ad139 100644 --- a/docs/source/guides/speed.rst +++ b/docs/source/guides/speed.rst @@ -151,9 +151,7 @@ For debugging purposes or for dataloaders that load very small datasets, it is d import warnings - warnings.filterwarnings( - "ignore", ".*does not have many workers. Consider increasing the value of the `num_workers` argument*" - ) + warnings.filterwarnings("ignore", ".*Consider increasing the value of the `num_workers` argument*") Spawn """"" diff --git a/pytorch_lightning/trainer/data_loading.py b/pytorch_lightning/trainer/data_loading.py index 9c40e728391c1..19cc3c4a38371 100644 --- a/pytorch_lightning/trainer/data_loading.py +++ b/pytorch_lightning/trainer/data_loading.py @@ -107,6 +107,7 @@ def _worker_check(self, dataloader: DataLoader, name: str) -> None: ) elif dataloader.num_workers <= 2 < num_cpus and not using_spawn: + # if changed, update the `filterwarnings` snippet in 'speed.html#num-workers' rank_zero_warn( f"The dataloader, {name}, does not have many workers which may be a bottleneck." " Consider increasing the value of the `num_workers` argument`" From c3341e6bc956f904e7d46a0d40626887e259b2ad Mon Sep 17 00:00:00 2001 From: shabie <30535146+shabie@users.noreply.github.com> Date: Thu, 18 Nov 2021 18:29:13 +0100 Subject: [PATCH 042/123] log metrics for correct dataloader only (#10522) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: tchaton Co-authored-by: Carlos Mocholí --- .../logging_/test_eval_loop_logging.py | 27 +++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/tests/trainer/logging_/test_eval_loop_logging.py b/tests/trainer/logging_/test_eval_loop_logging.py index d0e65e0429bc2..1109459938cdd 100644 --- a/tests/trainer/logging_/test_eval_loop_logging.py +++ b/tests/trainer/logging_/test_eval_loop_logging.py @@ -23,6 +23,7 @@ from pytorch_lightning import callbacks, Trainer from pytorch_lightning.loggers import TensorBoardLogger +from pytorch_lightning.trainer.connectors.logger_connector import LoggerConnector from pytorch_lightning.utilities.exceptions import MisconfigurationException from tests.helpers import BoringModel, RandomDataset @@ -672,3 +673,29 @@ def val_dataloader(self): enable_model_summary=False, ) trainer.fit(model) + + +@pytest.mark.parametrize( + ["kwargs", "expected"], + [ + ({"dl_idx": 0, "metrics": {"acc": 123}}, {"acc": 123}), + ( + {"dl_idx": 0, "metrics": {"acc/dataloader_idx_0": 123, "acc/dataloader_idx_1": 321}}, + {"acc/dataloader_idx_0": 123}, + ), + ( + {"dl_idx": 10, "metrics": {"acc/dataloader_idx_1": 123, "acc/dataloader_idx_10": 321}}, + {"acc/dataloader_idx_10": 321}, + ), + ( + {"dl_idx": 3, "metrics": {"top_3_acc/dataloader_idx_0": 123, "top_3_acc/dataloader_idx_3": 321}}, + {"top_3_acc/dataloader_idx_3": 321}, + ), + # theoretical case, as `/dataloader_idx_3` would have been added + ({"dl_idx": 3, "metrics": {"top_3_acc": 123}}, {"top_3_acc": 123}), + ], +) +def test_filter_metrics_for_dataloader(kwargs, expected): + """Logged metrics should only include metrics from the concerned dataloader.""" + actual = LoggerConnector._filter_metrics_for_dataloader(**kwargs) + assert actual == expected From 3f2b8574855902201d3d31a4cb2314865083fec8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Mon, 22 Nov 2021 16:58:21 +0100 Subject: [PATCH 043/123] Fix `move_metrics_to_cpu` with evaluation (#10631) --- CHANGELOG.md | 4 ++-- .../loops/epoch/evaluation_epoch_loop.py | 12 ++++++---- .../logging_/test_eval_loop_logging.py | 24 +++++++++++++++++++ 3 files changed, 33 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 40be4c31900a7..dddb3ddea25dc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -26,10 +26,10 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed an issue that caused Lightning to extract the batch size even though it was set by the user in `LightningModule.log` ([#10408](https://github.com/PyTorchLightning/pytorch-lightning/pull/10408)) -- +- Fixed `Trainer(move_metrics_to_cpu=True)` not moving the evaluation logged results to CPU ([#10631](https://github.com/PyTorchLightning/pytorch-lightning/pull/10631)) -- +- Fixed the `{validation,test}_step` outputs getting moved to CPU with `Trainer(move_metrics_to_cpu=True)` ([#10631](https://github.com/PyTorchLightning/pytorch-lightning/pull/10631)) diff --git a/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py b/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py index b4660c96a0989..102603f20302b 100644 --- a/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py +++ b/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py @@ -24,7 +24,6 @@ from pytorch_lightning.trainer.progress import BatchProgress from pytorch_lightning.utilities.auto_restart import MergedIteratorState, reload_dataloader_state_dict from pytorch_lightning.utilities.fetching import AbstractDataFetcher, DataFetcher -from pytorch_lightning.utilities.memory import recursive_detach from pytorch_lightning.utilities.model_helpers import is_overridden from pytorch_lightning.utilities.types import EPOCH_OUTPUT, STEP_OUTPUT @@ -134,10 +133,13 @@ def advance( self.trainer.logger_connector.update_eval_step_metrics() # track epoch level outputs - if self._should_track_batch_outputs_for_epoch_end(): - output = recursive_detach(output, to_cpu=self.trainer.move_metrics_to_cpu) - if output is not None: - self.outputs.append(output) + if self._should_track_batch_outputs_for_epoch_end() and output is not None: + self.outputs.append(output) + + if self.trainer.move_metrics_to_cpu: + # the evaluation step output is not moved as they are not considered "metrics" + assert self.trainer._results is not None + self.trainer._results.cpu() if not self.batch_progress.is_last_batch: # if fault tolerant is enabled and process has been notified, exit. diff --git a/tests/trainer/logging_/test_eval_loop_logging.py b/tests/trainer/logging_/test_eval_loop_logging.py index 1109459938cdd..433730d06111c 100644 --- a/tests/trainer/logging_/test_eval_loop_logging.py +++ b/tests/trainer/logging_/test_eval_loop_logging.py @@ -26,6 +26,7 @@ from pytorch_lightning.trainer.connectors.logger_connector import LoggerConnector from pytorch_lightning.utilities.exceptions import MisconfigurationException from tests.helpers import BoringModel, RandomDataset +from tests.helpers.runif import RunIf def test__validation_step__log(tmpdir): @@ -699,3 +700,26 @@ def test_filter_metrics_for_dataloader(kwargs, expected): """Logged metrics should only include metrics from the concerned dataloader.""" actual = LoggerConnector._filter_metrics_for_dataloader(**kwargs) assert actual == expected + + +@RunIf(min_gpus=1) +def test_evaluation_move_metrics_to_cpu_and_outputs(tmpdir): + class TestModel(BoringModel): + def validation_step(self, *args): + x = torch.tensor(2.0, requires_grad=True, device=self.device) + y = x * 2 + assert x.requires_grad is True + assert y.grad_fn is None # disabled by validation + + self.log("foo", y) + return y + + def validation_epoch_end(self, outputs): + # the step outputs were not moved + assert all(o.device == self.device for o in outputs), outputs + # but the logging results were + assert self.trainer.callback_metrics["foo"].device.type == "cpu" + + model = TestModel() + trainer = Trainer(default_root_dir=tmpdir, limit_val_batches=2, move_metrics_to_cpu=True, gpus=1) + trainer.validate(model, verbose=False) From bfc04472eb6f56e6ca73a20c7da0a9e43150c771 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 24 Nov 2021 12:52:22 +0100 Subject: [PATCH 044/123] Don't register signal in thread (#10610) Co-authored-by: tchaton --- CHANGELOG.md | 3 +++ .../trainer/connectors/signal_connector.py | 14 ++++++++++---- .../trainer/connectors/test_signal_connector.py | 17 +++++++++++++++++ 3 files changed, 30 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index dddb3ddea25dc..803b31578c168 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -32,6 +32,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed the `{validation,test}_step` outputs getting moved to CPU with `Trainer(move_metrics_to_cpu=True)` ([#10631](https://github.com/PyTorchLightning/pytorch-lightning/pull/10631)) +- Fixed signals being registered within threads ([#10610](https://github.com/PyTorchLightning/pytorch-lightning/pull/10610)) + + ## [1.5.2] - 2021-11-16 diff --git a/pytorch_lightning/trainer/connectors/signal_connector.py b/pytorch_lightning/trainer/connectors/signal_connector.py index dc33d1244441f..7344f076c3972 100644 --- a/pytorch_lightning/trainer/connectors/signal_connector.py +++ b/pytorch_lightning/trainer/connectors/signal_connector.py @@ -2,6 +2,7 @@ import os import signal import sys +import threading from signal import Signals from subprocess import call from types import FrameType, FunctionType @@ -43,11 +44,11 @@ def register_signal_handlers(self) -> None: # signal.SIGUSR1 doesn't seem available on windows if not self._is_on_windows(): - if not self._has_already_handler(signal.SIGUSR1): - signal.signal(signal.SIGUSR1, HandlersCompose(sigusr1_handlers)) + if sigusr1_handlers and not self._has_already_handler(signal.SIGUSR1): + self._register_signal(signal.SIGUSR1, HandlersCompose(sigusr1_handlers)) - if not self._has_already_handler(signal.SIGTERM): - signal.signal(signal.SIGTERM, HandlersCompose(sigterm_handlers)) + if sigterm_handlers and not self._has_already_handler(signal.SIGTERM): + self._register_signal(signal.SIGTERM, HandlersCompose(sigterm_handlers)) def slurm_sigusr1_handler_fn(self, signum: Signals, frame: FrameType) -> None: if self.trainer.is_global_zero: @@ -107,3 +108,8 @@ def _has_already_handler(self, signum: Signals) -> bool: return isinstance(signal.getsignal(signum), FunctionType) except AttributeError: return False + + @staticmethod + def _register_signal(signum: Signals, handlers: HandlersCompose) -> None: + if threading.current_thread() is threading.main_thread(): + signal.signal(signum, handlers) diff --git a/tests/trainer/connectors/test_signal_connector.py b/tests/trainer/connectors/test_signal_connector.py index 3da8c100fe40c..e89365e2e2056 100644 --- a/tests/trainer/connectors/test_signal_connector.py +++ b/tests/trainer/connectors/test_signal_connector.py @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import concurrent.futures import os import signal from time import sleep @@ -57,3 +58,19 @@ def training_step(self, batch, batch_idx): else: trainer.fit(model) assert trainer._terminate_gracefully == (False if register_handler else terminate_gracefully) + + # reset the signal to system defaults + signal.signal(signal.SIGUSR1, signal.SIG_DFL) + + +def _registering_signals(): + trainer = Trainer() + trainer.signal_connector.register_signal_handlers() + + +@RunIf(skip_windows=True) +@mock.patch.dict(os.environ, {"PL_FAULT_TOLERANT_TRAINING": "1"}) +def test_signal_connector_in_thread(): + with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor: + for future in concurrent.futures.as_completed([executor.submit(_registering_signals)]): + assert future.exception() is None From fb1029f23c74c262ec8d752ff6b1a1e2524a568b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Fri, 19 Nov 2021 15:38:42 +0100 Subject: [PATCH 045/123] MANIFEST.in and setup.py clean-up (#7614) --- .github/workflows/ci_pkg-install.yml | 5 +- MANIFEST.in | 67 +----------- .../basic_examples/mnist_datamodule.py | 102 +++++++++++++++++- pl_examples/run_examples.sh | 1 + pl_examples/test_examples.py | 6 +- requirements/test.txt | 1 - setup.cfg | 8 -- setup.py | 4 +- tests/helpers/datasets.py | 5 +- tests/special_tests.sh | 1 + 10 files changed, 114 insertions(+), 86 deletions(-) diff --git a/.github/workflows/ci_pkg-install.yml b/.github/workflows/ci_pkg-install.yml index 12f3976d078e4..2cf64f39a28b7 100644 --- a/.github/workflows/ci_pkg-install.yml +++ b/.github/workflows/ci_pkg-install.yml @@ -26,12 +26,11 @@ jobs: - name: Prepare env run: | - pip install check-manifest "twine==3.2" setuptools wheel + pip install "twine==3.2" setuptools wheel - name: Create package run: | - check-manifest - # python setup.py check --metadata --strict + python setup.py check --metadata --strict python setup.py sdist bdist_wheel - name: Check package diff --git a/MANIFEST.in b/MANIFEST.in index b810937f1a495..a68fc82474e70 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -11,69 +11,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - -# Manifest syntax https://docs.python.org/2/distutils/sourcedist.html -graft wheelhouse - -recursive-exclude __pycache__ *.py[cod] *.orig - -# Include the README and CHANGELOG -include *.md - -# Include the license file -include LICENSE - -# Include the citation info -include *.cff - -exclude *.sh -exclude *.svg -recursive-include pytorch_lightning *.py - -# Include marker file for PEP 561 -include pytorch_lightning/py.typed - -# include examples -recursive-include pl_examples *.py *.md *.sh *.txt *.toml - -# exclude tests from package -recursive-exclude tests * -recursive-exclude site * -exclude tests - -# Exclude the documentation files -recursive-exclude docs * -exclude docs -recursive-include docs/source/_static/images/logos/ * -recursive-include docs/source/_static/images/general/ pl_overview* tf_* tutorial_* PTL101_* - -# Include the Requirements +include pytorch_lightning/py.typed # marker file for PEP 561 +include CHANGELOG.md recursive-include requirements *.txt -recursive-exclude requirements *.sh *.py include requirements.txt -include pyproject.toml - -# Exclude build configs -exclude *.yml -exclude *.yaml -exclude *.toml -exclude *.jsonnet - -# Exclude pyright config -exclude .pyrightconfig.json - -# Exclude submodules -exclude .gitmodules -exclude _notebooks - -# Exclude Makefile -exclude Makefile - -prune .git -prune .github -prune .circleci -prune temp* -prune test* -prune benchmark* -prune dockers -prune legacy +include *.cff # citation info diff --git a/pl_examples/basic_examples/mnist_datamodule.py b/pl_examples/basic_examples/mnist_datamodule.py index 1d2371c702ce0..a8d33b287f380 100644 --- a/pl_examples/basic_examples/mnist_datamodule.py +++ b/pl_examples/basic_examples/mnist_datamodule.py @@ -11,13 +11,18 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import logging import os import platform -from typing import Optional +import random +import time +import urllib +from typing import Optional, Tuple from urllib.error import HTTPError from warnings import warn -from torch.utils.data import DataLoader, random_split +import torch +from torch.utils.data import DataLoader, Dataset, random_split from pl_examples import _DATASETS_PATH from pytorch_lightning import LightningDataModule @@ -27,6 +32,97 @@ from torchvision import transforms as transform_lib +class _MNIST(Dataset): + """Carbon copy of ``tests.helpers.datasets.MNIST``. + + We cannot import the tests as they are not distributed with the package. + See https://github.com/PyTorchLightning/pytorch-lightning/pull/7614#discussion_r671183652 for more context. + """ + + RESOURCES = ( + "https://pl-public-data.s3.amazonaws.com/MNIST/processed/training.pt", + "https://pl-public-data.s3.amazonaws.com/MNIST/processed/test.pt", + ) + + TRAIN_FILE_NAME = "training.pt" + TEST_FILE_NAME = "test.pt" + cache_folder_name = "complete" + + def __init__( + self, root: str, train: bool = True, normalize: tuple = (0.1307, 0.3081), download: bool = True, **kwargs + ): + super().__init__() + self.root = root + self.train = train # training set or test set + self.normalize = normalize + + self.prepare_data(download) + + data_file = self.TRAIN_FILE_NAME if self.train else self.TEST_FILE_NAME + self.data, self.targets = self._try_load(os.path.join(self.cached_folder_path, data_file)) + + def __getitem__(self, idx: int) -> Tuple[torch.Tensor, int]: + img = self.data[idx].float().unsqueeze(0) + target = int(self.targets[idx]) + + if self.normalize is not None and len(self.normalize) == 2: + img = self.normalize_tensor(img, *self.normalize) + + return img, target + + def __len__(self) -> int: + return len(self.data) + + @property + def cached_folder_path(self) -> str: + return os.path.join(self.root, "MNIST", self.cache_folder_name) + + def _check_exists(self, data_folder: str) -> bool: + existing = True + for fname in (self.TRAIN_FILE_NAME, self.TEST_FILE_NAME): + existing = existing and os.path.isfile(os.path.join(data_folder, fname)) + return existing + + def prepare_data(self, download: bool = True): + if download and not self._check_exists(self.cached_folder_path): + self._download(self.cached_folder_path) + if not self._check_exists(self.cached_folder_path): + raise RuntimeError("Dataset not found.") + + def _download(self, data_folder: str) -> None: + os.makedirs(data_folder, exist_ok=True) + for url in self.RESOURCES: + logging.info(f"Downloading {url}") + fpath = os.path.join(data_folder, os.path.basename(url)) + urllib.request.urlretrieve(url, fpath) + + @staticmethod + def _try_load(path_data, trials: int = 30, delta: float = 1.0): + """Resolving loading from the same time from multiple concurrent processes.""" + res, exception = None, None + assert trials, "at least some trial has to be set" + assert os.path.isfile(path_data), f"missing file: {path_data}" + for _ in range(trials): + try: + res = torch.load(path_data) + # todo: specify the possible exception + except Exception as e: + exception = e + time.sleep(delta * random.random()) + else: + break + if exception is not None: + # raise the caught exception + raise exception + return res + + @staticmethod + def normalize_tensor(tensor: torch.Tensor, mean: float = 0.0, std: float = 1.0) -> torch.Tensor: + mean = torch.as_tensor(mean, dtype=tensor.dtype, device=tensor.device) + std = torch.as_tensor(std, dtype=tensor.dtype, device=tensor.device) + return tensor.sub(mean).div(std) + + def MNIST(*args, **kwargs): torchvision_mnist_available = not bool(os.getenv("PL_USE_MOCKED_MNIST", False)) if torchvision_mnist_available: @@ -39,7 +135,7 @@ def MNIST(*args, **kwargs): torchvision_mnist_available = False if not torchvision_mnist_available: print("`torchvision.datasets.MNIST` not available. Using our hosted version") - from tests.helpers.datasets import MNIST + MNIST = _MNIST return MNIST(*args, **kwargs) diff --git a/pl_examples/run_examples.sh b/pl_examples/run_examples.sh index 4a15c3367d35f..a04a57631d9cb 100755 --- a/pl_examples/run_examples.sh +++ b/pl_examples/run_examples.sh @@ -1,6 +1,7 @@ #!/bin/bash set -ex +export PYTHONPATH="${PYTHONPATH}:$(pwd)" dir_path=$(dirname "${BASH_SOURCE[0]}") args=" --data.batch_size=32 diff --git a/pl_examples/test_examples.py b/pl_examples/test_examples.py index 19d09836ef34c..00ca558c53606 100644 --- a/pl_examples/test_examples.py +++ b/pl_examples/test_examples.py @@ -14,9 +14,10 @@ from unittest import mock import pytest +import torch from pl_examples import _DALI_AVAILABLE -from tests.helpers.runif import RunIf +from pytorch_lightning.utilities.imports import _IS_WINDOWS ARGS_DEFAULT = ( "--trainer.default_root_dir %(tmpdir)s " @@ -31,7 +32,8 @@ @pytest.mark.skipif(not _DALI_AVAILABLE, reason="Nvidia DALI required") -@RunIf(min_gpus=1, skip_windows=True) +@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA required") +@pytest.mark.skipif(_IS_WINDOWS, reason="Not supported on Windows") @pytest.mark.parametrize("cli_args", [ARGS_GPU]) def test_examples_mnist_dali(tmpdir, cli_args): from pl_examples.integration_examples.dali_image_classifier import cli_main diff --git a/requirements/test.txt b/requirements/test.txt index de749e2339f10..d86137b037f4d 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -2,7 +2,6 @@ coverage>5.2.0 codecov>=2.1 pytest>=6.0 pytest-rerunfailures>=10.2 -check-manifest twine==3.2 mypy>=0.900 flake8>=3.9.2 diff --git a/setup.cfg b/setup.cfg index 9d63c0e556341..20f00ef8ae102 100644 --- a/setup.cfg +++ b/setup.cfg @@ -66,14 +66,6 @@ ignore = W503 # Ignore "Line break occurred before a binary operator" E203 # Ignore "whitespace before ':'" -# setup.cfg or tox.ini -[check-manifest] -ignore = - *.yml - .github - .github/* - .circleci - [metadata] license_file = LICENSE diff --git a/setup.py b/setup.py index ddbae8b974a03..9d54a0d5641f5 100755 --- a/setup.py +++ b/setup.py @@ -74,10 +74,10 @@ def _load_py_module(fname, pkg="pytorch_lightning"): url=about.__homepage__, download_url="https://github.com/PyTorchLightning/pytorch-lightning", license=about.__license__, - packages=find_packages(exclude=["tests", "tests/*", "benchmarks", "legacy", "legacy/*"]), + packages=find_packages(exclude=["tests*", "pl_examples*", "legacy*"]), + include_package_data=True, long_description=long_description, long_description_content_type="text/markdown", - include_package_data=True, zip_safe=False, keywords=["deep learning", "pytorch", "AI"], python_requires=">=3.6", diff --git a/tests/helpers/datasets.py b/tests/helpers/datasets.py index 561642ae8cfbe..33bf1d9b8e13f 100644 --- a/tests/helpers/datasets.py +++ b/tests/helpers/datasets.py @@ -19,7 +19,6 @@ from typing import Optional, Sequence, Tuple import torch -from torch import Tensor from torch.utils.data import Dataset @@ -70,7 +69,7 @@ def __init__( data_file = self.TRAIN_FILE_NAME if self.train else self.TEST_FILE_NAME self.data, self.targets = self._try_load(os.path.join(self.cached_folder_path, data_file)) - def __getitem__(self, idx: int) -> Tuple[Tensor, int]: + def __getitem__(self, idx: int) -> Tuple[torch.Tensor, int]: img = self.data[idx].float().unsqueeze(0) target = int(self.targets[idx]) @@ -126,7 +125,7 @@ def _try_load(path_data, trials: int = 30, delta: float = 1.0): return res @staticmethod - def normalize_tensor(tensor: Tensor, mean: float = 0.0, std: float = 1.0) -> Tensor: + def normalize_tensor(tensor: torch.Tensor, mean: float = 0.0, std: float = 1.0) -> torch.Tensor: mean = torch.as_tensor(mean, dtype=tensor.dtype, device=tensor.device) std = torch.as_tensor(std, dtype=tensor.dtype, device=tensor.device) return tensor.sub(mean).div(std) diff --git a/tests/special_tests.sh b/tests/special_tests.sh index f4b760dd75291..ff6dbcecb34cd 100755 --- a/tests/special_tests.sh +++ b/tests/special_tests.sh @@ -87,6 +87,7 @@ fi # report+="Ran\ttests/plugins/environments/torch_elastic_deadlock.py\n" # test that a user can manually launch individual processes +export PYTHONPATH="${PYTHONPATH}:$(pwd)" args="--trainer.gpus 2 --trainer.strategy ddp --trainer.max_epochs=1 --trainer.limit_train_batches=1 --trainer.limit_val_batches=1 --trainer.limit_test_batches=1" MASTER_ADDR="localhost" MASTER_PORT=1234 LOCAL_RANK=1 python pl_examples/basic_examples/mnist_examples/image_classifier_5_lightning_datamodule.py ${args} & MASTER_ADDR="localhost" MASTER_PORT=1234 LOCAL_RANK=0 python pl_examples/basic_examples/mnist_examples/image_classifier_5_lightning_datamodule.py ${args} From 1d3b6a20ab75c855148d74c9994a69a2ccddd50b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Wed, 17 Nov 2021 16:46:14 +0100 Subject: [PATCH 046/123] Support special test parametrizations (#10569) --- tests/accelerators/test_ddp.py | 12 +-- tests/callbacks/test_pruning.py | 25 ++----- tests/callbacks/test_tqdm_progress_bar.py | 19 ++--- .../test_checkpoint_callback_frequency.py | 13 +--- tests/conftest.py | 13 ++++ tests/helpers/runif.py | 2 + tests/models/test_hooks.py | 12 +-- tests/special_tests.sh | 74 +++++++++---------- tests/trainer/test_trainer.py | 29 ++++---- 9 files changed, 82 insertions(+), 117 deletions(-) diff --git a/tests/accelerators/test_ddp.py b/tests/accelerators/test_ddp.py index 6b28640e92ab4..1982e967c21ea 100644 --- a/tests/accelerators/test_ddp.py +++ b/tests/accelerators/test_ddp.py @@ -109,16 +109,8 @@ def setup(self, stage: Optional[str] = None) -> None: @RunIf(min_gpus=2, min_torch="1.8.1", special=True) -def test_ddp_wrapper_16(tmpdir): - _test_ddp_wrapper(tmpdir, precision=16) - - -@RunIf(min_gpus=2, min_torch="1.8.1", special=True) -def test_ddp_wrapper_32(tmpdir): - _test_ddp_wrapper(tmpdir, precision=32) - - -def _test_ddp_wrapper(tmpdir, precision): +@pytest.mark.parametrize("precision", (16, 32)) +def test_ddp_wrapper(tmpdir, precision): """Test parameters to ignore are carried over for DDP.""" class WeirdModule(torch.nn.Module): diff --git a/tests/callbacks/test_pruning.py b/tests/callbacks/test_pruning.py index c813ed2b02e28..ec4dcddf777c0 100644 --- a/tests/callbacks/test_pruning.py +++ b/tests/callbacks/test_pruning.py @@ -161,27 +161,18 @@ def test_pruning_callback( @RunIf(special=True, min_gpus=2) -def test_pruning_callback_ddp_0(tmpdir): +@pytest.mark.parametrize("parameters_to_prune", (False, True)) +@pytest.mark.parametrize("use_global_unstructured", (False, True)) +def test_pruning_callback_ddp(tmpdir, parameters_to_prune, use_global_unstructured): train_with_pruning_callback( - tmpdir, parameters_to_prune=False, use_global_unstructured=False, strategy="ddp", gpus=2 + tmpdir, + parameters_to_prune=parameters_to_prune, + use_global_unstructured=use_global_unstructured, + strategy="ddp", + gpus=2, ) -@RunIf(special=True, min_gpus=2) -def test_pruning_callback_ddp_1(tmpdir): - train_with_pruning_callback(tmpdir, parameters_to_prune=False, use_global_unstructured=True, strategy="ddp", gpus=2) - - -@RunIf(special=True, min_gpus=2) -def test_pruning_callback_ddp_2(tmpdir): - train_with_pruning_callback(tmpdir, parameters_to_prune=True, use_global_unstructured=False, strategy="ddp", gpus=2) - - -@RunIf(special=True, min_gpus=2) -def test_pruning_callback_ddp_3(tmpdir): - train_with_pruning_callback(tmpdir, parameters_to_prune=True, use_global_unstructured=True, strategy="ddp", gpus=2) - - @RunIf(min_gpus=2, skip_windows=True) def test_pruning_callback_ddp_spawn(tmpdir): train_with_pruning_callback(tmpdir, use_global_unstructured=True, strategy="ddp_spawn", gpus=2) diff --git a/tests/callbacks/test_tqdm_progress_bar.py b/tests/callbacks/test_tqdm_progress_bar.py index b92fb18d54ccd..d25c263443168 100644 --- a/tests/callbacks/test_tqdm_progress_bar.py +++ b/tests/callbacks/test_tqdm_progress_bar.py @@ -522,20 +522,11 @@ def test_tqdm_progress_bar_can_be_pickled(): @RunIf(min_gpus=2, special=True) -def test_tqdm_progress_bar_max_val_check_interval_0(tmpdir): - _test_progress_bar_max_val_check_interval( - tmpdir, total_train_samples=8, train_batch_size=4, total_val_samples=2, val_batch_size=1, val_check_interval=0.2 - ) - - -@RunIf(min_gpus=2, special=True) -def test_tqdm_progress_bar_max_val_check_interval_1(tmpdir): - _test_progress_bar_max_val_check_interval( - tmpdir, total_train_samples=8, train_batch_size=4, total_val_samples=2, val_batch_size=1, val_check_interval=0.5 - ) - - -def _test_progress_bar_max_val_check_interval( +@pytest.mark.parametrize( + ["total_train_samples", "train_batch_size", "total_val_samples", "val_batch_size", "val_check_interval"], + [(8, 4, 2, 1, 0.2), (8, 4, 2, 1, 0.5)], +) +def test_progress_bar_max_val_check_interval( tmpdir, total_train_samples, train_batch_size, total_val_samples, val_batch_size, val_check_interval ): world_size = 2 diff --git a/tests/checkpointing/test_checkpoint_callback_frequency.py b/tests/checkpointing/test_checkpoint_callback_frequency.py index c75d7332e2e42..fd5c76b2faef7 100644 --- a/tests/checkpointing/test_checkpoint_callback_frequency.py +++ b/tests/checkpointing/test_checkpoint_callback_frequency.py @@ -88,17 +88,8 @@ def training_step(self, batch, batch_idx): @mock.patch("torch.save") @RunIf(special=True, min_gpus=2) -def test_top_k_ddp_0(save_mock, tmpdir): - _top_k_ddp(save_mock, tmpdir, k=1, epochs=1, val_check_interval=1.0, expected=1) - - -@mock.patch("torch.save") -@RunIf(special=True, min_gpus=2) -def test_top_k_ddp_1(save_mock, tmpdir): - _top_k_ddp(save_mock, tmpdir, k=2, epochs=2, val_check_interval=0.3, expected=4) - - -def _top_k_ddp(save_mock, tmpdir, k, epochs, val_check_interval, expected): +@pytest.mark.parametrize(["k", "epochs", "val_check_interval", "expected"], [(1, 1, 1.0, 1), (2, 2, 0.3, 4)]) +def test_top_k_ddp(save_mock, tmpdir, k, epochs, val_check_interval, expected): class TestModel(BoringModel): def training_step(self, batch, batch_idx): local_rank = int(os.getenv("LOCAL_RANK")) diff --git a/tests/conftest.py b/tests/conftest.py index 860f9357e4636..2bb5715ce9ee9 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -156,3 +156,16 @@ def single_process_pg(): torch.distributed.destroy_process_group() os.environ.clear() os.environ.update(orig_environ) + + +def pytest_collection_modifyitems(items): + if os.getenv("PL_RUNNING_SPECIAL_TESTS", "0") != "1": + return + # filter out non-special tests + items[:] = [ + item + for item in items + for marker in item.own_markers + # has `@RunIf(special=True)` + if marker.name == "skipif" and marker.kwargs.get("special") + ] diff --git a/tests/helpers/runif.py b/tests/helpers/runif.py index e53d3811f6b34..5cdf422cf4fdb 100644 --- a/tests/helpers/runif.py +++ b/tests/helpers/runif.py @@ -150,6 +150,8 @@ def __new__( env_flag = os.getenv("PL_RUNNING_SPECIAL_TESTS", "0") conditions.append(env_flag != "1") reasons.append("Special execution") + # used in tests/conftest.py::pytest_collection_modifyitems + kwargs["special"] = True if fairscale: conditions.append(not _FAIRSCALE_AVAILABLE) diff --git a/tests/models/test_hooks.py b/tests/models/test_hooks.py index 6b34553ff313b..c710aaf952458 100644 --- a/tests/models/test_hooks.py +++ b/tests/models/test_hooks.py @@ -423,16 +423,10 @@ def _predict_batch(trainer, model, batches): @RunIf(deepspeed=True, min_gpus=1, special=True) -def test_trainer_model_hook_system_fit_deepspeed_automatic_optimization(tmpdir): - _run_trainer_model_hook_system_fit( - dict(gpus=1, precision=16, strategy="deepspeed"), tmpdir, automatic_optimization=True - ) - - -@RunIf(deepspeed=True, min_gpus=1, special=True) -def test_trainer_model_hook_system_fit_deepspeed_manual_optimization(tmpdir): +@pytest.mark.parametrize("automatic_optimization", (True, False)) +def test_trainer_model_hook_system_fit_deepspeed(tmpdir, automatic_optimization): _run_trainer_model_hook_system_fit( - dict(gpus=1, precision=16, strategy="deepspeed"), tmpdir, automatic_optimization=False + dict(gpus=1, precision=16, strategy="deepspeed"), tmpdir, automatic_optimization=automatic_optimization ) diff --git a/tests/special_tests.sh b/tests/special_tests.sh index ff6dbcecb34cd..4c9ded2a33bd4 100755 --- a/tests/special_tests.sh +++ b/tests/special_tests.sh @@ -17,55 +17,49 @@ set -e # this environment variable allows special tests to run export PL_RUNNING_SPECIAL_TESTS=1 # python arguments -defaults='-m coverage run --source pytorch_lightning --append -m pytest --durations=0 --capture=no --disable-warnings' +defaults='-m coverage run --source pytorch_lightning --append -m pytest --capture=no' -# find tests marked as `@RunIf(special=True)` -grep_output=$(grep --recursive --line-number --word-regexp 'tests' 'benchmarks' --regexp 'special=True') -# file paths -files=$(echo "$grep_output" | cut -f1 -d:) -files_arr=($files) -# line numbers -linenos=$(echo "$grep_output" | cut -f2 -d:) -linenos_arr=($linenos) +# find tests marked as `@RunIf(special=True)`. done manually instead of with pytest because it is faster +grep_output=$(grep --recursive --word-regexp 'tests' 'benchmarks' --regexp 'special=True' --include '*.py' --exclude 'tests/conftest.py') + +# file paths, remove duplicates +files=$(echo "$grep_output" | cut -f1 -d: | sort | uniq) + +# get the list of parametrizations. we need to call them separately. the last two lines are removed. +# note: if there's a syntax error, this will fail with some garbled output +if [[ "$OSTYPE" == "darwin"* ]]; then + parametrizations=$(pytest $files --collect-only --quiet | tail -r | sed -e '1,3d' | tail -r) +else + parametrizations=$(pytest $files --collect-only --quiet | head -n -2) +fi +parametrizations_arr=($parametrizations) # tests to skip - space separated -blocklist='test_pytorch_profiler_nested_emit_nvtx' +blocklist='tests/profiler/test_profiler.py::test_pytorch_profiler_nested_emit_nvtx' report='' -for i in "${!files_arr[@]}"; do - file=${files_arr[$i]} - lineno=${linenos_arr[$i]} - - # get code from `@RunIf(special=True)` line to EOF - test_code=$(tail -n +"$lineno" "$file") +for i in "${!parametrizations_arr[@]}"; do + parametrization=${parametrizations_arr[$i]} - # read line by line - while read -r line; do - # if it's a test - if [[ $line == def\ test_* ]]; then - # get the name - test_name=$(echo $line | cut -c 5- | cut -f1 -d\() + # check blocklist + if echo $blocklist | grep -F "${parametrization}"; then + report+="Skipped\t$parametrization\n" + continue + fi - # check blocklist - if echo $blocklist | grep --word-regexp "$test_name" > /dev/null; then - report+="Skipped\t$file:$lineno::$test_name\n" - break - fi + # SPECIAL_PATTERN allows filtering the tests to run when debugging. + # use as `SPECIAL_PATTERN="foo_bar" ./special_tests.sh` to run only those + # test with `foo_bar` in their name + if [[ $parametrization != *$SPECIAL_PATTERN* ]]; then + report+="Skipped\t$parametrization\n" + continue + fi - # SPECIAL_PATTERN allows filtering the tests to run when debugging. - # use as `SPECIAL_PATTERN="foo_bar" ./special_tests.sh` to run only those - # test with `foo_bar` in their name - if [[ $line != *$SPECIAL_PATTERN* ]]; then - report+="Skipped\t$file:$lineno::$test_name\n" - break - fi + # run the test + echo "Running ${parametrization}" + python ${defaults} "${parametrization}" - # run the test - report+="Ran\t$file:$lineno::$test_name\n" - python ${defaults} "${file}::${test_name}" - break - fi - done < <(echo "$test_code") + report+="Ran\t$parametrization\n" done if nvcc --version; then diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py index c4a8884a23ccd..7c2c6e9b55e9e 100644 --- a/tests/trainer/test_trainer.py +++ b/tests/trainer/test_trainer.py @@ -1453,29 +1453,26 @@ def test_trainer_predict_cpu(tmpdir, datamodule, enable_progress_bar): @RunIf(min_gpus=2, special=True) -@pytest.mark.parametrize("num_gpus", [1, 2]) -def test_trainer_predict_dp(tmpdir, num_gpus): - predict(tmpdir, strategy="dp", accelerator="gpu", devices=num_gpus) - - -@RunIf(min_gpus=2, special=True, fairscale=True) -def test_trainer_predict_ddp(tmpdir): - predict(tmpdir, strategy="ddp", accelerator="gpu", devices=2) - - -@RunIf(min_gpus=2, skip_windows=True, special=True) -def test_trainer_predict_ddp_spawn(tmpdir): - predict(tmpdir, strategy="dp", accelerator="gpu", devices=2) +@pytest.mark.parametrize( + "kwargs", + [ + {"strategy": "dp", "devices": 1}, + {"strategy": "dp", "devices": 2}, + {"strategy": "ddp", "devices": 2}, + ], +) +def test_trainer_predict_special(tmpdir, kwargs): + predict(tmpdir, accelerator="gpu", **kwargs) -@RunIf(min_gpus=1, special=True) +@RunIf(min_gpus=1) def test_trainer_predict_1_gpu(tmpdir): predict(tmpdir, accelerator="gpu", devices=1) @RunIf(skip_windows=True) -def test_trainer_predict_ddp_cpu(tmpdir): - predict(tmpdir, strategy="ddp_spawn", accelerator="cpu", devices=2) +def test_trainer_predict_ddp_spawn(tmpdir): + predict(tmpdir, strategy="ddp_spawn", accelerator="auto", devices=2) @pytest.mark.parametrize("dataset_cls", [RandomDataset, RandomIterableDatasetWithLen, RandomIterableDataset]) From 963d0b0ea35b7295d0a2876f6f54231d65de1e71 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Fri, 19 Nov 2021 03:07:33 +0100 Subject: [PATCH 047/123] Move benchmarks into the test directory (#10614) --- .azure-pipelines/gpu-benchmark.yml | 2 +- .azure-pipelines/gpu-tests.yml | 4 ++-- benchmarks/__init__.py | 18 ------------------ pyproject.toml | 6 +----- tests/benchmarks/__init__.py | 0 .../benchmarks}/generate_comparison.py | 2 +- .../benchmarks}/test_basic_parity.py | 0 .../benchmarks}/test_sharded_parity.py | 0 tests/special_tests.sh | 2 +- 9 files changed, 6 insertions(+), 28 deletions(-) delete mode 100644 benchmarks/__init__.py create mode 100644 tests/benchmarks/__init__.py rename {benchmarks => tests/benchmarks}/generate_comparison.py (97%) rename {benchmarks => tests/benchmarks}/test_basic_parity.py (100%) rename {benchmarks => tests/benchmarks}/test_sharded_parity.py (100%) diff --git a/.azure-pipelines/gpu-benchmark.yml b/.azure-pipelines/gpu-benchmark.yml index f8b9593d72798..6d45cc2f4566a 100644 --- a/.azure-pipelines/gpu-benchmark.yml +++ b/.azure-pipelines/gpu-benchmark.yml @@ -36,7 +36,7 @@ jobs: steps: - bash: | - python -m pytest benchmarks -v --durations=0 + python -m pytest tests/benchmarks -v --durations=0 displayName: 'Testing: benchmarks' env: PL_RUNNING_BENCHMARKS: 1 diff --git a/.azure-pipelines/gpu-tests.yml b/.azure-pipelines/gpu-tests.yml index f1af36a6090b9..71332a840fdb0 100644 --- a/.azure-pipelines/gpu-tests.yml +++ b/.azure-pipelines/gpu-tests.yml @@ -68,7 +68,7 @@ jobs: displayName: 'Get legacy checkpoints' - bash: | - python -m coverage run --source pytorch_lightning -m pytest pytorch_lightning tests -v --junitxml=$(Build.StagingDirectory)/test-results.xml --durations=50 + python -m coverage run --source pytorch_lightning -m pytest pytorch_lightning tests --ignore tests/benchmarks -v --junitxml=$(Build.StagingDirectory)/test-results.xml --durations=50 displayName: 'Testing: standard' - bash: | @@ -113,5 +113,5 @@ jobs: displayName: 'Testing: examples' - bash: | - python -m pytest benchmarks -v --maxfail=2 --durations=0 + python -m pytest tests/benchmarks -v --maxfail=2 --durations=0 displayName: 'Testing: benchmarks' diff --git a/benchmarks/__init__.py b/benchmarks/__init__.py deleted file mode 100644 index b4a3da40d40d0..0000000000000 --- a/benchmarks/__init__.py +++ /dev/null @@ -1,18 +0,0 @@ -# Copyright The PyTorch Lightning team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import os - -_BENCHMARK_ROOT = os.path.dirname(__file__) -_PROJECT_ROOT = os.path.dirname(_BENCHMARK_ROOT) -_PATH_DATASETS = os.path.join(_PROJECT_ROOT, "Datasets") diff --git a/pyproject.toml b/pyproject.toml index 00e22c42e12ba..c527ffaa856cf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,7 +7,6 @@ requires = [ [tool.isort] known_first_party = [ - "benchmarks", "docs", "pl_examples", "pytorch_lightning", @@ -24,7 +23,7 @@ line-length = 120 [tool.mypy] -files = ["pytorch_lightning", "pl_examples", "benchmarks"] +files = ["pytorch_lightning"] disallow_untyped_defs = "True" ignore_missing_imports = "True" show_error_codes = "True" @@ -52,9 +51,6 @@ module = [ "pytorch_lightning.distributed.*", "pytorch_lightning.tuner.*", "pytorch_lightning.utilities.*", - "pl_examples.*", - "benchmarks.*", - "tests.helpers.*" ] ignore_errors = "True" diff --git a/tests/benchmarks/__init__.py b/tests/benchmarks/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/benchmarks/generate_comparison.py b/tests/benchmarks/generate_comparison.py similarity index 97% rename from benchmarks/generate_comparison.py rename to tests/benchmarks/generate_comparison.py index 5a9cde0d80ed3..bc95b5d9cf591 100644 --- a/benchmarks/generate_comparison.py +++ b/tests/benchmarks/generate_comparison.py @@ -16,7 +16,7 @@ import matplotlib.pylab as plt import pandas as pd -from benchmarks.test_basic_parity import measure_loops +from tests.benchmarks.test_basic_parity import measure_loops from tests.helpers.advanced_models import ParityModuleMNIST, ParityModuleRNN NUM_EPOCHS = 20 diff --git a/benchmarks/test_basic_parity.py b/tests/benchmarks/test_basic_parity.py similarity index 100% rename from benchmarks/test_basic_parity.py rename to tests/benchmarks/test_basic_parity.py diff --git a/benchmarks/test_sharded_parity.py b/tests/benchmarks/test_sharded_parity.py similarity index 100% rename from benchmarks/test_sharded_parity.py rename to tests/benchmarks/test_sharded_parity.py diff --git a/tests/special_tests.sh b/tests/special_tests.sh index 4c9ded2a33bd4..27abaa6cc62e3 100755 --- a/tests/special_tests.sh +++ b/tests/special_tests.sh @@ -20,7 +20,7 @@ export PL_RUNNING_SPECIAL_TESTS=1 defaults='-m coverage run --source pytorch_lightning --append -m pytest --capture=no' # find tests marked as `@RunIf(special=True)`. done manually instead of with pytest because it is faster -grep_output=$(grep --recursive --word-regexp 'tests' 'benchmarks' --regexp 'special=True' --include '*.py' --exclude 'tests/conftest.py') +grep_output=$(grep --recursive --word-regexp 'tests' --regexp 'special=True' --include '*.py' --exclude 'tests/conftest.py') # file paths, remove duplicates files=$(echo "$grep_output" | cut -f1 -d: | sort | uniq) From b2086538242a338dbc376ff83606bab63e5fabb7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 24 Nov 2021 15:03:35 +0100 Subject: [PATCH 048/123] 1.5.3 release notes --- CHANGELOG.md | 24 ++++-------------------- 1 file changed, 4 insertions(+), 20 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 803b31578c168..b8abfc19d630a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,35 +5,19 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). -## [1.5.3] - 2021-11-23 +## [1.5.3] - 2021-11-24 ### Fixed -- When a tensor is logged with `self.log`, run its computation with the same `dtype` ([#10076](https://github.com/PyTorchLightning/pytorch-lightning/pull/10076)) - - - Fixed `ShardedTensor` state dict hook registration to check if torch distributed is available ([#10621](https://github.com/PyTorchLightning/pytorch-lightning/pull/10621)) - -- When a tensor is logged with `self.log`, run its computation with the same `dtype` ([#10076](https://github.com/PyTorchLightning/pytorch-lightning/pull/10076)) - - -- Fixed LigtningLite `_wrap_init` popping unexisting keys from DataLoader signature parameters ([#10613](https://github.com/PyTorchLightning/pytorch-lightning/pull/10613)) - - +- Fixed an issue with `self.log` not respecting a tensor's `dtype` when applying computations ([#10076](https://github.com/PyTorchLightning/pytorch-lightning/pull/10076)) +- Fixed LigtningLite `_wrap_init` popping unexisting keys from DataLoader signature parameters ([#10613](https://github.com/PyTorchLightning/pytorch-lightning/pull/10613)) - Fixed signals being registered within threads ([#10610](https://github.com/PyTorchLightning/pytorch-lightning/pull/10610)) - - - Fixed an issue that caused Lightning to extract the batch size even though it was set by the user in `LightningModule.log` ([#10408](https://github.com/PyTorchLightning/pytorch-lightning/pull/10408)) - - - Fixed `Trainer(move_metrics_to_cpu=True)` not moving the evaluation logged results to CPU ([#10631](https://github.com/PyTorchLightning/pytorch-lightning/pull/10631)) - - - Fixed the `{validation,test}_step` outputs getting moved to CPU with `Trainer(move_metrics_to_cpu=True)` ([#10631](https://github.com/PyTorchLightning/pytorch-lightning/pull/10631)) - - - Fixed signals being registered within threads ([#10610](https://github.com/PyTorchLightning/pytorch-lightning/pull/10610)) - +- Fixed an issue with collecting logged test results with multiple dataloaders ([#10522](https://github.com/PyTorchLightning/pytorch-lightning/pull/10522)) ## [1.5.2] - 2021-11-16 From 90d362a913a00beef366a197c68f56509f97aaa7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Thu, 25 Nov 2021 15:24:29 +0100 Subject: [PATCH 049/123] Merge pull request #10738 from PyTorchLightning/1.5.x-drop-torch-1.6 Sync CI configuration into 1.5.x branch --- .github/workflows/ci_dockers.yml | 10 ++--- .github/workflows/ci_pkg-install.yml | 4 +- .github/workflows/ci_schema.yml | 4 +- .github/workflows/ci_test-base.yml | 5 +-- .github/workflows/ci_test-conda.yml | 4 +- .github/workflows/ci_test-full.yml | 9 +---- .github/workflows/code-checks.yml | 5 +-- .github/workflows/docs-checks.yml | 4 +- .github/workflows/events-nightly.yml | 4 +- .github/workflows/events-recurrent.yml | 2 +- .github/workflows/probot-auto-cc.yml | 16 ++++++++ .github/workflows/release-docker.yml | 8 ++-- .github/workflows/release-pypi.yml | 2 +- CHANGELOG.md | 10 +++++ dockers/base-cuda/Dockerfile | 2 +- dockers/base-xla/Dockerfile | 2 +- dockers/release/Dockerfile | 2 +- dockers/tpu-tests/Dockerfile | 2 +- environment.yml | 5 ++- requirements.txt | 2 +- requirements/adjust_versions.py | 56 ++++++++++++++++++++------ requirements/examples.txt | 2 +- requirements/extra.txt | 4 +- 23 files changed, 107 insertions(+), 57 deletions(-) create mode 100644 .github/workflows/probot-auto-cc.yml diff --git a/.github/workflows/ci_dockers.yml b/.github/workflows/ci_dockers.yml index 701223c795a3b..bd45247e15df2 100644 --- a/.github/workflows/ci_dockers.yml +++ b/.github/workflows/ci_dockers.yml @@ -1,4 +1,4 @@ -name: CI build Docker +name: Docker # https://www.docker.com/blog/first-docker-github-action-is-here # https://github.com/docker/build-push-action # see: https://help.github.com/en/actions/reference/events-that-trigger-workflows @@ -23,9 +23,9 @@ jobs: strategy: fail-fast: false matrix: - # should be the config used in '.github/workflows/release-docker.yml', but we just keep one to check. - python_version: ["3.9"] - pytorch_version: ["1.9"] + # the config used in '.azure-pipelines/gpu-tests.yml' since the Dockerfile uses the cuda image + python_version: ["3.7"] + pytorch_version: ["1.8"] steps: - name: Checkout uses: actions/checkout@v2 @@ -93,7 +93,7 @@ jobs: matrix: # the config used in '.github/workflows/ci_test-conda.yml' python_version: ["3.8"] - pytorch_version: ["1.6", "1.7", "1.8", "1.9", "1.10"] + pytorch_version: ["1.7", "1.8", "1.9", "1.10"] steps: - name: Checkout uses: actions/checkout@v2 diff --git a/.github/workflows/ci_pkg-install.yml b/.github/workflows/ci_pkg-install.yml index 2cf64f39a28b7..bf7de876d157e 100644 --- a/.github/workflows/ci_pkg-install.yml +++ b/.github/workflows/ci_pkg-install.yml @@ -1,4 +1,4 @@ -name: Install pkg +name: Package # see: https://help.github.com/en/actions/reference/events-that-trigger-workflows on: # Trigger the workflow on push or pull request, but only for the master branch @@ -9,7 +9,7 @@ on: # Trigger the workflow on push or pull request, but only for the master bra jobs: - pkg-install: + install: runs-on: ${{ matrix.os }} strategy: fail-fast: false diff --git a/.github/workflows/ci_schema.yml b/.github/workflows/ci_schema.yml index 51c4400666fd0..d635285fae39a 100644 --- a/.github/workflows/ci_schema.yml +++ b/.github/workflows/ci_schema.yml @@ -1,11 +1,11 @@ -name: CI action schema +name: Schema on: # Trigger the workflow on push or pull request, but only for the master branch push: {} pull_request: branches: [master, "release/*"] jobs: - validate-schema: + check: runs-on: ubuntu-20.04 steps: - name: Checkout diff --git a/.github/workflows/ci_test-base.yml b/.github/workflows/ci_test-base.yml index e3d3ca2e4e82b..8b2f8b721a37e 100644 --- a/.github/workflows/ci_test-base.yml +++ b/.github/workflows/ci_test-base.yml @@ -1,6 +1,6 @@ # this jobs runs `pytest` over the source directory. It does not install any extra dependencies. # this is useful to catch errors where an import has been added which is not part of the basic dependencies. -name: CI basic testing +name: Test # see: https://help.github.com/en/actions/reference/events-that-trigger-workflows on: # Trigger the workflow on push or pull request, but only for the master branch @@ -10,8 +10,7 @@ on: # Trigger the workflow on push or pull request, but only for the master bra branches: [master, "release/*"] jobs: - doctest: - + source: runs-on: ${{ matrix.os }} strategy: fail-fast: false diff --git a/.github/workflows/ci_test-conda.yml b/.github/workflows/ci_test-conda.yml index 9196034bf2757..a24a15baf4d78 100644 --- a/.github/workflows/ci_test-conda.yml +++ b/.github/workflows/ci_test-conda.yml @@ -1,4 +1,4 @@ -name: PyTorch & Conda +name: Test # see: https://help.github.com/en/actions/reference/events-that-trigger-workflows on: # Trigger the workflow on push or pull request, but only for the master branch @@ -15,7 +15,7 @@ jobs: fail-fast: false matrix: python-version: ["3.8"] # previous to last Python version as that one is already used in test-full - pytorch-version: ["1.6", "1.7", "1.8", "1.9", "1.10"] + pytorch-version: ["1.7", "1.8", "1.9", "1.10"] # nightly: add when there's a release candidate timeout-minutes: 35 steps: diff --git a/.github/workflows/ci_test-full.yml b/.github/workflows/ci_test-full.yml index 43096316cbff6..ad600623dfba0 100644 --- a/.github/workflows/ci_test-full.yml +++ b/.github/workflows/ci_test-full.yml @@ -1,4 +1,4 @@ -name: CI complete testing +name: Test # see: https://help.github.com/en/actions/reference/events-that-trigger-workflows on: # Trigger the workflow on push or pull request, but only for the master branch @@ -10,7 +10,7 @@ on: # Trigger the workflow on push or pull request, but only for the master bra jobs: - pytest: + cpu: runs-on: ${{ matrix.os }} if: github.event.pull_request.draft == false @@ -28,11 +28,6 @@ jobs: - {os: macOS-10.15, python-version: "3.6", requires: "oldest", release: "stable"} # nightly: add when there's a release candidate #- {os: ubuntu-20.04, python-version: "3.10", requires: "latest", release: "pre"} - exclude: - # PyTorch 1.6 is not available with Python 3.9: https://github.com/pytorch/pytorch/issues/46205 - - {os: ubuntu-18.04, python-version: "3.9", requires: "oldest", release: "stable"} - - {os: windows-2019, python-version: "3.9", requires: "oldest", release: "stable"} - - {os: macOS-10.15, python-version: "3.9", requires: "oldest", release: "stable"} timeout-minutes: 40 diff --git a/.github/workflows/code-checks.yml b/.github/workflows/code-checks.yml index 1cedf2c360306..e99863dc794d4 100644 --- a/.github/workflows/code-checks.yml +++ b/.github/workflows/code-checks.yml @@ -1,4 +1,4 @@ -name: "Check code" +name: Test on: # Trigger the workflow on push or pull request, but only for the master branch push: @@ -7,8 +7,7 @@ on: # Trigger the workflow on push or pull request, but only for the master bra branches: [master, "release/*"] jobs: - python-typing-mypy: - name: Python typing Mypy + mypy: runs-on: ubuntu-20.04 steps: - uses: actions/checkout@master diff --git a/.github/workflows/docs-checks.yml b/.github/workflows/docs-checks.yml index 9d6b660a168f8..841f9128da8b1 100644 --- a/.github/workflows/docs-checks.yml +++ b/.github/workflows/docs-checks.yml @@ -1,4 +1,4 @@ -name: "Docs check" +name: Test # https://github.com/marketplace/actions/sphinx-build on: # Trigger the workflow on push or pull request, but only for the master branch @@ -8,7 +8,7 @@ on: # Trigger the workflow on push or pull request, but only for the master bra branches: [master, "release/*"] jobs: - test-docs: + doctest: runs-on: ubuntu-20.04 steps: - uses: actions/checkout@v2 diff --git a/.github/workflows/events-nightly.yml b/.github/workflows/events-nightly.yml index ce2072e5f45aa..7c2075ce5b440 100644 --- a/.github/workflows/events-nightly.yml +++ b/.github/workflows/events-nightly.yml @@ -1,4 +1,4 @@ -name: Nightly events +name: Nightly # https://jasonet.co/posts/scheduled-actions/ # https://github.community/t/distinct-job-for-each-schedule/17811/2 @@ -123,7 +123,7 @@ jobs: matrix: # the config used in '.github/workflows/ci_test-conda.yml' python_version: ["3.8"] - pytorch_version: ["1.6", "1.7", "1.8", "1.9", "1.10"] + pytorch_version: ["1.7", "1.8", "1.9", "1.10"] steps: - name: Checkout diff --git a/.github/workflows/events-recurrent.yml b/.github/workflows/events-recurrent.yml index d7f1872fde732..834adc6c169fa 100644 --- a/.github/workflows/events-recurrent.yml +++ b/.github/workflows/events-recurrent.yml @@ -1,4 +1,4 @@ -name: Recurrent events +name: Recurrent # https://jasonet.co/posts/scheduled-actions/ # https://github.community/t/distinct-job-for-each-schedule/17811/2 diff --git a/.github/workflows/probot-auto-cc.yml b/.github/workflows/probot-auto-cc.yml new file mode 100644 index 0000000000000..5c6de911cd00e --- /dev/null +++ b/.github/workflows/probot-auto-cc.yml @@ -0,0 +1,16 @@ +name: Probot + +on: + issues: + types: [labeled] + pull_request: + types: [labeled, ready_for_review] + +jobs: + auto-cc: + runs-on: ubuntu-latest + if: github.event_name == 'issue' || github.event.pull_request.draft == false + steps: + - uses: carmocca/probot@v1 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/release-docker.yml b/.github/workflows/release-docker.yml index 92bf62d3c1ead..169e01edd8d48 100644 --- a/.github/workflows/release-docker.yml +++ b/.github/workflows/release-docker.yml @@ -1,4 +1,4 @@ -name: Publish Docker Releases +name: Docker # https://www.docker.com/blog/first-docker-github-action-is-here # https://github.com/docker/build-push-action on: @@ -8,7 +8,7 @@ on: types: [published] jobs: - cuda-PL: + publish: runs-on: ubuntu-20.04 # only on releases if: startsWith(github.ref, 'refs/tags/') || github.event_name == 'release' @@ -16,7 +16,7 @@ jobs: fail-fast: false matrix: python_version: ["3.6", "3.7", "3.8", "3.9"] - pytorch_version: ["1.6", "1.7", "1.8", "1.9"] + pytorch_version: ["1.7", "1.8", "1.9", "1.10"] steps: - name: Checkout uses: actions/checkout@v2 @@ -39,7 +39,7 @@ jobs: - name: Publish Latest to Docker uses: docker/build-push-action@v1.1.0 # only on releases and latest Python and PyTorch - if: matrix.python_version == 3.9 && matrix.pytorch_version == 1.9 + if: matrix.python_version == "3.9" && matrix.pytorch_version == "1.10" with: repository: pytorchlightning/pytorch_lightning username: ${{ secrets.DOCKER_USERNAME }} diff --git a/.github/workflows/release-pypi.yml b/.github/workflows/release-pypi.yml index a91837cab3340..09afd4db893d3 100644 --- a/.github/workflows/release-pypi.yml +++ b/.github/workflows/release-pypi.yml @@ -1,4 +1,4 @@ -name: PyPI Release +name: PyPI # https://help.github.com/en/actions/reference/events-that-trigger-workflows on: # Trigger the workflow on push or pull request, but only for the master branch diff --git a/CHANGELOG.md b/CHANGELOG.md index b8abfc19d630a..b04080122584f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,16 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). +## [1.5.4] - 2021-11-30 + +### Fixed + + +### Removed + +- Removed PyTorch 1.6 support ([#10367](https://github.com/PyTorchLightning/pytorch-lightning/pull/10367), [#10738](https://github.com/PyTorchLightning/pytorch-lightning/pull/10738)) + + ## [1.5.3] - 2021-11-24 ### Fixed diff --git a/dockers/base-cuda/Dockerfile b/dockers/base-cuda/Dockerfile index ab26af6c7accf..99e8d018f2884 100644 --- a/dockers/base-cuda/Dockerfile +++ b/dockers/base-cuda/Dockerfile @@ -17,7 +17,7 @@ ARG CUDA_VERSION=10.2 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu18.04 ARG PYTHON_VERSION=3.9 -ARG PYTORCH_VERSION=1.6 +ARG PYTORCH_VERSION=1.8 SHELL ["/bin/bash", "-c"] # https://techoverflow.net/2019/05/18/how-to-fix-configuring-tzdata-interactive-input-when-building-docker-images/ diff --git a/dockers/base-xla/Dockerfile b/dockers/base-xla/Dockerfile index 5c86da2147717..e293343614927 100644 --- a/dockers/base-xla/Dockerfile +++ b/dockers/base-xla/Dockerfile @@ -19,7 +19,7 @@ LABEL maintainer="PyTorchLightning " # CALL: docker image build -t pytorch-lightning:XLA-extras-py3.6 -f dockers/base-xla/Dockerfile . --build-arg PYTHON_VERSION=3.8 ARG PYTHON_VERSION=3.9 ARG CONDA_VERSION=4.9.2 -ARG XLA_VERSION=1.6 +ARG XLA_VERSION=1.8 SHELL ["/bin/bash", "-c"] # for skipping configurations diff --git a/dockers/release/Dockerfile b/dockers/release/Dockerfile index 529680059791c..f4083f2dd42fc 100644 --- a/dockers/release/Dockerfile +++ b/dockers/release/Dockerfile @@ -13,7 +13,7 @@ # limitations under the License. ARG PYTHON_VERSION=3.9 -ARG PYTORCH_VERSION=1.6 +ARG PYTORCH_VERSION=1.8 FROM pytorchlightning/pytorch_lightning:base-cuda-py${PYTHON_VERSION}-torch${PYTORCH_VERSION} diff --git a/dockers/tpu-tests/Dockerfile b/dockers/tpu-tests/Dockerfile index 086bd349bc757..6605b9abbaadc 100644 --- a/dockers/tpu-tests/Dockerfile +++ b/dockers/tpu-tests/Dockerfile @@ -13,7 +13,7 @@ # limitations under the License. ARG PYTHON_VERSION=3.9 -ARG PYTORCH_VERSION=1.6 +ARG PYTORCH_VERSION=1.8 FROM pytorchlightning/pytorch_lightning:base-xla-py${PYTHON_VERSION}-torch${PYTORCH_VERSION} diff --git a/environment.yml b/environment.yml index fb21d21c97730..d7d34c387af15 100644 --- a/environment.yml +++ b/environment.yml @@ -29,7 +29,7 @@ dependencies: - python>=3.6 - pip>20.1 - numpy>=1.17.2 - - pytorch>=1.6 + - pytorch>=1.7.* - future>=0.17.1 - PyYAML>=5.1 - tqdm>=4.41.0 @@ -41,9 +41,10 @@ dependencies: - scikit-learn>=0.20.0 - matplotlib>=3.1.1 - omegaconf>=2.0.5 + - torchtext>=0.8.* # Examples - - torchvision>=0.6 + - torchvision>=0.8.* - pip: - test-tube>=0.7.5 diff --git a/requirements.txt b/requirements.txt index 69074cbfb249c..34879d9290acb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ # the default package dependencies numpy>=1.17.2 -torch>=1.6 +torch>=1.7.* future>=0.17.1 # required for builtins in setup.py tqdm>=4.41.0 PyYAML>=5.1 diff --git a/requirements/adjust_versions.py b/requirements/adjust_versions.py index 3ebb3c28835b3..8295a726e7873 100644 --- a/requirements/adjust_versions.py +++ b/requirements/adjust_versions.py @@ -14,7 +14,6 @@ dict(torch="1.8.0", torchvision="0.9.0", torchtext="0.9.0"), dict(torch="1.7.1", torchvision="0.8.2", torchtext="0.8.1"), dict(torch="1.7.0", torchvision="0.8.1", torchtext="0.8.0"), - dict(torch="1.6.0", torchvision="0.7.0", torchtext="0.7"), ] @@ -33,28 +32,59 @@ def find_latest(ver: str) -> Dict[str, str]: raise ValueError(f"Missing {ver} in {VERSIONS}") -def main(path_req: str, torch_version: Optional[str] = None) -> None: +def main(req: str, torch_version: Optional[str] = None) -> str: if not torch_version: import torch torch_version = torch.__version__ assert torch_version, f"invalid torch: {torch_version}" - with open(path_req) as fp: - req = fp.read() - # remove comments - req = re.sub(rf"\s*#.*{os.linesep}", os.linesep, req) + # remove comments and strip whitespace + req = re.sub(rf"\s*#.*{os.linesep}", os.linesep, req).strip() latest = find_latest(torch_version) for lib, version in latest.items(): - replace = f"{lib}=={version}" if version else lib - replace += os.linesep - req = re.sub(rf"{lib}[>=]*[\d\.]*{os.linesep}", replace, req) + replace = f"{lib}=={version}" if version else "" + req = re.sub(rf"\b{lib}(?!\w).*", replace, req) - print(req) # on purpose - to debug - with open(path_req, "w") as fp: - fp.write(req) + return req + + +def test(): + requirements = """ + torch>=1.2.* + torch==1.2.3 + torch==1.4 + torch + future>=0.17.1 + pytorch==1.5.6+123dev0 + torchvision + torchmetrics>=0.4.1 + """ + expected = """ + torch==1.9.1 + torch==1.9.1 + torch==1.9.1 + torch==1.9.1 + future>=0.17.1 + pytorch==1.5.6+123dev0 + torchvision==0.10.1 + torchmetrics>=0.4.1 + """.strip() + actual = main(requirements, "1.9") + assert actual == expected, (actual, expected) if __name__ == "__main__": - main(*sys.argv[1:]) + test() # sanity check + + if len(sys.argv) == 3: + requirements_path, torch_version = sys.argv[1:] + else: + requirements_path, torch_version = sys.argv[1], None + + with open(requirements_path, "r+") as fp: + requirements = fp.read() + requirements = main(requirements, torch_version) + print(requirements) # on purpose - to debug + fp.write(requirements) diff --git a/requirements/examples.txt b/requirements/examples.txt index e38f1f92bcb83..8591f9bd509c2 100644 --- a/requirements/examples.txt +++ b/requirements/examples.txt @@ -1,3 +1,3 @@ -torchvision>=0.7 +torchvision>=0.8.* gym>=0.17.0 ipython[all] diff --git a/requirements/extra.txt b/requirements/extra.txt index e3763fcae487b..6abf3089b8506 100644 --- a/requirements/extra.txt +++ b/requirements/extra.txt @@ -2,9 +2,9 @@ matplotlib>3.1 horovod>=0.21.2 # no need to install with [pytorch] as pytorch is already installed -torchtext>=0.7 +torchtext>=0.8.* omegaconf>=2.0.5 hydra-core>=1.0.5 -jsonargparse[signatures]>=3.19.3 +jsonargparse[signatures]>=4.0.0 gcsfs>=2021.5.0 rich>=10.2.2 From d140a6419130b425a6674752a83d189a5ce7aef6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 24 Nov 2021 15:55:49 +0100 Subject: [PATCH 050/123] update version --- pytorch_lightning/__about__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/__about__.py b/pytorch_lightning/__about__.py index 2cfcb2c1778f8..1fa67b89ab062 100644 --- a/pytorch_lightning/__about__.py +++ b/pytorch_lightning/__about__.py @@ -1,7 +1,7 @@ import time _this_year = time.strftime("%Y") -__version__ = "1.5.3" +__version__ = "1.5.4" __author__ = "William Falcon et al." __author_email__ = "waf2107@columbia.edu" __license__ = "Apache-2.0" From a5e882314b7781a61cd368b1029fdd29ac424802 Mon Sep 17 00:00:00 2001 From: Kaushik B <45285388+kaushikb11@users.noreply.github.com> Date: Fri, 26 Nov 2021 14:45:22 +0530 Subject: [PATCH 051/123] Fix compare version for packages (#10762) --- CHANGELOG.md | 2 ++ pytorch_lightning/utilities/imports.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b04080122584f..7416041fead4a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -29,6 +29,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed signals being registered within threads ([#10610](https://github.com/PyTorchLightning/pytorch-lightning/pull/10610)) - Fixed an issue with collecting logged test results with multiple dataloaders ([#10522](https://github.com/PyTorchLightning/pytorch-lightning/pull/10522)) +- Fixed `_compare_version` for python packages ([#10762](https://github.com/PyTorchLightning/pytorch-lightning/pull/10762)) + ## [1.5.2] - 2021-11-16 diff --git a/pytorch_lightning/utilities/imports.py b/pytorch_lightning/utilities/imports.py index edf5f75aee6a9..247dfcc71b7c1 100644 --- a/pytorch_lightning/utilities/imports.py +++ b/pytorch_lightning/utilities/imports.py @@ -59,7 +59,7 @@ def _compare_version(package: str, op: Callable, version: str, use_base_version: pkg_version = Version(pkg.__version__) else: # try pkg_resources to infer version - pkg_version = Version(pkg_resources.get_distribution(pkg).version) + pkg_version = Version(pkg_resources.get_distribution(package).version) except TypeError: # this is mocked by Sphinx, so it should return True to generate all summaries return True From 7f147eebe1cde30f91c0c6ef101433090ae94140 Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Fri, 26 Nov 2021 17:07:57 +0000 Subject: [PATCH 052/123] Delete TensorBoardLogger experiment before spawning the processes. (#10777) --- CHANGELOG.md | 3 +++ .../plugins/training_type/ddp_spawn.py | 17 +++++++++++++++++ .../plugins/training_type/tpu_spawn.py | 9 --------- tests/loggers/test_tensorboard.py | 15 +++++++++++++++ 4 files changed, 35 insertions(+), 9 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7416041fead4a..5306861b7f0af 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -32,6 +32,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed `_compare_version` for python packages ([#10762](https://github.com/PyTorchLightning/pytorch-lightning/pull/10762)) +- Fixed TensorBoardLogger `SummaryWriter` not close before spawning the processes ([#10777](https://github.com/PyTorchLightning/pytorch-lightning/pull/10777)) + + ## [1.5.2] - 2021-11-16 ### Fixed diff --git a/pytorch_lightning/plugins/training_type/ddp_spawn.py b/pytorch_lightning/plugins/training_type/ddp_spawn.py index ff5159f739cdc..0b503db64e0a4 100644 --- a/pytorch_lightning/plugins/training_type/ddp_spawn.py +++ b/pytorch_lightning/plugins/training_type/ddp_spawn.py @@ -25,6 +25,7 @@ from torch.nn.parallel.distributed import DistributedDataParallel import pytorch_lightning as pl +from pytorch_lightning.loggers import LoggerCollection, TensorBoardLogger from pytorch_lightning.overrides import LightningDistributedModule from pytorch_lightning.overrides.distributed import prepare_for_backward from pytorch_lightning.overrides.torch_distributed import broadcast_object_list @@ -170,14 +171,17 @@ def get_mp_spawn_kwargs(self, trainer: Optional["pl.Trainer"] = None) -> Dict[st return {"nprocs": self.num_processes} def start_training(self, trainer: "pl.Trainer") -> None: + self._clean_logger(trainer) self.spawn(self.new_process, trainer, self.mp_queue, return_result=False) # reset optimizers, since main process is never used for training and thus does not have a valid optim state trainer.optimizers = [] def start_evaluating(self, trainer: "pl.Trainer") -> None: + self._clean_logger(trainer) self.spawn(self.new_process, trainer, self.mp_queue, return_result=False) def start_predicting(self, trainer: "pl.Trainer") -> None: + self._clean_logger(trainer) self.spawn(self.new_process, trainer, self.mp_queue, return_result=False) def spawn(self, function: Callable, *args: Any, return_result: bool = True, **kwargs: Any) -> Optional[Any]: @@ -440,3 +444,16 @@ def teardown(self) -> None: self.lightning_module.cpu() # clean up memory torch.cuda.empty_cache() + + @staticmethod + def _clean_logger(trainer: "pl.Trainer") -> None: + loggers = trainer.logger._logger_iterable if isinstance(trainer.logger, LoggerCollection) else [trainer.logger] + for logger in loggers: + if isinstance(logger, TensorBoardLogger) and logger._experiment is not None: + rank_zero_warn( + "When using `ddp_spawn`, the `TensorBoardLogger` experiment should be `None`. Setting it to `None`." + ) + # the experiment class of `TensorBoard` holds a multiprocessing queue which can make ours hang. + # we want to make sure these are closed before we spawn our own threads. + # assuming nothing else references the experiment object, python should instantly `__del__` it. + logger._experiment = None diff --git a/pytorch_lightning/plugins/training_type/tpu_spawn.py b/pytorch_lightning/plugins/training_type/tpu_spawn.py index 2509122bd99e2..4fa0cfda6a859 100644 --- a/pytorch_lightning/plugins/training_type/tpu_spawn.py +++ b/pytorch_lightning/plugins/training_type/tpu_spawn.py @@ -254,10 +254,6 @@ def reduce(self, output, group: Optional[Any] = None, reduce_op: Optional[Union[ return output - def _close_logger(self, trainer) -> None: - if trainer.logger is not None: - trainer.logger.finalize("success") - def get_mp_spawn_kwargs(self, trainer: Optional["pl.Trainer"] = None) -> Dict[str, Any]: return { "nprocs": len(self.parallel_devices), @@ -293,13 +289,8 @@ def start_training(self, trainer: "pl.Trainer") -> None: # todo: precision pluging is call in accelerator setup and should be moved if "XLA_USE_BF16" in os.environ: del os.environ["XLA_USE_BF16"] - self._close_logger(trainer) return super().start_training(trainer) - def start_evaluating(self, trainer: "pl.Trainer") -> None: - self._close_logger(trainer) - return super().start_evaluating(trainer) - def training_step(self, *args, **kwargs): return self.model(*args, **kwargs) diff --git a/tests/loggers/test_tensorboard.py b/tests/loggers/test_tensorboard.py index 02a809aa2ab30..0a99c058ef941 100644 --- a/tests/loggers/test_tensorboard.py +++ b/tests/loggers/test_tensorboard.py @@ -25,6 +25,7 @@ from pytorch_lightning import Trainer from pytorch_lightning.loggers import TensorBoardLogger +from pytorch_lightning.loggers.base import LoggerCollection from pytorch_lightning.utilities.imports import _compare_version from tests.helpers import BoringModel @@ -332,3 +333,17 @@ def test_tensorboard_missing_folder_warning(tmpdir, caplog): assert logger.version == 0 assert "Missing logger folder:" in caplog.text + + +@pytest.mark.parametrize("use_list", [False, True]) +def test_tensorboard_ddp_spawn_cleanup(use_list, tmpdir): + tensorboard_logger = TensorBoardLogger(save_dir=tmpdir) + assert tensorboard_logger._experiment is None + tensorboard_logger.experiment # this property access will create the experiment + assert tensorboard_logger._experiment is not None + logger = [tensorboard_logger] if use_list else tensorboard_logger + trainer = Trainer(strategy="ddp_spawn", devices=2, accelerator="auto", logger=logger) + trainer.training_type_plugin._clean_logger(trainer) + if use_list: + assert isinstance(trainer.logger, LoggerCollection) + assert tensorboard_logger._experiment is None From f85d4c74138d72cae63e72bd80558ce3a47407e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sat, 27 Nov 2021 06:09:51 +0100 Subject: [PATCH 053/123] Consolidate state when retrieving sharded state dict in Lite (#10746) Co-authored-by: thomas chaton --- CHANGELOG.md | 3 +++ pytorch_lightning/lite/wrappers.py | 5 ++++- tests/lite/test_wrappers.py | 9 +++++++++ 3 files changed, 16 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5306861b7f0af..190cd1a070845 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -29,6 +29,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed signals being registered within threads ([#10610](https://github.com/PyTorchLightning/pytorch-lightning/pull/10610)) - Fixed an issue with collecting logged test results with multiple dataloaders ([#10522](https://github.com/PyTorchLightning/pytorch-lightning/pull/10522)) +- Fixed a consolidation error in Lite when attempting to save the state dict of a sharded optimizer ([#10746](https://github.com/PyTorchLightning/pytorch-lightning/pull/10746)) + + - Fixed `_compare_version` for python packages ([#10762](https://github.com/PyTorchLightning/pytorch-lightning/pull/10762)) diff --git a/pytorch_lightning/lite/wrappers.py b/pytorch_lightning/lite/wrappers.py index 6b8e44b610352..c13800cb842d6 100644 --- a/pytorch_lightning/lite/wrappers.py +++ b/pytorch_lightning/lite/wrappers.py @@ -46,7 +46,7 @@ def __init__(self, optimizer: Optimizer, accelerator: Accelerator) -> None: """ # `__del__` is skipped in case the optimizer has implemented custom destructor logic which we would # not want to call on destruction of the `_LiteOptimizer - self.__dict__ = {k: v for k, v in optimizer.__dict__.items() if k not in ("step", "__del__")} + self.__dict__ = {k: v for k, v in optimizer.__dict__.items() if k not in ("state_dict", "step", "__del__")} self.__class__ = type("Lite" + optimizer.__class__.__name__, (self.__class__, optimizer.__class__), {}) self._optimizer = optimizer self._accelerator = accelerator @@ -55,6 +55,9 @@ def __init__(self, optimizer: Optimizer, accelerator: Accelerator) -> None: def optimizer(self) -> Optimizer: return self._optimizer + def state_dict(self) -> Dict[str, Tensor]: + return self._accelerator.optimizer_state(self.optimizer) + def step(self, closure: Optional[Callable] = None) -> None: closure = closure or _do_nothing_closure self._accelerator.optimizer_step( diff --git a/tests/lite/test_wrappers.py b/tests/lite/test_wrappers.py index c271d3b3163ed..a732390e1d00a 100644 --- a/tests/lite/test_wrappers.py +++ b/tests/lite/test_wrappers.py @@ -142,6 +142,15 @@ def test_lite_optimizer_wraps(): assert isinstance(lite_optimizer, optimizer_cls) +def test_lite_optimizer_state_dict(): + """Test that the LiteOptimizer calls into the accelerator/strategy to collect the state.""" + optimizer = Mock() + accelerator = Mock() + lite_optimizer = _LiteOptimizer(optimizer=optimizer, accelerator=accelerator) + lite_optimizer.state_dict() + accelerator.optimizer_state.assert_called_with(optimizer) + + def test_lite_optimizer_steps(): """Test that the LiteOptimizer forwards the step() and zero_grad() calls to the wrapped optimizer.""" optimizer = Mock() From 03b19fa607ec31eb7d25f23d63a1f99487d2dd9d Mon Sep 17 00:00:00 2001 From: Rohit Gupta Date: Tue, 30 Nov 2021 01:21:17 +0530 Subject: [PATCH 054/123] Fix default logging levels for train step specific hooks (#10756) --- CHANGELOG.md | 21 +- pytorch_lightning/core/lightning.py | 21 +- .../logger_connector/fx_validator.py | 184 +++++++++++++----- .../trainer/logging_/test_logger_connector.py | 8 +- tests/trainer/logging_/test_loop_logging.py | 108 ++++++++++ .../logging_/test_train_loop_logging.py | 4 +- 6 files changed, 269 insertions(+), 77 deletions(-) create mode 100644 tests/trainer/logging_/test_loop_logging.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 190cd1a070845..15c866fcbeddd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,19 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Fixed +- Fixed `_compare_version` for python packages ([#10762](https://github.com/PyTorchLightning/pytorch-lightning/pull/10762)) + + +- Fixed TensorBoardLogger `SummaryWriter` not close before spawning the processes ([#10777](https://github.com/PyTorchLightning/pytorch-lightning/pull/10777)) + + +- Fixed a consolidation error in Lite when attempting to save the state dict of a sharded optimizer ([#10746](https://github.com/PyTorchLightning/pytorch-lightning/pull/10746)) + + +- Fixed the default logging level for batch hooks associated with training from `on_step=False, on_epoch=True` to `on_step=True, on_epoch=False` ([#10756](https://github.com/PyTorchLightning/pytorch-lightning/pull/10756)) + + + ### Removed @@ -29,14 +42,6 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed signals being registered within threads ([#10610](https://github.com/PyTorchLightning/pytorch-lightning/pull/10610)) - Fixed an issue with collecting logged test results with multiple dataloaders ([#10522](https://github.com/PyTorchLightning/pytorch-lightning/pull/10522)) -- Fixed a consolidation error in Lite when attempting to save the state dict of a sharded optimizer ([#10746](https://github.com/PyTorchLightning/pytorch-lightning/pull/10746)) - - -- Fixed `_compare_version` for python packages ([#10762](https://github.com/PyTorchLightning/pytorch-lightning/pull/10762)) - - -- Fixed TensorBoardLogger `SummaryWriter` not close before spawning the processes ([#10777](https://github.com/PyTorchLightning/pytorch-lightning/pull/10777)) - ## [1.5.2] - 2021-11-16 diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py index 4d327684978b6..21d442d0d60ba 100644 --- a/pytorch_lightning/core/lightning.py +++ b/pytorch_lightning/core/lightning.py @@ -410,10 +410,6 @@ def log( value, object, self.__check_allowed, name, value, wrong_dtype=(numbers.Number, Metric, Tensor, dict) ) - # set the default depending on the fx_name - on_step = self.__auto_choose_log_on_step(on_step) - on_epoch = self.__auto_choose_log_on_epoch(on_epoch) - if self.trainer is None: # not an error to support testing the `*_step` methods without a `Trainer` reference rank_zero_warn( @@ -432,7 +428,10 @@ def log( raise MisconfigurationException( "You are trying to `self.log()` but it is not managed by the `Trainer` control flow" ) - _FxValidator.check_logging(self._current_fx_name, on_step=on_step, on_epoch=on_epoch) + + on_step, on_epoch = _FxValidator.check_logging_and_get_default_levels( + self._current_fx_name, on_step=on_step, on_epoch=on_epoch + ) # make sure user doesn't introduce logic for multi-dataloaders if "/dataloader_idx_" in name: @@ -593,18 +592,6 @@ def log_grad_norm(self, grad_norm_dict): """ self.log_dict(grad_norm_dict, on_step=True, on_epoch=True, prog_bar=True, logger=True) - def __auto_choose_log_on_step(self, on_step: Optional[bool]) -> bool: - if on_step is None: - on_step = False - on_step |= self._current_fx_name in ("training_step", "training_step_end") - return on_step - - def __auto_choose_log_on_epoch(self, on_epoch: Optional[bool]) -> bool: - if on_epoch is None: - on_epoch = True - on_epoch &= self._current_fx_name not in ("training_step", "training_step_end") - return on_epoch - def all_gather( self, data: Union[torch.Tensor, Dict, List, Tuple], group: Optional[Any] = None, sync_grads: bool = False ): diff --git a/pytorch_lightning/trainer/connectors/logger_connector/fx_validator.py b/pytorch_lightning/trainer/connectors/logger_connector/fx_validator.py index cc91476518565..ad3dce3c12964 100644 --- a/pytorch_lightning/trainer/connectors/logger_connector/fx_validator.py +++ b/pytorch_lightning/trainer/connectors/logger_connector/fx_validator.py @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import Tuple, Union +from typing import Optional, Tuple, Union from typing_extensions import TypedDict @@ -20,50 +20,98 @@ class _FxValidator: class _LogOptions(TypedDict): - on_step: Union[Tuple[bool], Tuple[bool, bool]] - on_epoch: Union[Tuple[bool], Tuple[bool, bool]] + allowed_on_step: Union[Tuple[bool], Tuple[bool, bool]] + allowed_on_epoch: Union[Tuple[bool], Tuple[bool, bool]] + default_on_step: bool + default_on_epoch: bool functions = { "on_before_accelerator_backend_setup": None, "on_configure_sharded_model": None, - "on_before_backward": _LogOptions(on_step=(False, True), on_epoch=(False, True)), - "on_after_backward": _LogOptions(on_step=(False, True), on_epoch=(False, True)), - "on_before_optimizer_step": _LogOptions(on_step=(False, True), on_epoch=(False, True)), - "on_before_zero_grad": _LogOptions(on_step=(False, True), on_epoch=(False, True)), + "on_before_backward": _LogOptions( + allowed_on_step=(False, True), allowed_on_epoch=(False, True), default_on_step=True, default_on_epoch=False + ), + "on_after_backward": _LogOptions( + allowed_on_step=(False, True), allowed_on_epoch=(False, True), default_on_step=True, default_on_epoch=False + ), + "on_before_optimizer_step": _LogOptions( + allowed_on_step=(False, True), allowed_on_epoch=(False, True), default_on_step=True, default_on_epoch=False + ), + "on_before_zero_grad": _LogOptions( + allowed_on_step=(False, True), allowed_on_epoch=(False, True), default_on_step=True, default_on_epoch=False + ), "on_init_start": None, "on_init_end": None, "on_fit_start": None, "on_fit_end": None, "on_sanity_check_start": None, "on_sanity_check_end": None, - "on_train_start": _LogOptions(on_step=(False,), on_epoch=(True,)), + "on_train_start": _LogOptions( + allowed_on_step=(False,), allowed_on_epoch=(True,), default_on_step=False, default_on_epoch=True + ), "on_train_end": None, - "on_validation_start": _LogOptions(on_step=(False,), on_epoch=(True,)), + "on_validation_start": _LogOptions( + allowed_on_step=(False,), allowed_on_epoch=(True,), default_on_step=False, default_on_epoch=True + ), "on_validation_end": None, - "on_test_start": _LogOptions(on_step=(False,), on_epoch=(True,)), + "on_test_start": _LogOptions( + allowed_on_step=(False,), allowed_on_epoch=(True,), default_on_step=False, default_on_epoch=True + ), "on_test_end": None, "on_predict_start": None, "on_predict_end": None, "on_pretrain_routine_start": None, "on_pretrain_routine_end": None, - "on_train_epoch_start": _LogOptions(on_step=(False,), on_epoch=(True,)), - "on_train_epoch_end": _LogOptions(on_step=(False,), on_epoch=(True,)), - "on_validation_epoch_start": _LogOptions(on_step=(False,), on_epoch=(True,)), - "on_validation_epoch_end": _LogOptions(on_step=(False,), on_epoch=(True,)), - "on_test_epoch_start": _LogOptions(on_step=(False,), on_epoch=(True,)), - "on_test_epoch_end": _LogOptions(on_step=(False,), on_epoch=(True,)), + "on_train_epoch_start": _LogOptions( + allowed_on_step=(False,), allowed_on_epoch=(True,), default_on_step=False, default_on_epoch=True + ), + "on_train_epoch_end": _LogOptions( + allowed_on_step=(False,), allowed_on_epoch=(True,), default_on_step=False, default_on_epoch=True + ), + "on_validation_epoch_start": _LogOptions( + allowed_on_step=(False,), allowed_on_epoch=(True,), default_on_step=False, default_on_epoch=True + ), + "on_validation_epoch_end": _LogOptions( + allowed_on_step=(False,), allowed_on_epoch=(True,), default_on_step=False, default_on_epoch=True + ), + "on_test_epoch_start": _LogOptions( + allowed_on_step=(False,), allowed_on_epoch=(True,), default_on_step=False, default_on_epoch=True + ), + "on_test_epoch_end": _LogOptions( + allowed_on_step=(False,), allowed_on_epoch=(True,), default_on_step=False, default_on_epoch=True + ), "on_predict_epoch_start": None, "on_predict_epoch_end": None, - "on_epoch_start": _LogOptions(on_step=(False,), on_epoch=(True,)), - "on_epoch_end": _LogOptions(on_step=(False,), on_epoch=(True,)), - "on_batch_start": _LogOptions(on_step=(False, True), on_epoch=(False, True)), - "on_batch_end": _LogOptions(on_step=(False, True), on_epoch=(False, True)), - "on_train_batch_start": _LogOptions(on_step=(False, True), on_epoch=(False, True)), - "on_train_batch_end": _LogOptions(on_step=(False, True), on_epoch=(False, True)), - "on_validation_batch_start": _LogOptions(on_step=(False, True), on_epoch=(False, True)), - "on_validation_batch_end": _LogOptions(on_step=(False, True), on_epoch=(False, True)), - "on_test_batch_start": _LogOptions(on_step=(False, True), on_epoch=(False, True)), - "on_test_batch_end": _LogOptions(on_step=(False, True), on_epoch=(False, True)), + "on_epoch_start": _LogOptions( + allowed_on_step=(False,), allowed_on_epoch=(True,), default_on_step=False, default_on_epoch=True + ), + "on_epoch_end": _LogOptions( + allowed_on_step=(False,), allowed_on_epoch=(True,), default_on_step=False, default_on_epoch=True + ), + "on_batch_start": _LogOptions( + allowed_on_step=(False, True), allowed_on_epoch=(False, True), default_on_step=True, default_on_epoch=False + ), + "on_batch_end": _LogOptions( + allowed_on_step=(False, True), allowed_on_epoch=(False, True), default_on_step=True, default_on_epoch=False + ), + "on_train_batch_start": _LogOptions( + allowed_on_step=(False, True), allowed_on_epoch=(False, True), default_on_step=True, default_on_epoch=False + ), + "on_train_batch_end": _LogOptions( + allowed_on_step=(False, True), allowed_on_epoch=(False, True), default_on_step=True, default_on_epoch=False + ), + "on_validation_batch_start": _LogOptions( + allowed_on_step=(False, True), allowed_on_epoch=(False, True), default_on_step=False, default_on_epoch=True + ), + "on_validation_batch_end": _LogOptions( + allowed_on_step=(False, True), allowed_on_epoch=(False, True), default_on_step=False, default_on_epoch=True + ), + "on_test_batch_start": _LogOptions( + allowed_on_step=(False, True), allowed_on_epoch=(False, True), default_on_step=False, default_on_epoch=True + ), + "on_test_batch_end": _LogOptions( + allowed_on_step=(False, True), allowed_on_epoch=(False, True), default_on_step=False, default_on_epoch=True + ), "on_predict_batch_start": None, "on_predict_batch_end": None, "on_keyboard_interrupt": None, @@ -73,16 +121,34 @@ class _LogOptions(TypedDict): "setup": None, "teardown": None, "configure_sharded_model": None, - "training_step": _LogOptions(on_step=(False, True), on_epoch=(False, True)), - "validation_step": _LogOptions(on_step=(False, True), on_epoch=(False, True)), - "test_step": _LogOptions(on_step=(False, True), on_epoch=(False, True)), + "training_step": _LogOptions( + allowed_on_step=(False, True), allowed_on_epoch=(False, True), default_on_step=True, default_on_epoch=False + ), + "validation_step": _LogOptions( + allowed_on_step=(False, True), allowed_on_epoch=(False, True), default_on_step=False, default_on_epoch=True + ), + "test_step": _LogOptions( + allowed_on_step=(False, True), allowed_on_epoch=(False, True), default_on_step=False, default_on_epoch=True + ), "predict_step": None, - "training_step_end": _LogOptions(on_step=(False, True), on_epoch=(False, True)), - "validation_step_end": _LogOptions(on_step=(False, True), on_epoch=(False, True)), - "test_step_end": _LogOptions(on_step=(False, True), on_epoch=(False, True)), - "training_epoch_end": _LogOptions(on_step=(False,), on_epoch=(True,)), - "validation_epoch_end": _LogOptions(on_step=(False,), on_epoch=(True,)), - "test_epoch_end": _LogOptions(on_step=(False,), on_epoch=(True,)), + "training_step_end": _LogOptions( + allowed_on_step=(False, True), allowed_on_epoch=(False, True), default_on_step=True, default_on_epoch=False + ), + "validation_step_end": _LogOptions( + allowed_on_step=(False, True), allowed_on_epoch=(False, True), default_on_step=False, default_on_epoch=True + ), + "test_step_end": _LogOptions( + allowed_on_step=(False, True), allowed_on_epoch=(False, True), default_on_step=False, default_on_epoch=True + ), + "training_epoch_end": _LogOptions( + allowed_on_step=(False,), allowed_on_epoch=(True,), default_on_step=False, default_on_epoch=True + ), + "validation_epoch_end": _LogOptions( + allowed_on_step=(False,), allowed_on_epoch=(True,), default_on_step=False, default_on_epoch=True + ), + "test_epoch_end": _LogOptions( + allowed_on_step=(False,), allowed_on_epoch=(True,), default_on_step=False, default_on_epoch=True + ), "configure_optimizers": None, "on_train_dataloader": None, "train_dataloader": None, @@ -97,22 +163,48 @@ class _LogOptions(TypedDict): } @classmethod - def check_logging(cls, fx_name: str, on_step: bool, on_epoch: bool) -> None: - """Check if the given function name is allowed to log.""" + def check_logging(cls, fx_name: str) -> None: + """Check if the given hook is allowed to log.""" if fx_name not in cls.functions: raise RuntimeError( f"Logging inside `{fx_name}` is not implemented." - " Please, open an issue in `https://github.com/PyTorchLightning/pytorch-lightning/issues`" + " Please, open an issue in `https://github.com/PyTorchLightning/pytorch-lightning/issues`." ) - allowed = cls.functions[fx_name] - if allowed is None: - raise MisconfigurationException(f"You can't `self.log()` inside `{fx_name}`") - m = "You can't `self.log({}={})` inside `{}`, must be one of {}" - if on_step not in allowed["on_step"]: - msg = m.format("on_step", on_step, fx_name, allowed["on_step"]) + if cls.functions[fx_name] is None: + raise MisconfigurationException(f"You can't `self.log()` inside `{fx_name}`.") + + @classmethod + def get_default_logging_levels( + cls, fx_name: str, on_step: Optional[bool], on_epoch: Optional[bool] + ) -> Tuple[bool, bool]: + """Return default logging levels for given hook.""" + fx_config = cls.functions[fx_name] + assert fx_config is not None + on_step = fx_config["default_on_step"] if on_step is None else on_step + on_epoch = fx_config["default_on_epoch"] if on_epoch is None else on_epoch + return on_step, on_epoch + + @classmethod + def check_logging_levels(cls, fx_name: str, on_step: bool, on_epoch: bool) -> None: + """Check if the logging levels are allowed in the given hook.""" + fx_config = cls.functions[fx_name] + assert fx_config is not None + m = "You can't `self.log({}={})` inside `{}`, must be one of {}." + if on_step not in fx_config["allowed_on_step"]: + msg = m.format("on_step", on_step, fx_name, fx_config["allowed_on_step"]) raise MisconfigurationException(msg) - if on_epoch not in allowed["on_epoch"]: - msg = m.format("on_epoch", on_epoch, fx_name, allowed["on_epoch"]) + if on_epoch not in fx_config["allowed_on_epoch"]: + msg = m.format("on_epoch", on_epoch, fx_name, fx_config["allowed_on_epoch"]) raise MisconfigurationException(msg) + + @classmethod + def check_logging_and_get_default_levels( + cls, fx_name: str, on_step: Optional[bool], on_epoch: Optional[bool] + ) -> Tuple[bool, bool]: + """Check if the given hook name is allowed to log and return logging levels.""" + cls.check_logging(fx_name) + on_step, on_epoch = cls.get_default_logging_levels(fx_name, on_step, on_epoch) + cls.check_logging_levels(fx_name, on_step, on_epoch) + return on_step, on_epoch diff --git a/tests/trainer/logging_/test_logger_connector.py b/tests/trainer/logging_/test_logger_connector.py index c6afc1ef60503..7dae25df68d2b 100644 --- a/tests/trainer/logging_/test_logger_connector.py +++ b/tests/trainer/logging_/test_logger_connector.py @@ -141,17 +141,17 @@ def test_fx_validator(tmpdir): and func_name not in ["on_train_end", "on_test_end", "on_validation_end"] ) if allowed: - validator.check_logging(fx_name=func_name, on_step=on_step, on_epoch=on_epoch) + validator.check_logging_levels(fx_name=func_name, on_step=on_step, on_epoch=on_epoch) if not is_start and is_stage: with pytest.raises(MisconfigurationException, match="must be one of"): - validator.check_logging(fx_name=func_name, on_step=True, on_epoch=on_epoch) + validator.check_logging_levels(fx_name=func_name, on_step=True, on_epoch=on_epoch) else: assert func_name in not_supported with pytest.raises(MisconfigurationException, match="You can't"): - validator.check_logging(fx_name=func_name, on_step=on_step, on_epoch=on_epoch) + validator.check_logging(fx_name=func_name) with pytest.raises(RuntimeError, match="Logging inside `foo` is not implemented"): - validator.check_logging("foo", False, False) + validator.check_logging("foo") class HookedCallback(Callback): diff --git a/tests/trainer/logging_/test_loop_logging.py b/tests/trainer/logging_/test_loop_logging.py new file mode 100644 index 0000000000000..2c2f2253c42a3 --- /dev/null +++ b/tests/trainer/logging_/test_loop_logging.py @@ -0,0 +1,108 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Test logging in the training loop.""" +import inspect +from unittest import mock +from unittest.mock import ANY + +import torch + +from pytorch_lightning import Trainer +from pytorch_lightning.trainer.connectors.logger_connector.fx_validator import _FxValidator +from pytorch_lightning.trainer.connectors.logger_connector.result import ResultCollection +from pytorch_lightning.trainer.states import RunningStage, TrainerFn +from tests.helpers.boring_model import BoringModel + + +def test_default_level_for_hooks_that_support_logging(): + def _make_assertion(model, hooks, result_mock, on_step, on_epoch, extra_kwargs): + for hook in hooks: + model._current_fx_name = hook + model.log(hook, 1) + result_mock.assert_called_with( + hook, hook, torch.tensor(1), on_step=on_step, on_epoch=on_epoch, **extra_kwargs + ) + + trainer = Trainer() + model = BoringModel() + model.trainer = trainer + extra_kwargs = { + k: ANY + for k in inspect.signature(ResultCollection.log).parameters + if k not in ["self", "fx", "name", "value", "on_step", "on_epoch"] + } + all_logging_hooks = {k for k in _FxValidator.functions if _FxValidator.functions[k]} + + with mock.patch( + "pytorch_lightning.trainer.connectors.logger_connector.result.ResultCollection.log", return_value=None + ) as result_mock: + trainer.state.stage = RunningStage.TRAINING + hooks = [ + "on_before_backward", + "on_after_backward", + "on_before_optimizer_step", + "on_before_zero_grad", + "training_step", + "training_step_end", + "on_batch_start", + "on_batch_end", + "on_train_batch_start", + "on_train_batch_end", + ] + all_logging_hooks = all_logging_hooks - set(hooks) + _make_assertion(model, hooks, result_mock, on_step=True, on_epoch=False, extra_kwargs=extra_kwargs) + + hooks = [ + "on_train_start", + "on_train_epoch_start", + "on_train_epoch_end", + "on_epoch_start", + "on_epoch_end", + "training_epoch_end", + ] + all_logging_hooks = all_logging_hooks - set(hooks) + _make_assertion(model, hooks, result_mock, on_step=False, on_epoch=True, extra_kwargs=extra_kwargs) + + trainer.state.stage = RunningStage.VALIDATING + trainer.state.fn = TrainerFn.VALIDATING + hooks = [ + "on_validation_start", + "on_validation_epoch_start", + "on_validation_epoch_end", + "on_validation_batch_start", + "on_validation_batch_end", + "validation_step", + "validation_step_end", + "validation_epoch_end", + ] + all_logging_hooks = all_logging_hooks - set(hooks) + _make_assertion(model, hooks, result_mock, on_step=False, on_epoch=True, extra_kwargs=extra_kwargs) + + trainer.state.stage = RunningStage.TESTING + trainer.state.fn = TrainerFn.TESTING + hooks = [ + "on_test_start", + "on_test_epoch_start", + "on_test_epoch_end", + "on_test_batch_start", + "on_test_batch_end", + "test_step", + "test_step_end", + "test_epoch_end", + ] + all_logging_hooks = all_logging_hooks - set(hooks) + _make_assertion(model, hooks, result_mock, on_step=False, on_epoch=True, extra_kwargs=extra_kwargs) + + # just to ensure we checked all possible logging hooks here + assert len(all_logging_hooks) == 0 diff --git a/tests/trainer/logging_/test_train_loop_logging.py b/tests/trainer/logging_/test_train_loop_logging.py index 0ec61358d9408..2950404e03f75 100644 --- a/tests/trainer/logging_/test_train_loop_logging.py +++ b/tests/trainer/logging_/test_train_loop_logging.py @@ -716,10 +716,10 @@ def on_validation_epoch_end(self): assert all(v == 3 for v in self.trainer.callback_metrics.values()) def on_train_batch_start(self, batch, batch_idx): - self.log("on_train_batch_start", 1.0, reduce_fx="sum") + self.log("on_train_batch_start", 1.0, on_step=False, on_epoch=True, reduce_fx="sum") def on_train_batch_end(self, outputs, batch, batch_idx): - self.log("on_train_batch_end", 1.0, reduce_fx="sum") + self.log("on_train_batch_end", 1.0, on_step=False, on_epoch=True, reduce_fx="sum") def on_validation_batch_start(self, batch, batch_idx, dataloader_idx): self.log("on_validation_batch_start", 1.0, reduce_fx="sum") From bf9aef737f10b5640d3d3321c608037897cd1bed Mon Sep 17 00:00:00 2001 From: Ethan Harris Date: Tue, 30 Nov 2021 11:54:43 +0000 Subject: [PATCH 055/123] Update notebooks submodule (#10827) --- _notebooks | 2 +- docs/source/conf.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/_notebooks b/_notebooks index a2fb6468112b7..0c325829101d5 160000 --- a/_notebooks +++ b/_notebooks @@ -1 +1 @@ -Subproject commit a2fb6468112b7e1dad501c3b6a17533a4adfeabc +Subproject commit 0c325829101d5a6ebf32ed99bbf5b09badf04a59 diff --git a/docs/source/conf.py b/docs/source/conf.py index 845b3b946972a..8aaa06ccef8ec 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -46,7 +46,7 @@ # -- Project documents ------------------------------------------------------- if _SHOULD_COPY_NOTEBOOKS: - HelperCLI.copy_notebooks(PATH_RAW_NB, PATH_HERE, "notebooks") + HelperCLI.copy_notebooks(PATH_RAW_NB, PATH_HERE, "notebooks", patterns=[".", "course_UvA-DL", "lightning_examples"]) def _transform_changelog(path_in: str, path_out: str) -> None: From 4e08de243bfa35b9e5a5654a77394135c8eaa5b5 Mon Sep 17 00:00:00 2001 From: Kaushik B Date: Tue, 30 Nov 2021 17:51:59 +0530 Subject: [PATCH 056/123] [CLI] Add support for --key.help=class --- CHANGELOG.md | 3 +++ pytorch_lightning/utilities/cli.py | 34 ++++++++++++++++++++++++++---- requirements/extra.txt | 2 +- tests/utilities/test_cli.py | 24 ++++++++++++++++++--- 4 files changed, 55 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 15c866fcbeddd..0e45a8676d4dd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Fixed +- Fixed support for `--key.help=class` with the `LightningCLI` ([#10767](https://github.com/PyTorchLightning/pytorch-lightning/pull/10767)) + + - Fixed `_compare_version` for python packages ([#10762](https://github.com/PyTorchLightning/pytorch-lightning/pull/10762)) diff --git a/pytorch_lightning/utilities/cli.py b/pytorch_lightning/utilities/cli.py index 9d8cca7db1c69..7a2d4ba994d7f 100644 --- a/pytorch_lightning/utilities/cli.py +++ b/pytorch_lightning/utilities/cli.py @@ -265,9 +265,27 @@ def _convert_argv_issue_84(classes: Tuple[Type, ...], nested_key: str, argv: Lis else: clean_argv.append(arg) i += 1 + + # the user requested a help message + help_key = argv_key + ".help" + if help_key in passed_args: + argv_class = passed_args[help_key] + if "." in argv_class: + # user passed the class path directly + class_path = argv_class + else: + # convert shorthand format to the classpath + for cls in classes: + if cls.__name__ == argv_class: + class_path = _class_path_from_class(cls) + break + else: + raise ValueError(f"Could not generate get the class_path for {repr(argv_class)}") + return clean_argv + [help_key, class_path] + # generate the associated config file - argv_class = passed_args.pop(argv_key, None) - if argv_class is None: + argv_class = passed_args.pop(argv_key, "") + if not argv_class: # the user passed a config as a str class_path = passed_args[f"{argv_key}.class_path"] init_args_key = f"{argv_key}.init_args" @@ -772,8 +790,16 @@ def _prepare_subcommand_kwargs(self, subcommand: str) -> Dict[str, Any]: return fn_kwargs -def _global_add_class_path(class_type: Type, init_args: Dict[str, Any] = None) -> Dict[str, Any]: - return {"class_path": class_type.__module__ + "." + class_type.__name__, "init_args": init_args or {}} +def _class_path_from_class(class_type: Type) -> str: + return class_type.__module__ + "." + class_type.__name__ + + +def _global_add_class_path( + class_type: Type, init_args: Optional[Union[Namespace, Dict[str, Any]]] = None +) -> Dict[str, Any]: + if isinstance(init_args, Namespace): + init_args = init_args.as_dict() + return {"class_path": _class_path_from_class(class_type), "init_args": init_args or {}} def _add_class_path_generator(class_type: Type) -> Callable[[Dict[str, Any]], Dict[str, Any]]: diff --git a/requirements/extra.txt b/requirements/extra.txt index 6abf3089b8506..babaffca6280d 100644 --- a/requirements/extra.txt +++ b/requirements/extra.txt @@ -5,6 +5,6 @@ horovod>=0.21.2 # no need to install with [pytorch] as pytorch is already insta torchtext>=0.8.* omegaconf>=2.0.5 hydra-core>=1.0.5 -jsonargparse[signatures]>=4.0.0 +jsonargparse[signatures]>=4.0.4 gcsfs>=2021.5.0 rich>=10.2.2 diff --git a/tests/utilities/test_cli.py b/tests/utilities/test_cli.py index 7a86150454777..58903a799cc61 100644 --- a/tests/utilities/test_cli.py +++ b/tests/utilities/test_cli.py @@ -57,7 +57,7 @@ @mock.patch("argparse.ArgumentParser.parse_args") -def test_default_args(mock_argparse, tmpdir): +def test_default_args(mock_argparse): """Tests default argument parser for Trainer.""" mock_argparse.return_value = Namespace(**Trainer.default_attributes()) @@ -868,7 +868,7 @@ class CustomCallback(Callback): pass -def test_registries(tmpdir): +def test_registries(): assert "SGD" in OPTIMIZER_REGISTRY.names assert "RMSprop" in OPTIMIZER_REGISTRY.names assert "CustomAdam" in OPTIMIZER_REGISTRY.names @@ -1358,9 +1358,27 @@ class TestCallback(Callback): assert cli.config_init["trainer"]["max_epochs"] is None -def test_cli_configure_optimizers_warning(tmpdir): +def test_cli_configure_optimizers_warning(): match = "configure_optimizers` will be overridden by `LightningCLI" with mock.patch("sys.argv", ["any.py"]), no_warning_call(UserWarning, match=match): LightningCLI(BoringModel, run=False) with mock.patch("sys.argv", ["any.py", "--optimizer=Adam"]), pytest.warns(UserWarning, match=match): LightningCLI(BoringModel, run=False) + + +def test_cli_help_message(): + # full class path + cli_args = ["any.py", "--optimizer.help=torch.optim.Adam"] + classpath_help = StringIO() + with mock.patch("sys.argv", cli_args), redirect_stdout(classpath_help), pytest.raises(SystemExit): + LightningCLI(BoringModel, run=False) + + cli_args = ["any.py", "--optimizer.help=Adam"] + shorthand_help = StringIO() + with mock.patch("sys.argv", cli_args), redirect_stdout(shorthand_help), pytest.raises(SystemExit): + LightningCLI(BoringModel, run=False) + + # the help messages should match + assert shorthand_help.getvalue() == classpath_help.getvalue() + # make sure it's not empty + assert "Implements Adam" in shorthand_help.getvalue() From 288d0186e3d0e8e48bc3d795ce09c980212aab69 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Sun, 28 Nov 2021 18:58:03 +0100 Subject: [PATCH 057/123] Tune Conda CI timeout and other minor improvements (#10769) --- .github/workflows/ci_test-base.yml | 1 - .github/workflows/ci_test-conda.yml | 8 ++++---- requirements/adjust_versions.py | 7 ++++--- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/.github/workflows/ci_test-base.yml b/.github/workflows/ci_test-base.yml index 8b2f8b721a37e..c2f1d370e2d1a 100644 --- a/.github/workflows/ci_test-base.yml +++ b/.github/workflows/ci_test-base.yml @@ -59,7 +59,6 @@ jobs: - name: Test Package [only] run: | - # NOTE: run coverage on tests does not propagate failure status for Win, https://github.com/nedbat/coveragepy/issues/1003 coverage run --source pytorch_lightning -m pytest pytorch_lightning -v --junitxml=junit/test-results-${{ runner.os }}-${{ matrix.python-version }}-${{ matrix.requires }}.xml - name: Upload pytest test results diff --git a/.github/workflows/ci_test-conda.yml b/.github/workflows/ci_test-conda.yml index a24a15baf4d78..fa366e645f1d9 100644 --- a/.github/workflows/ci_test-conda.yml +++ b/.github/workflows/ci_test-conda.yml @@ -17,7 +17,7 @@ jobs: python-version: ["3.8"] # previous to last Python version as that one is already used in test-full pytorch-version: ["1.7", "1.8", "1.9", "1.10"] # nightly: add when there's a release candidate - timeout-minutes: 35 + timeout-minutes: 30 steps: - uses: actions/checkout@v2 @@ -29,7 +29,8 @@ jobs: python ./requirements/adjust_versions.py requirements/extra.txt python ./requirements/adjust_versions.py requirements/examples.txt pip install --requirement requirements/devel.txt --find-links https://download.pytorch.org/whl/nightly/torch_nightly.html - pip install pytest-random-order + # set a per-test timeout of 2.5 minutes to fail sooner. this aids with hanging tests + pip install pytest-timeout pip list - name: Pull checkpoints from S3 @@ -42,8 +43,7 @@ jobs: - name: Tests run: | - # NOTE: run coverage on tests does not propagate failure status for Win, https://github.com/nedbat/coveragepy/issues/1003 - coverage run --source pytorch_lightning -m pytest --random-order-seed=1 pytorch_lightning tests -v --durations=50 --junitxml=junit/test-results-${{ runner.os }}-torch${{ matrix.pytorch-version }}.xml + coverage run --source pytorch_lightning -m pytest --timeout 150 pytorch_lightning tests -v --durations=50 --junitxml=junit/test-results-${{ runner.os }}-torch${{ matrix.pytorch-version }}.xml shell: bash -l {0} - name: Upload pytest results diff --git a/requirements/adjust_versions.py b/requirements/adjust_versions.py index 8295a726e7873..2ec7a177e0824 100644 --- a/requirements/adjust_versions.py +++ b/requirements/adjust_versions.py @@ -83,8 +83,9 @@ def test(): else: requirements_path, torch_version = sys.argv[1], None - with open(requirements_path, "r+") as fp: + with open(requirements_path) as fp: requirements = fp.read() - requirements = main(requirements, torch_version) - print(requirements) # on purpose - to debug + requirements = main(requirements, torch_version) + print(requirements) # on purpose - to debug + with open(requirements_path, "w") as fp: fp.write(requirements) From 00d367545d797abdfd2c768a252120a9e2999e7e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Fri, 26 Nov 2021 18:13:14 +0100 Subject: [PATCH 058/123] Rename special to standalone (#10779) --- .azure-pipelines/gpu-tests.yml | 4 +- .../test_accelerator_connector.py | 4 +- tests/accelerators/test_ddp.py | 2 +- tests/accelerators/test_multi_nodes_gpu.py | 4 +- tests/callbacks/test_pruning.py | 2 +- tests/callbacks/test_stochastic_weight_avg.py | 2 +- tests/callbacks/test_tqdm_progress_bar.py | 2 +- .../test_checkpoint_callback_frequency.py | 2 +- tests/conftest.py | 8 +-- tests/core/test_metric_result_integration.py | 2 +- tests/helpers/runif.py | 12 ++-- tests/lite/test_lite.py | 2 +- tests/lite/test_parity.py | 2 +- tests/models/test_hooks.py | 4 +- tests/models/test_sync_batchnorm.py | 2 +- .../environments/torch_elastic_deadlock.py | 2 +- tests/plugins/test_amp_plugins.py | 2 +- ..._ddp_fully_sharded_with_full_state_dict.py | 6 +- tests/plugins/test_ddp_plugin.py | 4 +- .../plugins/test_ddp_plugin_with_comm_hook.py | 10 ++-- tests/plugins/test_deepspeed_plugin.py | 58 +++++++++---------- tests/plugins/test_sharded_plugin.py | 6 +- tests/profiler/test_profiler.py | 6 +- .../{special_tests.sh => standalone_tests.sh} | 14 ++--- .../logging_/test_train_loop_logging.py | 2 +- .../optimization/test_manual_optimization.py | 4 +- tests/trainer/optimization/test_optimizers.py | 2 +- tests/trainer/test_trainer.py | 6 +- tests/utilities/test_all_gather_grad.py | 4 +- .../test_deepspeed_collate_checkpoint.py | 2 +- tests/utilities/test_meta.py | 2 +- tests/utilities/test_warnings.py | 4 +- 32 files changed, 94 insertions(+), 94 deletions(-) rename tests/{special_tests.sh => standalone_tests.sh} (82%) diff --git a/.azure-pipelines/gpu-tests.yml b/.azure-pipelines/gpu-tests.yml index 71332a840fdb0..8752e8584439a 100644 --- a/.azure-pipelines/gpu-tests.yml +++ b/.azure-pipelines/gpu-tests.yml @@ -72,10 +72,10 @@ jobs: displayName: 'Testing: standard' - bash: | - bash tests/special_tests.sh + bash tests/standalone_tests.sh env: PL_USE_MOCKED_MNIST: "1" - displayName: 'Testing: special' + displayName: 'Testing: standalone' - bash: | python -m coverage report diff --git a/tests/accelerators/test_accelerator_connector.py b/tests/accelerators/test_accelerator_connector.py index 0ac22f5f204ba..aa0c184a72980 100644 --- a/tests/accelerators/test_accelerator_connector.py +++ b/tests/accelerators/test_accelerator_connector.py @@ -323,8 +323,8 @@ def on_fit_start(self, trainer, pl_module): trainer.fit(model) -@RunIf(special=True) -def test_accelerator_choice_ddp_cpu_and_plugin(tmpdir): +@RunIf(skip_windows=True, standalone=True) +def test_accelerator_choice_ddp_cpu_and_strategy(tmpdir): """Test that accelerator="ddp_cpu" can work together with an instance of DDPPlugin.""" _test_accelerator_choice_ddp_cpu_and_plugin(tmpdir, ddp_plugin_class=DDPPlugin) diff --git a/tests/accelerators/test_ddp.py b/tests/accelerators/test_ddp.py index 1982e967c21ea..db2f388971c12 100644 --- a/tests/accelerators/test_ddp.py +++ b/tests/accelerators/test_ddp.py @@ -108,7 +108,7 @@ def setup(self, stage: Optional[str] = None) -> None: trainer.fit(model) -@RunIf(min_gpus=2, min_torch="1.8.1", special=True) +@RunIf(min_gpus=2, min_torch="1.8.1", standalone=True) @pytest.mark.parametrize("precision", (16, 32)) def test_ddp_wrapper(tmpdir, precision): """Test parameters to ignore are carried over for DDP.""" diff --git a/tests/accelerators/test_multi_nodes_gpu.py b/tests/accelerators/test_multi_nodes_gpu.py index 0df49a41b0fd0..09f632746b1dd 100644 --- a/tests/accelerators/test_multi_nodes_gpu.py +++ b/tests/accelerators/test_multi_nodes_gpu.py @@ -31,7 +31,7 @@ # TODO(Borda): When multi-node tests are re-enabled (.github/workflows/ci_test-mnodes.yml) # use an environment variable `PL_RUNNING_MULTINODE_TESTS` and set `RunIf(multinode=True)` @pytest.mark.skip("Multi-node testing is currently disabled") -@RunIf(special=True) +@RunIf(standalone=True) def test_logging_sync_dist_true_ddp(tmpdir): """Tests to ensure that the sync_dist flag works with CPU (should just return the original value)""" fake_result = 1 @@ -68,7 +68,7 @@ def validation_step(self, batch, batch_idx): # TODO(Borda): When multi-node tests are re-enabled (.github/workflows/ci_test-mnodes.yml) # use an environment variable `PL_RUNNING_MULTINODE_TESTS` and set `RunIf(multinode=True)` @pytest.mark.skip("Multi-node testing is currently disabled") -@RunIf(special=True) +@RunIf(standalone=True) def test__validation_step__log(tmpdir): """Tests that validation_step can log.""" diff --git a/tests/callbacks/test_pruning.py b/tests/callbacks/test_pruning.py index ec4dcddf777c0..f63892df94310 100644 --- a/tests/callbacks/test_pruning.py +++ b/tests/callbacks/test_pruning.py @@ -160,7 +160,7 @@ def test_pruning_callback( ) -@RunIf(special=True, min_gpus=2) +@RunIf(standalone=True, min_gpus=2) @pytest.mark.parametrize("parameters_to_prune", (False, True)) @pytest.mark.parametrize("use_global_unstructured", (False, True)) def test_pruning_callback_ddp(tmpdir, parameters_to_prune, use_global_unstructured): diff --git a/tests/callbacks/test_stochastic_weight_avg.py b/tests/callbacks/test_stochastic_weight_avg.py index 4a0f154928adb..910d6443d4def 100644 --- a/tests/callbacks/test_stochastic_weight_avg.py +++ b/tests/callbacks/test_stochastic_weight_avg.py @@ -138,7 +138,7 @@ def train_with_swa( assert trainer.lightning_module == model -@RunIf(min_gpus=2, special=True) +@RunIf(min_gpus=2, standalone=True) def test_swa_callback_ddp(tmpdir): train_with_swa(tmpdir, strategy="ddp", gpus=2) diff --git a/tests/callbacks/test_tqdm_progress_bar.py b/tests/callbacks/test_tqdm_progress_bar.py index d25c263443168..9b80fabf800a7 100644 --- a/tests/callbacks/test_tqdm_progress_bar.py +++ b/tests/callbacks/test_tqdm_progress_bar.py @@ -521,7 +521,7 @@ def test_tqdm_progress_bar_can_be_pickled(): pickle.dumps(bar) -@RunIf(min_gpus=2, special=True) +@RunIf(min_gpus=2, standalone=True) @pytest.mark.parametrize( ["total_train_samples", "train_batch_size", "total_val_samples", "val_batch_size", "val_check_interval"], [(8, 4, 2, 1, 0.2), (8, 4, 2, 1, 0.5)], diff --git a/tests/checkpointing/test_checkpoint_callback_frequency.py b/tests/checkpointing/test_checkpoint_callback_frequency.py index fd5c76b2faef7..2c14c7de29b9c 100644 --- a/tests/checkpointing/test_checkpoint_callback_frequency.py +++ b/tests/checkpointing/test_checkpoint_callback_frequency.py @@ -87,7 +87,7 @@ def training_step(self, batch, batch_idx): @mock.patch("torch.save") -@RunIf(special=True, min_gpus=2) +@RunIf(standalone=True, min_gpus=2) @pytest.mark.parametrize(["k", "epochs", "val_check_interval", "expected"], [(1, 1, 1.0, 1), (2, 2, 0.3, 4)]) def test_top_k_ddp(save_mock, tmpdir, k, epochs, val_check_interval, expected): class TestModel(BoringModel): diff --git a/tests/conftest.py b/tests/conftest.py index 2bb5715ce9ee9..3c1efcde4deb2 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -159,13 +159,13 @@ def single_process_pg(): def pytest_collection_modifyitems(items): - if os.getenv("PL_RUNNING_SPECIAL_TESTS", "0") != "1": + if os.getenv("PL_RUN_STANDALONE_TESTS", "0") != "1": return - # filter out non-special tests + # filter out non-standalone tests items[:] = [ item for item in items for marker in item.own_markers - # has `@RunIf(special=True)` - if marker.name == "skipif" and marker.kwargs.get("special") + # has `@RunIf(standalone=True)` + if marker.name == "skipif" and marker.kwargs.get("standalone") ] diff --git a/tests/core/test_metric_result_integration.py b/tests/core/test_metric_result_integration.py index 6b2d965c5add5..0138e0d26359d 100644 --- a/tests/core/test_metric_result_integration.py +++ b/tests/core/test_metric_result_integration.py @@ -482,7 +482,7 @@ def test_result_collection_reload_1_gpu_ddp(tmpdir): result_collection_reload(default_root_dir=tmpdir, strategy="ddp", gpus=1) -@RunIf(min_gpus=2, special=True) +@RunIf(min_gpus=2, standalone=True) @mock.patch.dict(os.environ, {"PL_FAULT_TOLERANT_TRAINING": "1"}) @pytest.mark.skipif(not _TORCH_GREATER_EQUAL_1_7, reason="Requires at least PyTorch 1.7") def test_result_collection_reload_2_gpus(tmpdir): diff --git a/tests/helpers/runif.py b/tests/helpers/runif.py index 5cdf422cf4fdb..4ad6942aa160a 100644 --- a/tests/helpers/runif.py +++ b/tests/helpers/runif.py @@ -65,7 +65,7 @@ def __new__( horovod: bool = False, horovod_nccl: bool = False, skip_windows: bool = False, - special: bool = False, + standalone: bool = False, fairscale: bool = False, fairscale_fully_sharded: bool = False, deepspeed: bool = False, @@ -87,7 +87,7 @@ def __new__( horovod: if Horovod is installed horovod_nccl: if Horovod is installed with NCCL support skip_windows: skip test for Windows platform (typically for some limited torch functionality) - special: running in special mode, outside pytest suit + standalone: Mark the test as standalone, our CI will run it in a separate process. fairscale: if `fairscale` module is required to run the test fairscale_fully_sharded: if `fairscale` fully sharded module is required to run the test deepspeed: if `deepspeed` module is required to run the test @@ -146,12 +146,12 @@ def __new__( conditions.append(not _HOROVOD_NCCL_AVAILABLE) reasons.append("Horovod with NCCL") - if special: - env_flag = os.getenv("PL_RUNNING_SPECIAL_TESTS", "0") + if standalone: + env_flag = os.getenv("PL_RUN_STANDALONE_TESTS", "0") conditions.append(env_flag != "1") - reasons.append("Special execution") + reasons.append("Standalone execution") # used in tests/conftest.py::pytest_collection_modifyitems - kwargs["special"] = True + kwargs["standalone"] = True if fairscale: conditions.append(not _FAIRSCALE_AVAILABLE) diff --git a/tests/lite/test_lite.py b/tests/lite/test_lite.py index bd69cf359473e..97046c71bedbd 100644 --- a/tests/lite/test_lite.py +++ b/tests/lite/test_lite.py @@ -380,7 +380,7 @@ def test_autocast(): lite._precision_plugin.forward_context().__exit__.assert_called() -@RunIf(min_gpus=2, deepspeed=True, special=True) +@RunIf(min_gpus=2, deepspeed=True, standalone=True) def test_deepspeed_multiple_models(): class Lite(LightningLite): def run(self): diff --git a/tests/lite/test_parity.py b/tests/lite/test_parity.py index bec9339ec8e2f..d4d0ca6e5e9c7 100644 --- a/tests/lite/test_parity.py +++ b/tests/lite/test_parity.py @@ -190,7 +190,7 @@ def test_boring_lite_model_ddp_spawn(precision, strategy, devices, accelerator, assert torch.equal(w_pure.cpu(), w_lite.cpu()) -@RunIf(min_gpus=2, special=True) +@RunIf(min_gpus=2, standalone=True) @pytest.mark.parametrize( "precision, strategy, devices, accelerator", [ diff --git a/tests/models/test_hooks.py b/tests/models/test_hooks.py index c710aaf952458..b43dfe386b1ec 100644 --- a/tests/models/test_hooks.py +++ b/tests/models/test_hooks.py @@ -167,7 +167,7 @@ def transfer_batch_to_device(self, batch, device, dataloader_idx): assert torch.allclose(batch_gpu.targets.cpu(), torch.ones(5, 1, dtype=torch.long) * 2) -@RunIf(min_gpus=2, special=True) +@RunIf(min_gpus=2, standalone=True) def test_transfer_batch_hook_ddp(tmpdir): """Test custom data are properly moved to the right device using ddp.""" @@ -422,7 +422,7 @@ def _predict_batch(trainer, model, batches): return out -@RunIf(deepspeed=True, min_gpus=1, special=True) +@RunIf(deepspeed=True, min_gpus=1, standalone=True) @pytest.mark.parametrize("automatic_optimization", (True, False)) def test_trainer_model_hook_system_fit_deepspeed(tmpdir, automatic_optimization): _run_trainer_model_hook_system_fit( diff --git a/tests/models/test_sync_batchnorm.py b/tests/models/test_sync_batchnorm.py index 67880bec4e474..86c4a5af68b91 100644 --- a/tests/models/test_sync_batchnorm.py +++ b/tests/models/test_sync_batchnorm.py @@ -67,7 +67,7 @@ def configure_optimizers(self): # TODO: Fatal Python error: Bus error @pytest.mark.skip(reason="Fatal Python error: Bus error") -@RunIf(min_gpus=2, special=True) +@RunIf(min_gpus=2, standalone=True) def test_sync_batchnorm_ddp(tmpdir): seed_everything(234) set_random_main_port() diff --git a/tests/plugins/environments/torch_elastic_deadlock.py b/tests/plugins/environments/torch_elastic_deadlock.py index ead433200c304..f8a64ba632991 100644 --- a/tests/plugins/environments/torch_elastic_deadlock.py +++ b/tests/plugins/environments/torch_elastic_deadlock.py @@ -7,7 +7,7 @@ from pytorch_lightning.utilities.exceptions import DeadlockDetectedException from tests.helpers.boring_model import BoringModel -if os.getenv("PL_RUNNING_SPECIAL_TESTS", "0") == "1" and os.getenv("PL_RECONCILE_PROCESS", "0") == "1": +if os.getenv("PL_RUN_STANDALONE_TESTS", "0") == "1" and os.getenv("PL_RECONCILE_PROCESS", "0") == "1": class CustomException(Exception): pass diff --git a/tests/plugins/test_amp_plugins.py b/tests/plugins/test_amp_plugins.py index c482e8a83d7b6..3c10bf8495aeb 100644 --- a/tests/plugins/test_amp_plugins.py +++ b/tests/plugins/test_amp_plugins.py @@ -190,7 +190,7 @@ def configure_optimizers(self): trainer.fit(model) -@RunIf(min_gpus=2, amp_apex=True, special=True) +@RunIf(min_gpus=2, amp_apex=True, standalone=True) @pytest.mark.parametrize("amp_level", ["O2"]) def test_amp_apex_ddp_fit(amp_level, tmpdir): class CustomBoringModel(BoringModel): diff --git a/tests/plugins/test_ddp_fully_sharded_with_full_state_dict.py b/tests/plugins/test_ddp_fully_sharded_with_full_state_dict.py index 1468c7f4a4137..4b68667bbed6a 100644 --- a/tests/plugins/test_ddp_fully_sharded_with_full_state_dict.py +++ b/tests/plugins/test_ddp_fully_sharded_with_full_state_dict.py @@ -89,7 +89,7 @@ def _assert_layer_fsdp_instance(self) -> None: assert self.layer.module[2].reshard_after_forward is True -@RunIf(min_gpus=1, skip_windows=True, fairscale_fully_sharded=True, special=True) +@RunIf(min_gpus=1, skip_windows=True, fairscale_fully_sharded=True, standalone=True) def test_fully_sharded_plugin_checkpoint(tmpdir): """Test to ensure that checkpoint is saved correctly when using a single GPU, and all stages can be run.""" @@ -98,7 +98,7 @@ def test_fully_sharded_plugin_checkpoint(tmpdir): _run_multiple_stages(trainer, model, os.path.join(tmpdir, "last.ckpt")) -@RunIf(min_gpus=2, skip_windows=True, fairscale_fully_sharded=True, special=True) +@RunIf(min_gpus=2, skip_windows=True, fairscale_fully_sharded=True, standalone=True) def test_fully_sharded_plugin_checkpoint_multi_gpus(tmpdir): """Test to ensure that checkpoint is saved correctly when using multiple GPUs, and all stages can be run.""" @@ -136,7 +136,7 @@ def _run_multiple_stages(trainer, model, model_path: Optional[str] = None): trainer.test(ckpt_path=model_path) -@RunIf(min_gpus=1, skip_windows=True, fairscale_fully_sharded=True, special=True) +@RunIf(min_gpus=1, skip_windows=True, fairscale_fully_sharded=True, standalone=True) def test_fsdp_gradient_clipping_raises(tmpdir): """Test to ensure that an exception is raised when clipping gradients by value with FSDP.""" model = BoringModel() diff --git a/tests/plugins/test_ddp_plugin.py b/tests/plugins/test_ddp_plugin.py index 78ae931330307..1aaf89d052686 100644 --- a/tests/plugins/test_ddp_plugin.py +++ b/tests/plugins/test_ddp_plugin.py @@ -33,7 +33,7 @@ def on_train_start(self) -> None: self.start_cuda_memory = torch.cuda.memory_allocated() -@RunIf(skip_windows=True, min_gpus=2, special=True) +@RunIf(skip_windows=True, min_gpus=2, standalone=True) def test_ddp_with_2_gpus(): """Tests if device is set correctely when training and after teardown for DDPPlugin.""" trainer = Trainer(gpus=2, strategy="ddp", fast_dev_run=True) @@ -64,7 +64,7 @@ def on_train_start(self): self.trainer.training_type_plugin.barrier("barrier after model is wrapped") -@RunIf(min_gpus=4, special=True) +@RunIf(min_gpus=4, standalone=True) @mock.patch("torch.distributed.barrier") def test_ddp_barrier_non_consecutive_device_ids(barrier_mock, tmpdir): """Test correct usage of barriers when device ids do not start at 0 or are not consecutive.""" diff --git a/tests/plugins/test_ddp_plugin_with_comm_hook.py b/tests/plugins/test_ddp_plugin_with_comm_hook.py index 6497b39ffa516..49a637098f279 100644 --- a/tests/plugins/test_ddp_plugin_with_comm_hook.py +++ b/tests/plugins/test_ddp_plugin_with_comm_hook.py @@ -26,7 +26,7 @@ import torch.distributed.algorithms.ddp_comm_hooks.post_localSGD_hook as post_localSGD -@RunIf(skip_windows=True, min_torch="1.9.0", min_gpus=2, special=True) +@RunIf(skip_windows=True, min_torch="1.9.0", min_gpus=2, standalone=True) def test_ddp_fp16_compress_comm_hook(tmpdir): """Test for DDP FP16 compress hook.""" model = BoringModel() @@ -46,7 +46,7 @@ def test_ddp_fp16_compress_comm_hook(tmpdir): assert trainer.state.finished, f"Training failed with {trainer.state}" -@RunIf(skip_windows=True, min_torch="1.9.0", min_gpus=2, special=True) +@RunIf(skip_windows=True, min_torch="1.9.0", min_gpus=2, standalone=True) def test_ddp_sgd_comm_hook(tmpdir): """Test for DDP FP16 compress hook.""" model = BoringModel() @@ -70,7 +70,7 @@ def test_ddp_sgd_comm_hook(tmpdir): assert trainer.state.finished, f"Training failed with {trainer.state}" -@RunIf(skip_windows=True, min_torch="1.9.0", min_gpus=2, special=True) +@RunIf(skip_windows=True, min_torch="1.9.0", min_gpus=2, standalone=True) def test_ddp_fp16_compress_wrap_sgd_comm_hook(tmpdir): """Test for DDP FP16 compress wrapper for SGD hook.""" model = BoringModel() @@ -95,7 +95,7 @@ def test_ddp_fp16_compress_wrap_sgd_comm_hook(tmpdir): assert trainer.state.finished, f"Training failed with {trainer.state}" -@RunIf(skip_windows=True, min_torch="1.9.0", min_gpus=2, special=True) +@RunIf(skip_windows=True, min_torch="1.9.0", min_gpus=2, standalone=True) def test_ddp_spawn_fp16_compress_comm_hook(tmpdir): """Test for DDP Spawn FP16 compress hook.""" model = BoringModel() @@ -112,7 +112,7 @@ def test_ddp_spawn_fp16_compress_comm_hook(tmpdir): assert trainer.state.finished, f"Training failed with {trainer.state}" -@RunIf(skip_windows=True, min_torch="1.10.0", min_gpus=2, special=True) +@RunIf(skip_windows=True, min_torch="1.10.0", min_gpus=2, standalone=True) def test_ddp_post_local_sgd_comm_hook(tmpdir): """Test for DDP post-localSGD hook.""" model = BoringModel() diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py index c5b71e908795b..cfb12369da1c4 100644 --- a/tests/plugins/test_deepspeed_plugin.py +++ b/tests/plugins/test_deepspeed_plugin.py @@ -203,7 +203,7 @@ def test_deepspeed_defaults(tmpdir): assert isinstance(plugin.config["zero_optimization"], dict) -@RunIf(min_gpus=1, deepspeed=True, special=True) +@RunIf(min_gpus=1, deepspeed=True, standalone=True) def test_warn_deepspeed_ignored(tmpdir): class TestModel(BoringModel): def backward(self, loss: Tensor, optimizer: Optimizer, optimizer_idx: int, *args, **kwargs) -> None: @@ -259,7 +259,7 @@ def setup(self, trainer, pl_module, stage: Optional[str] = None) -> None: trainer.fit(model) -@RunIf(min_gpus=1, deepspeed=True, special=True) +@RunIf(min_gpus=1, deepspeed=True, standalone=True) def test_deepspeed_run_configure_optimizers(tmpdir): """Test end to end that deepspeed works with defaults (without ZeRO as that requires compilation), whilst using configure_optimizers for optimizers and schedulers.""" @@ -296,7 +296,7 @@ def configure_optimizers(self): _assert_save_model_is_equal(model, tmpdir, trainer) -@RunIf(min_gpus=1, deepspeed=True, special=True) +@RunIf(min_gpus=1, deepspeed=True, standalone=True) def test_deepspeed_config(tmpdir, deepspeed_zero_config): """Test to ensure deepspeed works correctly when passed a DeepSpeed config object including optimizers/schedulers and saves the model weights to load correctly.""" @@ -324,7 +324,7 @@ def on_train_start(self, trainer, pl_module) -> None: trainer.test(model) -@RunIf(min_gpus=1, deepspeed=True, special=True) +@RunIf(min_gpus=1, deepspeed=True, standalone=True) def test_deepspeed_custom_precision_params(tmpdir): """Ensure if we modify the FP16 parameters via the DeepSpeedPlugin, the deepspeed config contains these changes.""" @@ -386,7 +386,7 @@ def on_before_accelerator_backend_setup(self, trainer, pl_module) -> None: trainer.fit(model) -@RunIf(min_gpus=2, deepspeed=True, special=True) +@RunIf(min_gpus=2, deepspeed=True, standalone=True) def test_deepspeed_multigpu(tmpdir): """Test to ensure that DeepSpeed with multiple GPUs works and deepspeed distributed is initialized correctly.""" @@ -402,14 +402,14 @@ def test_deepspeed_multigpu(tmpdir): _assert_save_model_is_equal(model, tmpdir, trainer) -@RunIf(min_gpus=1, deepspeed=True, special=True) +@RunIf(min_gpus=1, deepspeed=True, standalone=True) def test_deepspeed_fp32_works(tmpdir): model = BoringModel() trainer = Trainer(default_root_dir=tmpdir, gpus=1, strategy="deepspeed_stage_3", fast_dev_run=True) trainer.fit(model) -@RunIf(min_gpus=2, deepspeed=True, special=True) +@RunIf(min_gpus=2, deepspeed=True, standalone=True) def test_deepspeed_stage_3_save_warning(tmpdir): """Test to ensure that DeepSpeed Stage 3 gives a warning when saving on rank zero.""" model = BoringModel() @@ -429,7 +429,7 @@ def test_deepspeed_stage_3_save_warning(tmpdir): trainer.save_checkpoint(checkpoint_path) -@RunIf(min_gpus=1, deepspeed=True, special=True) +@RunIf(min_gpus=1, deepspeed=True, standalone=True) def test_deepspeed_multigpu_single_file(tmpdir): """Test to ensure that DeepSpeed loads from a single file checkpoint.""" model = BoringModel() @@ -538,7 +538,7 @@ def training_step(self, batch, batch_idx): opt.step() -@RunIf(min_gpus=2, deepspeed=True, special=True) +@RunIf(min_gpus=2, deepspeed=True, standalone=True) def test_deepspeed_multigpu_stage_3(tmpdir, deepspeed_config): """Test to ensure ZeRO Stage 3 works with a parallel model.""" model = ModelParallelBoringModel() @@ -551,7 +551,7 @@ def test_deepspeed_multigpu_stage_3(tmpdir, deepspeed_config): _assert_save_model_is_equal(model, tmpdir, trainer) -@RunIf(min_gpus=2, deepspeed=True, special=True) +@RunIf(min_gpus=2, deepspeed=True, standalone=True) def test_deepspeed_multigpu_stage_3_manual_optimization(tmpdir, deepspeed_config): """Test to ensure ZeRO Stage 3 works with a parallel model.""" model = ModelParallelBoringModelManualOptim() @@ -600,14 +600,14 @@ def run_checkpoint_test(tmpdir: str, automatic_optimization: bool = True, accumu assert results[0]["test_acc"] > 0.7 -@RunIf(min_gpus=2, deepspeed=True, special=True) +@RunIf(min_gpus=2, deepspeed=True, standalone=True) def test_deepspeed_multigpu_stage_3_checkpointing(tmpdir): """Test to ensure with Stage 3 and multiple GPUs that we can save/load a model resuming from a checkpoint, and see convergence.""" run_checkpoint_test(tmpdir) -@RunIf(min_gpus=1, deepspeed=True, special=True) +@RunIf(min_gpus=1, deepspeed=True, standalone=True) def test_deepspeed_multigpu_stage_3_warns_resume_training(tmpdir): """Test to ensure with Stage 3 and multiple GPUs that we can resume from training, throwing a warning that the optimizer state and scheduler states cannot be restored.""" @@ -634,7 +634,7 @@ def test_deepspeed_multigpu_stage_3_warns_resume_training(tmpdir): trainer.fit(model, datamodule=dm, ckpt_path=checkpoint_path) -@RunIf(min_gpus=1, deepspeed=True, special=True) +@RunIf(min_gpus=1, deepspeed=True, standalone=True) def test_deepspeed_multigpu_stage_3_resume_training(tmpdir): """Test to ensure with Stage 3 and multiple GPUs that we can resume training.""" initial_model = ModelParallelClassificationModel() @@ -688,19 +688,19 @@ def on_train_batch_start( trainer.fit(model, datamodule=dm, ckpt_path=ck.best_model_path) -@RunIf(min_gpus=2, deepspeed=True, special=True) +@RunIf(min_gpus=2, deepspeed=True, standalone=True) def test_deepspeed_multigpu_stage_3_checkpointing_full_weights_manual(tmpdir): """Test to ensure with Stage 3 and multiple GPUs that we can save/load a model resuming from a checkpoint, where we save the full weights to one file.""" run_checkpoint_test(tmpdir, automatic_optimization=False, accumulate_grad_batches=1) -@RunIf(min_gpus=2, deepspeed=True, special=True) +@RunIf(min_gpus=2, deepspeed=True, standalone=True) def test_deepspeed_multigpu_stage_2_accumulated_grad_batches(tmpdir): _deepspeed_multigpu_stage_2_accumulated_grad_batches(tmpdir, offload_optimizer=False) -@RunIf(min_gpus=2, deepspeed=True, special=True) +@RunIf(min_gpus=2, deepspeed=True, standalone=True) def test_deepspeed_multigpu_stage_2_accumulated_grad_batches_offload_optimizer(tmpdir): _deepspeed_multigpu_stage_2_accumulated_grad_batches(tmpdir, offload_optimizer=True) @@ -741,7 +741,7 @@ def on_train_batch_start(self, trainer, pl_module: LightningModule, batch: Any, assert verification_callback.on_train_batch_start_called -@RunIf(min_gpus=2, deepspeed=True, special=True) +@RunIf(min_gpus=2, deepspeed=True, standalone=True) def test_deepspeed_multigpu_test(tmpdir): """Test to ensure we can use DeepSpeed with just test using ZeRO Stage 3.""" model = ModelParallelBoringModel() @@ -751,7 +751,7 @@ def test_deepspeed_multigpu_test(tmpdir): trainer.test(model) -@RunIf(min_gpus=1, deepspeed=True, special=True) +@RunIf(min_gpus=1, deepspeed=True, standalone=True) def test_deepspeed_multigpu_partial_partition_parameters(tmpdir): """Test to ensure that a module that defines a layer inside the ``__init__`` and ``configure_sharded_model`` correctly converts all parameters to float16 when ``precision=16`` and runs successfully.""" @@ -778,7 +778,7 @@ def on_train_epoch_start(self) -> None: trainer.fit(model) -@RunIf(min_gpus=1, deepspeed=True, special=True) +@RunIf(min_gpus=1, deepspeed=True, standalone=True) def test_deepspeed_multigpu_test_rnn(tmpdir): """Test to ensure that turning off explicit partitioning of the entire module for ZeRO Stage 3 works when training with certain layers which will crash with explicit partitioning.""" @@ -849,7 +849,7 @@ def _assert_save_model_is_equal(model, tmpdir, trainer): assert torch.equal(orig_param, saved_model_param) -@RunIf(min_gpus=2, deepspeed=True, special=True) +@RunIf(min_gpus=2, deepspeed=True, standalone=True) def test_deepspeed_multigpu_no_schedulers(tmpdir): """Test to ensure ZeRO Stage 3 works with a parallel model and no schedulers.""" model = ModelParallelBoringModelNoSchedulers() @@ -861,7 +861,7 @@ def test_deepspeed_multigpu_no_schedulers(tmpdir): _assert_save_model_is_equal(model, tmpdir, trainer) -@RunIf(min_gpus=1, deepspeed=True, special=True) +@RunIf(min_gpus=1, deepspeed=True, standalone=True) def test_deepspeed_skip_backward_raises(tmpdir): class TestModel(BoringModel): def training_step(self, batch, batch_idx): @@ -873,7 +873,7 @@ def training_step(self, batch, batch_idx): trainer.fit(model) -@RunIf(min_gpus=1, deepspeed=True, special=True) +@RunIf(min_gpus=1, deepspeed=True, standalone=True) def test_deepspeed_setup_train_dataloader(tmpdir): """Test DeepSpeed works when setup is required to call in the DataModule.""" @@ -911,7 +911,7 @@ def test_dataloader(self): @mock.patch("torch.optim.lr_scheduler.StepLR.step", autospec=True) -@RunIf(min_gpus=1, deepspeed=True, special=True) +@RunIf(min_gpus=1, deepspeed=True, standalone=True) def test_deepspeed_scheduler_step_count(mock_step): """Test to ensure that the scheduler is called the correct amount of times during training when scheduler is set to step.""" @@ -919,7 +919,7 @@ def test_deepspeed_scheduler_step_count(mock_step): @mock.patch("torch.optim.lr_scheduler.StepLR.step", autospec=True) -@RunIf(min_gpus=1, deepspeed=True, special=True) +@RunIf(min_gpus=1, deepspeed=True, standalone=True) def test_deepspeed_scheduler_step_count_epoch(mock_step): """Test to ensure that the scheduler is called the correct amount of times during training when scheduler is set to epoch.""" @@ -954,7 +954,7 @@ def configure_optimizers(self): assert mock_step.call_count == 1 + (max_epoch * limit_train_batches) -@RunIf(min_gpus=1, deepspeed=True, special=True) +@RunIf(min_gpus=1, deepspeed=True, standalone=True) def test_deepspeed_configure_gradient_clipping(tmpdir): """Test to ensure that a warning is raised when `LightningModule.configure_gradient_clipping` is overridden in case of deepspeed.""" @@ -975,7 +975,7 @@ def configure_gradient_clipping(self, optimizer, optimizer_idx, gradient_clip_va trainer.fit(model) -@RunIf(min_gpus=1, deepspeed=True, special=True) +@RunIf(min_gpus=1, deepspeed=True, standalone=True) def test_deepspeed_gradient_clip_by_value(tmpdir): """Test to ensure that an exception is raised when using `gradient_clip_algorithm='value'`.""" model = BoringModel() @@ -989,7 +989,7 @@ def test_deepspeed_gradient_clip_by_value(tmpdir): trainer.fit(model) -@RunIf(min_gpus=1, deepspeed=True, special=True) +@RunIf(min_gpus=1, deepspeed=True, standalone=True) def test_different_accumulate_grad_batches_fails(tmpdir): model = BoringModel() trainer = Trainer(default_root_dir=tmpdir, accumulate_grad_batches={1: 2}, gpus=1, strategy="deepspeed") @@ -999,7 +999,7 @@ def test_different_accumulate_grad_batches_fails(tmpdir): trainer.fit(model) -@RunIf(min_gpus=2, deepspeed=True, special=True) +@RunIf(min_gpus=2, deepspeed=True, standalone=True) def test_specific_gpu_device_id(tmpdir): class TestCallback(Callback): def on_train_start(self, trainer: Trainer, pl_module: LightningModule) -> None: @@ -1036,7 +1036,7 @@ def on_test_batch_start( trainer.test(model) -@RunIf(min_gpus=2, deepspeed=True, special=True, min_torch="1.10.0") +@RunIf(min_gpus=2, deepspeed=True, standalone=True, min_torch="1.10.0") def test_deepspeed_with_meta_device(tmpdir): with init_meta_context(): model = BoringModel() diff --git a/tests/plugins/test_sharded_plugin.py b/tests/plugins/test_sharded_plugin.py index e80b5d9f7621e..8a55633fb143e 100644 --- a/tests/plugins/test_sharded_plugin.py +++ b/tests/plugins/test_sharded_plugin.py @@ -175,7 +175,7 @@ def test_ddp_sharded_plugin_fit_ckpt_path_gpu_to_cpu(tmpdir): trainer.fit(model, ckpt_path=checkpoint_path) -@RunIf(skip_windows=True, special=True, fairscale=True) +@RunIf(skip_windows=True, standalone=True, fairscale=True) @pytest.mark.parametrize("trainer_kwargs", (dict(num_processes=2), pytest.param(dict(gpus=2), marks=RunIf(min_gpus=2)))) def test_ddp_sharded_plugin_test_multigpu(tmpdir, trainer_kwargs): """Test to ensure we can use validate and test without fit.""" @@ -201,7 +201,7 @@ def training_step(self, batch, batch_idx): return {"loss": loss} -@RunIf(skip_windows=True, special=True, fairscale=True, min_gpus=2) +@RunIf(skip_windows=True, standalone=True, fairscale=True, min_gpus=2) def test_ddp_sharded_plugin_manual_optimization_spawn(tmpdir): # todo (sean): this test has been split out as running both tests using parametrize causes "Address in use" model = ManualBoringModel() @@ -209,7 +209,7 @@ def test_ddp_sharded_plugin_manual_optimization_spawn(tmpdir): trainer.fit(model) -@RunIf(skip_windows=True, special=True, fairscale=True, min_gpus=2) +@RunIf(skip_windows=True, standalone=True, fairscale=True, min_gpus=2) def test_ddp_sharded_plugin_manual_optimization(tmpdir): model = ManualBoringModel() trainer = Trainer(default_root_dir=tmpdir, strategy="ddp_sharded", fast_dev_run=2, gpus=2) diff --git a/tests/profiler/test_profiler.py b/tests/profiler/test_profiler.py index f9a4727334a4f..708062ab64490 100644 --- a/tests/profiler/test_profiler.py +++ b/tests/profiler/test_profiler.py @@ -293,7 +293,7 @@ def test_advanced_profiler_cprofile_deepcopy(tmpdir): trainer.fit(model) -@RunIf(min_gpus=2, special=True) +@RunIf(min_gpus=2, standalone=True) def test_pytorch_profiler_trainer_ddp(tmpdir, pytorch_profiler): """Ensure that the profiler can be given to the training and default step are properly recorded.""" model = BoringModel() @@ -331,7 +331,7 @@ def test_pytorch_profiler_trainer_ddp(tmpdir, pytorch_profiler): assert any(f"{local_rank}-validation_step" in f for f in files) -@RunIf(special=True) +@RunIf(standalone=True) @pytest.mark.parametrize("fast_dev_run", [1, 2, 3, 4, 5]) @pytest.mark.parametrize("boring_model_cls", [ManualOptimBoringModel, BoringModel]) def test_pytorch_profiler_trainer_fit(fast_dev_run, boring_model_cls, tmpdir): @@ -427,7 +427,7 @@ def look_for_trace(trace_dir): assert look_for_trace(tmpdir) -@RunIf(min_gpus=1, special=True) +@RunIf(min_gpus=1, standalone=True) def test_pytorch_profiler_nested_emit_nvtx(tmpdir): """This test check emit_nvtx is correctly supported.""" profiler = PyTorchProfiler(use_cuda=True, emit_nvtx=True) diff --git a/tests/special_tests.sh b/tests/standalone_tests.sh similarity index 82% rename from tests/special_tests.sh rename to tests/standalone_tests.sh index 27abaa6cc62e3..49c608d53cfa1 100755 --- a/tests/special_tests.sh +++ b/tests/standalone_tests.sh @@ -15,12 +15,12 @@ set -e # this environment variable allows special tests to run -export PL_RUNNING_SPECIAL_TESTS=1 +export PL_RUN_STANDALONE_TESTS=1 # python arguments defaults='-m coverage run --source pytorch_lightning --append -m pytest --capture=no' -# find tests marked as `@RunIf(special=True)`. done manually instead of with pytest because it is faster -grep_output=$(grep --recursive --word-regexp 'tests' --regexp 'special=True' --include '*.py' --exclude 'tests/conftest.py') +# find tests marked as `@RunIf(standalone=True)`. done manually instead of with pytest because it is faster +grep_output=$(grep --recursive --word-regexp 'tests' --regexp 'standalone=True' --include '*.py' --exclude 'tests/conftest.py') # file paths, remove duplicates files=$(echo "$grep_output" | cut -f1 -d: | sort | uniq) @@ -47,10 +47,10 @@ for i in "${!parametrizations_arr[@]}"; do continue fi - # SPECIAL_PATTERN allows filtering the tests to run when debugging. - # use as `SPECIAL_PATTERN="foo_bar" ./special_tests.sh` to run only those + # STANDALONE_PATTERN allows filtering the tests to run when debugging. + # use as `STANDALONE_PATTERN="foo_bar" ./standalone_tests.sh` to run only those # test with `foo_bar` in their name - if [[ $parametrization != *$SPECIAL_PATTERN* ]]; then + if [[ $parametrization != *STANDALONE_PATTERN* ]]; then report+="Skipped\t$parametrization\n" continue fi @@ -74,7 +74,7 @@ fi # TODO: enable when CI uses torch>=1.9 # test deadlock is properly handled with TorchElastic. -# LOGS=$(PL_RUNNING_SPECIAL_TESTS=1 PL_RECONCILE_PROCESS=1 python -m torch.distributed.run --nproc_per_node=2 --max_restarts 0 -m coverage run --source pytorch_lightning -a tests/plugins/environments/torch_elastic_deadlock.py | grep "SUCCEEDED") +# LOGS=$(PL_RUN_STANDALONE_TESTS=1 PL_RECONCILE_PROCESS=1 python -m torch.distributed.run --nproc_per_node=2 --max_restarts 0 -m coverage run --source pytorch_lightning -a tests/plugins/environments/torch_elastic_deadlock.py | grep "SUCCEEDED") # if [ -z "$LOGS" ]; then # exit 1 # fi diff --git a/tests/trainer/logging_/test_train_loop_logging.py b/tests/trainer/logging_/test_train_loop_logging.py index 2950404e03f75..139714acc97bc 100644 --- a/tests/trainer/logging_/test_train_loop_logging.py +++ b/tests/trainer/logging_/test_train_loop_logging.py @@ -434,7 +434,7 @@ def test_logging_sync_dist_true(tmpdir, devices): assert metrics["bar_3"] == 2 + int(use_multiple_devices) -@RunIf(min_gpus=2, special=True) +@RunIf(min_gpus=2, standalone=True) def test_logging_sync_dist_true_ddp(tmpdir): """Tests to ensure that the sync_dist flag works with ddp.""" diff --git a/tests/trainer/optimization/test_manual_optimization.py b/tests/trainer/optimization/test_manual_optimization.py index 5c86fd6343002..82acfb8b08f8c 100644 --- a/tests/trainer/optimization/test_manual_optimization.py +++ b/tests/trainer/optimization/test_manual_optimization.py @@ -840,7 +840,7 @@ def train_manual_optimization(tmpdir, strategy, model_cls=TesManualOptimizationD assert not torch.equal(param.cpu().data, param_copy.data) -@RunIf(min_gpus=2, special=True) +@RunIf(min_gpus=2, standalone=True) def test_step_with_optimizer_closure_with_different_frequencies_ddp(tmpdir): """Tests that `step` works with optimizer_closure and different accumulated_gradient frequency.""" @@ -910,7 +910,7 @@ def dis_closure(): opt_dis.zero_grad() -@RunIf(min_gpus=2, special=True) +@RunIf(min_gpus=2, standalone=True) def test_step_with_optimizer_closure_with_different_frequencies_ddp_with_toggle_model(tmpdir): train_manual_optimization(tmpdir, "ddp", model_cls=TestManualOptimizationDDPModelToggleModel) diff --git a/tests/trainer/optimization/test_optimizers.py b/tests/trainer/optimization/test_optimizers.py index b2d88becb1ec7..4a99b3318f06f 100644 --- a/tests/trainer/optimization/test_optimizers.py +++ b/tests/trainer/optimization/test_optimizers.py @@ -537,7 +537,7 @@ def configure_optimizers(self): trainer.fit(model) -@RunIf(min_gpus=2, special=True) +@RunIf(min_gpus=2, standalone=True) def test_optimizer_state_on_device(tmpdir): """Test that optimizers that create state initially at instantiation still end up with the state on the GPU.""" diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py index 7c2c6e9b55e9e..ee71616524944 100644 --- a/tests/trainer/test_trainer.py +++ b/tests/trainer/test_trainer.py @@ -1452,7 +1452,7 @@ def test_trainer_predict_cpu(tmpdir, datamodule, enable_progress_bar): predict(tmpdir, datamodule=datamodule, enable_progress_bar=enable_progress_bar) -@RunIf(min_gpus=2, special=True) +@RunIf(min_gpus=2, standalone=True) @pytest.mark.parametrize( "kwargs", [ @@ -1461,7 +1461,7 @@ def test_trainer_predict_cpu(tmpdir, datamodule, enable_progress_bar): {"strategy": "ddp", "devices": 2}, ], ) -def test_trainer_predict_special(tmpdir, kwargs): +def test_trainer_predict_standalone(tmpdir, kwargs): predict(tmpdir, accelerator="gpu", **kwargs) @@ -1889,7 +1889,7 @@ class CustomException(Exception): pass -@RunIf(min_gpus=2, special=True) +@RunIf(min_gpus=2, standalone=True) def test_ddp_terminate_when_deadlock_is_detected(tmpdir): """Test that DDP kills the remaining processes when only one rank is throwing an exception.""" diff --git a/tests/utilities/test_all_gather_grad.py b/tests/utilities/test_all_gather_grad.py index 2ed42b0b0f21a..63c5c2cfe90fe 100644 --- a/tests/utilities/test_all_gather_grad.py +++ b/tests/utilities/test_all_gather_grad.py @@ -47,7 +47,7 @@ def test_all_gather_ddp_spawn(): torch.multiprocessing.spawn(_test_all_gather_ddp, args=(world_size,), nprocs=world_size) -@RunIf(min_gpus=2, skip_windows=True, special=True) +@RunIf(min_gpus=2, skip_windows=True, standalone=True) def test_all_gather_collection(tmpdir): class TestModel(BoringModel): @@ -98,7 +98,7 @@ def training_epoch_end(self, outputs) -> None: assert model.training_epoch_end_called -@RunIf(min_gpus=2, skip_windows=True, special=True) +@RunIf(min_gpus=2, skip_windows=True, standalone=True) def test_all_gather_sync_grads(tmpdir): class TestModel(BoringModel): diff --git a/tests/utilities/test_deepspeed_collate_checkpoint.py b/tests/utilities/test_deepspeed_collate_checkpoint.py index e85557b4e6056..0f36ada39227d 100644 --- a/tests/utilities/test_deepspeed_collate_checkpoint.py +++ b/tests/utilities/test_deepspeed_collate_checkpoint.py @@ -22,7 +22,7 @@ from tests.helpers.runif import RunIf -@RunIf(min_gpus=2, deepspeed=True, special=True) +@RunIf(min_gpus=2, deepspeed=True, standalone=True) def test_deepspeed_collate_checkpoint(tmpdir): """Test to ensure that with DeepSpeed Stage 3 we can collate the sharded checkpoints into a single file.""" model = BoringModel() diff --git a/tests/utilities/test_meta.py b/tests/utilities/test_meta.py index 581b949d9167f..1f386ac1ce0fe 100644 --- a/tests/utilities/test_meta.py +++ b/tests/utilities/test_meta.py @@ -31,7 +31,7 @@ def __init__(self, num_layers: int): self.layer = nn.Sequential(*[nn.Linear(1, 1) for _ in range(self.hparams.num_layers)]) -@RunIf(special=True, min_torch="1.10.0") +@RunIf(standalone=True, min_torch="1.10.0") def test_init_meta_context(): with init_meta_context(): diff --git a/tests/utilities/test_warnings.py b/tests/utilities/test_warnings.py index d1222672b7595..6189562d9e190 100644 --- a/tests/utilities/test_warnings.py +++ b/tests/utilities/test_warnings.py @@ -21,8 +21,8 @@ from pytorch_lightning.utilities.warnings import _warn, rank_zero_deprecation, rank_zero_warn, WarningCache -running_special = os.getenv("PL_RUNNING_SPECIAL_TESTS", "0") == "1" -if running_special: +standalone = os.getenv("PL_RUN_STANDALONE_TESTS", "0") == "1" +if standalone: stderr = StringIO() # recording From 551f05e7658815ac19a56a0201baf0fe1165618d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Thu, 25 Nov 2021 15:24:29 +0100 Subject: [PATCH 059/123] Merge pull request #10738 from PyTorchLightning/1.5.x-drop-torch-1.6 Sync CI configuration into 1.5.x branch --- requirements/adjust_versions.py | 7 +++---- requirements/extra.txt | 2 +- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/requirements/adjust_versions.py b/requirements/adjust_versions.py index 2ec7a177e0824..8295a726e7873 100644 --- a/requirements/adjust_versions.py +++ b/requirements/adjust_versions.py @@ -83,9 +83,8 @@ def test(): else: requirements_path, torch_version = sys.argv[1], None - with open(requirements_path) as fp: + with open(requirements_path, "r+") as fp: requirements = fp.read() - requirements = main(requirements, torch_version) - print(requirements) # on purpose - to debug - with open(requirements_path, "w") as fp: + requirements = main(requirements, torch_version) + print(requirements) # on purpose - to debug fp.write(requirements) diff --git a/requirements/extra.txt b/requirements/extra.txt index babaffca6280d..6abf3089b8506 100644 --- a/requirements/extra.txt +++ b/requirements/extra.txt @@ -5,6 +5,6 @@ horovod>=0.21.2 # no need to install with [pytorch] as pytorch is already insta torchtext>=0.8.* omegaconf>=2.0.5 hydra-core>=1.0.5 -jsonargparse[signatures]>=4.0.4 +jsonargparse[signatures]>=4.0.0 gcsfs>=2021.5.0 rich>=10.2.2 From 20bef8327f52248a02dfc6c013afb90089d01519 Mon Sep 17 00:00:00 2001 From: Kaushik B Date: Tue, 30 Nov 2021 18:07:29 +0530 Subject: [PATCH 060/123] Update changelog --- CHANGELOG.md | 10 ---------- requirements/extra.txt | 2 +- 2 files changed, 1 insertion(+), 11 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0e45a8676d4dd..53f0105102fdb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,22 +10,12 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Fixed - Fixed support for `--key.help=class` with the `LightningCLI` ([#10767](https://github.com/PyTorchLightning/pytorch-lightning/pull/10767)) - - - Fixed `_compare_version` for python packages ([#10762](https://github.com/PyTorchLightning/pytorch-lightning/pull/10762)) - - - Fixed TensorBoardLogger `SummaryWriter` not close before spawning the processes ([#10777](https://github.com/PyTorchLightning/pytorch-lightning/pull/10777)) - - - Fixed a consolidation error in Lite when attempting to save the state dict of a sharded optimizer ([#10746](https://github.com/PyTorchLightning/pytorch-lightning/pull/10746)) - - - Fixed the default logging level for batch hooks associated with training from `on_step=False, on_epoch=True` to `on_step=True, on_epoch=False` ([#10756](https://github.com/PyTorchLightning/pytorch-lightning/pull/10756)) - - ### Removed - Removed PyTorch 1.6 support ([#10367](https://github.com/PyTorchLightning/pytorch-lightning/pull/10367), [#10738](https://github.com/PyTorchLightning/pytorch-lightning/pull/10738)) diff --git a/requirements/extra.txt b/requirements/extra.txt index 6abf3089b8506..babaffca6280d 100644 --- a/requirements/extra.txt +++ b/requirements/extra.txt @@ -5,6 +5,6 @@ horovod>=0.21.2 # no need to install with [pytorch] as pytorch is already insta torchtext>=0.8.* omegaconf>=2.0.5 hydra-core>=1.0.5 -jsonargparse[signatures]>=4.0.0 +jsonargparse[signatures]>=4.0.4 gcsfs>=2021.5.0 rich>=10.2.2 From c935ef7dde151cae0d302f65601ecf2e1ef1f910 Mon Sep 17 00:00:00 2001 From: rohitgr7 Date: Wed, 1 Dec 2021 18:22:28 +0530 Subject: [PATCH 061/123] update version --- pytorch_lightning/__about__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/__about__.py b/pytorch_lightning/__about__.py index 1fa67b89ab062..15d57a2617fed 100644 --- a/pytorch_lightning/__about__.py +++ b/pytorch_lightning/__about__.py @@ -1,7 +1,7 @@ import time _this_year = time.strftime("%Y") -__version__ = "1.5.4" +__version__ = "1.5.5" __author__ = "William Falcon et al." __author_email__ = "waf2107@columbia.edu" __license__ = "Apache-2.0" From 8c1788cd04ed1d5136b39da7065c102db7d89966 Mon Sep 17 00:00:00 2001 From: Rohit Gupta Date: Wed, 1 Dec 2021 02:17:05 +0530 Subject: [PATCH 062/123] Disable batch_size extraction for torchmetric instances (#10815) Co-authored-by: Carlos Mocholi --- CHANGELOG.md | 7 ++- .../connectors/logger_connector/result.py | 16 +++--- .../trainer/logging_/test_logger_connector.py | 50 ++++++++++++++++++- .../logging_/test_train_loop_logging.py | 34 ------------- 4 files changed, 64 insertions(+), 43 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 53f0105102fdb..a6e8c77f17d33 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,12 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). +## [1.5.4] - 2021-12-07 + +### Fixed + +- Disabled batch_size extraction for torchmetric instances because they accumulate the metrics internally ([#10815](https://github.com/PyTorchLightning/pytorch-lightning/pull/10815)) + ## [1.5.4] - 2021-11-30 @@ -15,7 +21,6 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed a consolidation error in Lite when attempting to save the state dict of a sharded optimizer ([#10746](https://github.com/PyTorchLightning/pytorch-lightning/pull/10746)) - Fixed the default logging level for batch hooks associated with training from `on_step=False, on_epoch=True` to `on_step=True, on_epoch=False` ([#10756](https://github.com/PyTorchLightning/pytorch-lightning/pull/10756)) - ### Removed - Removed PyTorch 1.6 support ([#10367](https://github.com/PyTorchLightning/pytorch-lightning/pull/10367), [#10738](https://github.com/PyTorchLightning/pytorch-lightning/pull/10738)) diff --git a/pytorch_lightning/trainer/connectors/logger_connector/result.py b/pytorch_lightning/trainer/connectors/logger_connector/result.py index 1b82baf0440c9..428830e663e1a 100644 --- a/pytorch_lightning/trainer/connectors/logger_connector/result.py +++ b/pytorch_lightning/trainer/connectors/logger_connector/result.py @@ -335,12 +335,13 @@ class ResultMetricCollection(dict): with the same metadata. """ - def __init__(self, *args: Any) -> None: - super().__init__(*args) - @property def meta(self) -> _Metadata: - return list(self.values())[0].meta + return next(iter(self.values())).meta + + @property + def has_tensor(self) -> bool: + return any(v.is_tensor for v in self.values()) def __getstate__(self, drop_value: bool = False) -> dict: def getstate(item: ResultMetric) -> dict: @@ -400,7 +401,7 @@ def append_fn(v: ResultMetric) -> None: apply_to_collection(list(self.values()), ResultMetric, append_fn) return o - def _extract_batch_size(self, batch_size: Optional[int], meta: _Metadata) -> int: + def _extract_batch_size(self, value: _METRIC_COLLECTION, batch_size: Optional[int], meta: _Metadata) -> int: # check if we have extracted the batch size already if batch_size is None: batch_size = self.batch_size @@ -409,7 +410,8 @@ def _extract_batch_size(self, batch_size: Optional[int], meta: _Metadata) -> int return batch_size batch_size = 1 - if self.batch is not None and meta.on_epoch and meta.is_mean_reduction: + is_tensor = value.is_tensor if isinstance(value, ResultMetric) else value.has_tensor + if self.batch is not None and is_tensor and meta.on_epoch and meta.is_mean_reduction: try: batch_size = extract_batch_size(self.batch) self.batch_size = batch_size @@ -477,7 +479,7 @@ def log( f"You called `self.log({name}, ...)` twice in `{fx}` with different arguments. This is not allowed" ) - batch_size = self._extract_batch_size(batch_size, meta) + batch_size = self._extract_batch_size(self[key], batch_size, meta) self.update_metrics(key, value, batch_size) def register_key(self, key: str, meta: _Metadata, value: _METRIC_COLLECTION) -> None: diff --git a/tests/trainer/logging_/test_logger_connector.py b/tests/trainer/logging_/test_logger_connector.py index 7dae25df68d2b..ef36211eb3b70 100644 --- a/tests/trainer/logging_/test_logger_connector.py +++ b/tests/trainer/logging_/test_logger_connector.py @@ -17,7 +17,7 @@ import pytest import torch from torch.utils.data import DataLoader -from torchmetrics import Accuracy, AveragePrecision +from torchmetrics import Accuracy, AveragePrecision, MeanAbsoluteError, MeanSquaredError from pytorch_lightning import LightningModule from pytorch_lightning.callbacks.base import Callback @@ -637,3 +637,51 @@ def training_step(self, batch, batch_idx): # should not get overridden if logged manually assert trainer.logged_metrics == {"epoch": -1} + + +def test_result_collection_batch_size_extraction(): + fx_name = "training_step" + log_val = torch.tensor(7.0) + + results = ResultCollection(training=True, device="cpu") + results.batch = torch.randn(1, 4) + train_mse = MeanSquaredError() + train_mse(torch.randn(4, 5), torch.randn(4, 5)) + results.log(fx_name, "train_logs", {"mse": train_mse, "log_val": log_val}, on_step=False, on_epoch=True) + assert results.batch_size == 1 + assert isinstance(results["training_step.train_logs"]["mse"].value, MeanSquaredError) + assert results["training_step.train_logs"]["log_val"].value == log_val + + results = ResultCollection(training=True, device="cpu") + results.batch = torch.randn(1, 4) + results.log(fx_name, "train_log", log_val, on_step=False, on_epoch=True) + assert results.batch_size == 1 + assert results["training_step.train_log"].value == log_val + assert results["training_step.train_log"].cumulated_batch_size == 1 + + +def test_result_collection_no_batch_size_extraction(): + results = ResultCollection(training=True, device="cpu") + results.batch = torch.randn(1, 4) + fx_name = "training_step" + batch_size = 10 + log_val = torch.tensor(7.0) + + train_mae = MeanAbsoluteError() + train_mae(torch.randn(4, 5), torch.randn(4, 5)) + train_mse = MeanSquaredError() + train_mse(torch.randn(4, 5), torch.randn(4, 5)) + results.log(fx_name, "step_log_val", log_val, on_step=True, on_epoch=False) + results.log(fx_name, "epoch_log_val", log_val, on_step=False, on_epoch=True, batch_size=batch_size) + results.log(fx_name, "epoch_sum_log_val", log_val, on_step=True, on_epoch=True, reduce_fx="sum") + results.log(fx_name, "train_mae", train_mae, on_step=True, on_epoch=False) + results.log(fx_name, "train_mse", {"mse": train_mse}, on_step=True, on_epoch=False) + + assert results.batch_size is None + assert isinstance(results["training_step.train_mse"]["mse"].value, MeanSquaredError) + assert isinstance(results["training_step.train_mae"].value, MeanAbsoluteError) + assert results["training_step.step_log_val"].value == log_val + assert results["training_step.step_log_val"].cumulated_batch_size == 0 + assert results["training_step.epoch_log_val"].value == log_val * batch_size + assert results["training_step.epoch_log_val"].cumulated_batch_size == batch_size + assert results["training_step.epoch_sum_log_val"].value == log_val diff --git a/tests/trainer/logging_/test_train_loop_logging.py b/tests/trainer/logging_/test_train_loop_logging.py index 139714acc97bc..2ad2585f0fe02 100644 --- a/tests/trainer/logging_/test_train_loop_logging.py +++ b/tests/trainer/logging_/test_train_loop_logging.py @@ -27,7 +27,6 @@ from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint, TQDMProgressBar from pytorch_lightning.core.lightning import LightningModule from pytorch_lightning.utilities.exceptions import MisconfigurationException -from tests.deprecated_api import no_warning_call from tests.helpers.boring_model import BoringModel, RandomDataset, RandomDictDataset from tests.helpers.runif import RunIf @@ -746,36 +745,3 @@ def validation_epoch_end(self, *_) -> None: train_data = DataLoader(RandomDataset(32, 64), batch_size=2) val_data = DataLoader(RandomDataset(32, 64), batch_size=2) trainer.fit(model, train_dataloaders=train_data, val_dataloaders=val_data) - - -def test_no_batch_size_extraction_with_specifying_explictly(tmpdir): - batch_size = BoringModel().train_dataloader().batch_size + 1 - fast_dev_run = 2 - log_val = 7 - - class CustomBoringModel(BoringModel): - def on_before_batch_transfer(self, batch, *args, **kwargs): - # This is an ambiguous batch which have multiple potential batch sizes - if self.trainer.training: - batch = {"batch1": torch.randn(batch_size, 10), "batch2": batch} - return batch - - def training_step(self, batch, batch_idx): - self.log("step_log_val", log_val, on_epoch=False) - self.log("epoch_log_val", log_val, batch_size=batch_size, on_step=False, on_epoch=True) - self.log("epoch_sum_log_val", log_val, on_epoch=True, reduce_fx="sum") - return super().training_step(batch["batch2"], batch_idx) - - def on_train_epoch_end(self, *args, **kwargs): - results = self.trainer._results - assert results["training_step.step_log_val"].value == log_val - assert results["training_step.step_log_val"].cumulated_batch_size == 0 - assert results["training_step.epoch_log_val"].value == log_val * batch_size * fast_dev_run - assert results["training_step.epoch_log_val"].cumulated_batch_size == batch_size * fast_dev_run - assert results["training_step.epoch_sum_log_val"].value == log_val * fast_dev_run - - model = CustomBoringModel() - trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=fast_dev_run) - - with no_warning_call(match="Trying to infer the `batch_size`"): - trainer.fit(model) From 439bea42aa00adc6636e1c5fecb1d3ce8644421a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 1 Dec 2021 09:48:37 +0100 Subject: [PATCH 063/123] Fix selection of standalone tests (#10857) Co-authored-by: Carlos Mocholi --- tests/standalone_tests.sh | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/tests/standalone_tests.sh b/tests/standalone_tests.sh index 49c608d53cfa1..7b7dd361ab0b1 100755 --- a/tests/standalone_tests.sh +++ b/tests/standalone_tests.sh @@ -28,9 +28,9 @@ files=$(echo "$grep_output" | cut -f1 -d: | sort | uniq) # get the list of parametrizations. we need to call them separately. the last two lines are removed. # note: if there's a syntax error, this will fail with some garbled output if [[ "$OSTYPE" == "darwin"* ]]; then - parametrizations=$(pytest $files --collect-only --quiet | tail -r | sed -e '1,3d' | tail -r) + parametrizations=$(pytest $files --collect-only --quiet "$@" | tail -r | sed -e '1,3d' | tail -r) else - parametrizations=$(pytest $files --collect-only --quiet | head -n -2) + parametrizations=$(pytest $files --collect-only --quiet "$@" | head -n -2) fi parametrizations_arr=($parametrizations) @@ -47,14 +47,6 @@ for i in "${!parametrizations_arr[@]}"; do continue fi - # STANDALONE_PATTERN allows filtering the tests to run when debugging. - # use as `STANDALONE_PATTERN="foo_bar" ./standalone_tests.sh` to run only those - # test with `foo_bar` in their name - if [[ $parametrization != *STANDALONE_PATTERN* ]]; then - report+="Skipped\t$parametrization\n" - continue - fi - # run the test echo "Running ${parametrization}" python ${defaults} "${parametrization}" From 9d6b1b6753cc29f1e5e550e28b9a92bdfd12f86e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 1 Dec 2021 15:39:22 +0100 Subject: [PATCH 064/123] Restore signals on teardown (#10611) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Carlos Mocholí --- CHANGELOG.md | 3 +- .../trainer/connectors/signal_connector.py | 63 ++++++++++++++++--- pytorch_lightning/trainer/trainer.py | 1 + tests/conftest.py | 21 ++++++- .../connectors/test_signal_connector.py | 22 +++++-- 5 files changed, 95 insertions(+), 15 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a6e8c77f17d33..cd7c617cdf96e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,7 +8,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Fixed -- Disabled batch_size extraction for torchmetric instances because they accumulate the metrics internally ([#10815](https://github.com/PyTorchLightning/pytorch-lightning/pull/10815)) +- Disabled batch_size extraction for torchmetric instances because they accumulate the metrics internally ([#10815](https://github.com/PyTorchLightning/pytorch-lightning/pull/10815)) +- Fixed an issue with `SignalConnector` not restoring the default signal handlers on teardown when running on SLURM or with fault-tolerant training enabled ([#10611](https://github.com/PyTorchLightning/pytorch-lightning/pull/10611)) ## [1.5.4] - 2021-11-30 diff --git a/pytorch_lightning/trainer/connectors/signal_connector.py b/pytorch_lightning/trainer/connectors/signal_connector.py index 7344f076c3972..07290edd47d3e 100644 --- a/pytorch_lightning/trainer/connectors/signal_connector.py +++ b/pytorch_lightning/trainer/connectors/signal_connector.py @@ -6,16 +6,18 @@ from signal import Signals from subprocess import call from types import FrameType, FunctionType -from typing import Callable, List, Union +from typing import Any, Callable, Dict, List, Set, Union import pytorch_lightning as pl -from pytorch_lightning.utilities.imports import _fault_tolerant_training +from pytorch_lightning.utilities.imports import _fault_tolerant_training, _IS_WINDOWS log = logging.getLogger(__name__) +_SIGNAL_HANDLER_DICT = Dict[Signals, Union[Callable[[Signals, FrameType], Any], int, None]] + class HandlersCompose: - def __init__(self, signal_handlers: Union[List[Callable], Callable]): + def __init__(self, signal_handlers: Union[List[Callable], Callable]) -> None: if not isinstance(signal_handlers, list): signal_handlers = [signal_handlers] self.signal_handlers = signal_handlers @@ -26,11 +28,14 @@ def __call__(self, signum: Signals, frame: FrameType) -> None: class SignalConnector: - def __init__(self, trainer: "pl.Trainer"): + def __init__(self, trainer: "pl.Trainer") -> None: self.trainer = trainer self.trainer._terminate_gracefully = False + self._original_handlers: _SIGNAL_HANDLER_DICT = {} def register_signal_handlers(self) -> None: + self._original_handlers = self._get_current_signal_handlers() + sigusr1_handlers: List[Callable] = [] sigterm_handlers: List[Callable] = [] @@ -43,7 +48,7 @@ def register_signal_handlers(self) -> None: sigterm_handlers.append(self.sigterm_handler_fn) # signal.SIGUSR1 doesn't seem available on windows - if not self._is_on_windows(): + if not _IS_WINDOWS: if sigusr1_handlers and not self._has_already_handler(signal.SIGUSR1): self._register_signal(signal.SIGUSR1, HandlersCompose(sigusr1_handlers)) @@ -87,6 +92,43 @@ def fault_tolerant_sigusr1_handler_fn(self, signum: Signals, frame: FrameType) - def sigterm_handler_fn(self, signum: Signals, frame: FrameType) -> None: log.info("bypassing sigterm") + def teardown(self) -> None: + """Restores the signals that were previsouly configured before :class:`SignalConnector` replaced them.""" + for signum, handler in self._original_handlers.items(): + if handler is not None: + signal.signal(signum, handler) + self._original_handlers = {} + + @staticmethod + def _get_current_signal_handlers() -> _SIGNAL_HANDLER_DICT: + """Collects the currently assigned signal handlers.""" + valid_signals = SignalConnector._valid_signals() + if not _IS_WINDOWS: + # SIGKILL and SIGSTOP are not allowed to be modified by the user + valid_signals -= {signal.SIGKILL, signal.SIGSTOP} + return {signum: signal.getsignal(signum) for signum in valid_signals} + + @staticmethod + def _valid_signals() -> Set[Signals]: + """Returns all valid signals supported on the current platform. + + Behaves identically to :func:`signals.valid_signals` in Python 3.8+ and implements the equivalent behavior for + older Python versions. + """ + if sys.version_info >= (3, 8): + return signal.valid_signals() + elif _IS_WINDOWS: + # supported signals on Windows: https://docs.python.org/3/library/signal.html#signal.signal + return { + signal.SIGABRT, + signal.SIGFPE, + signal.SIGILL, + signal.SIGINT, + signal.SIGSEGV, + signal.SIGTERM, + } + return set(signal.Signals) + def _is_on_slurm(self) -> bool: # see if we're using slurm (not interactive) on_slurm = False @@ -100,10 +142,8 @@ def _is_on_slurm(self) -> bool: return on_slurm - def _is_on_windows(self) -> bool: - return sys.platform == "win32" - - def _has_already_handler(self, signum: Signals) -> bool: + @staticmethod + def _has_already_handler(signum: Signals) -> bool: try: return isinstance(signal.getsignal(signum), FunctionType) except AttributeError: @@ -113,3 +153,8 @@ def _has_already_handler(self, signum: Signals) -> bool: def _register_signal(signum: Signals, handlers: HandlersCompose) -> None: if threading.current_thread() is threading.main_thread(): signal.signal(signum, handlers) + + def __getstate__(self) -> Dict: + state = self.__dict__.copy() + state["_original_handlers"] = {} + return state diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index b4a3f97025701..2449f6b35bce4 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -1264,6 +1264,7 @@ def _post_dispatch(self): self._data_connector.teardown() self._active_loop.teardown() self.logger_connector.teardown() + self.signal_connector.teardown() def _dispatch(self): if self.evaluating: diff --git a/tests/conftest.py b/tests/conftest.py index 3c1efcde4deb2..772061d8bbd3d 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. import os +import signal import sys import threading from functools import partial @@ -22,7 +23,8 @@ import torch.distributed from pytorch_lightning.plugins.environments.lightning_environment import find_free_network_port -from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_7, _TORCH_GREATER_EQUAL_1_8 +from pytorch_lightning.trainer.connectors.signal_connector import SignalConnector +from pytorch_lightning.utilities.imports import _IS_WINDOWS, _TORCH_GREATER_EQUAL_1_7, _TORCH_GREATER_EQUAL_1_8 from tests import _PATH_DATASETS @@ -81,6 +83,23 @@ def restore_env_variables(): assert not leaked_vars, f"test is leaking environment variable(s): {set(leaked_vars)}" +@pytest.fixture(scope="function", autouse=True) +def restore_signal_handlers(): + """Ensures that signal handlers get restored before the next test runs. + + This is a safety net for tests that don't run Trainer's teardown. + """ + valid_signals = SignalConnector._valid_signals() + if not _IS_WINDOWS: + # SIGKILL and SIGSTOP are not allowed to be modified by the user + valid_signals -= {signal.SIGKILL, signal.SIGSTOP} + handlers = {signum: signal.getsignal(signum) for signum in valid_signals} + yield + for signum, handler in handlers.items(): + if handler is not None: + signal.signal(signum, handler) + + @pytest.fixture(scope="function", autouse=True) def teardown_process_group(): """Ensures that the distributed process group gets closed before the next test runs.""" diff --git a/tests/trainer/connectors/test_signal_connector.py b/tests/trainer/connectors/test_signal_connector.py index e89365e2e2056..44657fd4d2d94 100644 --- a/tests/trainer/connectors/test_signal_connector.py +++ b/tests/trainer/connectors/test_signal_connector.py @@ -20,19 +20,33 @@ import pytest from pytorch_lightning import Trainer +from pytorch_lightning.plugins.environments import SLURMEnvironment +from pytorch_lightning.trainer.connectors.signal_connector import SignalConnector from pytorch_lightning.utilities.exceptions import ExitGracefullyException from tests.helpers import BoringModel from tests.helpers.runif import RunIf +@RunIf(skip_windows=True) +@mock.patch.dict(os.environ, {"PL_FAULT_TOLERANT_TRAINING": "1"}) +def test_signal_handlers_restored_in_teardown(): + """Test that the SignalConnector restores the previously configured handler on teardown.""" + assert signal.getsignal(signal.SIGTERM) is signal.SIG_DFL + + trainer = Trainer(plugins=SLURMEnvironment()) + connector = SignalConnector(trainer) + connector.register_signal_handlers() + + assert signal.getsignal(signal.SIGUSR1) is not signal.SIG_DFL + connector.teardown() + assert signal.getsignal(signal.SIGUSR1) is signal.SIG_DFL + + @pytest.mark.parametrize("register_handler", [False, True]) @pytest.mark.parametrize("terminate_gracefully", [False, True]) -@RunIf(min_torch="1.7.0", skip_windows=True) +@RunIf(skip_windows=True) def test_fault_tolerant_sig_handler(register_handler, terminate_gracefully, tmpdir): - # hack to reset the signal - signal.signal(signal.SIGUSR1, 0) - if register_handler: def handler(*_): From e0a1f55659e3bb54b6ee6def5c9b5242f84c585c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 1 Dec 2021 16:04:31 +0100 Subject: [PATCH 065/123] Fix `SignalConnector._has_already_handler` check for callable type (#10483) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Adrian Wälchli Co-authored-by: thomas chaton Co-authored-by: Carlos Mocholi --- CHANGELOG.md | 1 + .../trainer/connectors/signal_connector.py | 7 ++---- .../connectors/test_signal_connector.py | 24 +++++++++++++++++++ 3 files changed, 27 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index cd7c617cdf96e..4f99d812d4d6c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Disabled batch_size extraction for torchmetric instances because they accumulate the metrics internally ([#10815](https://github.com/PyTorchLightning/pytorch-lightning/pull/10815)) - Fixed an issue with `SignalConnector` not restoring the default signal handlers on teardown when running on SLURM or with fault-tolerant training enabled ([#10611](https://github.com/PyTorchLightning/pytorch-lightning/pull/10611)) +- Fixed `SignalConnector._has_already_handler` check for callable type ([#10483](https://github.com/PyTorchLightning/pytorch-lightning/pull/10483)) ## [1.5.4] - 2021-11-30 diff --git a/pytorch_lightning/trainer/connectors/signal_connector.py b/pytorch_lightning/trainer/connectors/signal_connector.py index 07290edd47d3e..27b96f1e4cbcf 100644 --- a/pytorch_lightning/trainer/connectors/signal_connector.py +++ b/pytorch_lightning/trainer/connectors/signal_connector.py @@ -5,7 +5,7 @@ import threading from signal import Signals from subprocess import call -from types import FrameType, FunctionType +from types import FrameType from typing import Any, Callable, Dict, List, Set, Union import pytorch_lightning as pl @@ -144,10 +144,7 @@ def _is_on_slurm(self) -> bool: @staticmethod def _has_already_handler(signum: Signals) -> bool: - try: - return isinstance(signal.getsignal(signum), FunctionType) - except AttributeError: - return False + return signal.getsignal(signum) is not signal.SIG_DFL @staticmethod def _register_signal(signum: Signals, handlers: HandlersCompose) -> None: diff --git a/tests/trainer/connectors/test_signal_connector.py b/tests/trainer/connectors/test_signal_connector.py index 44657fd4d2d94..46ad48cc6276d 100644 --- a/tests/trainer/connectors/test_signal_connector.py +++ b/tests/trainer/connectors/test_signal_connector.py @@ -88,3 +88,27 @@ def test_signal_connector_in_thread(): with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor: for future in concurrent.futures.as_completed([executor.submit(_registering_signals)]): assert future.exception() is None + + +def signal_handler(): + pass + + +class SignalHandlers: + def signal_handler(self): + pass + + +@pytest.mark.parametrize( + ["handler", "expected_return"], + [ + (signal.Handlers.SIG_IGN, True), + (signal.Handlers.SIG_DFL, False), + (signal_handler, True), + (SignalHandlers().signal_handler, True), + ], +) +def test_has_already_handler(handler, expected_return): + """Test that the SignalConnector detects whether a signal handler is already attached.""" + with mock.patch("pytorch_lightning.trainer.connectors.signal_connector.signal.getsignal", return_value=handler): + assert SignalConnector._has_already_handler(signal.SIGTERM) is expected_return From 11012672433f35477f404ca6bc0d513add79087e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 1 Dec 2021 16:20:03 +0100 Subject: [PATCH 066/123] Skip hanging spawn tests (#10838) Co-authored-by: Carlos Mocholi --- .../plugins/training_type/ddp_spawn.py | 17 ---------------- .../plugins/training_type/tpu_spawn.py | 20 +++++++++++++++++++ tests/helpers/runif.py | 11 ++++++++++ tests/loggers/test_all.py | 2 +- tests/loggers/test_tensorboard.py | 15 -------------- tests/plugins/test_tpu_spawn.py | 16 +++++++++++++++ tests/utilities/test_all_gather_grad.py | 2 +- 7 files changed, 49 insertions(+), 34 deletions(-) diff --git a/pytorch_lightning/plugins/training_type/ddp_spawn.py b/pytorch_lightning/plugins/training_type/ddp_spawn.py index 0b503db64e0a4..ff5159f739cdc 100644 --- a/pytorch_lightning/plugins/training_type/ddp_spawn.py +++ b/pytorch_lightning/plugins/training_type/ddp_spawn.py @@ -25,7 +25,6 @@ from torch.nn.parallel.distributed import DistributedDataParallel import pytorch_lightning as pl -from pytorch_lightning.loggers import LoggerCollection, TensorBoardLogger from pytorch_lightning.overrides import LightningDistributedModule from pytorch_lightning.overrides.distributed import prepare_for_backward from pytorch_lightning.overrides.torch_distributed import broadcast_object_list @@ -171,17 +170,14 @@ def get_mp_spawn_kwargs(self, trainer: Optional["pl.Trainer"] = None) -> Dict[st return {"nprocs": self.num_processes} def start_training(self, trainer: "pl.Trainer") -> None: - self._clean_logger(trainer) self.spawn(self.new_process, trainer, self.mp_queue, return_result=False) # reset optimizers, since main process is never used for training and thus does not have a valid optim state trainer.optimizers = [] def start_evaluating(self, trainer: "pl.Trainer") -> None: - self._clean_logger(trainer) self.spawn(self.new_process, trainer, self.mp_queue, return_result=False) def start_predicting(self, trainer: "pl.Trainer") -> None: - self._clean_logger(trainer) self.spawn(self.new_process, trainer, self.mp_queue, return_result=False) def spawn(self, function: Callable, *args: Any, return_result: bool = True, **kwargs: Any) -> Optional[Any]: @@ -444,16 +440,3 @@ def teardown(self) -> None: self.lightning_module.cpu() # clean up memory torch.cuda.empty_cache() - - @staticmethod - def _clean_logger(trainer: "pl.Trainer") -> None: - loggers = trainer.logger._logger_iterable if isinstance(trainer.logger, LoggerCollection) else [trainer.logger] - for logger in loggers: - if isinstance(logger, TensorBoardLogger) and logger._experiment is not None: - rank_zero_warn( - "When using `ddp_spawn`, the `TensorBoardLogger` experiment should be `None`. Setting it to `None`." - ) - # the experiment class of `TensorBoard` holds a multiprocessing queue which can make ours hang. - # we want to make sure these are closed before we spawn our own threads. - # assuming nothing else references the experiment object, python should instantly `__del__` it. - logger._experiment = None diff --git a/pytorch_lightning/plugins/training_type/tpu_spawn.py b/pytorch_lightning/plugins/training_type/tpu_spawn.py index 4fa0cfda6a859..92bd0f06735d8 100644 --- a/pytorch_lightning/plugins/training_type/tpu_spawn.py +++ b/pytorch_lightning/plugins/training_type/tpu_spawn.py @@ -24,6 +24,7 @@ from torch.utils.data import DataLoader import pytorch_lightning as pl +from pytorch_lightning.loggers import LoggerCollection, TensorBoardLogger from pytorch_lightning.overrides import LightningDistributedModule from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO from pytorch_lightning.plugins.io.xla_plugin import XLACheckpointIO @@ -289,8 +290,17 @@ def start_training(self, trainer: "pl.Trainer") -> None: # todo: precision pluging is call in accelerator setup and should be moved if "XLA_USE_BF16" in os.environ: del os.environ["XLA_USE_BF16"] + self._clean_logger(trainer) return super().start_training(trainer) + def start_evaluating(self, trainer: "pl.Trainer") -> None: + self._clean_logger(trainer) + return super().start_evaluating(trainer) + + def start_predicting(self, trainer: "pl.Trainer") -> None: + self._clean_logger(trainer) + return super().start_predicting(trainer) + def training_step(self, *args, **kwargs): return self.model(*args, **kwargs) @@ -366,3 +376,13 @@ def checkpoint_io(self) -> CheckpointIO: @checkpoint_io.setter def checkpoint_io(self, plugin: CheckpointIO) -> None: raise MisconfigurationException("TPU Spawn Plugin currently does not support custom checkpoint plugins.") + + @staticmethod + def _clean_logger(trainer: "pl.Trainer") -> None: + loggers = trainer.logger._logger_iterable if isinstance(trainer.logger, LoggerCollection) else [trainer.logger] + for logger in loggers: + if isinstance(logger, TensorBoardLogger) and logger._experiment is not None: + # the experiment class of `TensorBoard` holds a multiprocessing queue which can make ours hang. + # we want to make sure these are closed before we spawn our own threads. + # assuming nothing else references the experiment object, python should instantly `__del__` it. + logger._experiment = None diff --git a/tests/helpers/runif.py b/tests/helpers/runif.py index 4ad6942aa160a..07bd6438da125 100644 --- a/tests/helpers/runif.py +++ b/tests/helpers/runif.py @@ -71,6 +71,7 @@ def __new__( deepspeed: bool = False, rich: bool = False, skip_49370: bool = False, + skip_hanging_spawn: bool = False, **kwargs, ): """ @@ -93,6 +94,7 @@ def __new__( deepspeed: if `deepspeed` module is required to run the test rich: if `rich` module is required to run the test skip_49370: Skip the test as it's impacted by https://github.com/pytorch/pytorch/issues/49370. + skip_hanging_spawn: Skip the test as it's impacted by hanging loggers on spawn. kwargs: native pytest.mark.skipif keyword arguments """ conditions = [] @@ -178,6 +180,15 @@ def __new__( conditions.append(ge_3_9 and old_torch) reasons.append("Impacted by https://github.com/pytorch/pytorch/issues/49370") + if skip_hanging_spawn: + # strategy=ddp_spawn, accelerator=cpu, python>=3.8, torch<1.9 does not work + py_version = f"{sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}" + ge_3_8 = Version(py_version) >= Version("3.8") + torch_version = get_distribution("torch").version + old_torch = Version(torch_version) < Version("1.9") + conditions.append(ge_3_8 and old_torch) + reasons.append("Impacted by hanging DDP spawn") + reasons = [rs for cond, rs in zip(conditions, reasons) if cond] return pytest.mark.skipif( *args, condition=any(conditions), reason=f"Requires: [{' + '.join(reasons)}]", **kwargs diff --git a/tests/loggers/test_all.py b/tests/loggers/test_all.py index 370b24431b088..d66e77b4cea34 100644 --- a/tests/loggers/test_all.py +++ b/tests/loggers/test_all.py @@ -321,7 +321,7 @@ def on_train_batch_start(self, trainer, pl_module, batch, batch_idx): assert pl_module.logger.experiment.something(foo="bar") is None -@RunIf(skip_windows=True, skip_49370=True) +@RunIf(skip_windows=True, skip_49370=True, skip_hanging_spawn=True) @pytest.mark.parametrize("logger_class", [CometLogger, CSVLogger, MLFlowLogger, TensorBoardLogger, TestTubeLogger]) def test_logger_created_on_rank_zero_only(tmpdir, monkeypatch, logger_class): """Test that loggers get replaced by dummy loggers on global rank > 0.""" diff --git a/tests/loggers/test_tensorboard.py b/tests/loggers/test_tensorboard.py index 0a99c058ef941..02a809aa2ab30 100644 --- a/tests/loggers/test_tensorboard.py +++ b/tests/loggers/test_tensorboard.py @@ -25,7 +25,6 @@ from pytorch_lightning import Trainer from pytorch_lightning.loggers import TensorBoardLogger -from pytorch_lightning.loggers.base import LoggerCollection from pytorch_lightning.utilities.imports import _compare_version from tests.helpers import BoringModel @@ -333,17 +332,3 @@ def test_tensorboard_missing_folder_warning(tmpdir, caplog): assert logger.version == 0 assert "Missing logger folder:" in caplog.text - - -@pytest.mark.parametrize("use_list", [False, True]) -def test_tensorboard_ddp_spawn_cleanup(use_list, tmpdir): - tensorboard_logger = TensorBoardLogger(save_dir=tmpdir) - assert tensorboard_logger._experiment is None - tensorboard_logger.experiment # this property access will create the experiment - assert tensorboard_logger._experiment is not None - logger = [tensorboard_logger] if use_list else tensorboard_logger - trainer = Trainer(strategy="ddp_spawn", devices=2, accelerator="auto", logger=logger) - trainer.training_type_plugin._clean_logger(trainer) - if use_list: - assert isinstance(trainer.logger, LoggerCollection) - assert tensorboard_logger._experiment is None diff --git a/tests/plugins/test_tpu_spawn.py b/tests/plugins/test_tpu_spawn.py index 3f4ff354e39bb..5f4abf560d6a6 100644 --- a/tests/plugins/test_tpu_spawn.py +++ b/tests/plugins/test_tpu_spawn.py @@ -20,6 +20,7 @@ from torch.utils.data import DataLoader from pytorch_lightning import Trainer +from pytorch_lightning.loggers import LoggerCollection, TensorBoardLogger from pytorch_lightning.plugins.training_type import TPUSpawnPlugin from pytorch_lightning.utilities.exceptions import MisconfigurationException from tests.helpers.boring_model import BoringModel, RandomDataset @@ -102,3 +103,18 @@ def test_model_tpu_one_core(): model = BoringModelTPU() trainer.fit(model) assert "PT_XLA_DEBUG" not in os.environ + + +@RunIf(tpu=True) +@pytest.mark.parametrize("use_list", [False, True]) +def test_tensorboard_ddp_spawn_cleanup(use_list, tmpdir): + tensorboard_logger = TensorBoardLogger(save_dir=tmpdir) + assert tensorboard_logger._experiment is None + tensorboard_logger.experiment # this property access will create the experiment + assert tensorboard_logger._experiment is not None + logger = [tensorboard_logger] if use_list else tensorboard_logger + trainer = Trainer(strategy="ddp_spawn", accelerator="tpu", devices="auto", logger=logger) + trainer.training_type_plugin._clean_logger(trainer) + if use_list: + assert isinstance(trainer.logger, LoggerCollection) + assert tensorboard_logger._experiment is None diff --git a/tests/utilities/test_all_gather_grad.py b/tests/utilities/test_all_gather_grad.py index 63c5c2cfe90fe..01ffd12a0ca62 100644 --- a/tests/utilities/test_all_gather_grad.py +++ b/tests/utilities/test_all_gather_grad.py @@ -41,7 +41,7 @@ def _test_all_gather_ddp(rank, world_size): assert torch.allclose(grad2, tensor2.grad) -@RunIf(skip_windows=True, skip_49370=True) +@RunIf(skip_windows=True, skip_49370=True, skip_hanging_spawn=True) def test_all_gather_ddp_spawn(): world_size = 3 torch.multiprocessing.spawn(_test_all_gather_ddp, args=(world_size,), nprocs=world_size) From 9b895de4a05cb50c41ce95a87221cb645e578b3c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 1 Dec 2021 16:43:34 +0100 Subject: [PATCH 067/123] Raise exception if rich is less than 10.2.2 (#10839) --- CHANGELOG.md | 3 ++- pytorch_lightning/callbacks/progress/rich_progress.py | 8 +++++--- tests/callbacks/test_rich_progress_bar.py | 2 +- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4f99d812d4d6c..9167332653928 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,13 +4,14 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). -## [1.5.4] - 2021-12-07 +## [1.5.5] - 2021-12-07 ### Fixed - Disabled batch_size extraction for torchmetric instances because they accumulate the metrics internally ([#10815](https://github.com/PyTorchLightning/pytorch-lightning/pull/10815)) - Fixed an issue with `SignalConnector` not restoring the default signal handlers on teardown when running on SLURM or with fault-tolerant training enabled ([#10611](https://github.com/PyTorchLightning/pytorch-lightning/pull/10611)) - Fixed `SignalConnector._has_already_handler` check for callable type ([#10483](https://github.com/PyTorchLightning/pytorch-lightning/pull/10483)) +- Improved exception message if `rich` version is less than `10.2.2` ([#10839](https://github.com/PyTorchLightning/pytorch-lightning/pull/10839)) ## [1.5.4] - 2021-11-30 diff --git a/pytorch_lightning/callbacks/progress/rich_progress.py b/pytorch_lightning/callbacks/progress/rich_progress.py index 3c70bfe735f95..9cb2a91dde3af 100644 --- a/pytorch_lightning/callbacks/progress/rich_progress.py +++ b/pytorch_lightning/callbacks/progress/rich_progress.py @@ -17,7 +17,8 @@ from typing import Any, Optional, Union from pytorch_lightning.callbacks.progress.base import ProgressBarBase -from pytorch_lightning.utilities import _RICH_AVAILABLE +from pytorch_lightning.utilities.exceptions import MisconfigurationException +from pytorch_lightning.utilities.imports import _RICH_AVAILABLE Task, Style = None, None if _RICH_AVAILABLE: @@ -222,9 +223,10 @@ def __init__( theme: RichProgressBarTheme = RichProgressBarTheme(), ) -> None: if not _RICH_AVAILABLE: - raise ModuleNotFoundError( - "`RichProgressBar` requires `rich` to be installed. Install it by running `pip install -U rich`." + raise MisconfigurationException( + "`RichProgressBar` requires `rich` >= 10.2.2. Install it by running `pip install -U rich`." ) + super().__init__() self._refresh_rate_per_second: int = refresh_rate_per_second self._leave: bool = leave diff --git a/tests/callbacks/test_rich_progress_bar.py b/tests/callbacks/test_rich_progress_bar.py index 8f3f20630b5c0..f4dc5421539e8 100644 --- a/tests/callbacks/test_rich_progress_bar.py +++ b/tests/callbacks/test_rich_progress_bar.py @@ -85,7 +85,7 @@ def predict_dataloader(self): def test_rich_progress_bar_import_error(): if not _RICH_AVAILABLE: - with pytest.raises(ImportError, match="`RichProgressBar` requires `rich` to be installed."): + with pytest.raises(ImportError, match="`RichProgressBar` requires `rich` >= 10.2.2."): Trainer(callbacks=RichProgressBar()) From 5820711006077faf8ddf23cac0b4dba467d20b9e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafa=C5=82=20Jankowski?= Date: Wed, 1 Dec 2021 14:58:54 +0100 Subject: [PATCH 068/123] Fixed uploading best model checkpoint in NeptuneLogger (#10369) --- CHANGELOG.md | 3 +++ pytorch_lightning/loggers/neptune.py | 18 ++++++++++----- tests/loggers/test_neptune.py | 33 +++++++++++++++++----------- 3 files changed, 35 insertions(+), 19 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9167332653928..2e2b40460b0c4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Improved exception message if `rich` version is less than `10.2.2` ([#10839](https://github.com/PyTorchLightning/pytorch-lightning/pull/10839)) +- Fixed uploading best model checkpoint in NeptuneLogger ([#10369](https://github.com/PyTorchLightning/pytorch-lightning/pull/10369)) + + ## [1.5.4] - 2021-11-30 ### Fixed diff --git a/pytorch_lightning/loggers/neptune.py b/pytorch_lightning/loggers/neptune.py index f7c611ed787ce..21fbd4a79ea0e 100644 --- a/pytorch_lightning/loggers/neptune.py +++ b/pytorch_lightning/loggers/neptune.py @@ -523,6 +523,16 @@ def after_save_checkpoint(self, checkpoint_callback: "ReferenceType[ModelCheckpo file_names.add(model_name) self.experiment[f"{checkpoints_namespace}/{model_name}"].upload(key) + # log best model path and checkpoint + if checkpoint_callback.best_model_path: + self.experiment[ + self._construct_path_with_prefix("model/best_model_path") + ] = checkpoint_callback.best_model_path + + model_name = self._get_full_model_name(checkpoint_callback.best_model_path, checkpoint_callback) + file_names.add(model_name) + self.experiment[f"{checkpoints_namespace}/{model_name}"].upload(checkpoint_callback.best_model_path) + # remove old models logged to experiment if they are not part of best k models at this point if self.experiment.exists(checkpoints_namespace): exp_structure = self.experiment.get_structure() @@ -531,11 +541,7 @@ def after_save_checkpoint(self, checkpoint_callback: "ReferenceType[ModelCheckpo for file_to_drop in list(uploaded_model_names - file_names): del self.experiment[f"{checkpoints_namespace}/{file_to_drop}"] - # log best model path and best model score - if checkpoint_callback.best_model_path: - self.experiment[ - self._construct_path_with_prefix("model/best_model_path") - ] = checkpoint_callback.best_model_path + # log best model score if checkpoint_callback.best_model_score: self.experiment[self._construct_path_with_prefix("model/best_model_score")] = ( checkpoint_callback.best_model_score.cpu().detach().numpy() @@ -544,7 +550,7 @@ def after_save_checkpoint(self, checkpoint_callback: "ReferenceType[ModelCheckpo @staticmethod def _get_full_model_name(model_path: str, checkpoint_callback: "ReferenceType[ModelCheckpoint]") -> str: """Returns model name which is string `modle_path` appended to `checkpoint_callback.dirpath`.""" - expected_model_path = f"{checkpoint_callback.dirpath}/" + expected_model_path = f"{checkpoint_callback.dirpath}{os.path.sep}" if not model_path.startswith(expected_model_path): raise ValueError(f"{model_path} was expected to start with {expected_model_path}.") return model_path[len(expected_model_path) :] diff --git a/tests/loggers/test_neptune.py b/tests/loggers/test_neptune.py index 6eec5fffcf5b5..6238b408c1e6e 100644 --- a/tests/loggers/test_neptune.py +++ b/tests/loggers/test_neptune.py @@ -276,14 +276,15 @@ def test_after_save_checkpoint(self, neptune): logger, run_instance_mock, run_attr_mock = self._get_logger_with_mocks( api_key="test", project="project", **prefix ) + models_root_dir = os.path.join("path", "to", "models") cb_mock = MagicMock( - dirpath="path/to/models", - last_model_path="path/to/models/last", + dirpath=models_root_dir, + last_model_path=os.path.join(models_root_dir, "last"), best_k_models={ - "path/to/models/model1": None, - "path/to/models/model2/with/slashes": None, + f"{os.path.join(models_root_dir, 'model1')}": None, + f"{os.path.join(models_root_dir, 'model2/with/slashes')}": None, }, - best_model_path="path/to/models/best_model", + best_model_path=os.path.join(models_root_dir, "best_model"), best_model_score=None, ) @@ -292,19 +293,21 @@ def test_after_save_checkpoint(self, neptune): # then: self.assertEqual(run_instance_mock.__setitem__.call_count, 1) - self.assertEqual(run_instance_mock.__getitem__.call_count, 3) - self.assertEqual(run_attr_mock.upload.call_count, 3) + self.assertEqual(run_instance_mock.__getitem__.call_count, 4) + self.assertEqual(run_attr_mock.upload.call_count, 4) run_instance_mock.__setitem__.assert_called_once_with( - f"{model_key_prefix}/best_model_path", "path/to/models/best_model" + f"{model_key_prefix}/best_model_path", os.path.join(models_root_dir, "best_model") ) run_instance_mock.__getitem__.assert_any_call(f"{model_key_prefix}/checkpoints/last") run_instance_mock.__getitem__.assert_any_call(f"{model_key_prefix}/checkpoints/model1") run_instance_mock.__getitem__.assert_any_call(f"{model_key_prefix}/checkpoints/model2/with/slashes") + run_instance_mock.__getitem__.assert_any_call(f"{model_key_prefix}/checkpoints/best_model") run_attr_mock.upload.assert_has_calls( [ - call("path/to/models/last"), - call("path/to/models/model1"), - call("path/to/models/model2/with/slashes"), + call(os.path.join(models_root_dir, "last")), + call(os.path.join(models_root_dir, "model1")), + call(os.path.join(models_root_dir, "model2/with/slashes")), + call(os.path.join(models_root_dir, "best_model")), ] ) @@ -394,8 +397,12 @@ def test__get_full_model_name(self): # given: SimpleCheckpoint = namedtuple("SimpleCheckpoint", ["dirpath"]) test_input_data = [ - ("key.ext", "foo/bar/key.ext", SimpleCheckpoint(dirpath="foo/bar")), - ("key/in/parts.ext", "foo/bar/key/in/parts.ext", SimpleCheckpoint(dirpath="foo/bar")), + ("key.ext", os.path.join("foo", "bar", "key.ext"), SimpleCheckpoint(dirpath=os.path.join("foo", "bar"))), + ( + "key/in/parts.ext", + os.path.join("foo", "bar", "key/in/parts.ext"), + SimpleCheckpoint(dirpath=os.path.join("foo", "bar")), + ), ] # expect: From c38fbad7d433215efa88cb95384ee2a201b7e2d8 Mon Sep 17 00:00:00 2001 From: Rohit Gupta Date: Thu, 2 Dec 2021 14:22:49 +0530 Subject: [PATCH 069/123] Fix schedule reset logic in pytorch profiler (#10837) --- CHANGELOG.md | 9 ++++++ .../loops/dataloader/evaluation_loop.py | 7 +++-- .../loops/dataloader/prediction_loop.py | 5 +--- pytorch_lightning/profiler/pytorch.py | 25 ++++++++++++---- pytorch_lightning/trainer/data_loading.py | 17 +++++++---- tests/profiler/test_profiler.py | 30 ++++++++++++++++++- 6 files changed, 74 insertions(+), 19 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2e2b40460b0c4..c64be1885d322 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,6 +17,15 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed uploading best model checkpoint in NeptuneLogger ([#10369](https://github.com/PyTorchLightning/pytorch-lightning/pull/10369)) +- Fixed early schedule reset logic in PyTorch profiler that was causing data leak ([#10837](https://github.com/PyTorchLightning/pytorch-lightning/pull/10837)) + + +- + + +- + + ## [1.5.4] - 2021-11-30 ### Fixed diff --git a/pytorch_lightning/loops/dataloader/evaluation_loop.py b/pytorch_lightning/loops/dataloader/evaluation_loop.py index 6140bd60d6a7f..b3291fcc21890 100644 --- a/pytorch_lightning/loops/dataloader/evaluation_loop.py +++ b/pytorch_lightning/loops/dataloader/evaluation_loop.py @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import Any, List, Optional, Sequence, Union +from typing import Any, List, Sequence from deprecate.utils import void from torch.utils.data.dataloader import DataLoader @@ -32,7 +32,8 @@ def __init__(self): self.epoch_loop = EvaluationEpochLoop() self._results = ResultCollection(training=False) - self._max_batches: Optional[Union[int, Sequence[int]]] = None + self._outputs: List[EPOCH_OUTPUT] = [] + self._max_batches: List[int] = [] self._has_run: bool = False @property @@ -147,7 +148,7 @@ def teardown(self) -> None: self._results.cpu() self.epoch_loop.teardown() - def _get_max_batches(self) -> List[Union[int, float]]: + def _get_max_batches(self) -> List[int]: """Returns the max number of batches for each dataloader.""" if self.trainer.testing: max_batches = self.trainer.num_test_batches diff --git a/pytorch_lightning/loops/dataloader/prediction_loop.py b/pytorch_lightning/loops/dataloader/prediction_loop.py index cf40316312107..58fee7743c1e5 100644 --- a/pytorch_lightning/loops/dataloader/prediction_loop.py +++ b/pytorch_lightning/loops/dataloader/prediction_loop.py @@ -53,10 +53,7 @@ def num_dataloaders(self) -> int: @property def max_batches(self) -> List[int]: """The max number of batches this loop will run for each dataloader.""" - max_batches = self.trainer.num_predict_batches - if isinstance(max_batches, int): - max_batches = [max_batches] * len(self.dataloaders) - return max_batches + return self.trainer.num_predict_batches @property def dataloaders(self) -> Sequence[DataLoader]: diff --git a/pytorch_lightning/profiler/pytorch.py b/pytorch_lightning/profiler/pytorch.py index 58f4a18895498..92bb9965dac4a 100644 --- a/pytorch_lightning/profiler/pytorch.py +++ b/pytorch_lightning/profiler/pytorch.py @@ -335,9 +335,24 @@ def _init_kineto(self, profiler_kwargs: Any) -> None: with_stack = profiler_kwargs.get("with_stack", False) or self._export_to_flame_graph self._profiler_kwargs["with_stack"] = with_stack + @property + def _total_steps(self) -> int: + trainer = self._lightning_module.trainer + if self._schedule.is_training: + return trainer.num_training_batches + if self._schedule._current_action == "validation_step": + return sum(trainer.num_val_batches) + sum(trainer.num_sanity_val_batches) + if self._schedule._current_action == "test_step": + return sum(trainer.num_test_batches) + if self._schedule._current_action == "predict_step": + return sum(trainer.num_predict_batches) + def _should_override_schedule(self) -> bool: - return (self._lightning_module is not None and self._lightning_module.trainer.limit_train_batches < 5) and ( - self._schedule is not None and self._schedule._schedule == self._default_schedule() + return ( + self._lightning_module is not None + and self._schedule is not None + and self._total_steps < 5 + and self._schedule._schedule == self._default_schedule() ) @staticmethod @@ -410,6 +425,9 @@ def stop(self, action_name: str) -> None: action_name in self.STEP_FUNCTIONS or action_name.startswith(self.STEP_FUNCTION_PREFIX) ): + if self._schedule is not None: + self._schedule.pre_step(action_name) + # the default schedule requires a minimum of 5 steps to properly work: `wait=1, warmup=1, active=3`. # otherwise, this will raise a `segmentation fault`. if self._should_override_schedule(): @@ -420,9 +438,6 @@ def stop(self, action_name: str) -> None: self._schedule = None self.profiler.schedule = torch.profiler.profiler._default_schedule_fn - if self._schedule is not None: - self._schedule.pre_step(action_name) - def on_trace_ready(profiler): if self.dirpath is not None: if self._export_to_chrome: diff --git a/pytorch_lightning/trainer/data_loading.py b/pytorch_lightning/trainer/data_loading.py index 19cc3c4a38371..b82f9dc319165 100644 --- a/pytorch_lightning/trainer/data_loading.py +++ b/pytorch_lightning/trainer/data_loading.py @@ -52,13 +52,18 @@ class TrainerDataLoadingMixin(ABC): val_check_interval: float tpu_local_core_rank: int train_dataloader: DataLoader - num_training_batches: Union[int, float] - val_check_batch: float - val_dataloaders: Optional[List[DataLoader]] - num_val_batches: List[Union[int, float]] - test_dataloaders: Optional[List[DataLoader]] - num_test_batches: List[Union[int, float]] limit_train_batches: Union[int, float] + num_training_batches: int + val_check_batch: float + val_dataloaders: List[DataLoader] + limit_val_batches: Union[int, float] + num_val_batches: List[int] + test_dataloaders: List[DataLoader] + limit_test_batches: Union[int, float] + num_test_batches: List[int] + predict_dataloaders: List[DataLoader] + limit_predict_batches: Union[int, float] + num_predict_batches: List[int] log_every_n_steps: int overfit_batches: Union[int, float] distributed_sampler_kwargs: dict diff --git a/tests/profiler/test_profiler.py b/tests/profiler/test_profiler.py index 708062ab64490..15cf40e924c68 100644 --- a/tests/profiler/test_profiler.py +++ b/tests/profiler/test_profiler.py @@ -25,7 +25,7 @@ from pytorch_lightning.loggers.base import LoggerCollection from pytorch_lightning.loggers.tensorboard import TensorBoardLogger from pytorch_lightning.profiler import AdvancedProfiler, PassThroughProfiler, PyTorchProfiler, SimpleProfiler -from pytorch_lightning.profiler.pytorch import RegisterRecordFunction +from pytorch_lightning.profiler.pytorch import RegisterRecordFunction, warning_cache from pytorch_lightning.utilities import _TORCH_GREATER_EQUAL_1_7 from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.imports import _KINETO_AVAILABLE @@ -523,3 +523,31 @@ def test_trainer_profiler_incorrect_str_arg(): match=r"When passing string value for the `profiler` parameter of `Trainer`, it can only be one of.*", ): Trainer(profiler="unknown_profiler") + + +@pytest.mark.skipif(not _KINETO_AVAILABLE, reason="Requires PyTorch Profiler Kineto") +@pytest.mark.parametrize( + ["trainer_config", "trainer_fn"], + [ + ({"limit_train_batches": 4, "limit_val_batches": 7}, "fit"), + ({"limit_train_batches": 7, "limit_val_batches": 4, "num_sanity_val_steps": 0}, "fit"), + ( + { + "limit_train_batches": 7, + "limit_val_batches": 2, + }, + "fit", + ), + ({"limit_val_batches": 4}, "validate"), + ({"limit_test_batches": 4}, "test"), + ({"limit_predict_batches": 4}, "predict"), + ], +) +def test_pytorch_profiler_raises_warning_for_limited_steps(tmpdir, trainer_config, trainer_fn): + model = BoringModel() + trainer = Trainer(default_root_dir=tmpdir, profiler="pytorch", max_epochs=1, **trainer_config) + warning_cache.clear() + with pytest.warns(UserWarning, match="not enough steps to properly record traces"): + getattr(trainer, trainer_fn)(model) + assert trainer.profiler._schedule is None + warning_cache.clear() From c1184dcf342b47b25522ec37b48783ff80289083 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Thu, 2 Dec 2021 12:07:51 +0100 Subject: [PATCH 070/123] Fix retrieval of batch indices when dataloader num_workers > 0 (#10870) Co-authored-by: Rohit Gupta --- CHANGELOG.md | 11 +- .../loops/epoch/prediction_epoch_loop.py | 38 +++---- pytorch_lightning/overrides/distributed.py | 26 ++++- tests/callbacks/test_prediction_writer.py | 102 +++++++++++++----- tests/deprecated_api/test_remove_1-7.py | 11 ++ tests/overrides/test_distributed.py | 4 +- 6 files changed, 129 insertions(+), 63 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c64be1885d322..57b3ed7e97a96 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,18 +12,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed an issue with `SignalConnector` not restoring the default signal handlers on teardown when running on SLURM or with fault-tolerant training enabled ([#10611](https://github.com/PyTorchLightning/pytorch-lightning/pull/10611)) - Fixed `SignalConnector._has_already_handler` check for callable type ([#10483](https://github.com/PyTorchLightning/pytorch-lightning/pull/10483)) - Improved exception message if `rich` version is less than `10.2.2` ([#10839](https://github.com/PyTorchLightning/pytorch-lightning/pull/10839)) - - - Fixed uploading best model checkpoint in NeptuneLogger ([#10369](https://github.com/PyTorchLightning/pytorch-lightning/pull/10369)) - - - Fixed early schedule reset logic in PyTorch profiler that was causing data leak ([#10837](https://github.com/PyTorchLightning/pytorch-lightning/pull/10837)) - - -- - - -- +- Fixed a bug that caused incorrect batch indices to be passed to the `BasePredictionWriter` hooks when using a dataloader with `num_workers > 0` ([#10870](https://github.com/PyTorchLightning/pytorch-lightning/pull/10870)) ## [1.5.4] - 2021-11-30 diff --git a/pytorch_lightning/loops/epoch/prediction_epoch_loop.py b/pytorch_lightning/loops/epoch/prediction_epoch_loop.py index 58e65233dfe81..e5fa46fe05836 100644 --- a/pytorch_lightning/loops/epoch/prediction_epoch_loop.py +++ b/pytorch_lightning/loops/epoch/prediction_epoch_loop.py @@ -26,7 +26,7 @@ def __init__(self) -> None: self._dl_max_batches: Optional[int] = None self._num_dataloaders: Optional[int] = None self._warning_cache = WarningCache() - self._all_batch_indices: List[int] = [] + self._seen_batch_indices: List[List[int]] = [] @property def done(self) -> bool: @@ -44,8 +44,8 @@ def connect(self, **kwargs: "Loop") -> None: def reset(self) -> None: """Resets the loops internal state.""" - self._all_batch_indices: List[int] = [] - self.predictions: List[Any] = [] + self._seen_batch_indices = [] + self.predictions = [] self.batch_progress.reset_on_run() def on_run_start( @@ -68,6 +68,7 @@ def on_run_start( void(dataloader_iter, dataloader_idx) self._dl_max_batches = dl_max_batches self._num_dataloaders = num_dataloaders + self._seen_batch_indices = self._get_batch_indices(dataloader_idx) self.return_predictions = return_predictions def advance( @@ -88,6 +89,10 @@ def advance( return_predictions: whether to return the obtained predictions """ batch_idx, batch = next(dataloader_iter) + self._seen_batch_indices = self._get_batch_indices(dataloader_idx) + # we need to truncate the list of batch indicies due to prefetching in the dataloader and Lightning + self._seen_batch_indices = self._seen_batch_indices[: (self.batch_progress.current.completed + 1)] + if batch is None: raise StopIteration @@ -99,13 +104,10 @@ def advance( with self.trainer.profiler.profile("predict_step"): self._predict_step(batch, batch_idx, dataloader_idx) - def on_run_end(self) -> Tuple[List[Any], List[int]]: + def on_run_end(self) -> Tuple[List[Any], List[List[int]]]: """Returns the predictions and the corresponding batch indices.""" - predictions = self.predictions - all_batch_indices = self._all_batch_indices - # free memory - self.predictions = [] - self._all_batch_indices = [] + predictions, all_batch_indices = self.predictions, self._seen_batch_indices + self.predictions, self._seen_batch_indices = [], [] # free memory return predictions, all_batch_indices def _predict_step(self, batch: Any, batch_idx: int, dataloader_idx: int) -> None: @@ -121,7 +123,7 @@ def _predict_step(self, batch: Any, batch_idx: int, dataloader_idx: int) -> None step_kwargs = self._build_kwargs(batch, batch_idx, dataloader_idx) # extract batch_indices and store them - self._store_batch_indices(dataloader_idx) + self.current_batch_indices = self._seen_batch_indices[batch_idx] if self._seen_batch_indices else [] model_ref = self.trainer.lightning_module @@ -160,12 +162,12 @@ def _build_kwargs(self, batch: Any, batch_idx: int, dataloader_idx: int) -> Dict step_kwargs["dataloader_idx"] = dataloader_idx return step_kwargs - def _store_batch_indices(self, dataloader_idx: int) -> None: - """Stores the batch indices if the predictions should be stored.""" + def _get_batch_indices(self, dataloader_idx: int) -> List[List[int]]: + """Returns a reference to the seen batch indices if the dataloader has a batch sampler wrapped by our + :class:`~pytorch_lightning.overrides.distributed.IndexBatchSamplerWrapper`.""" batch_sampler = self.trainer.predict_dataloaders[dataloader_idx].batch_sampler - if isinstance(batch_sampler, IndexBatchSamplerWrapper): - self.current_batch_indices = batch_sampler.batch_indices - if self.should_store_predictions: - self._all_batch_indices.append(batch_sampler.batch_indices) - else: - warning_cache.warn("Lightning couldn't infer the indices fetched for your dataloader.") + if isinstance(batch_sampler, IndexBatchSamplerWrapper) and self.should_store_predictions: + return batch_sampler.seen_batch_indices + + warning_cache.warn("Lightning couldn't infer the indices fetched for your dataloader.") + return [] diff --git a/pytorch_lightning/overrides/distributed.py b/pytorch_lightning/overrides/distributed.py index 0cf392dd44775..835d7f87040c1 100644 --- a/pytorch_lightning/overrides/distributed.py +++ b/pytorch_lightning/overrides/distributed.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. import itertools -from typing import Any, Iterator, List, Optional +from typing import Any, Iterator, List import torch from torch.nn.parallel import DistributedDataParallel @@ -20,6 +20,7 @@ import pytorch_lightning as pl from pytorch_lightning.overrides.base import _LightningModuleWrapperBase +from pytorch_lightning.utilities import rank_zero_deprecation class LightningDistributedModule(_LightningModuleWrapperBase): @@ -119,12 +120,31 @@ class IndexBatchSamplerWrapper: """This class is used to wrap a :class:`torch.utils.data.BatchSampler` and capture its indices.""" def __init__(self, sampler: BatchSampler) -> None: + self.seen_batch_indices: List[List[int]] = [] self._sampler = sampler - self.batch_indices: Optional[List[int]] = None + self._batch_indices: List[int] = [] + + @property + def batch_indices(self) -> List[int]: + rank_zero_deprecation( + "The attribute `IndexBatchSamplerWrapper.batch_indices` was deprecated in v1.5 and will be removed in" + " v1.7. Access the full list `seen_batch_indices` instead." + ) + return self._batch_indices + + @batch_indices.setter + def batch_indices(self, indices: List[int]) -> None: + rank_zero_deprecation( + "The attribute `IndexBatchSamplerWrapper.batch_indices` was deprecated in v1.5 and will be removed in" + " v1.7. Access the full list `seen_batch_indices` instead." + ) + self._batch_indices = indices def __iter__(self) -> Iterator[List[int]]: + self.seen_batch_indices = [] for batch in self._sampler: - self.batch_indices = batch + self._batch_indices = batch + self.seen_batch_indices.append(batch) yield batch def __len__(self) -> int: diff --git a/tests/callbacks/test_prediction_writer.py b/tests/callbacks/test_prediction_writer.py index 75e0dbd31ec79..2cd3738ca875f 100644 --- a/tests/callbacks/test_prediction_writer.py +++ b/tests/callbacks/test_prediction_writer.py @@ -11,54 +11,98 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from unittest.mock import ANY, call, Mock import pytest +from torch.utils.data import DataLoader from pytorch_lightning import Trainer from pytorch_lightning.callbacks import BasePredictionWriter from pytorch_lightning.utilities.exceptions import MisconfigurationException -from tests.helpers import BoringModel +from tests.helpers import BoringModel, RandomDataset +from tests.helpers.runif import RunIf -def test_prediction_writer(tmpdir): - class CustomPredictionWriter(BasePredictionWriter): - def __init__(self, writer_interval: str): - super().__init__(writer_interval) +class DummyPredictionWriter(BasePredictionWriter): + def write_on_batch_end(self, *args, **kwargs): + pass - self.write_on_batch_end_called = False - self.write_on_epoch_end_called = False + def write_on_epoch_end(self, *args, **kwargs): + pass - def write_on_batch_end(self, *args, **kwargs): - self.write_on_batch_end_called = True - - def write_on_epoch_end(self, *args, **kwargs): - self.write_on_epoch_end_called = True +def test_prediction_writer_invalid_write_interval(): + """Test that configuring an unknown interval name raises an error.""" with pytest.raises(MisconfigurationException, match=r"`write_interval` should be one of \['batch"): - CustomPredictionWriter("something") + DummyPredictionWriter("something") + + +def test_prediction_writer_hook_call_intervals(tmpdir): + """Test that the `write_on_batch_end` and `write_on_epoch_end` hooks get invoked based on the defined + interval.""" + DummyPredictionWriter.write_on_batch_end = Mock() + DummyPredictionWriter.write_on_epoch_end = Mock() + + dataloader = DataLoader(RandomDataset(32, 64)) model = BoringModel() - cb = CustomPredictionWriter("batch_and_epoch") + cb = DummyPredictionWriter("batch_and_epoch") trainer = Trainer(limit_predict_batches=4, callbacks=cb) - results = trainer.predict(model, dataloaders=model.train_dataloader()) + results = trainer.predict(model, dataloaders=dataloader) assert len(results) == 4 - assert cb.write_on_batch_end_called - assert cb.write_on_epoch_end_called + assert cb.write_on_batch_end.call_count == 4 + assert cb.write_on_epoch_end.call_count == 1 - cb = CustomPredictionWriter("batch_and_epoch") + DummyPredictionWriter.write_on_batch_end.reset_mock() + DummyPredictionWriter.write_on_epoch_end.reset_mock() + + cb = DummyPredictionWriter("batch_and_epoch") trainer = Trainer(limit_predict_batches=4, callbacks=cb) - trainer.predict(model, dataloaders=model.train_dataloader(), return_predictions=False) - assert cb.write_on_batch_end_called - assert cb.write_on_epoch_end_called + trainer.predict(model, dataloaders=dataloader, return_predictions=False) + assert cb.write_on_batch_end.call_count == 4 + assert cb.write_on_epoch_end.call_count == 1 + + DummyPredictionWriter.write_on_batch_end.reset_mock() + DummyPredictionWriter.write_on_epoch_end.reset_mock() - cb = CustomPredictionWriter("batch") + cb = DummyPredictionWriter("batch") trainer = Trainer(limit_predict_batches=4, callbacks=cb) - trainer.predict(model, dataloaders=model.train_dataloader(), return_predictions=False) - assert cb.write_on_batch_end_called - assert not cb.write_on_epoch_end_called + trainer.predict(model, dataloaders=dataloader, return_predictions=False) + assert cb.write_on_batch_end.call_count == 4 + assert cb.write_on_epoch_end.call_count == 0 + + DummyPredictionWriter.write_on_batch_end.reset_mock() + DummyPredictionWriter.write_on_epoch_end.reset_mock() - cb = CustomPredictionWriter("epoch") + cb = DummyPredictionWriter("epoch") trainer = Trainer(limit_predict_batches=4, callbacks=cb) - trainer.predict(model, dataloaders=model.train_dataloader(), return_predictions=False) - assert not cb.write_on_batch_end_called - assert cb.write_on_epoch_end_called + trainer.predict(model, dataloaders=dataloader, return_predictions=False) + assert cb.write_on_batch_end.call_count == 0 + assert cb.write_on_epoch_end.call_count == 1 + + +@pytest.mark.parametrize("num_workers", [0, pytest.param(2, marks=RunIf(slow=True))]) +def test_prediction_writer_batch_indices(tmpdir, num_workers): + DummyPredictionWriter.write_on_batch_end = Mock() + DummyPredictionWriter.write_on_epoch_end = Mock() + + dataloader = DataLoader(RandomDataset(32, 64), batch_size=4, num_workers=num_workers) + model = BoringModel() + writer = DummyPredictionWriter("batch_and_epoch") + trainer = Trainer(limit_predict_batches=4, callbacks=writer) + trainer.predict(model, dataloaders=dataloader) + + writer.write_on_batch_end.assert_has_calls( + [ + call(trainer, model, ANY, [0, 1, 2, 3], ANY, 0, 0), + call(trainer, model, ANY, [4, 5, 6, 7], ANY, 1, 0), + call(trainer, model, ANY, [8, 9, 10, 11], ANY, 2, 0), + call(trainer, model, ANY, [12, 13, 14, 15], ANY, 3, 0), + ] + ) + + writer.write_on_epoch_end.assert_has_calls( + [ + call(trainer, model, ANY, [[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]]), + ] + ) diff --git a/tests/deprecated_api/test_remove_1-7.py b/tests/deprecated_api/test_remove_1-7.py index e0e51575f5f22..62ec4d8d5490a 100644 --- a/tests/deprecated_api/test_remove_1-7.py +++ b/tests/deprecated_api/test_remove_1-7.py @@ -13,6 +13,7 @@ # limitations under the License. """Test deprecated functionality which will be removed in v1.7.0.""" from unittest import mock +from unittest.mock import Mock import pytest @@ -22,6 +23,7 @@ from pytorch_lightning.callbacks.progress import ProgressBar from pytorch_lightning.callbacks.xla_stats_monitor import XLAStatsMonitor from pytorch_lightning.loggers import LoggerCollection, TestTubeLogger +from pytorch_lightning.overrides.distributed import IndexBatchSamplerWrapper from tests.callbacks.test_callbacks import OldStatefulCallback from tests.deprecated_api import _soft_unimport_module from tests.helpers import BoringModel @@ -448,3 +450,12 @@ def test_v1_7_0_deprecate_lr_sch_names(tmpdir): with pytest.deprecated_call(match="`LearningRateMonitor.lr_sch_names` has been deprecated in v1.5"): assert lr_monitor.lr_sch_names == ["lr-SGD"] + + +def test_v1_7_0_index_batch_sampler_wrapper_batch_indices(): + sampler = IndexBatchSamplerWrapper(Mock()) + with pytest.deprecated_call(match="was deprecated in v1.5 and will be removed in v1.7"): + _ = sampler.batch_indices + + with pytest.deprecated_call(match="was deprecated in v1.5 and will be removed in v1.7"): + sampler.batch_indices = [] diff --git a/tests/overrides/test_distributed.py b/tests/overrides/test_distributed.py index c8d982bd733fe..e425859fe34df 100644 --- a/tests/overrides/test_distributed.py +++ b/tests/overrides/test_distributed.py @@ -54,9 +54,7 @@ def test_index_batch_sampler(tmpdir): assert batch_sampler.batch_size == index_batch_sampler.batch_size assert batch_sampler.drop_last == index_batch_sampler.drop_last assert batch_sampler.sampler is sampler - - for batch in index_batch_sampler: - assert index_batch_sampler.batch_indices == batch + assert list(index_batch_sampler) == index_batch_sampler.seen_batch_indices def test_index_batch_sampler_methods(): From e4f965629a22dc6e7e5fbb65992975aaf8df43e9 Mon Sep 17 00:00:00 2001 From: Rohit Gupta Date: Fri, 3 Dec 2021 20:04:46 +0530 Subject: [PATCH 071/123] Fix filtration logic for eval results with multiple dataloaders (#10810) Co-authored-by: Carlos Mocholi --- CHANGELOG.md | 2 + pytorch_lightning/core/lightning.py | 3 +- .../logger_connector/logger_connector.py | 21 +++--- .../connectors/logger_connector/result.py | 20 ++++-- pytorch_lightning/trainer/trainer.py | 2 +- tests/plugins/test_ddp_spawn_plugin.py | 2 +- .../logging_/test_eval_loop_logging.py | 69 +++++++++++-------- 7 files changed, 72 insertions(+), 47 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 57b3ed7e97a96..3439774a0368c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Disabled batch_size extraction for torchmetric instances because they accumulate the metrics internally ([#10815](https://github.com/PyTorchLightning/pytorch-lightning/pull/10815)) - Fixed an issue with `SignalConnector` not restoring the default signal handlers on teardown when running on SLURM or with fault-tolerant training enabled ([#10611](https://github.com/PyTorchLightning/pytorch-lightning/pull/10611)) - Fixed `SignalConnector._has_already_handler` check for callable type ([#10483](https://github.com/PyTorchLightning/pytorch-lightning/pull/10483)) +- Fixed an issue to return the results for each dataloader separately instead of duplicating them for each ([#10810](https://github.com/PyTorchLightning/pytorch-lightning/pull/10810)) - Improved exception message if `rich` version is less than `10.2.2` ([#10839](https://github.com/PyTorchLightning/pytorch-lightning/pull/10839)) - Fixed uploading best model checkpoint in NeptuneLogger ([#10369](https://github.com/PyTorchLightning/pytorch-lightning/pull/10369)) - Fixed early schedule reset logic in PyTorch profiler that was causing data leak ([#10837](https://github.com/PyTorchLightning/pytorch-lightning/pull/10837)) @@ -27,6 +28,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed a consolidation error in Lite when attempting to save the state dict of a sharded optimizer ([#10746](https://github.com/PyTorchLightning/pytorch-lightning/pull/10746)) - Fixed the default logging level for batch hooks associated with training from `on_step=False, on_epoch=True` to `on_step=True, on_epoch=False` ([#10756](https://github.com/PyTorchLightning/pytorch-lightning/pull/10756)) + ### Removed - Removed PyTorch 1.6 support ([#10367](https://github.com/PyTorchLightning/pytorch-lightning/pull/10367), [#10738](https://github.com/PyTorchLightning/pytorch-lightning/pull/10738)) diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py index 21d442d0d60ba..4270d910933c5 100644 --- a/pytorch_lightning/core/lightning.py +++ b/pytorch_lightning/core/lightning.py @@ -486,7 +486,8 @@ def log( on_epoch=on_epoch, reduce_fx=reduce_fx, enable_graph=enable_graph, - dataloader_idx=(self._current_dataloader_idx if add_dataloader_idx else None), + add_dataloader_idx=add_dataloader_idx, + dataloader_idx=self._current_dataloader_idx, batch_size=batch_size, sync_dist=sync_dist and distributed_available(), sync_dist_fn=self.trainer.training_type_plugin.reduce or sync_ddp, diff --git a/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py b/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py index d970d98c602bc..f574eebecd5f7 100644 --- a/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py +++ b/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py @@ -154,21 +154,20 @@ def update_eval_step_metrics(self) -> None: # increment the step even if nothing was logged self._increment_eval_log_step() - @staticmethod - def _filter_metrics_for_dataloader( - dl_idx: int, metrics: _OUT_DICT, metric_prefix: str = "dataloader_idx" - ) -> _OUT_DICT: - return {k: v for k, v in metrics.items() if metric_prefix not in k or k.endswith(f"{metric_prefix}_{dl_idx}")} - - def _prepare_eval_loop_results(self, metrics: _OUT_DICT) -> None: + def _prepare_eval_loop_results(self) -> None: if self.trainer.sanity_checking: return + on_step = not self._epoch_end_reached num_dataloaders = self.trainer._evaluation_loop.num_dataloaders has_been_initialized = len(self.eval_loop_results) == num_dataloaders - for dl_idx in range(self.trainer._evaluation_loop.num_dataloaders): - # remove callback metrics that don't belong to this dataloader - callback_metrics = self._filter_metrics_for_dataloader(dl_idx, metrics) + assert self.trainer._evaluation_loop._results is not None + for dl_idx in range(num_dataloaders): + metrics = self.trainer._evaluation_loop._results.metrics( + on_step, dataloader_idx=dl_idx if num_dataloaders > 1 else None + ) + callback_metrics = metrics["callback"] + if has_been_initialized: self.eval_loop_results[dl_idx].update(callback_metrics) else: @@ -182,7 +181,7 @@ def update_eval_epoch_metrics(self) -> List[_OUT_DICT]: # log all the metrics as a single dict self.log_metrics(metrics["log"]) - self._prepare_eval_loop_results(metrics["callback"]) + self._prepare_eval_loop_results() # log results of evaluation if ( diff --git a/pytorch_lightning/trainer/connectors/logger_connector/result.py b/pytorch_lightning/trainer/connectors/logger_connector/result.py index 428830e663e1a..8a3307a373998 100644 --- a/pytorch_lightning/trainer/connectors/logger_connector/result.py +++ b/pytorch_lightning/trainer/connectors/logger_connector/result.py @@ -113,6 +113,7 @@ class _Metadata: on_epoch: bool = True reduce_fx: Callable = torch.mean enable_graph: bool = False + add_dataloader_idx: bool = True dataloader_idx: Optional[int] = None metric_attribute: Optional[str] = None _sync: Optional[_Sync] = None @@ -434,6 +435,7 @@ def log( sync_dist: bool = False, sync_dist_fn: Callable = _Sync.no_op, sync_dist_group: Optional[Any] = None, + add_dataloader_idx: bool = True, dataloader_idx: Optional[int] = None, batch_size: Optional[int] = None, metric_attribute: Optional[str] = None, @@ -451,7 +453,7 @@ def log( # storage key key = f"{fx}.{name}" # add dataloader_suffix to both key and fx - if dataloader_idx is not None: + if add_dataloader_idx and dataloader_idx is not None: key += f".{dataloader_idx}" fx += f".{dataloader_idx}" @@ -464,6 +466,7 @@ def log( on_epoch=on_epoch, reduce_fx=reduce_fx, enable_graph=enable_graph, + add_dataloader_idx=add_dataloader_idx, dataloader_idx=dataloader_idx, metric_attribute=metric_attribute, ) @@ -522,24 +525,29 @@ def _get_cache(result_metric: ResultMetric, on_step: bool) -> Optional[torch.Ten return cache.detach() return cache - def valid_items(self) -> Generator: + def valid_items(self, dataloader_idx: Optional[int] = None) -> Generator: """This function is used to iterate over current valid metrics.""" - return ((k, v) for k, v in self.items() if not (isinstance(v, ResultMetric) and v.has_reset)) + return ( + (k, v) + for k, v in self.items() + if not (isinstance(v, ResultMetric) and v.has_reset) and (dataloader_idx in (None, v.meta.dataloader_idx)) + ) def _forked_name(self, result_metric: ResultMetric, on_step: bool) -> Tuple[str, str]: name = result_metric.meta.name forked_name = result_metric.meta.forked_name(on_step) + add_dataloader_idx = result_metric.meta.add_dataloader_idx dl_idx = result_metric.meta.dataloader_idx - if dl_idx is not None: + if add_dataloader_idx and dl_idx is not None: dataloader_suffix = self.DATALOADER_SUFFIX.format(dl_idx) name += dataloader_suffix forked_name += dataloader_suffix return name, forked_name - def metrics(self, on_step: bool) -> _METRICS: + def metrics(self, on_step: bool, dataloader_idx: Optional[int] = None) -> _METRICS: metrics = _METRICS(callback={}, log={}, pbar={}) - for _, result_metric in self.valid_items(): + for _, result_metric in self.valid_items(dataloader_idx): # extract forward_cache or computed from the ResultMetric. ignore when the output is None value = apply_to_collection(result_metric, ResultMetric, self._get_cache, on_step, include_none=False) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 2449f6b35bce4..2ab6b0d85ec0d 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -1396,7 +1396,7 @@ def __set_ckpt_path(self, ckpt_path: Optional[str], model_provided: bool, model_ " The best model of the previous `fit` call will be used." f" You can pass `{fn}(ckpt_path='best')` to use and best model" " checkpoint and avoid this warning or" - " `ckpt_path=trainer.model_checkpoint.last_model_path` to use the last model." + " `ckpt_path=trainer.checkpoint_callback.last_model_path` to use the last model." ) ckpt_path = "best" diff --git a/tests/plugins/test_ddp_spawn_plugin.py b/tests/plugins/test_ddp_spawn_plugin.py index c5e5f7ccda748..f8a96f5ef496e 100644 --- a/tests/plugins/test_ddp_spawn_plugin.py +++ b/tests/plugins/test_ddp_spawn_plugin.py @@ -128,7 +128,7 @@ def on_predict_start(self) -> None: assert isinstance(self.trainer.model, LightningModule) -@RunIf(skip_windows=True, skip_49370=True) +@RunIf(skip_windows=True, skip_49370=True, skip_hanging_spawn=True) def test_ddp_spawn_configure_ddp(tmpdir): """Tests with ddp spawn plugin.""" trainer = Trainer(default_root_dir=tmpdir, num_processes=2, strategy="ddp_spawn", fast_dev_run=True) diff --git a/tests/trainer/logging_/test_eval_loop_logging.py b/tests/trainer/logging_/test_eval_loop_logging.py index 433730d06111c..c205ed8c6af48 100644 --- a/tests/trainer/logging_/test_eval_loop_logging.py +++ b/tests/trainer/logging_/test_eval_loop_logging.py @@ -23,7 +23,6 @@ from pytorch_lightning import callbacks, Trainer from pytorch_lightning.loggers import TensorBoardLogger -from pytorch_lightning.trainer.connectors.logger_connector import LoggerConnector from pytorch_lightning.utilities.exceptions import MisconfigurationException from tests.helpers import BoringModel, RandomDataset from tests.helpers.runif import RunIf @@ -676,32 +675,6 @@ def val_dataloader(self): trainer.fit(model) -@pytest.mark.parametrize( - ["kwargs", "expected"], - [ - ({"dl_idx": 0, "metrics": {"acc": 123}}, {"acc": 123}), - ( - {"dl_idx": 0, "metrics": {"acc/dataloader_idx_0": 123, "acc/dataloader_idx_1": 321}}, - {"acc/dataloader_idx_0": 123}, - ), - ( - {"dl_idx": 10, "metrics": {"acc/dataloader_idx_1": 123, "acc/dataloader_idx_10": 321}}, - {"acc/dataloader_idx_10": 321}, - ), - ( - {"dl_idx": 3, "metrics": {"top_3_acc/dataloader_idx_0": 123, "top_3_acc/dataloader_idx_3": 321}}, - {"top_3_acc/dataloader_idx_3": 321}, - ), - # theoretical case, as `/dataloader_idx_3` would have been added - ({"dl_idx": 3, "metrics": {"top_3_acc": 123}}, {"top_3_acc": 123}), - ], -) -def test_filter_metrics_for_dataloader(kwargs, expected): - """Logged metrics should only include metrics from the concerned dataloader.""" - actual = LoggerConnector._filter_metrics_for_dataloader(**kwargs) - assert actual == expected - - @RunIf(min_gpus=1) def test_evaluation_move_metrics_to_cpu_and_outputs(tmpdir): class TestModel(BoringModel): @@ -723,3 +696,45 @@ def validation_epoch_end(self, outputs): model = TestModel() trainer = Trainer(default_root_dir=tmpdir, limit_val_batches=2, move_metrics_to_cpu=True, gpus=1) trainer.validate(model, verbose=False) + + +def test_logging_results_with_no_dataloader_idx(tmpdir): + num_dataloaders = 2 + log_common_same_val = {"test_log_common": 789} + log_common_diff_val = "test_log_common_diff_value" + log_key_no_dl_idx = "test_log_no_dl_idx_{}" + log_key_dl0 = {"test_log_a_class": 123} + log_key_dl1 = {"test_log_b_class": 456} + + class CustomBoringModel(BoringModel): + def test_step(self, batch, batch_idx, dataloader_idx): + self.log_dict(log_common_same_val) + self.log(log_common_diff_val, dataloader_idx + 1) + self.log( + log_key_no_dl_idx.format(dataloader_idx), + 321 * (dataloader_idx + 1), + add_dataloader_idx=False, + ) + self.log_dict(log_key_dl0 if dataloader_idx == 0 else log_key_dl1, add_dataloader_idx=False) + + def test_dataloader(self): + return [torch.utils.data.DataLoader(RandomDataset(32, 64)) for _ in range(num_dataloaders)] + + model = CustomBoringModel() + model.test_epoch_end = None + trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=1) + results = trainer.test(model) + + assert len(results) == num_dataloaders + assert results[0] == { + "test_log_common/dataloader_idx_0": 789.0, + "test_log_common_diff_value/dataloader_idx_0": 1.0, + "test_log_no_dl_idx_0": 321, + "test_log_a_class": 123.0, + } + assert results[1] == { + "test_log_common/dataloader_idx_1": 789.0, + "test_log_common_diff_value/dataloader_idx_1": 2.0, + "test_log_no_dl_idx_1": 321 * 2, + "test_log_b_class": 456.0, + } From c604fdffd651e2fc18e013cbd5776a4c5324b614 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Fri, 3 Dec 2021 17:37:40 +0100 Subject: [PATCH 072/123] Simplify some ddp-spawn tests #10921 --- tests/plugins/test_sharded_plugin.py | 45 ++++++++-------------------- tests/trainer/test_trainer.py | 12 +++----- 2 files changed, 16 insertions(+), 41 deletions(-) diff --git a/tests/plugins/test_sharded_plugin.py b/tests/plugins/test_sharded_plugin.py index 8a55633fb143e..588ee8096250b 100644 --- a/tests/plugins/test_sharded_plugin.py +++ b/tests/plugins/test_sharded_plugin.py @@ -5,7 +5,6 @@ import torch from pytorch_lightning import LightningModule, Trainer -from pytorch_lightning.callbacks import Callback from pytorch_lightning.plugins import DDPShardedPlugin, DDPSpawnShardedPlugin from pytorch_lightning.trainer.states import TrainerFn from pytorch_lightning.utilities import _FAIRSCALE_AVAILABLE @@ -31,43 +30,23 @@ def test_ddp_sharded_precision_16_clip_gradients(mock_oss_clip_grad_norm, clip_v @RunIf(fairscale=True) -@pytest.mark.parametrize(["strategy"], [("ddp_sharded",), ("ddp_sharded_spawn",)]) -def test_sharded_ddp_choice(tmpdir, strategy): +@pytest.mark.parametrize( + "strategy,expected", [("ddp_sharded", DDPShardedPlugin), ("ddp_sharded_spawn", DDPSpawnShardedPlugin)] +) +def test_sharded_ddp_choice(tmpdir, strategy, expected): """Test to ensure that plugin is correctly chosen.""" - - class CB(Callback): - def on_fit_start(self, trainer, pl_module): - if strategy == "ddp_sharded": - assert isinstance(trainer.accelerator.training_type_plugin, DDPShardedPlugin) - elif strategy == "ddp_sharded_spawn": - assert isinstance(trainer.accelerator.training_type_plugin, DDPSpawnShardedPlugin) - raise SystemExit() - - model = BoringModel() - trainer = Trainer(fast_dev_run=True, strategy=strategy, callbacks=[CB()]) - - with pytest.raises(SystemExit): - trainer.fit(model) + trainer = Trainer(fast_dev_run=True, strategy=strategy) + assert isinstance(trainer.accelerator.training_type_plugin, expected) @RunIf(min_gpus=1, fairscale=True) -@pytest.mark.parametrize(["strategy"], [("ddp_sharded",), ("ddp_sharded_spawn",)]) -def test_ddp_choice_sharded_amp(tmpdir, strategy): +@pytest.mark.parametrize( + "strategy,expected", [("ddp_sharded", DDPShardedPlugin), ("ddp_sharded_spawn", DDPSpawnShardedPlugin)] +) +def test_ddp_choice_sharded_amp(tmpdir, strategy, expected): """Test to ensure that plugin native amp plugin is correctly chosen when using sharded.""" - - class CB(Callback): - def on_fit_start(self, trainer, pl_module): - if strategy == "ddp_sharded": - assert isinstance(trainer.accelerator.training_type_plugin, DDPShardedPlugin) - elif strategy == "ddp_sharded_spawn": - assert isinstance(trainer.accelerator.training_type_plugin, DDPSpawnShardedPlugin) - raise SystemExit() - - model = BoringModel() - trainer = Trainer(fast_dev_run=True, gpus=1, precision=16, strategy=strategy, callbacks=[CB()]) - - with pytest.raises(SystemExit): - trainer.fit(model) + trainer = Trainer(fast_dev_run=True, gpus=1, precision=16, strategy=strategy) + assert isinstance(trainer.accelerator.training_type_plugin, expected) @RunIf(skip_windows=True, fairscale=True) diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py index ee71616524944..65006a98b30e8 100644 --- a/tests/trainer/test_trainer.py +++ b/tests/trainer/test_trainer.py @@ -1502,14 +1502,10 @@ def write_on_batch_end(self, trainer, pl_module, prediction, batch_indices, *arg def test_spawn_predict_return_predictions(_, __, accelerator): """Test that `return_predictions=True` raise a MisconfigurationException with spawn training type plugins.""" model = BoringModel() - - def run(expected_plugin, **trainer_kwargs): - trainer = Trainer(**trainer_kwargs, fast_dev_run=True) - assert isinstance(trainer.training_type_plugin, expected_plugin) - with pytest.raises(MisconfigurationException, match="`return_predictions` should be set to `False`"): - trainer.predict(model, dataloaders=model.train_dataloader(), return_predictions=True) - - run(DDPSpawnPlugin, accelerator=accelerator, strategy="ddp_spawn", devices=2) + trainer = Trainer(accelerator=accelerator, strategy="ddp_spawn", devices=2, fast_dev_run=True) + assert isinstance(trainer.training_type_plugin, DDPSpawnPlugin) + with pytest.raises(MisconfigurationException, match="`return_predictions` should be set to `False`"): + trainer.predict(model, dataloaders=model.train_dataloader(), return_predictions=True) @pytest.mark.parametrize("return_predictions", [None, False, True]) From d2e791e22386cf1dae0a9ccad690da212538d3c0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Fri, 3 Dec 2021 21:11:20 +0100 Subject: [PATCH 073/123] Resolve: 'DummyExperiment' object does not support item assignment (#10917) Co-authored-by: Justus Schock <12886177+justusschock@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- CHANGELOG.md | 2 ++ pytorch_lightning/loggers/base.py | 3 +++ tests/loggers/test_base.py | 7 +++++++ 3 files changed, 12 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3439774a0368c..486778b3620c3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,6 +16,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed uploading best model checkpoint in NeptuneLogger ([#10369](https://github.com/PyTorchLightning/pytorch-lightning/pull/10369)) - Fixed early schedule reset logic in PyTorch profiler that was causing data leak ([#10837](https://github.com/PyTorchLightning/pytorch-lightning/pull/10837)) - Fixed a bug that caused incorrect batch indices to be passed to the `BasePredictionWriter` hooks when using a dataloader with `num_workers > 0` ([#10870](https://github.com/PyTorchLightning/pytorch-lightning/pull/10870)) +- Fixed an issue with item assignment on the logger on rank > 0 for those who support it ([#10917](https://github.com/PyTorchLightning/pytorch-lightning/pull/10917)) + ## [1.5.4] - 2021-11-30 diff --git a/pytorch_lightning/loggers/base.py b/pytorch_lightning/loggers/base.py index e5ccae435d8c9..0698a409762b4 100644 --- a/pytorch_lightning/loggers/base.py +++ b/pytorch_lightning/loggers/base.py @@ -474,6 +474,9 @@ def __getitem__(self, idx) -> "DummyExperiment": # enables self.logger.experiment[0].add_image(...) return self + def __setitem__(self, *args, **kwargs) -> None: + pass + class DummyLogger(LightningLoggerBase): """Dummy logger for internal use. diff --git a/tests/loggers/test_base.py b/tests/loggers/test_base.py index d6b753c0439ee..f4a91f63b50c4 100644 --- a/tests/loggers/test_base.py +++ b/tests/loggers/test_base.py @@ -241,6 +241,13 @@ def test_dummylogger_noop_method_calls(): logger.log_metrics("1", 2, three="three") +def test_dummyexperiment_support_item_assignment(): + """Test that the DummyExperiment supports item assignment.""" + experiment = DummyExperiment() + experiment["variable"] = "value" + assert experiment["variable"] != "value" # this is only a stateless mock experiment + + def test_np_sanitization(): class CustomParamsLogger(CustomLogger): def __init__(self): From f245ea7398299dde713b0dd5e83d2322c9dc0180 Mon Sep 17 00:00:00 2001 From: Kaushik B <45285388+kaushikb11@users.noreply.github.com> Date: Mon, 6 Dec 2021 12:01:38 +0530 Subject: [PATCH 074/123] Don't import torch_xla.debug for torch-xla<1.8 (#10836) --- CHANGELOG.md | 2 +- pytorch_lightning/profiler/xla.py | 9 +++++++-- tests/profiler/test_xla_profiler.py | 6 ++++-- 3 files changed, 12 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 486778b3620c3..0f3cb340e18ec 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,7 +17,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed early schedule reset logic in PyTorch profiler that was causing data leak ([#10837](https://github.com/PyTorchLightning/pytorch-lightning/pull/10837)) - Fixed a bug that caused incorrect batch indices to be passed to the `BasePredictionWriter` hooks when using a dataloader with `num_workers > 0` ([#10870](https://github.com/PyTorchLightning/pytorch-lightning/pull/10870)) - Fixed an issue with item assignment on the logger on rank > 0 for those who support it ([#10917](https://github.com/PyTorchLightning/pytorch-lightning/pull/10917)) - +- Fixed importing `torch_xla.debug` for `torch-xla<1.8` ([#10836](https://github.com/PyTorchLightning/pytorch-lightning/pull/10836)) ## [1.5.4] - 2021-11-30 diff --git a/pytorch_lightning/profiler/xla.py b/pytorch_lightning/profiler/xla.py index e30f06f84e952..c89685bcad0be 100644 --- a/pytorch_lightning/profiler/xla.py +++ b/pytorch_lightning/profiler/xla.py @@ -42,9 +42,10 @@ from typing import Dict from pytorch_lightning.profiler.base import BaseProfiler -from pytorch_lightning.utilities import _TPU_AVAILABLE +from pytorch_lightning.utilities import _TORCH_GREATER_EQUAL_1_8, _TPU_AVAILABLE +from pytorch_lightning.utilities.exceptions import MisconfigurationException -if _TPU_AVAILABLE: +if _TPU_AVAILABLE and _TORCH_GREATER_EQUAL_1_8: import torch_xla.debug.profiler as xp log = logging.getLogger(__name__) @@ -65,6 +66,10 @@ class XLAProfiler(BaseProfiler): def __init__(self, port: int = 9012) -> None: """This Profiler will help you debug and optimize training workload performance for your models using Cloud TPU performance tools.""" + if not _TPU_AVAILABLE: + raise MisconfigurationException("`XLAProfiler` is only supported on TPUs") + if not _TORCH_GREATER_EQUAL_1_8: + raise MisconfigurationException("`XLAProfiler` is only supported with `torch-xla >= 1.8`") super().__init__(dirpath=None, filename=None) self.port = port self._recording_map: Dict = {} diff --git a/tests/profiler/test_xla_profiler.py b/tests/profiler/test_xla_profiler.py index 2afbf69a6d0b0..7f460ea11d322 100644 --- a/tests/profiler/test_xla_profiler.py +++ b/tests/profiler/test_xla_profiler.py @@ -18,14 +18,16 @@ from pytorch_lightning import Trainer from pytorch_lightning.profiler import XLAProfiler -from pytorch_lightning.utilities import _TPU_AVAILABLE +from pytorch_lightning.utilities import _TORCH_GREATER_EQUAL_1_8, _TPU_AVAILABLE from tests.helpers import BoringModel from tests.helpers.runif import RunIf if _TPU_AVAILABLE: - import torch_xla.debug.profiler as xp import torch_xla.utils.utils as xu + if _TORCH_GREATER_EQUAL_1_8: + import torch_xla.debug.profiler as xp + @RunIf(tpu=True) def test_xla_profiler_instance(tmpdir): From 2cbf762cfb55d2d7e6b77b9d4bb23979be49549b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sun, 5 Dec 2021 07:55:47 +0100 Subject: [PATCH 075/123] triger ci only with pull request (#10932) --- .circleci/config.yml | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/.circleci/config.yml b/.circleci/config.yml index ecebb1e9d94b5..8758310be9b6b 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -5,6 +5,19 @@ orbs: go: circleci/go@1.3.0 codecov: codecov/codecov@1.1.0 +trigger: + tags: + include: + - '*' + branches: + include: + - "master" + - "release/*" + - "refs/tags/*" +pr: + - "master" + - "release/*" + # Workflow Steps: # 1. Checkout # 2. Install GO From 50b89dbcdabb16a2e8ebf3d2d8af44b8735032e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 7 Dec 2021 12:06:21 +0100 Subject: [PATCH 076/123] Fix spawn plugins not deleting temp checkpoint (#10935) --- CHANGELOG.md | 2 ++ pytorch_lightning/plugins/training_type/ddp_spawn.py | 1 + 2 files changed, 3 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0f3cb340e18ec..a6c08e7875e3f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,6 +19,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed an issue with item assignment on the logger on rank > 0 for those who support it ([#10917](https://github.com/PyTorchLightning/pytorch-lightning/pull/10917)) - Fixed importing `torch_xla.debug` for `torch-xla<1.8` ([#10836](https://github.com/PyTorchLightning/pytorch-lightning/pull/10836)) +- Fixed an issue with `DDPSpawnPlugin` and related plugins leaving a temporary checkpoint behind ([#10934](https://github.com/PyTorchLightning/pytorch-lightning/pull/10934)) + ## [1.5.4] - 2021-11-30 diff --git a/pytorch_lightning/plugins/training_type/ddp_spawn.py b/pytorch_lightning/plugins/training_type/ddp_spawn.py index ff5159f739cdc..4e787e77e668a 100644 --- a/pytorch_lightning/plugins/training_type/ddp_spawn.py +++ b/pytorch_lightning/plugins/training_type/ddp_spawn.py @@ -333,6 +333,7 @@ def __recover_child_process_weights(self, best_path, last_path): if last_path is not None and self.lightning_module.trainer.state.fn == TrainerFn.FITTING: ckpt = pl_load(last_path, map_location=lambda storage, loc: storage) self.lightning_module.load_state_dict(ckpt) + self.checkpoint_io.remove_checkpoint(last_path) def barrier(self, *args, **kwargs) -> None: if not distributed_available(): From f5931816d8b26447b226d14f63497f53f41a9941 Mon Sep 17 00:00:00 2001 From: four4fish <88516121+four4fish@users.noreply.github.com> Date: Mon, 6 Dec 2021 13:48:31 -0800 Subject: [PATCH 077/123] fix TypeError cause failure in singal_connector teardown (#10961) --- CHANGELOG.md | 3 +++ pytorch_lightning/trainer/connectors/signal_connector.py | 2 +- tests/trainer/connectors/test_signal_connector.py | 1 + 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a6c08e7875e3f..9be1f32e4cbb9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed an issue with `DDPSpawnPlugin` and related plugins leaving a temporary checkpoint behind ([#10934](https://github.com/PyTorchLightning/pytorch-lightning/pull/10934)) +- Fixed TypeError cause failure in `singal_connector` `teardown` method by adding None check ([#10961](https://github.com/PyTorchLightning/pytorch-lightning/pull/10961)) + + ## [1.5.4] - 2021-11-30 ### Fixed diff --git a/pytorch_lightning/trainer/connectors/signal_connector.py b/pytorch_lightning/trainer/connectors/signal_connector.py index 27b96f1e4cbcf..2020aab7bd48f 100644 --- a/pytorch_lightning/trainer/connectors/signal_connector.py +++ b/pytorch_lightning/trainer/connectors/signal_connector.py @@ -144,7 +144,7 @@ def _is_on_slurm(self) -> bool: @staticmethod def _has_already_handler(signum: Signals) -> bool: - return signal.getsignal(signum) is not signal.SIG_DFL + return signal.getsignal(signum) not in (None, signal.SIG_DFL) @staticmethod def _register_signal(signum: Signals, handlers: HandlersCompose) -> None: diff --git a/tests/trainer/connectors/test_signal_connector.py b/tests/trainer/connectors/test_signal_connector.py index 46ad48cc6276d..c5990be94e2cb 100644 --- a/tests/trainer/connectors/test_signal_connector.py +++ b/tests/trainer/connectors/test_signal_connector.py @@ -102,6 +102,7 @@ def signal_handler(self): @pytest.mark.parametrize( ["handler", "expected_return"], [ + (None, False), (signal.Handlers.SIG_IGN, True), (signal.Handlers.SIG_DFL, False), (signal_handler, True), From 361894f094c44bc7c14cf67ae191b6d3915562b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 7 Dec 2021 14:26:47 +0100 Subject: [PATCH 078/123] 1.5.5 release --- CHANGELOG.md | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9be1f32e4cbb9..b6634152da0a5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -18,11 +18,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed a bug that caused incorrect batch indices to be passed to the `BasePredictionWriter` hooks when using a dataloader with `num_workers > 0` ([#10870](https://github.com/PyTorchLightning/pytorch-lightning/pull/10870)) - Fixed an issue with item assignment on the logger on rank > 0 for those who support it ([#10917](https://github.com/PyTorchLightning/pytorch-lightning/pull/10917)) - Fixed importing `torch_xla.debug` for `torch-xla<1.8` ([#10836](https://github.com/PyTorchLightning/pytorch-lightning/pull/10836)) - - Fixed an issue with `DDPSpawnPlugin` and related plugins leaving a temporary checkpoint behind ([#10934](https://github.com/PyTorchLightning/pytorch-lightning/pull/10934)) - - -- Fixed TypeError cause failure in `singal_connector` `teardown` method by adding None check ([#10961](https://github.com/PyTorchLightning/pytorch-lightning/pull/10961)) +- Fixed a `TypeError` occuring in the `SingalConnector.teardown()` method ([#10961](https://github.com/PyTorchLightning/pytorch-lightning/pull/10961)) ## [1.5.4] - 2021-11-30 From 6963a2f6a7831f0aa5008a69758b98261b44086b Mon Sep 17 00:00:00 2001 From: rohitgr7 Date: Tue, 7 Dec 2021 21:23:59 +0530 Subject: [PATCH 079/123] update chlog --- CHANGELOG.md | 10 ++++++++++ pytorch_lightning/__about__.py | 2 +- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b6634152da0a5..6d2957158f825 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,16 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). +## [1.5.6] - 2021-12-14 + +### Fixed + +- + + +- + + ## [1.5.5] - 2021-12-07 ### Fixed diff --git a/pytorch_lightning/__about__.py b/pytorch_lightning/__about__.py index 15d57a2617fed..d107d5d77c424 100644 --- a/pytorch_lightning/__about__.py +++ b/pytorch_lightning/__about__.py @@ -1,7 +1,7 @@ import time _this_year = time.strftime("%Y") -__version__ = "1.5.5" +__version__ = "1.5.6" __author__ = "William Falcon et al." __author_email__ = "waf2107@columbia.edu" __license__ = "Apache-2.0" From 1f6392344a8ae3758419131619485772362f25fb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 14 Dec 2021 20:41:32 +0100 Subject: [PATCH 080/123] [DeepSpeed] fix flag forwarding in DeepSpeedPlugin (#10899) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Sean Naren Co-authored-by: ananthsub Co-authored-by: Adrian Wälchli --- CHANGELOG.md | 2 +- .../plugins/training_type/deepspeed.py | 4 +-- tests/plugins/test_deepspeed_plugin.py | 30 +++++++++++++++++++ 3 files changed, 33 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6d2957158f825..39eb1b56933c1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,7 +8,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Fixed -- +- Fixed a bug where the DeepSpeedPlugin arguments `cpu_checkpointing` and `contiguous_memory_optimization` were not being forwarded to deepspeed correctly ([#10874](https://github.com/PyTorchLightning/pytorch-lightning/issues/10874)) - diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py index 13f518d53ec00..4b08c8dc8b039 100644 --- a/pytorch_lightning/plugins/training_type/deepspeed.py +++ b/pytorch_lightning/plugins/training_type/deepspeed.py @@ -538,8 +538,8 @@ def _set_deepspeed_activation_checkpointing(self): deepspeed.checkpointing.configure( mpu_=None, partition_activations=checkpoint_config.get("partition_activations"), - contiguous_checkpointing=checkpoint_config.get("contiguous_checkpointing"), - checkpoint_in_cpu=checkpoint_config.get("checkpoint_in_cpu"), + contiguous_checkpointing=checkpoint_config.get("contiguous_memory_optimization"), + checkpoint_in_cpu=checkpoint_config.get("cpu_checkpointing"), profile=checkpoint_config.get("profile"), ) diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py index cfb12369da1c4..bb65c61e057fd 100644 --- a/tests/plugins/test_deepspeed_plugin.py +++ b/tests/plugins/test_deepspeed_plugin.py @@ -361,6 +361,36 @@ def test_deepspeed_custom_activation_checkpointing_params(tmpdir): assert checkpoint_config["synchronize_checkpoint_boundary"] +@RunIf(min_gpus=1, deepspeed=True, standalone=True) +def test_deepspeed_custom_activation_checkpointing_params_forwarded(tmpdir): + """Ensure if we modify the activation checkpointing parameters, we pass these to + deepspeed.checkpointing.configure correctly.""" + ds = DeepSpeedPlugin( + partition_activations=True, + cpu_checkpointing=True, + contiguous_memory_optimization=True, + synchronize_checkpoint_boundary=True, + ) + + model = BoringModel() + trainer = Trainer( + default_root_dir=tmpdir, + enable_progress_bar=False, + fast_dev_run=1, + strategy=ds, + precision=16, + gpus=1, + ) + with mock.patch( + "deepspeed.checkpointing.configure", wraps=deepspeed.checkpointing.configure + ) as deepspeed_checkpointing_configure: + trainer.fit(model) + + deepspeed_checkpointing_configure.assert_called_with( + mpu_=None, partition_activations=True, contiguous_checkpointing=True, checkpoint_in_cpu=True, profile=None + ) + + @RunIf(min_gpus=1, deepspeed=True) def test_deepspeed_assert_config_zero_offload_disabled(tmpdir, deepspeed_zero_config): """Ensure if we use a config and turn off offload_optimizer, that this is set to False within the config.""" From 93cda2491fb4a35f20c497265eea9abf34fcd428 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 14 Dec 2021 20:47:25 +0100 Subject: [PATCH 081/123] Removed duplicated file extension when uploading model checkpoints with NeptuneLogger (#11015) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Rafał Jankowski --- CHANGELOG.md | 4 +--- pytorch_lightning/loggers/neptune.py | 5 ++++- tests/loggers/test_neptune.py | 4 ++-- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 39eb1b56933c1..9212a8fb86c1c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,9 +9,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Fixed - Fixed a bug where the DeepSpeedPlugin arguments `cpu_checkpointing` and `contiguous_memory_optimization` were not being forwarded to deepspeed correctly ([#10874](https://github.com/PyTorchLightning/pytorch-lightning/issues/10874)) - - -- +- Fixed an issue with `NeptuneLogger` causing checkpoints to be uploaded with a duplicated file extension ([#11015](https://github.com/PyTorchLightning/pytorch-lightning/issues/11015)) ## [1.5.5] - 2021-12-07 diff --git a/pytorch_lightning/loggers/neptune.py b/pytorch_lightning/loggers/neptune.py index 21fbd4a79ea0e..97876afe65368 100644 --- a/pytorch_lightning/loggers/neptune.py +++ b/pytorch_lightning/loggers/neptune.py @@ -553,7 +553,10 @@ def _get_full_model_name(model_path: str, checkpoint_callback: "ReferenceType[Mo expected_model_path = f"{checkpoint_callback.dirpath}{os.path.sep}" if not model_path.startswith(expected_model_path): raise ValueError(f"{model_path} was expected to start with {expected_model_path}.") - return model_path[len(expected_model_path) :] + # Remove extension from filepath + filepath, _ = os.path.splitext(model_path[len(expected_model_path) :]) + + return filepath @classmethod def _get_full_model_names_from_exp_structure(cls, exp_structure: dict, namespace: str) -> Set[str]: diff --git a/tests/loggers/test_neptune.py b/tests/loggers/test_neptune.py index 6238b408c1e6e..cb7ef9c5156cc 100644 --- a/tests/loggers/test_neptune.py +++ b/tests/loggers/test_neptune.py @@ -397,9 +397,9 @@ def test__get_full_model_name(self): # given: SimpleCheckpoint = namedtuple("SimpleCheckpoint", ["dirpath"]) test_input_data = [ - ("key.ext", os.path.join("foo", "bar", "key.ext"), SimpleCheckpoint(dirpath=os.path.join("foo", "bar"))), + ("key", os.path.join("foo", "bar", "key.ext"), SimpleCheckpoint(dirpath=os.path.join("foo", "bar"))), ( - "key/in/parts.ext", + "key/in/parts", os.path.join("foo", "bar", "key/in/parts.ext"), SimpleCheckpoint(dirpath=os.path.join("foo", "bar")), ), From ccfd1d8fc50dce24368b92906362328158866198 Mon Sep 17 00:00:00 2001 From: Rohit Gupta Date: Wed, 15 Dec 2021 00:11:29 +0530 Subject: [PATCH 082/123] Fix support for logging within callbacks returned from `LightningModule` (#10991) Co-authored-by: thomas chaton Co-authored-by: Carlos Mocholi --- CHANGELOG.md | 12 ++++++++++++ pytorch_lightning/trainer/configuration_validator.py | 6 +++++- .../trainer/connectors/callback_connector.py | 7 ++++--- pytorch_lightning/trainer/trainer.py | 11 +++++------ tests/models/test_hooks.py | 8 ++++---- 5 files changed, 30 insertions(+), 14 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9212a8fb86c1c..a0649eab800bb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,7 +9,19 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Fixed - Fixed a bug where the DeepSpeedPlugin arguments `cpu_checkpointing` and `contiguous_memory_optimization` were not being forwarded to deepspeed correctly ([#10874](https://github.com/PyTorchLightning/pytorch-lightning/issues/10874)) + + - Fixed an issue with `NeptuneLogger` causing checkpoints to be uploaded with a duplicated file extension ([#11015](https://github.com/PyTorchLightning/pytorch-lightning/issues/11015)) +======= + + +- Fixed support for logging within callbacks returned from `LightningModule` ([#10991](https://github.com/PyTorchLightning/pytorch-lightning/pull/10991)) + + +- + + +- ## [1.5.5] - 2021-12-07 diff --git a/pytorch_lightning/trainer/configuration_validator.py b/pytorch_lightning/trainer/configuration_validator.py index c44529c539b92..ed247ac94feac 100644 --- a/pytorch_lightning/trainer/configuration_validator.py +++ b/pytorch_lightning/trainer/configuration_validator.py @@ -19,7 +19,7 @@ from pytorch_lightning.utilities.warnings import rank_zero_deprecation, rank_zero_warn -def verify_loop_configurations(trainer: "pl.Trainer", model: "pl.LightningModule") -> None: +def verify_loop_configurations(trainer: "pl.Trainer") -> None: r""" Checks that the model is configured correctly before the run is started. @@ -28,6 +28,10 @@ def verify_loop_configurations(trainer: "pl.Trainer", model: "pl.LightningModule model: The model to check the configuration. """ + model = trainer.lightning_module + + if trainer.state.fn is None: + raise ValueError("Unexpected: Trainer state fn must be set before validating loop configuration.") if trainer.state.fn in (TrainerFn.FITTING, TrainerFn.TUNING): __verify_train_val_loop_configuration(trainer, model) __verify_manual_optimization_support(trainer, model) diff --git a/pytorch_lightning/trainer/connectors/callback_connector.py b/pytorch_lightning/trainer/connectors/callback_connector.py index 4d41734ed90e6..c683a294d0440 100644 --- a/pytorch_lightning/trainer/connectors/callback_connector.py +++ b/pytorch_lightning/trainer/connectors/callback_connector.py @@ -249,10 +249,11 @@ def _configure_timer_callback(self, max_time: Optional[Union[str, timedelta, Dic def _trainer_has_checkpoint_callbacks(self): return len(self.trainer.checkpoint_callbacks) > 0 - def attach_model_logging_functions(self, model): + def _attach_model_logging_functions(self): + lightning_module = self.trainer.lightning_module for callback in self.trainer.callbacks: - callback.log = model.log - callback.log_dict = model.log_dict + callback.log = lightning_module.log + callback.log_dict = lightning_module.log_dict def _attach_model_callbacks(self) -> None: """Attaches the callbacks defined in the model. diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 2ab6b0d85ec0d..18be005715aca 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -1114,17 +1114,16 @@ def _run( if hasattr(model, "hparams"): parsing.clean_namespace(model.hparams) - verify_loop_configurations(self, model) - - # attach model log function to callback - self._callback_connector.attach_model_logging_functions(model) - # attach model to the training type plugin self.training_type_plugin.connect(model) + self._callback_connector._attach_model_callbacks() + self._callback_connector._attach_model_logging_functions() + + verify_loop_configurations(self) + # hook self._data_connector.prepare_data() - self._callback_connector._attach_model_callbacks() # ---------------------------- # SET UP TRAINING diff --git a/tests/models/test_hooks.py b/tests/models/test_hooks.py index b43dfe386b1ec..51c041fadd85e 100644 --- a/tests/models/test_hooks.py +++ b/tests/models/test_hooks.py @@ -499,8 +499,8 @@ def training_step(self, batch, batch_idx): expected = [ dict(name="Callback.on_init_start", args=(trainer,)), dict(name="Callback.on_init_end", args=(trainer,)), - dict(name="prepare_data"), dict(name="configure_callbacks"), + dict(name="prepare_data"), dict(name="Callback.on_before_accelerator_backend_setup", args=(trainer, model)), # DeepSpeed needs the batch size to figure out throughput logging *([dict(name="train_dataloader")] if kwargs.get("strategy") == "deepspeed" else []), @@ -618,8 +618,8 @@ def test_trainer_model_hook_system_fit_no_val_and_resume(tmpdir): expected = [ dict(name="Callback.on_init_start", args=(trainer,)), dict(name="Callback.on_init_end", args=(trainer,)), - dict(name="prepare_data"), dict(name="configure_callbacks"), + dict(name="prepare_data"), dict(name="Callback.on_before_accelerator_backend_setup", args=(trainer, model)), dict(name="Callback.setup", args=(trainer, model), kwargs=dict(stage="fit")), dict(name="setup", kwargs=dict(stage="fit")), @@ -716,8 +716,8 @@ def test_trainer_model_hook_system_eval(tmpdir, batches, verb, noun, dataloader, expected = [ dict(name="Callback.on_init_start", args=(trainer,)), dict(name="Callback.on_init_end", args=(trainer,)), - dict(name="prepare_data"), dict(name="configure_callbacks"), + dict(name="prepare_data"), dict(name="Callback.on_before_accelerator_backend_setup", args=(trainer, model)), dict(name="Callback.setup", args=(trainer, model), kwargs=dict(stage=verb)), dict(name="setup", kwargs=dict(stage=verb)), @@ -748,8 +748,8 @@ def test_trainer_model_hook_system_predict(tmpdir): expected = [ dict(name="Callback.on_init_start", args=(trainer,)), dict(name="Callback.on_init_end", args=(trainer,)), - dict(name="prepare_data"), dict(name="configure_callbacks"), + dict(name="prepare_data"), dict(name="Callback.on_before_accelerator_backend_setup", args=(trainer, model)), dict(name="Callback.setup", args=(trainer, model), kwargs=dict(stage="predict")), dict(name="setup", kwargs=dict(stage="predict")), From 82d7d5018fd7c933a9909088a46f68db86db02df Mon Sep 17 00:00:00 2001 From: Rohit Gupta Date: Tue, 14 Dec 2021 23:35:42 +0530 Subject: [PATCH 083/123] Fix the num_batches value in warning (#10980) --- pytorch_lightning/trainer/data_loading.py | 6 +++--- tests/trainer/test_data_loading.py | 18 +++++++++++++++++- 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/pytorch_lightning/trainer/data_loading.py b/pytorch_lightning/trainer/data_loading.py index b82f9dc319165..c52705fc07c59 100644 --- a/pytorch_lightning/trainer/data_loading.py +++ b/pytorch_lightning/trainer/data_loading.py @@ -177,7 +177,7 @@ def _resolve_sampler(self, dataloader: DataLoader, shuffle: bool, mode: Optional if self._requires_distributed_sampler(dataloader): if not isinstance(dataloader.sampler, (SequentialSampler, RandomSampler)): raise MisconfigurationException( - "You seem to have configured a sampler in your DataLoader. This will be replaced " + "You seem to have configured a sampler in your DataLoader. This will be replaced" " by `DistributedSampler` since `replace_sampler_ddp` is True and you are using" " distributed training. Either remove the sampler from your DataLoader or set" " `replace_sampler_ddp=False` if you want to use your custom sampler." @@ -478,7 +478,7 @@ def _reset_eval_dataloader( module = model or self.lightning_module or self.datamodule if len(dataloaders) != 0: for i, dataloader in enumerate(dataloaders): - num_batches = ( + orig_num_batches = num_batches = ( len(dataloader) if has_len_all_ranks(dataloader, self.training_type_plugin, module) else float("inf") @@ -504,7 +504,7 @@ def _reset_eval_dataloader( min_pct = 1.0 / len(dataloader) raise MisconfigurationException( f"you requested to check {limit_eval_batches} of the `{mode.dataloader_prefix}_dataloader` but" - f" {limit_eval_batches}*{num_batches} < 1. Please increase the" + f" {limit_eval_batches} * {orig_num_batches} < 1. Please increase the" f" `limit_{mode.dataloader_prefix}_batches` flag. Try at least" f" `limit_{mode.dataloader_prefix}_batches={min_pct}`" ) diff --git a/tests/trainer/test_data_loading.py b/tests/trainer/test_data_loading.py index c04c6f0f6ea41..d7bf9c987127d 100644 --- a/tests/trainer/test_data_loading.py +++ b/tests/trainer/test_data_loading.py @@ -20,6 +20,7 @@ from torch.utils.data.sampler import BatchSampler, Sampler, SequentialSampler from pytorch_lightning import Trainer +from pytorch_lightning.trainer.states import RunningStage from pytorch_lightning.utilities.enums import DistributedType from pytorch_lightning.utilities.exceptions import MisconfigurationException from tests.helpers import BoringModel, RandomDataset @@ -279,7 +280,7 @@ class CustomSampler(Sampler): # Should raise an error if existing sampler is being replaced dataloader = CustomDataLoader(dataset, sampler=CustomSampler(dataset)) - with pytest.raises(MisconfigurationException, match="will be replaced by `DistributedSampler`"): + with pytest.raises(MisconfigurationException, match="will be replaced by `DistributedSampler`"): trainer.prepare_dataloader(dataloader, shuffle=True) @@ -348,3 +349,18 @@ def test_pre_made_batches(): loader = DataLoader(RandomDataset(32, 10), batch_size=None) trainer = Trainer(fast_dev_run=1) trainer.predict(LoaderTestModel(), loader) + + +def test_error_raised_with_float_limited_eval_batches(): + """Test that an error is raised if there are not enough batches when passed with float value of + limit_eval_batches.""" + model = BoringModel() + dl_size = len(model.val_dataloader()) + limit_val_batches = 1 / (dl_size + 2) + trainer = Trainer(limit_val_batches=limit_val_batches) + trainer._data_connector.attach_data(model) + with pytest.raises( + MisconfigurationException, + match=fr"{limit_val_batches} \* {dl_size} < 1. Please increase the `limit_val_batches`", + ): + trainer._reset_eval_dataloader(RunningStage.VALIDATING, model) From c014dd155d814e3c5ac32911be2cf78640bb4961 Mon Sep 17 00:00:00 2001 From: Rohit Gupta Date: Tue, 14 Dec 2021 20:43:23 +0530 Subject: [PATCH 084/123] Fix support for `CombinedLoader` while checking for warning raised with eval dataloaders (#10994) --- CHANGELOG.md | 3 +++ pytorch_lightning/trainer/data_loading.py | 21 ++++++++++++++++--- pytorch_lightning/trainer/supporters.py | 8 ++++---- tests/trainer/test_data_loading.py | 25 +++++++++++++++++++++++ 4 files changed, 50 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a0649eab800bb..47d23d2de0ef3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -18,6 +18,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed support for logging within callbacks returned from `LightningModule` ([#10991](https://github.com/PyTorchLightning/pytorch-lightning/pull/10991)) +- Fixed support for `CombinedLoader` while checking for warning raised with eval dataloaders ([#10994](https://github.com/PyTorchLightning/pytorch-lightning/pull/10994)) + + - diff --git a/pytorch_lightning/trainer/data_loading.py b/pytorch_lightning/trainer/data_loading.py index c52705fc07c59..48c9666c24726 100644 --- a/pytorch_lightning/trainer/data_loading.py +++ b/pytorch_lightning/trainer/data_loading.py @@ -455,9 +455,11 @@ def _reset_eval_dataloader( loader, SequentialSampler(loader.dataset), mode=mode ) else: - rank_zero_warn( - f"Your `{mode.dataloader_prefix}_dataloader` has `shuffle=True`," - "it is strongly recommended that you turn this off for val/test/predict dataloaders." + apply_to_collection( + loader.loaders if isinstance(loader, CombinedLoader) else loader, + DataLoader, + self._check_eval_shuffling, + mode=mode, ) if any(dl is None for dl in dataloaders): @@ -620,3 +622,16 @@ def replace_sampler(dataloader: DataLoader) -> DataLoader: dataloader = apply_to_collection(dataloader, DataLoader, replace_sampler) return dataloader + + @staticmethod + def _check_eval_shuffling(dataloader, mode): + if ( + hasattr(dataloader, "sampler") + and not isinstance(dataloader.sampler, SequentialSampler) + and not isinstance(dataloader.dataset, IterableDataset) + ): + rank_zero_warn( + f"Your `{mode.dataloader_prefix}_dataloader` has `shuffle=True`," + " it is strongly recommended that you turn this off for val/test/predict dataloaders.", + category=UserWarning, + ) diff --git a/pytorch_lightning/trainer/supporters.py b/pytorch_lightning/trainer/supporters.py index 6e2e51e82bbf1..2c91fb0d245d1 100644 --- a/pytorch_lightning/trainer/supporters.py +++ b/pytorch_lightning/trainer/supporters.py @@ -304,10 +304,10 @@ def __len__(self) -> int: class CombinedLoader: - """Combines different dataloaders and allows sampling in parallel. Supported modes are 'min_size', which raises - StopIteration after the shortest loader (the one with the lowest number of batches) is done, and - 'max_size_cycle` which raises StopIteration after the longest loader (the one with most batches) is done, while - cycling through the shorter loaders. + """Combines different dataloaders and allows sampling in parallel. Supported modes are ``"min_size"``, which + raises StopIteration after the shortest loader (the one with the lowest number of batches) is done, and + ``"max_size_cycle"`` which raises StopIteration after the longest loader (the one with most batches) is done, + while cycling through the shorter loaders. Examples: >>> loaders = {'a': torch.utils.data.DataLoader(range(6), batch_size=4), diff --git a/tests/trainer/test_data_loading.py b/tests/trainer/test_data_loading.py index d7bf9c987127d..9b1e5ca45e655 100644 --- a/tests/trainer/test_data_loading.py +++ b/tests/trainer/test_data_loading.py @@ -21,6 +21,7 @@ from pytorch_lightning import Trainer from pytorch_lightning.trainer.states import RunningStage +from pytorch_lightning.trainer.supporters import CombinedLoader from pytorch_lightning.utilities.enums import DistributedType from pytorch_lightning.utilities.exceptions import MisconfigurationException from tests.helpers import BoringModel, RandomDataset @@ -364,3 +365,27 @@ def test_error_raised_with_float_limited_eval_batches(): match=fr"{limit_val_batches} \* {dl_size} < 1. Please increase the `limit_val_batches`", ): trainer._reset_eval_dataloader(RunningStage.VALIDATING, model) + + +@pytest.mark.parametrize( + "val_dl", + [ + DataLoader(dataset=RandomDataset(32, 64), shuffle=True), + CombinedLoader(DataLoader(dataset=RandomDataset(32, 64), shuffle=True)), + CombinedLoader( + [DataLoader(dataset=RandomDataset(32, 64)), DataLoader(dataset=RandomDataset(32, 64), shuffle=True)] + ), + CombinedLoader( + { + "dl1": DataLoader(dataset=RandomDataset(32, 64)), + "dl2": DataLoader(dataset=RandomDataset(32, 64), shuffle=True), + } + ), + ], +) +def test_non_sequential_sampler_warning_is_raised_for_eval_dataloader(val_dl): + trainer = Trainer() + model = BoringModel() + trainer._data_connector.attach_data(model, val_dataloaders=val_dl) + with pytest.warns(UserWarning, match="recommended .* turn this off for val/test/predict"): + trainer._reset_eval_dataloader(RunningStage.VALIDATING, model) From af11c11d087e935f531b0e8ae9fe8048cd90bc9e Mon Sep 17 00:00:00 2001 From: Kaushik B <45285388+kaushikb11@users.noreply.github.com> Date: Wed, 15 Dec 2021 14:53:35 +0530 Subject: [PATCH 085/123] Fix sanity check for RichProgressBar (#10913) --- CHANGELOG.md | 16 ++------------ .../callbacks/progress/rich_progress.py | 4 +++- tests/callbacks/test_rich_progress_bar.py | 21 +++++++++++++++++++ 3 files changed, 26 insertions(+), 15 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 47d23d2de0ef3..03ef3eb32f636 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,29 +4,17 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). -## [1.5.6] - 2021-12-14 +## [1.5.6] - 2021-12-15 ### Fixed - Fixed a bug where the DeepSpeedPlugin arguments `cpu_checkpointing` and `contiguous_memory_optimization` were not being forwarded to deepspeed correctly ([#10874](https://github.com/PyTorchLightning/pytorch-lightning/issues/10874)) - - - Fixed an issue with `NeptuneLogger` causing checkpoints to be uploaded with a duplicated file extension ([#11015](https://github.com/PyTorchLightning/pytorch-lightning/issues/11015)) -======= - - - Fixed support for logging within callbacks returned from `LightningModule` ([#10991](https://github.com/PyTorchLightning/pytorch-lightning/pull/10991)) - - +- Fixed running sanity check with `RichProgressBar` ([#10913](https://github.com/PyTorchLightning/pytorch-lightning/pull/10913)) - Fixed support for `CombinedLoader` while checking for warning raised with eval dataloaders ([#10994](https://github.com/PyTorchLightning/pytorch-lightning/pull/10994)) -- - - -- - - ## [1.5.5] - 2021-12-07 ### Fixed diff --git a/pytorch_lightning/callbacks/progress/rich_progress.py b/pytorch_lightning/callbacks/progress/rich_progress.py index 9cb2a91dde3af..b9e975e018cfa 100644 --- a/pytorch_lightning/callbacks/progress/rich_progress.py +++ b/pytorch_lightning/callbacks/progress/rich_progress.py @@ -328,7 +328,9 @@ def on_sanity_check_start(self, trainer, pl_module): def on_sanity_check_end(self, trainer, pl_module): super().on_sanity_check_end(trainer, pl_module) - self._update(self.val_sanity_progress_bar_id, visible=False) + if self.progress is not None: + self.progress.update(self.val_sanity_progress_bar_id, advance=0, visible=False) + self.progress.refresh() def on_train_epoch_start(self, trainer, pl_module): super().on_train_epoch_start(trainer, pl_module) diff --git a/tests/callbacks/test_rich_progress_bar.py b/tests/callbacks/test_rich_progress_bar.py index f4dc5421539e8..ea38287d1789f 100644 --- a/tests/callbacks/test_rich_progress_bar.py +++ b/tests/callbacks/test_rich_progress_bar.py @@ -180,3 +180,24 @@ def test_rich_progress_bar_leave(tmpdir, leave, reset_call_count): ) trainer.fit(model) assert mock_progress_reset.call_count == reset_call_count + + +@RunIf(rich=True) +@pytest.mark.parametrize("limit_val_batches", (1, 5)) +def test_rich_progress_bar_num_sanity_val_steps(tmpdir, limit_val_batches: int): + model = BoringModel() + + progress_bar = RichProgressBar() + num_sanity_val_steps = 3 + + trainer = Trainer( + default_root_dir=tmpdir, + num_sanity_val_steps=num_sanity_val_steps, + limit_train_batches=1, + limit_val_batches=limit_val_batches, + max_epochs=1, + callbacks=progress_bar, + ) + + trainer.fit(model) + assert progress_bar.progress.tasks[0].completed == min(num_sanity_val_steps, limit_val_batches) From 0bb9ce0b688921689f82e646e3da2985b341a495 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 15 Dec 2021 18:55:09 +0100 Subject: [PATCH 086/123] Update the TQDM progress bar `on_train_epoch_end` (#11069) Co-authored-by: Carlos Mocholi --- CHANGELOG.md | 2 + .../callbacks/progress/tqdm_progress.py | 47 ++++++++------ pytorch_lightning/core/lightning.py | 2 +- tests/callbacks/test_tqdm_progress_bar.py | 63 +++++++++++++++++++ 4 files changed, 94 insertions(+), 20 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 03ef3eb32f636..5c18801ef3595 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed support for logging within callbacks returned from `LightningModule` ([#10991](https://github.com/PyTorchLightning/pytorch-lightning/pull/10991)) - Fixed running sanity check with `RichProgressBar` ([#10913](https://github.com/PyTorchLightning/pytorch-lightning/pull/10913)) - Fixed support for `CombinedLoader` while checking for warning raised with eval dataloaders ([#10994](https://github.com/PyTorchLightning/pytorch-lightning/pull/10994)) +- The TQDM progress bar now correctly shows the `on_epoch` logged values on train epoch end ([#11069](https://github.com/PyTorchLightning/pytorch-lightning/pull/11069)) +- Fixed bug where the TQDM updated the training progress bar during `trainer.validate` ([#11069](https://github.com/PyTorchLightning/pytorch-lightning/pull/11069)) ## [1.5.5] - 2021-12-07 diff --git a/pytorch_lightning/callbacks/progress/tqdm_progress.py b/pytorch_lightning/callbacks/progress/tqdm_progress.py index 672d9d893ad61..11103e4b0595d 100644 --- a/pytorch_lightning/callbacks/progress/tqdm_progress.py +++ b/pytorch_lightning/callbacks/progress/tqdm_progress.py @@ -25,6 +25,7 @@ else: from tqdm import tqdm as _tqdm +import pytorch_lightning as pl from pytorch_lightning.callbacks.progress.base import ProgressBarBase _PAD_SIZE = 5 @@ -206,12 +207,10 @@ def init_test_tqdm(self) -> Tqdm: return bar def on_sanity_check_start(self, trainer, pl_module): - super().on_sanity_check_start(trainer, pl_module) self.val_progress_bar = self.init_sanity_tqdm() self.main_progress_bar = Tqdm(disable=True) # dummy progress bar def on_sanity_check_end(self, trainer, pl_module): - super().on_sanity_check_end(trainer, pl_module) self.main_progress_bar.close() self.val_progress_bar.close() @@ -233,37 +232,44 @@ def on_train_epoch_start(self, trainer, pl_module): def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx): super().on_train_batch_end(trainer, pl_module, outputs, batch, batch_idx) - total_batches = self.total_train_batches + self.total_val_batches - total_batches = convert_inf(total_batches) - if self._should_update(self.train_batch_idx, total_batches): + if self._should_update(self.train_batch_idx): self._update_bar(self.main_progress_bar) self.main_progress_bar.set_postfix(self.get_metrics(trainer, pl_module)) + def on_train_epoch_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None: + if self.is_enabled: + self._update_bar(self.main_progress_bar) + self.main_progress_bar.set_postfix(self.get_metrics(trainer, pl_module)) + + def on_train_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None: + self.main_progress_bar.close() + def on_validation_start(self, trainer, pl_module): super().on_validation_start(trainer, pl_module) if trainer.sanity_checking: reset(self.val_progress_bar, total=sum(trainer.num_sanity_val_batches), current=self.val_batch_idx) else: - self._update_bar(self.main_progress_bar) # fill up remaining + if trainer.state.fn == pl.trainer.states.TrainerFn.FITTING: + self._update_bar(self.main_progress_bar) # fill up remaining self.val_progress_bar = self.init_validation_tqdm() reset(self.val_progress_bar, total=self.total_val_batches, current=self.val_batch_idx) def on_validation_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, dataloader_idx): super().on_validation_batch_end(trainer, pl_module, outputs, batch, batch_idx, dataloader_idx) - if self._should_update(self.val_batch_idx, convert_inf(self.total_val_batches)): + if self._should_update(self.val_batch_idx): + self._update_bar(self.val_progress_bar) + if trainer.state.fn == pl.trainer.states.TrainerFn.FITTING: + self._update_bar(self.main_progress_bar) + + def on_validation_epoch_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None: + if self.is_enabled: self._update_bar(self.val_progress_bar) - self._update_bar(self.main_progress_bar) def on_validation_end(self, trainer, pl_module): - super().on_validation_end(trainer, pl_module) - if self.main_progress_bar is not None: + if self.main_progress_bar is not None and trainer.state.fn == pl.trainer.states.TrainerFn.FITTING: self.main_progress_bar.set_postfix(self.get_metrics(trainer, pl_module)) self.val_progress_bar.close() - def on_train_end(self, trainer, pl_module): - super().on_train_end(trainer, pl_module) - self.main_progress_bar.close() - def on_test_start(self, trainer, pl_module): super().on_test_start(trainer, pl_module) self.test_progress_bar = self.init_test_tqdm() @@ -271,11 +277,14 @@ def on_test_start(self, trainer, pl_module): def on_test_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, dataloader_idx): super().on_test_batch_end(trainer, pl_module, outputs, batch, batch_idx, dataloader_idx) - if self._should_update(self.test_batch_idx, self.total_test_batches): + if self._should_update(self.test_batch_idx): + self._update_bar(self.test_progress_bar) + + def on_test_epoch_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None: + if self.is_enabled: self._update_bar(self.test_progress_bar) def on_test_end(self, trainer, pl_module): - super().on_test_end(trainer, pl_module) self.test_progress_bar.close() def on_predict_epoch_start(self, trainer, pl_module): @@ -285,7 +294,7 @@ def on_predict_epoch_start(self, trainer, pl_module): def on_predict_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, dataloader_idx): super().on_predict_batch_end(trainer, pl_module, outputs, batch, batch_idx, dataloader_idx) - if self._should_update(self.predict_batch_idx, self.total_predict_batches): + if self._should_update(self.predict_batch_idx): self._update_bar(self.predict_progress_bar) def on_predict_end(self, trainer, pl_module): @@ -309,8 +318,8 @@ def print( s = sep.join(map(str, args)) active_progress_bar.write(s, end=end, file=file, nolock=nolock) - def _should_update(self, current, total) -> bool: - return self.is_enabled and (current % self.refresh_rate == 0 or current == total) + def _should_update(self, idx: int) -> bool: + return self.is_enabled and (idx % self.refresh_rate == 0) def _update_bar(self, bar: Optional[Tqdm]) -> None: """Updates the bar by the refresh rate without overshooting.""" diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py index 4270d910933c5..8e77baf6ec539 100644 --- a/pytorch_lightning/core/lightning.py +++ b/pytorch_lightning/core/lightning.py @@ -1792,7 +1792,7 @@ def get_progress_bar_dict(self) -> Dict[str, Union[int, str]]: r""" .. deprecated:: v1.5 This method was deprecated in v1.5 in favor of - `pytorch_lightning.callbacks.progress.base.get_standard_metrics` and will be removed in v1.7. + `pytorch_lightning.callbacks.progress.base.get_metrics` and will be removed in v1.7. Implement this to override the default items displayed in the progress bar. By default it includes the average loss value, split index of BPTT (if used) diff --git a/tests/callbacks/test_tqdm_progress_bar.py b/tests/callbacks/test_tqdm_progress_bar.py index 9b80fabf800a7..8f4c1b4e540b7 100644 --- a/tests/callbacks/test_tqdm_progress_bar.py +++ b/tests/callbacks/test_tqdm_progress_bar.py @@ -14,6 +14,7 @@ import os import pickle import sys +from collections import defaultdict from typing import Optional, Union from unittest import mock from unittest.mock import ANY, call, Mock @@ -607,3 +608,65 @@ def test_tqdm_progress_bar_main_bar_resume(): # restarting mid validation epoch is not currently supported assert bar.val_progress_bar.n == 0 assert bar.val_progress_bar.total == 3 + + +def test_tqdm_progress_bar_correct_value_epoch_end(tmpdir): + class MockedProgressBar(TQDMProgressBar): + calls = defaultdict(list) + + def get_metrics(self, trainer, pl_module): + items = super().get_metrics(trainer, model) + del items["v_num"] + del items["loss"] + # this is equivalent to mocking `set_postfix` as this method gets called every time + self.calls[trainer.state.fn].append( + (trainer.state.stage, trainer.current_epoch, trainer.global_step, items) + ) + return items + + class MyModel(BoringModel): + def training_step(self, batch, batch_idx): + self.log("a", self.global_step, prog_bar=True, on_step=False, on_epoch=True, reduce_fx=max) + return super().training_step(batch, batch_idx) + + def validation_step(self, batch, batch_idx): + self.log("b", self.global_step, prog_bar=True, on_step=False, on_epoch=True, reduce_fx=max) + return super().validation_step(batch, batch_idx) + + def test_step(self, batch, batch_idx): + self.log("c", self.global_step, prog_bar=True, on_step=False, on_epoch=True, reduce_fx=max) + return super().test_step(batch, batch_idx) + + model = MyModel() + pbar = MockedProgressBar() + trainer = Trainer( + default_root_dir=tmpdir, + limit_train_batches=2, + limit_val_batches=2, + limit_test_batches=2, + max_epochs=2, + enable_model_summary=False, + enable_checkpointing=False, + log_every_n_steps=1, + callbacks=pbar, + ) + + trainer.fit(model) + assert pbar.calls["fit"] == [ + ("sanity_check", 0, 0, {"b": 0}), + ("train", 0, 0, {}), + ("train", 0, 1, {}), + ("validate", 0, 1, {"b": 1}), # validation end + # epoch end over, `on_epoch=True` metrics are computed + ("train", 0, 2, {"a": 1, "b": 1}), # training epoch end + ("train", 1, 2, {"a": 1, "b": 1}), + ("train", 1, 3, {"a": 1, "b": 1}), + ("validate", 1, 3, {"a": 1, "b": 3}), # validation end + ("train", 1, 4, {"a": 3, "b": 3}), # training epoch end + ] + + trainer.validate(model, verbose=False) + assert pbar.calls["validate"] == [] + + trainer.test(model, verbose=False) + assert pbar.calls["test"] == [] From 6268716c2555a1d58b0ca2b9012c2ec1bf94451f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Thu, 16 Dec 2021 01:55:08 +0100 Subject: [PATCH 087/123] update version to 1.5.7 --- CHANGELOG.md | 6 ++++++ pytorch_lightning/__about__.py | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5c18801ef3595..400c3a16940f5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,12 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). +## [1.5.7] - 2021-12-21 + +### Fixed + + + ## [1.5.6] - 2021-12-15 ### Fixed diff --git a/pytorch_lightning/__about__.py b/pytorch_lightning/__about__.py index d107d5d77c424..327dd9e6b61fb 100644 --- a/pytorch_lightning/__about__.py +++ b/pytorch_lightning/__about__.py @@ -1,7 +1,7 @@ import time _this_year = time.strftime("%Y") -__version__ = "1.5.6" +__version__ = "1.5.7" __author__ = "William Falcon et al." __author_email__ = "waf2107@columbia.edu" __license__ = "Apache-2.0" From 2596f2433cce0aae57821aaf232c1b9a045aedc9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Thu, 16 Dec 2021 04:23:31 +0100 Subject: [PATCH 088/123] Support torch 1.10.1 (#11095) --- requirements/adjust_versions.py | 3 ++- requirements/test.txt | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/requirements/adjust_versions.py b/requirements/adjust_versions.py index 8295a726e7873..bdff1af6a7771 100644 --- a/requirements/adjust_versions.py +++ b/requirements/adjust_versions.py @@ -6,7 +6,8 @@ # IMPORTANT: this list needs to be sorted in reverse VERSIONS = [ dict(torch="1.11.0", torchvision="0.11.*", torchtext=""), # nightly - dict(torch="1.10.0", torchvision="0.11.1", torchtext="0.11.0"), # stable + dict(torch="1.10.1", torchvision="0.11.2", torchtext="0.11.1"), # stable + dict(torch="1.10.0", torchvision="0.11.1", torchtext="0.11.0"), dict(torch="1.9.1", torchvision="0.10.1", torchtext="0.10.1"), dict(torch="1.9.0", torchvision="0.10.0", torchtext="0.10.0"), # dict(torch="1.8.2", torchvision="0.9.1", torchtext="0.9.1"), # LTS # Not on PyPI, commented so 1.8.1 is used diff --git a/requirements/test.txt b/requirements/test.txt index d86137b037f4d..2f2228126a6ba 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -3,7 +3,7 @@ codecov>=2.1 pytest>=6.0 pytest-rerunfailures>=10.2 twine==3.2 -mypy>=0.900 +mypy==0.910 flake8>=3.9.2 pre-commit>=1.0 From ff3d0dce3b0d93774017b01dc8c388794c3ae7e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Fri, 17 Dec 2021 00:36:48 +0100 Subject: [PATCH 089/123] Update mypy (#11096) Co-authored-by: Carlos Mocholi --- .../trainer/connectors/signal_connector.py | 40 ++++++++++--------- pytorch_lightning/utilities/model_helpers.py | 2 + requirements/test.txt | 2 +- 3 files changed, 25 insertions(+), 19 deletions(-) diff --git a/pytorch_lightning/trainer/connectors/signal_connector.py b/pytorch_lightning/trainer/connectors/signal_connector.py index 2020aab7bd48f..44af6a512ef4d 100644 --- a/pytorch_lightning/trainer/connectors/signal_connector.py +++ b/pytorch_lightning/trainer/connectors/signal_connector.py @@ -3,7 +3,6 @@ import signal import sys import threading -from signal import Signals from subprocess import call from types import FrameType from typing import Any, Callable, Dict, List, Set, Union @@ -11,33 +10,38 @@ import pytorch_lightning as pl from pytorch_lightning.utilities.imports import _fault_tolerant_training, _IS_WINDOWS -log = logging.getLogger(__name__) +# copied from signal.pyi +_SIGNUM = Union[int, signal.Signals] +_HANDLER = Union[Callable[[_SIGNUM, FrameType], Any], int, signal.Handlers, None] -_SIGNAL_HANDLER_DICT = Dict[Signals, Union[Callable[[Signals, FrameType], Any], int, None]] +log = logging.getLogger(__name__) class HandlersCompose: - def __init__(self, signal_handlers: Union[List[Callable], Callable]) -> None: + def __init__(self, signal_handlers: Union[List[_HANDLER], _HANDLER]) -> None: if not isinstance(signal_handlers, list): signal_handlers = [signal_handlers] self.signal_handlers = signal_handlers - def __call__(self, signum: Signals, frame: FrameType) -> None: + def __call__(self, signum: _SIGNUM, frame: FrameType) -> None: for signal_handler in self.signal_handlers: - signal_handler(signum, frame) + if isinstance(signal_handler, int): + signal_handler = signal.getsignal(signal_handler) + if callable(signal_handler): + signal_handler(signum, frame) class SignalConnector: def __init__(self, trainer: "pl.Trainer") -> None: self.trainer = trainer self.trainer._terminate_gracefully = False - self._original_handlers: _SIGNAL_HANDLER_DICT = {} + self._original_handlers: Dict[_SIGNUM, _HANDLER] = {} def register_signal_handlers(self) -> None: self._original_handlers = self._get_current_signal_handlers() - sigusr1_handlers: List[Callable] = [] - sigterm_handlers: List[Callable] = [] + sigusr1_handlers: List[_HANDLER] = [] + sigterm_handlers: List[_HANDLER] = [] if _fault_tolerant_training(): sigusr1_handlers.append(self.fault_tolerant_sigusr1_handler_fn) @@ -55,7 +59,7 @@ def register_signal_handlers(self) -> None: if sigterm_handlers and not self._has_already_handler(signal.SIGTERM): self._register_signal(signal.SIGTERM, HandlersCompose(sigterm_handlers)) - def slurm_sigusr1_handler_fn(self, signum: Signals, frame: FrameType) -> None: + def slurm_sigusr1_handler_fn(self, signum: _SIGNUM, frame: FrameType) -> None: if self.trainer.is_global_zero: # save weights log.info("handling SIGUSR1") @@ -86,21 +90,21 @@ def slurm_sigusr1_handler_fn(self, signum: Signals, frame: FrameType) -> None: if self.trainer.logger: self.trainer.logger.finalize("finished") - def fault_tolerant_sigusr1_handler_fn(self, signum: Signals, frame: FrameType) -> None: + def fault_tolerant_sigusr1_handler_fn(self, signum: _SIGNUM, frame: FrameType) -> None: self.trainer._terminate_gracefully = True - def sigterm_handler_fn(self, signum: Signals, frame: FrameType) -> None: + def sigterm_handler_fn(self, signum: _SIGNUM, frame: FrameType) -> None: log.info("bypassing sigterm") def teardown(self) -> None: """Restores the signals that were previsouly configured before :class:`SignalConnector` replaced them.""" for signum, handler in self._original_handlers.items(): if handler is not None: - signal.signal(signum, handler) + signal.signal(signum, handler) # type: ignore[arg-type] self._original_handlers = {} @staticmethod - def _get_current_signal_handlers() -> _SIGNAL_HANDLER_DICT: + def _get_current_signal_handlers() -> Dict[_SIGNUM, _HANDLER]: """Collects the currently assigned signal handlers.""" valid_signals = SignalConnector._valid_signals() if not _IS_WINDOWS: @@ -109,7 +113,7 @@ def _get_current_signal_handlers() -> _SIGNAL_HANDLER_DICT: return {signum: signal.getsignal(signum) for signum in valid_signals} @staticmethod - def _valid_signals() -> Set[Signals]: + def _valid_signals() -> Set[signal.Signals]: """Returns all valid signals supported on the current platform. Behaves identically to :func:`signals.valid_signals` in Python 3.8+ and implements the equivalent behavior for @@ -143,13 +147,13 @@ def _is_on_slurm(self) -> bool: return on_slurm @staticmethod - def _has_already_handler(signum: Signals) -> bool: + def _has_already_handler(signum: _SIGNUM) -> bool: return signal.getsignal(signum) not in (None, signal.SIG_DFL) @staticmethod - def _register_signal(signum: Signals, handlers: HandlersCompose) -> None: + def _register_signal(signum: _SIGNUM, handlers: _HANDLER) -> None: if threading.current_thread() is threading.main_thread(): - signal.signal(signum, handlers) + signal.signal(signum, handlers) # type: ignore[arg-type] def __getstate__(self) -> Dict: state = self.__dict__.copy() diff --git a/pytorch_lightning/utilities/model_helpers.py b/pytorch_lightning/utilities/model_helpers.py index 3146b33fe153d..90707279500e0 100644 --- a/pytorch_lightning/utilities/model_helpers.py +++ b/pytorch_lightning/utilities/model_helpers.py @@ -47,6 +47,8 @@ def is_overridden( raise ValueError("Expected a parent") instance_attr = getattr(instance, method_name, None) + if instance_attr is None: + return False # `functools.wraps()` support if hasattr(instance_attr, "__wrapped__"): instance_attr = instance_attr.__wrapped__ diff --git a/requirements/test.txt b/requirements/test.txt index 2f2228126a6ba..941b53dc8c102 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -3,7 +3,7 @@ codecov>=2.1 pytest>=6.0 pytest-rerunfailures>=10.2 twine==3.2 -mypy==0.910 +mypy>=0.920 flake8>=3.9.2 pre-commit>=1.0 From acb38468b7161a97bbff116ea8ec9f7312a03a76 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Fri, 17 Dec 2021 05:13:34 +0100 Subject: [PATCH 090/123] Teardown sync-batchnorm after training (#11078) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: ananthsub Co-authored-by: Carlos Mocholí --- CHANGELOG.md | 1 + .../plugins/training_type/ddp.py | 5 ++- .../plugins/training_type/ddp_spawn.py | 5 ++- pytorch_lightning/utilities/distributed.py | 35 +++++++++++++++++++ tests/models/test_sync_batchnorm.py | 7 +++- 5 files changed, 50 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 400c3a16940f5..df0a60f6e6f28 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Fixed +- Fixed an issue when torch-scripting a `LightningModule` after training with `Trainer(sync_batchnorm=True)` ([#11078](https://github.com/PyTorchLightning/pytorch-lightning/pull/11078)) ## [1.5.6] - 2021-12-15 diff --git a/pytorch_lightning/plugins/training_type/ddp.py b/pytorch_lightning/plugins/training_type/ddp.py index fe10088b19944..9c4a7e36c66ba 100644 --- a/pytorch_lightning/plugins/training_type/ddp.py +++ b/pytorch_lightning/plugins/training_type/ddp.py @@ -50,7 +50,7 @@ rank_zero_deprecation, rank_zero_warn, ) -from pytorch_lightning.utilities.distributed import distributed_available +from pytorch_lightning.utilities.distributed import _revert_sync_batchnorm, distributed_available from pytorch_lightning.utilities.distributed import group as _group from pytorch_lightning.utilities.distributed import ( init_dist_connection, @@ -536,6 +536,9 @@ def teardown(self) -> None: if isinstance(self.model, DistributedDataParallel): self.model = self.lightning_module + if self.sync_batchnorm: + self.model = _revert_sync_batchnorm(self.model) + if self.on_gpu: # GPU teardown self.lightning_module.cpu() diff --git a/pytorch_lightning/plugins/training_type/ddp_spawn.py b/pytorch_lightning/plugins/training_type/ddp_spawn.py index 4e787e77e668a..d71df296e8544 100644 --- a/pytorch_lightning/plugins/training_type/ddp_spawn.py +++ b/pytorch_lightning/plugins/training_type/ddp_spawn.py @@ -41,7 +41,7 @@ from pytorch_lightning.utilities.apply_func import apply_to_collection, move_data_to_device from pytorch_lightning.utilities.cloud_io import atomic_save from pytorch_lightning.utilities.cloud_io import load as pl_load -from pytorch_lightning.utilities.distributed import distributed_available +from pytorch_lightning.utilities.distributed import _revert_sync_batchnorm, distributed_available from pytorch_lightning.utilities.distributed import group as _group from pytorch_lightning.utilities.distributed import ( init_dist_connection, @@ -436,6 +436,9 @@ def teardown(self) -> None: if isinstance(self.model, DistributedDataParallel): self.model = self.lightning_module + if self.sync_batchnorm: + self.model = _revert_sync_batchnorm(self.model) + if self.on_gpu: # GPU teardown self.lightning_module.cpu() diff --git a/pytorch_lightning/utilities/distributed.py b/pytorch_lightning/utilities/distributed.py index 47fa7b791eae0..d4d488d973ebf 100644 --- a/pytorch_lightning/utilities/distributed.py +++ b/pytorch_lightning/utilities/distributed.py @@ -19,6 +19,7 @@ from typing import Any, Callable, List, Optional, Tuple, Union import torch +from torch.nn import Module from torch.nn.parallel.distributed import DistributedDataParallel import pytorch_lightning as pl @@ -394,3 +395,37 @@ def init_dist_connection( f"All distributed processes registered. Starting with {world_size} processes\n" f"{'-' * 100}\n" ) + + +class _BatchNormXd(torch.nn.modules.batchnorm._BatchNorm): + def _check_input_dim(self, input: torch.Tensor) -> None: + # The only difference between BatchNorm1d, BatchNorm2d, BatchNorm3d, etc + # is this method that is overwritten by the subclass. + # Here, we are bypassing some tensor sanity checks and trusting that the user + # provides the right input dimensions at inference. + return + + +def _revert_sync_batchnorm(module: Module) -> Module: + # Code adapted from https://github.com/pytorch/pytorch/issues/41081#issuecomment-783961547 + # Original author: Kapil Yedidi (@kapily) + converted_module = module + if isinstance(module, torch.nn.modules.batchnorm.SyncBatchNorm): + # Unfortunately, SyncBatchNorm does not store the original class - if it did + # we could return the one that was originally created. + converted_module = _BatchNormXd( + module.num_features, module.eps, module.momentum, module.affine, module.track_running_stats + ) + if module.affine: + with torch.no_grad(): + converted_module.weight = module.weight + converted_module.bias = module.bias + converted_module.running_mean = module.running_mean + converted_module.running_var = module.running_var + converted_module.num_batches_tracked = module.num_batches_tracked + if hasattr(module, "qconfig"): + converted_module.qconfig = module.qconfig + for name, child in module.named_children(): + converted_module.add_module(name, _revert_sync_batchnorm(child)) + del module + return converted_module diff --git a/tests/models/test_sync_batchnorm.py b/tests/models/test_sync_batchnorm.py index 86c4a5af68b91..5035e71f928fc 100644 --- a/tests/models/test_sync_batchnorm.py +++ b/tests/models/test_sync_batchnorm.py @@ -37,6 +37,9 @@ def __init__(self, gpu_count=1, **kwargs): self.linear = nn.Linear(28 * 28, 10) self.bn_layer = nn.BatchNorm1d(28 * 28) + def on_train_start(self) -> None: + assert isinstance(self.bn_layer, torch.nn.modules.batchnorm.SyncBatchNorm) + def forward(self, x, batch_idx): with torch.no_grad(): out_bn = self.bn_layer(x.view(x.size(0), -1)) @@ -123,4 +126,6 @@ def test_sync_batchnorm_ddp(tmpdir): ) trainer.fit(model, dm) - assert trainer.state.finished, "Sync batchnorm failing with DDP" + # the strategy is responsible for tearing down the batchnorm wrappers + assert not isinstance(model.bn_layer, torch.nn.modules.batchnorm.SyncBatchNorm) + assert isinstance(model.bn_layer, torch.nn.modules.batchnorm._BatchNorm) From 9cf6168b3b449a2ac96c1922e454d4c793f91b67 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Fri, 17 Dec 2021 14:12:02 +0100 Subject: [PATCH 091/123] Fix signal teardown outside main thread (#11124) --- pytorch_lightning/trainer/connectors/signal_connector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/trainer/connectors/signal_connector.py b/pytorch_lightning/trainer/connectors/signal_connector.py index 44af6a512ef4d..8145a692ceeb4 100644 --- a/pytorch_lightning/trainer/connectors/signal_connector.py +++ b/pytorch_lightning/trainer/connectors/signal_connector.py @@ -100,7 +100,7 @@ def teardown(self) -> None: """Restores the signals that were previsouly configured before :class:`SignalConnector` replaced them.""" for signum, handler in self._original_handlers.items(): if handler is not None: - signal.signal(signum, handler) # type: ignore[arg-type] + self._register_signal(signum, handler) self._original_handlers = {} @staticmethod From d4cfcc35524331b4f7e58e93c24ab182fc627921 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Fri, 17 Dec 2021 19:02:25 +0100 Subject: [PATCH 092/123] Fix AttributeError when using CombinedLoader in prediction (#11111) Co-authored-by: Carlos Mocholi --- CHANGELOG.md | 1 + .../loops/epoch/prediction_epoch_loop.py | 3 +- tests/callbacks/test_prediction_writer.py | 34 +++++++++++++++++++ 3 files changed, 37 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index df0a60f6e6f28..3a485566372ae 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Fixed - Fixed an issue when torch-scripting a `LightningModule` after training with `Trainer(sync_batchnorm=True)` ([#11078](https://github.com/PyTorchLightning/pytorch-lightning/pull/11078)) +- Fixed an `AttributeError` occuring when using a `CombinedLoader` (multiple dataloaders) for prediction ([#11111](https://github.com/PyTorchLightning/pytorch-lightning/pull/11111)) ## [1.5.6] - 2021-12-15 diff --git a/pytorch_lightning/loops/epoch/prediction_epoch_loop.py b/pytorch_lightning/loops/epoch/prediction_epoch_loop.py index e5fa46fe05836..6e3b326da534d 100644 --- a/pytorch_lightning/loops/epoch/prediction_epoch_loop.py +++ b/pytorch_lightning/loops/epoch/prediction_epoch_loop.py @@ -165,7 +165,8 @@ def _build_kwargs(self, batch: Any, batch_idx: int, dataloader_idx: int) -> Dict def _get_batch_indices(self, dataloader_idx: int) -> List[List[int]]: """Returns a reference to the seen batch indices if the dataloader has a batch sampler wrapped by our :class:`~pytorch_lightning.overrides.distributed.IndexBatchSamplerWrapper`.""" - batch_sampler = self.trainer.predict_dataloaders[dataloader_idx].batch_sampler + # the batch_sampler is not be defined in case of CombinedDataLoaders + batch_sampler = getattr(self.trainer.predict_dataloaders[dataloader_idx], "batch_sampler", None) if isinstance(batch_sampler, IndexBatchSamplerWrapper) and self.should_store_predictions: return batch_sampler.seen_batch_indices diff --git a/tests/callbacks/test_prediction_writer.py b/tests/callbacks/test_prediction_writer.py index 2cd3738ca875f..f086316052995 100644 --- a/tests/callbacks/test_prediction_writer.py +++ b/tests/callbacks/test_prediction_writer.py @@ -16,8 +16,10 @@ import pytest from torch.utils.data import DataLoader +import pytorch_lightning as pl from pytorch_lightning import Trainer from pytorch_lightning.callbacks import BasePredictionWriter +from pytorch_lightning.trainer.supporters import CombinedLoader from pytorch_lightning.utilities.exceptions import MisconfigurationException from tests.helpers import BoringModel, RandomDataset from tests.helpers.runif import RunIf @@ -106,3 +108,35 @@ def test_prediction_writer_batch_indices(tmpdir, num_workers): call(trainer, model, ANY, [[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]]), ] ) + + +def test_prediction_writer_partial_support_for_combined_loader(tmpdir): + """Test partial support for CombinedLoader: prediction works but sample indices don't get tracked.""" + pl.loops.epoch.prediction_epoch_loop.warning_cache.clear() + + class PredictionModel(BoringModel): + def predict_dataloader(self): + return CombinedLoader( + { + "a": DataLoader(RandomDataset(32, 8), batch_size=2), + "b": DataLoader(RandomDataset(32, 8), batch_size=4), + } + ) + + def predict_step(self, batch, *args, **kwargs): + return self(batch["a"]) + + DummyPredictionWriter.write_on_batch_end = Mock() + DummyPredictionWriter.write_on_epoch_end = Mock() + + model = PredictionModel() + writer = DummyPredictionWriter("batch_and_epoch") + trainer = Trainer(callbacks=writer) + with pytest.warns(UserWarning, match="Lightning couldn't infer the indices fetched for your dataloader."): + trainer.predict(model) + + writer.write_on_batch_end.assert_has_calls( + [call(trainer, model, ANY, [], ANY, 0, 0), call(trainer, model, ANY, [], ANY, 1, 0)] + ) + + writer.write_on_epoch_end.assert_has_calls([call(trainer, model, ANY, [[]])]) From a542f8271ffbe827d0da0c8b3aef888e408cc0af Mon Sep 17 00:00:00 2001 From: Rohit Gupta Date: Sat, 18 Dec 2021 00:10:56 +0530 Subject: [PATCH 093/123] Enable logging hparams only if there are any (#11105) --- CHANGELOG.md | 1 + pytorch_lightning/core/mixins/hparams_mixin.py | 2 +- tests/callbacks/test_gpu_stats_monitor.py | 2 +- tests/loggers/test_all.py | 2 -- tests/loggers/test_base.py | 5 ++--- tests/models/test_hparams.py | 5 ++++- tests/trainer/logging_/test_distributed_logging.py | 1 - tests/trainer/logging_/test_eval_loop_logging.py | 4 ++++ 8 files changed, 13 insertions(+), 9 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3a485566372ae..b0ebb9a37aa4c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Fixed +- Fixed a bug to disable logging hyperparameters in logger if there are no hparams ([#11105](https://github.com/PyTorchLightning/pytorch-lightning/issues/11105)) - Fixed an issue when torch-scripting a `LightningModule` after training with `Trainer(sync_batchnorm=True)` ([#11078](https://github.com/PyTorchLightning/pytorch-lightning/pull/11078)) - Fixed an `AttributeError` occuring when using a `CombinedLoader` (multiple dataloaders) for prediction ([#11111](https://github.com/PyTorchLightning/pytorch-lightning/pull/11111)) diff --git a/pytorch_lightning/core/mixins/hparams_mixin.py b/pytorch_lightning/core/mixins/hparams_mixin.py index 0e722f2bdb683..26a272dd3dd1d 100644 --- a/pytorch_lightning/core/mixins/hparams_mixin.py +++ b/pytorch_lightning/core/mixins/hparams_mixin.py @@ -28,7 +28,7 @@ class HyperparametersMixin: def __init__(self) -> None: super().__init__() - self._log_hyperparams = True + self._log_hyperparams = False def save_hyperparameters( self, diff --git a/tests/callbacks/test_gpu_stats_monitor.py b/tests/callbacks/test_gpu_stats_monitor.py index 5ed3f533b5588..ca9197c6a078c 100644 --- a/tests/callbacks/test_gpu_stats_monitor.py +++ b/tests/callbacks/test_gpu_stats_monitor.py @@ -83,7 +83,7 @@ def test_gpu_stats_monitor_no_queries(tmpdir): with mock.patch("pytorch_lightning.loggers.tensorboard.TensorBoardLogger.log_metrics") as log_metrics_mock: trainer.fit(model) - assert log_metrics_mock.mock_calls[2:] == [ + assert log_metrics_mock.mock_calls[1:] == [ mock.call({"batch_time/intra_step (ms)": mock.ANY}, step=0), mock.call({"batch_time/inter_step (ms)": mock.ANY}, step=1), mock.call({"batch_time/intra_step (ms)": mock.ANY}, step=1), diff --git a/tests/loggers/test_all.py b/tests/loggers/test_all.py index d66e77b4cea34..3bedd26621238 100644 --- a/tests/loggers/test_all.py +++ b/tests/loggers/test_all.py @@ -144,10 +144,8 @@ def log_metrics(self, metrics, step): log_metric_names = [(s, sorted(m.keys())) for s, m in logger.history] if logger_class == TensorBoardLogger: expected = [ - (0, ["hp_metric"]), (0, ["epoch", "train_some_val"]), (0, ["early_stop_on", "epoch", "val_loss"]), - (0, ["hp_metric"]), (1, ["epoch", "test_loss"]), ] assert log_metric_names == expected diff --git a/tests/loggers/test_base.py b/tests/loggers/test_base.py index f4a91f63b50c4..224271709f5f7 100644 --- a/tests/loggers/test_base.py +++ b/tests/loggers/test_base.py @@ -111,7 +111,6 @@ def training_step(self, batch, batch_idx): trainer = Trainer(max_steps=2, log_every_n_steps=1, logger=logger, default_root_dir=tmpdir) trainer.fit(model) assert trainer.state.finished, f"Training failed with {trainer.state}" - assert logger.hparams_logged == model.hparams assert logger.metrics_logged != {} assert logger.after_save_checkpoint_called assert logger.finalized_status == "success" @@ -133,11 +132,11 @@ def training_step(self, batch, batch_idx): trainer.fit(model) assert trainer.state.finished, f"Training failed with {trainer.state}" - assert logger1.hparams_logged == model.hparams + assert logger1.hparams_logged is None assert logger1.metrics_logged != {} assert logger1.finalized_status == "success" - assert logger2.hparams_logged == model.hparams + assert logger2.hparams_logged is None assert logger2.metrics_logged != {} assert logger2.finalized_status == "success" diff --git a/tests/models/test_hparams.py b/tests/models/test_hparams.py index dbd51d33bf0ed..d2ea07a12ea49 100644 --- a/tests/models/test_hparams.py +++ b/tests/models/test_hparams.py @@ -776,7 +776,10 @@ def test_adding_datamodule_hparams(tmpdir, model, data): # Merged hparams were logged merged_hparams = copy.deepcopy(org_model_hparams) merged_hparams.update(org_data_hparams) - mock_logger.log_hyperparams.assert_called_with(merged_hparams) + if merged_hparams: + mock_logger.log_hyperparams.assert_called_with(merged_hparams) + else: + mock_logger.log_hyperparams.assert_not_called() def test_no_datamodule_for_hparams(tmpdir): diff --git a/tests/trainer/logging_/test_distributed_logging.py b/tests/trainer/logging_/test_distributed_logging.py index d4ba4f242294a..36c266343b849 100644 --- a/tests/trainer/logging_/test_distributed_logging.py +++ b/tests/trainer/logging_/test_distributed_logging.py @@ -112,7 +112,6 @@ def on_fit_start(self, trainer, pl_module): def on_train_start(self, trainer, pl_module): assert trainer.logger.method_call - trainer.logger.log_hyperparams.assert_called_once() trainer.logger.log_graph.assert_called_once() logger = Mock() diff --git a/tests/trainer/logging_/test_eval_loop_logging.py b/tests/trainer/logging_/test_eval_loop_logging.py index c205ed8c6af48..d47cb1ef7d3bf 100644 --- a/tests/trainer/logging_/test_eval_loop_logging.py +++ b/tests/trainer/logging_/test_eval_loop_logging.py @@ -510,6 +510,10 @@ class ExtendedModel(BoringModel): val_losses = [] + def __init__(self, some_val=7): + super().__init__() + self.save_hyperparameters() + def training_step(self, batch, batch_idx): output = self.layer(batch) loss = self.loss(batch, output) From 8b2c39b5b7c26e065808895e74e5fe8b1d36d48c Mon Sep 17 00:00:00 2001 From: Kaushik B <45285388+kaushikb11@users.noreply.github.com> Date: Sat, 18 Dec 2021 08:23:37 +0530 Subject: [PATCH 094/123] Fix tpu spawn plugin test (#11131) --- dockers/tpu-tests/tpu_test_cases.jsonnet | 1 + tests/plugins/test_tpu_spawn.py | 6 +++--- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/dockers/tpu-tests/tpu_test_cases.jsonnet b/dockers/tpu-tests/tpu_test_cases.jsonnet index 55454e7cac0a2..530c40e49ed3e 100644 --- a/dockers/tpu-tests/tpu_test_cases.jsonnet +++ b/dockers/tpu-tests/tpu_test_cases.jsonnet @@ -33,6 +33,7 @@ local tputests = base.BaseTest { echo $KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS export XRT_TPU_CONFIG="tpu_worker;0;${KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS:7}" coverage run --source=pytorch_lightning -m pytest -v --capture=no \ + tests/plugins/test_tpu_spawn.py \ tests/profiler/test_xla_profiler.py \ pytorch_lightning/utilities/xla_device.py \ tests/accelerators/test_tpu.py \ diff --git a/tests/plugins/test_tpu_spawn.py b/tests/plugins/test_tpu_spawn.py index 5f4abf560d6a6..ba5dc0e9d5f0d 100644 --- a/tests/plugins/test_tpu_spawn.py +++ b/tests/plugins/test_tpu_spawn.py @@ -86,7 +86,7 @@ def test_error_process_iterable_dataloader(_): class BoringModelTPU(BoringModel): def on_train_start(self) -> None: - assert self.device == torch.device("xla") + assert self.device == torch.device("xla", index=1) assert os.environ.get("PT_XLA_DEBUG") == "1" @@ -94,12 +94,12 @@ def on_train_start(self) -> None: @pl_multi_process_test def test_model_tpu_one_core(): """Tests if device/debug flag is set correctely when training and after teardown for TPUSpawnPlugin.""" - trainer = Trainer(tpu_cores=1, fast_dev_run=True, plugin=TPUSpawnPlugin(debug=True)) + trainer = Trainer(tpu_cores=1, fast_dev_run=True, strategy=TPUSpawnPlugin(debug=True)) # assert training type plugin attributes for device setting assert isinstance(trainer.training_type_plugin, TPUSpawnPlugin) assert not trainer.training_type_plugin.on_gpu assert trainer.training_type_plugin.on_tpu - assert trainer.training_type_plugin.root_device == torch.device("xla") + assert trainer.training_type_plugin.root_device == torch.device("xla", index=1) model = BoringModelTPU() trainer.fit(model) assert "PT_XLA_DEBUG" not in os.environ From a6599048786383576252ed12e9184886e6dfcba9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafa=C5=82=20Jankowski?= Date: Sat, 18 Dec 2021 02:40:13 +0100 Subject: [PATCH 095/123] Fixed NeptuneLogger when using DDP (#11030) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Carlos Mocholí Co-authored-by: Rohit Gupta --- CHANGELOG.md | 1 + environment.yml | 2 +- pytorch_lightning/loggers/neptune.py | 126 ++++++++++++++++----------- tests/loggers/test_all.py | 6 +- tests/loggers/test_neptune.py | 6 +- 5 files changed, 86 insertions(+), 55 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b0ebb9a37aa4c..ee523731092b7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Fixed +- Fixed `NeptuneLogger` when using DDP ([#11030](https://github.com/PyTorchLightning/pytorch-lightning/pull/11030)) - Fixed a bug to disable logging hyperparameters in logger if there are no hparams ([#11105](https://github.com/PyTorchLightning/pytorch-lightning/issues/11105)) - Fixed an issue when torch-scripting a `LightningModule` after training with `Trainer(sync_batchnorm=True)` ([#11078](https://github.com/PyTorchLightning/pytorch-lightning/pull/11078)) - Fixed an `AttributeError` occuring when using a `CombinedLoader` (multiple dataloaders) for prediction ([#11111](https://github.com/PyTorchLightning/pytorch-lightning/pull/11111)) diff --git a/environment.yml b/environment.yml index d7d34c387af15..7e3c84e913f9e 100644 --- a/environment.yml +++ b/environment.yml @@ -51,4 +51,4 @@ dependencies: - mlflow>=1.0.0 - comet_ml>=3.1.12 - wandb>=0.8.21 - - neptune-client>=0.4.109 + - neptune-client>=0.10.0 diff --git a/pytorch_lightning/loggers/neptune.py b/pytorch_lightning/loggers/neptune.py index 97876afe65368..2e7031ff46c4f 100644 --- a/pytorch_lightning/loggers/neptune.py +++ b/pytorch_lightning/loggers/neptune.py @@ -44,7 +44,7 @@ from neptune.new.types import File as NeptuneFile except ModuleNotFoundError: import neptune - from neptune.exceptions import NeptuneLegacyProjectException + from neptune.exceptions import NeptuneLegacyProjectException, NeptuneOfflineModeFetchException from neptune.run import Run from neptune.types import File as NeptuneFile else: @@ -266,51 +266,64 @@ def __init__( prefix: str = "training", **neptune_run_kwargs, ): - # verify if user passed proper init arguments self._verify_input_arguments(api_key, project, name, run, neptune_run_kwargs) + if neptune is None: + raise ModuleNotFoundError( + "You want to use the `Neptune` logger which is not installed yet, install it with" + " `pip install neptune-client`." + ) super().__init__() self._log_model_checkpoints = log_model_checkpoints self._prefix = prefix + self._run_name = name + self._project_name = project + self._api_key = api_key + self._run_instance = run + self._neptune_run_kwargs = neptune_run_kwargs + self._run_short_id = None - self._run_instance = self._init_run_instance(api_key, project, name, run, neptune_run_kwargs) + if self._run_instance is not None: + self._retrieve_run_data() - self._run_short_id = self.run._short_id # skipcq: PYL-W0212 + # make sure that we've log integration version for outside `Run` instances + self._run_instance[_INTEGRATION_VERSION_KEY] = __version__ + + def _retrieve_run_data(self): try: - self.run.wait() + self._run_instance.wait() + self._run_short_id = self.run._short_id # skipcq: PYL-W0212 self._run_name = self._run_instance["sys/name"].fetch() except NeptuneOfflineModeFetchException: self._run_name = "offline-name" - def _init_run_instance(self, api_key, project, name, run, neptune_run_kwargs) -> Run: - if run is not None: - run_instance = run - else: - try: - run_instance = neptune.init( - project=project, - api_token=api_key, - name=name, - **neptune_run_kwargs, - ) - except NeptuneLegacyProjectException as e: - raise TypeError( - f"""Project {project} has not been migrated to the new structure. - You can still integrate it with the Neptune logger using legacy Python API - available as part of neptune-contrib package: - - https://docs-legacy.neptune.ai/integrations/pytorch_lightning.html\n - """ - ) from e - - # make sure that we've log integration version for both newly created and outside `Run` instances - run_instance[_INTEGRATION_VERSION_KEY] = __version__ - - # keep api_key and project, they will be required when resuming Run for pickled logger - self._api_key = api_key - self._project_name = run_instance._project_name # skipcq: PYL-W0212 + @property + def _neptune_init_args(self): + args = {} + # Backward compatibility in case of previous version retrieval + try: + args = self._neptune_run_kwargs + except AttributeError: + pass + + if self._project_name is not None: + args["project"] = self._project_name + + if self._api_key is not None: + args["api_token"] = self._api_key - return run_instance + if self._run_short_id is not None: + args["run"] = self._run_short_id + + # Backward compatibility in case of previous version retrieval + try: + if self._run_name is not None: + args["name"] = self._run_name + except AttributeError: + pass + + return args def _construct_path_with_prefix(self, *keys) -> str: """Return sequence of keys joined by `LOGGER_JOIN_CHAR`, started with `_prefix` if defined.""" @@ -379,7 +392,7 @@ def __getstate__(self): def __setstate__(self, state): self.__dict__ = state - self._run_instance = neptune.init(project=self._project_name, api_token=self._api_key, run=self._run_short_id) + self._run_instance = neptune.init(**self._neptune_init_args) @property @rank_zero_experiment @@ -412,8 +425,23 @@ def training_step(self, batch, batch_idx): return self.run @property + @rank_zero_experiment def run(self) -> Run: - return self._run_instance + try: + if not self._run_instance: + self._run_instance = neptune.init(**self._neptune_init_args) + self._retrieve_run_data() + # make sure that we've log integration version for newly created + self._run_instance[_INTEGRATION_VERSION_KEY] = __version__ + + return self._run_instance + except NeptuneLegacyProjectException as e: + raise TypeError( + f"Project {self._project_name} has not been migrated to the new structure." + " You can still integrate it with the Neptune logger using legacy Python API" + " available as part of neptune-contrib package:" + " https://docs-legacy.neptune.ai/integrations/pytorch_lightning.html\n" + ) from e @rank_zero_only def log_hyperparams(self, params: Union[Dict[str, Any], Namespace]) -> None: # skipcq: PYL-W0221 @@ -473,13 +501,13 @@ def log_metrics(self, metrics: Dict[str, Union[torch.Tensor, float]], step: Opti for key, val in metrics.items(): # `step` is ignored because Neptune expects strictly increasing step values which - # Lighting does not always guarantee. - self.experiment[key].log(val) + # Lightning does not always guarantee. + self.run[key].log(val) @rank_zero_only def finalize(self, status: str) -> None: if status: - self.experiment[self._construct_path_with_prefix("status")] = status + self.run[self._construct_path_with_prefix("status")] = status super().finalize(status) @@ -493,12 +521,14 @@ def save_dir(self) -> Optional[str]: """ return os.path.join(os.getcwd(), ".neptune") + @rank_zero_only def log_model_summary(self, model, max_depth=-1): model_str = str(ModelSummary(model=model, max_depth=max_depth)) - self.experiment[self._construct_path_with_prefix("model/summary")] = neptune.types.File.from_content( + self.run[self._construct_path_with_prefix("model/summary")] = neptune.types.File.from_content( content=model_str, extension="txt" ) + @rank_zero_only def after_save_checkpoint(self, checkpoint_callback: "ReferenceType[ModelCheckpoint]") -> None: """Automatically log checkpointed model. Called after model checkpoint callback saves a new checkpoint. @@ -515,35 +545,33 @@ def after_save_checkpoint(self, checkpoint_callback: "ReferenceType[ModelCheckpo if checkpoint_callback.last_model_path: model_last_name = self._get_full_model_name(checkpoint_callback.last_model_path, checkpoint_callback) file_names.add(model_last_name) - self.experiment[f"{checkpoints_namespace}/{model_last_name}"].upload(checkpoint_callback.last_model_path) + self.run[f"{checkpoints_namespace}/{model_last_name}"].upload(checkpoint_callback.last_model_path) # save best k models for key in checkpoint_callback.best_k_models.keys(): model_name = self._get_full_model_name(key, checkpoint_callback) file_names.add(model_name) - self.experiment[f"{checkpoints_namespace}/{model_name}"].upload(key) + self.run[f"{checkpoints_namespace}/{model_name}"].upload(key) # log best model path and checkpoint if checkpoint_callback.best_model_path: - self.experiment[ - self._construct_path_with_prefix("model/best_model_path") - ] = checkpoint_callback.best_model_path + self.run[self._construct_path_with_prefix("model/best_model_path")] = checkpoint_callback.best_model_path model_name = self._get_full_model_name(checkpoint_callback.best_model_path, checkpoint_callback) file_names.add(model_name) - self.experiment[f"{checkpoints_namespace}/{model_name}"].upload(checkpoint_callback.best_model_path) + self.run[f"{checkpoints_namespace}/{model_name}"].upload(checkpoint_callback.best_model_path) # remove old models logged to experiment if they are not part of best k models at this point - if self.experiment.exists(checkpoints_namespace): - exp_structure = self.experiment.get_structure() + if self.run.exists(checkpoints_namespace): + exp_structure = self.run.get_structure() uploaded_model_names = self._get_full_model_names_from_exp_structure(exp_structure, checkpoints_namespace) for file_to_drop in list(uploaded_model_names - file_names): - del self.experiment[f"{checkpoints_namespace}/{file_to_drop}"] + del self.run[f"{checkpoints_namespace}/{file_to_drop}"] # log best model score if checkpoint_callback.best_model_score: - self.experiment[self._construct_path_with_prefix("model/best_model_score")] = ( + self.run[self._construct_path_with_prefix("model/best_model_score")] = ( checkpoint_callback.best_model_score.cpu().detach().numpy() ) @@ -637,13 +665,11 @@ def log_artifact(self, artifact: str, destination: Optional[str] = None) -> None self._signal_deprecated_api_usage("log_artifact", f"logger.run['{key}].log('path_to_file')") self.run[key].log(destination) - @rank_zero_only def set_property(self, *args, **kwargs): self._signal_deprecated_api_usage( "log_artifact", f"logger.run['{self._prefix}/{self.PARAMETERS_KEY}/key'].log(value)", raise_exception=True ) - @rank_zero_only def append_tags(self, *args, **kwargs): self._signal_deprecated_api_usage( "append_tags", "logger.run['sys/tags'].add(['foo', 'bar'])", raise_exception=True diff --git a/tests/loggers/test_all.py b/tests/loggers/test_all.py index 3bedd26621238..b12f7c8286c62 100644 --- a/tests/loggers/test_all.py +++ b/tests/loggers/test_all.py @@ -47,6 +47,8 @@ def _get_logger_args(logger_class, save_dir): logger_args.update(offline_mode=True) if "offline" in inspect.getfullargspec(logger_class).args: logger_args.update(offline=True) + if issubclass(logger_class, NeptuneLogger): + logger_args.update(mode="offline") return logger_args @@ -320,7 +322,9 @@ def on_train_batch_start(self, trainer, pl_module, batch, batch_idx): @RunIf(skip_windows=True, skip_49370=True, skip_hanging_spawn=True) -@pytest.mark.parametrize("logger_class", [CometLogger, CSVLogger, MLFlowLogger, TensorBoardLogger, TestTubeLogger]) +@pytest.mark.parametrize( + "logger_class", [CometLogger, CSVLogger, MLFlowLogger, NeptuneLogger, TensorBoardLogger, TestTubeLogger] +) def test_logger_created_on_rank_zero_only(tmpdir, monkeypatch, logger_class): """Test that loggers get replaced by dummy loggers on global rank > 0.""" _patch_comet_atexit(monkeypatch) diff --git a/tests/loggers/test_neptune.py b/tests/loggers/test_neptune.py index cb7ef9c5156cc..ddea7b2419608 100644 --- a/tests/loggers/test_neptune.py +++ b/tests/loggers/test_neptune.py @@ -77,7 +77,7 @@ def tmpdir_unittest_fixture(request, tmpdir): class TestNeptuneLogger(unittest.TestCase): def test_neptune_online(self, neptune): logger = NeptuneLogger(api_key="test", project="project") - created_run_mock = logger._run_instance + created_run_mock = logger.run self.assertEqual(logger._run_instance, created_run_mock) self.assertEqual(logger.name, "Run test name") @@ -109,7 +109,7 @@ def test_neptune_pickling(self, neptune): pickled_logger = pickle.dumps(logger) unpickled = pickle.loads(pickled_logger) - neptune.init.assert_called_once_with(project="test-project", api_token=None, run="TEST-42") + neptune.init.assert_called_once_with(name="Test name", run=unpickleable_run._short_id) self.assertIsNotNone(unpickled.experiment) @patch("pytorch_lightning.loggers.neptune.Run", Run) @@ -360,7 +360,7 @@ def test_legacy_functions(self, neptune, neptune_file_mock, warnings_mock): logger = NeptuneLogger(api_key="test", project="project") # test deprecated functions which will be shut down in pytorch-lightning 1.7.0 - attr_mock = logger._run_instance.__getitem__ + attr_mock = logger.run.__getitem__ attr_mock.reset_mock() fake_image = {} From 7971e78b48b8e030692978c5455910b97a016d7a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Fri, 17 Dec 2021 10:11:11 +0100 Subject: [PATCH 096/123] Avoid the deprecated `onnx.export(example_outputs=...)` in torch 1.10 (#11116) --- CHANGELOG.md | 1 + pytorch_lightning/core/lightning.py | 2 +- tests/models/test_onnx.py | 1 + 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ee523731092b7..6cfd858a126cc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed `NeptuneLogger` when using DDP ([#11030](https://github.com/PyTorchLightning/pytorch-lightning/pull/11030)) - Fixed a bug to disable logging hyperparameters in logger if there are no hparams ([#11105](https://github.com/PyTorchLightning/pytorch-lightning/issues/11105)) +- Avoid the deprecated `onnx.export(example_outputs=...)` in torch 1.10 ([#11116](https://github.com/PyTorchLightning/pytorch-lightning/pull/11116)) - Fixed an issue when torch-scripting a `LightningModule` after training with `Trainer(sync_batchnorm=True)` ([#11078](https://github.com/PyTorchLightning/pytorch-lightning/pull/11078)) - Fixed an `AttributeError` occuring when using a `CombinedLoader` (multiple dataloaders) for prediction ([#11111](https://github.com/PyTorchLightning/pytorch-lightning/pull/11111)) diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py index 8e77baf6ec539..74485f7ddc89e 100644 --- a/pytorch_lightning/core/lightning.py +++ b/pytorch_lightning/core/lightning.py @@ -1889,7 +1889,7 @@ def to_onnx(self, file_path: Union[str, Path], input_sample: Optional[Any] = Non input_sample = self._apply_batch_transfer_handler(input_sample) - if "example_outputs" not in kwargs: + if not _TORCH_GREATER_EQUAL_1_10 and "example_outputs" not in kwargs: self.eval() if isinstance(input_sample, Tuple): kwargs["example_outputs"] = self(*input_sample) diff --git a/tests/models/test_onnx.py b/tests/models/test_onnx.py index 7ab425dd12ea6..d111b266fb115 100644 --- a/tests/models/test_onnx.py +++ b/tests/models/test_onnx.py @@ -53,6 +53,7 @@ def test_model_saves_on_gpu(tmpdir): assert os.path.getsize(file_path) > 4e2 +@RunIf(max_torch="1.10") def test_model_saves_with_example_output(tmpdir): """Test that ONNX model saves when provided with example output.""" model = BoringModel() From 327cba5202d77c14fc8faf73e3dff3dabd36883d Mon Sep 17 00:00:00 2001 From: Sean Naren Date: Fri, 17 Dec 2021 12:36:53 +0000 Subject: [PATCH 097/123] Remove partitioning of model in ZeRO 3 (#10655) (cherry picked from commit c66cd12445481357cb4e29d69a85d021d5b876ea) --- .azure-pipelines/gpu-tests.yml | 2 +- CHANGELOG.md | 3 ++ dockers/base-cuda/Dockerfile | 2 +- .../plugins/training_type/deepspeed.py | 19 +------ tests/plugins/test_deepspeed_plugin.py | 52 +++++-------------- 5 files changed, 20 insertions(+), 58 deletions(-) diff --git a/.azure-pipelines/gpu-tests.yml b/.azure-pipelines/gpu-tests.yml index 8752e8584439a..ca8c54a61479e 100644 --- a/.azure-pipelines/gpu-tests.yml +++ b/.azure-pipelines/gpu-tests.yml @@ -51,7 +51,7 @@ jobs: - bash: | python -c "fname = 'requirements/extra.txt' ; lines = [line for line in open(fname).readlines() if 'horovod' not in line] ; open(fname, 'w').writelines(lines)" pip install fairscale==0.4.0 - pip install deepspeed==0.5.4 + pip install deepspeed==0.5.7 pip install . --requirement requirements/devel.txt pip list displayName: 'Install dependencies' diff --git a/CHANGELOG.md b/CHANGELOG.md index 6cfd858a126cc..8bf2ee6664270 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed an issue when torch-scripting a `LightningModule` after training with `Trainer(sync_batchnorm=True)` ([#11078](https://github.com/PyTorchLightning/pytorch-lightning/pull/11078)) - Fixed an `AttributeError` occuring when using a `CombinedLoader` (multiple dataloaders) for prediction ([#11111](https://github.com/PyTorchLightning/pytorch-lightning/pull/11111)) +### Changed + +- DeepSpeed does not require lightning module zero 3 partitioning ([#10655](https://github.com/PyTorchLightning/pytorch-lightning/pull/10655)) ## [1.5.6] - 2021-12-15 diff --git a/dockers/base-cuda/Dockerfile b/dockers/base-cuda/Dockerfile index 99e8d018f2884..d70761cbdd37a 100644 --- a/dockers/base-cuda/Dockerfile +++ b/dockers/base-cuda/Dockerfile @@ -112,7 +112,7 @@ RUN \ RUN \ # install DeepSpeed - pip install deepspeed==0.5.4 + pip install deepspeed==0.5.7 RUN \ # Show what we have diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py index 4b08c8dc8b039..3359e7776d6e5 100644 --- a/pytorch_lightning/plugins/training_type/deepspeed.py +++ b/pytorch_lightning/plugins/training_type/deepspeed.py @@ -129,7 +129,6 @@ def __init__( contiguous_memory_optimization: bool = False, synchronize_checkpoint_boundary: bool = False, load_full_weights: bool = False, - partition_module: bool = True, ) -> None: """Provides capabilities to run training using the DeepSpeed library, with training optimizations for large billion parameter models. `For more information: https://pytorch- @@ -259,12 +258,6 @@ def __init__( load_full_weights: True when loading a single checkpoint file containing the model state dict when using ZeRO Stage 3. This differs from the DeepSpeed checkpoint which contains shards per worker. - - partition_module: When True, partitions the ``LightningModule`` across devices when using ZeRO Stage 3. - This is the default behaviour to ensure that the entire module is appropriately initialized - for DeepSpeed. When False we do not explicitly convert the model, which is fine if NO layers - or ALL layers are defined in ``configure_sharded_model``. This is useful for layers such as - ``torch.nn.RNN`` which do internal logic when moving to device. """ if not _DEEPSPEED_AVAILABLE: raise MisconfigurationException( @@ -317,7 +310,6 @@ def __init__( self.remote_device = remote_device self.load_full_weights = load_full_weights - self.partition_module = partition_module # default FP16 parameters. self.loss_scale = loss_scale @@ -463,13 +455,6 @@ def init_deepspeed(self): precision = self.lightning_module.trainer.accelerator.precision model = LightningDeepSpeedModule(pl_module=self.model, precision=precision) - if self.zero_stage_3 and self.partition_module: - # Ensure the entire model has been moved to the appropriate device - dtype = torch.float16 if self.precision in (16, "mixed") else torch.float32 - deepspeed.zero.Init( - module=model, remote_device=self.remote_device, pin_memory=True, config=self.config, dtype=dtype - ) - if self.lightning_module.trainer and self.lightning_module.trainer.training: self._initialize_deepspeed_train(model) else: @@ -524,7 +509,7 @@ def model_sharded_context(self) -> Generator[None, None, None]: assert self._config_initialized dtype = torch.float16 if self.precision in (16, "mixed") else torch.float32 model_parallel_context = deepspeed.zero.Init( - remote_device=self.remote_device, pin_memory=True, config=self.config, dtype=dtype + remote_device=self.remote_device, pin_memory=True, config_dict_or_path=self.config, dtype=dtype ) else: model_parallel_context = super().model_sharded_context() @@ -554,7 +539,7 @@ def _initialize_deepspeed_inference(self, model): optimizer, lr_scheduler, _ = self._init_optimizers() scheduler = lr_scheduler["scheduler"] inference_config = { - # todo: this is required for DeepSpeed throughput timers, or throughput timers will be incorrect + # todo: this is required for DeepSpeed throughput timers "train_micro_batch_size_per_gpu": 1 } if "fp16" in self.config: diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py index bb65c61e057fd..d2205e59773d4 100644 --- a/tests/plugins/test_deepspeed_plugin.py +++ b/tests/plugins/test_deepspeed_plugin.py @@ -595,7 +595,9 @@ def test_deepspeed_multigpu_stage_3_manual_optimization(tmpdir, deepspeed_config _assert_save_model_is_equal(model, tmpdir, trainer) -def run_checkpoint_test(tmpdir: str, automatic_optimization: bool = True, accumulate_grad_batches: int = 2): +@pytest.mark.parametrize(("accumulate_grad_batches", "automatic_optimization"), [(1, False), (2, True)]) +@RunIf(min_gpus=2, deepspeed=True, standalone=True) +def test_deepspeed_multigpu_stage_3_checkpointing(tmpdir, automatic_optimization, accumulate_grad_batches): seed_everything(1) if automatic_optimization: model = ModelParallelClassificationModel() @@ -630,13 +632,6 @@ def run_checkpoint_test(tmpdir: str, automatic_optimization: bool = True, accumu assert results[0]["test_acc"] > 0.7 -@RunIf(min_gpus=2, deepspeed=True, standalone=True) -def test_deepspeed_multigpu_stage_3_checkpointing(tmpdir): - """Test to ensure with Stage 3 and multiple GPUs that we can save/load a model resuming from a checkpoint, and - see convergence.""" - run_checkpoint_test(tmpdir) - - @RunIf(min_gpus=1, deepspeed=True, standalone=True) def test_deepspeed_multigpu_stage_3_warns_resume_training(tmpdir): """Test to ensure with Stage 3 and multiple GPUs that we can resume from training, throwing a warning that the @@ -718,24 +713,9 @@ def on_train_batch_start( trainer.fit(model, datamodule=dm, ckpt_path=ck.best_model_path) +@pytest.mark.parametrize("offload_optimizer", [False, True]) @RunIf(min_gpus=2, deepspeed=True, standalone=True) -def test_deepspeed_multigpu_stage_3_checkpointing_full_weights_manual(tmpdir): - """Test to ensure with Stage 3 and multiple GPUs that we can save/load a model resuming from a checkpoint, - where we save the full weights to one file.""" - run_checkpoint_test(tmpdir, automatic_optimization=False, accumulate_grad_batches=1) - - -@RunIf(min_gpus=2, deepspeed=True, standalone=True) -def test_deepspeed_multigpu_stage_2_accumulated_grad_batches(tmpdir): - _deepspeed_multigpu_stage_2_accumulated_grad_batches(tmpdir, offload_optimizer=False) - - -@RunIf(min_gpus=2, deepspeed=True, standalone=True) -def test_deepspeed_multigpu_stage_2_accumulated_grad_batches_offload_optimizer(tmpdir): - _deepspeed_multigpu_stage_2_accumulated_grad_batches(tmpdir, offload_optimizer=True) - - -def _deepspeed_multigpu_stage_2_accumulated_grad_batches(tmpdir, offload_optimizer): +def test_deepspeed_multigpu_stage_2_accumulated_grad_batches(tmpdir, offload_optimizer): """Test to ensure with Stage 2 and multiple GPUs, accumulated grad batches works.""" seed_everything(42) @@ -781,6 +761,8 @@ def test_deepspeed_multigpu_test(tmpdir): trainer.test(model) +# TODO(Sean): Once partial parameter partitioning is supported this test should be re-enabled +@pytest.mark.skip("Partial parameter partitioning for DeepSpeed is currently broken.") @RunIf(min_gpus=1, deepspeed=True, standalone=True) def test_deepspeed_multigpu_partial_partition_parameters(tmpdir): """Test to ensure that a module that defines a layer inside the ``__init__`` and ``configure_sharded_model`` @@ -824,7 +806,7 @@ def on_train_epoch_start(self) -> None: model = TestModel() trainer = Trainer( default_root_dir=tmpdir, - strategy=DeepSpeedPlugin(stage=3, partition_module=False), + strategy=DeepSpeedPlugin(stage=3), gpus=1, fast_dev_run=True, precision=16, @@ -941,22 +923,14 @@ def test_dataloader(self): @mock.patch("torch.optim.lr_scheduler.StepLR.step", autospec=True) +@pytest.mark.parametrize("interval", ["step", "epoch"]) +@pytest.mark.parametrize("max_epoch", [2]) +@pytest.mark.parametrize("limit_train_batches", [2]) @RunIf(min_gpus=1, deepspeed=True, standalone=True) -def test_deepspeed_scheduler_step_count(mock_step): +def test_scheduler_step_count(mock_step, max_epoch, limit_train_batches, interval): """Test to ensure that the scheduler is called the correct amount of times during training when scheduler is - set to step.""" - _run_scheduler_test(mock_step, max_epoch=2, limit_train_batches=2, interval="step") - - -@mock.patch("torch.optim.lr_scheduler.StepLR.step", autospec=True) -@RunIf(min_gpus=1, deepspeed=True, standalone=True) -def test_deepspeed_scheduler_step_count_epoch(mock_step): - """Test to ensure that the scheduler is called the correct amount of times during training when scheduler is - set to epoch.""" - _run_scheduler_test(mock_step, max_epoch=2, limit_train_batches=2, interval="epoch") - + set to step or epoch.""" -def _run_scheduler_test(mock_step, max_epoch, limit_train_batches, interval): class TestModel(BoringModel): def configure_optimizers(self): optimizer = torch.optim.SGD(self.layer.parameters(), lr=0.1) From 2264082e97a45b4d20ecfb0cfa1150e261879e8b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 21 Dec 2021 13:10:11 +0100 Subject: [PATCH 098/123] Add typing for `trainer.logger` (#11114) --- CHANGELOG.md | 1 + pytorch_lightning/callbacks/device_stats_monitor.py | 2 ++ pytorch_lightning/callbacks/gpu_stats_monitor.py | 2 ++ pytorch_lightning/plugins/precision/precision_plugin.py | 3 ++- pytorch_lightning/trainer/trainer.py | 1 + 5 files changed, 8 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8bf2ee6664270..71f9cb424884e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Avoid the deprecated `onnx.export(example_outputs=...)` in torch 1.10 ([#11116](https://github.com/PyTorchLightning/pytorch-lightning/pull/11116)) - Fixed an issue when torch-scripting a `LightningModule` after training with `Trainer(sync_batchnorm=True)` ([#11078](https://github.com/PyTorchLightning/pytorch-lightning/pull/11078)) - Fixed an `AttributeError` occuring when using a `CombinedLoader` (multiple dataloaders) for prediction ([#11111](https://github.com/PyTorchLightning/pytorch-lightning/pull/11111)) +- Fixed bug where `Trainer(track_grad_norm=..., logger=False)' would fail ([#11114](https://github.com/PyTorchLightning/pytorch-lightning/pull/11114)) ### Changed diff --git a/pytorch_lightning/callbacks/device_stats_monitor.py b/pytorch_lightning/callbacks/device_stats_monitor.py index b743ed3e1bbeb..016d2015a81e1 100644 --- a/pytorch_lightning/callbacks/device_stats_monitor.py +++ b/pytorch_lightning/callbacks/device_stats_monitor.py @@ -59,6 +59,7 @@ def on_train_batch_start( device_stats = trainer.accelerator.get_device_stats(pl_module.device) prefixed_device_stats = prefix_metrics_keys(device_stats, "on_train_batch_start") + assert trainer.logger is not None trainer.logger.log_metrics(prefixed_device_stats, step=trainer.global_step) def on_train_batch_end( @@ -75,6 +76,7 @@ def on_train_batch_end( device_stats = trainer.accelerator.get_device_stats(pl_module.device) prefixed_device_stats = prefix_metrics_keys(device_stats, "on_train_batch_end") + assert trainer.logger is not None trainer.logger.log_metrics(prefixed_device_stats, step=trainer.global_step) diff --git a/pytorch_lightning/callbacks/gpu_stats_monitor.py b/pytorch_lightning/callbacks/gpu_stats_monitor.py index 7ee6771056666..7bb0289050a1e 100644 --- a/pytorch_lightning/callbacks/gpu_stats_monitor.py +++ b/pytorch_lightning/callbacks/gpu_stats_monitor.py @@ -161,6 +161,7 @@ def on_train_batch_start( # First log at beginning of second step logs["batch_time/inter_step (ms)"] = (time.time() - self._snap_inter_step_time) * 1000 + assert trainer.logger is not None trainer.logger.log_metrics(logs, step=trainer.global_step) @rank_zero_only @@ -185,6 +186,7 @@ def on_train_batch_end( if self._log_stats.intra_step_time and self._snap_intra_step_time: logs["batch_time/intra_step (ms)"] = (time.time() - self._snap_intra_step_time) * 1000 + assert trainer.logger is not None trainer.logger.log_metrics(logs, step=trainer.global_step) @staticmethod diff --git a/pytorch_lightning/plugins/precision/precision_plugin.py b/pytorch_lightning/plugins/precision/precision_plugin.py index f1ebbf58d8326..012922ea2b60a 100644 --- a/pytorch_lightning/plugins/precision/precision_plugin.py +++ b/pytorch_lightning/plugins/precision/precision_plugin.py @@ -165,7 +165,8 @@ def optimizer_step( def _track_grad_norm(self, trainer: "pl.Trainer") -> None: if trainer.track_grad_norm == -1: return - grad_norm_dict = grad_norm(trainer.lightning_module, trainer.track_grad_norm, trainer.logger.group_separator) + kwargs = {"group_separator": trainer.logger.group_separator} if trainer.logger is not None else {} + grad_norm_dict = grad_norm(trainer.lightning_module, trainer.track_grad_norm, **kwargs) if grad_norm_dict: prev_fx = trainer.lightning_module._current_fx_name trainer.lightning_module._current_fx_name = "on_before_optimizer_step" diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 18be005715aca..a21763259ece2 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -569,6 +569,7 @@ def __init__( self.__init_profiler(profiler) # init logger flags + self.logger: Optional[LightningLoggerBase] self.logger_connector.on_trainer_init(logger, flush_logs_every_n_steps, log_every_n_steps, move_metrics_to_cpu) # init debugging flags From 502aa74b917e73071b264312ac33dd4f10fbee78 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 21 Dec 2021 13:19:13 +0100 Subject: [PATCH 099/123] Refactor plugin tests whose assertions don't need to run in `on_fit_start` hook (#11149) --- .../test_accelerator_connector.py | 381 ++++++------------ tests/accelerators/test_gpu.py | 34 ++ 2 files changed, 151 insertions(+), 264 deletions(-) diff --git a/tests/accelerators/test_accelerator_connector.py b/tests/accelerators/test_accelerator_connector.py index aa0c184a72980..3cb9ed382f5be 100644 --- a/tests/accelerators/test_accelerator_connector.py +++ b/tests/accelerators/test_accelerator_connector.py @@ -24,7 +24,6 @@ from pytorch_lightning.accelerators.accelerator import Accelerator from pytorch_lightning.accelerators.cpu import CPUAccelerator from pytorch_lightning.accelerators.gpu import GPUAccelerator -from pytorch_lightning.callbacks import Callback from pytorch_lightning.plugins import ( DataParallelPlugin, DDP2Plugin, @@ -45,7 +44,6 @@ ) from pytorch_lightning.utilities import DeviceType, DistributedType from pytorch_lightning.utilities.exceptions import MisconfigurationException -from tests.helpers.boring_model import BoringModel from tests.helpers.runif import RunIf @@ -99,25 +97,16 @@ def test_accelerator_choice_ddp_spawn(cuda_available_mock, device_count_mock): }, ) @mock.patch("pytorch_lightning.plugins.DDPPlugin.setup_distributed", autospec=True) -def test_accelerator_choice_ddp_slurm(setup_distributed_mock): - class CB(Callback): - def on_fit_start(self, trainer, pl_module): - assert trainer._accelerator_connector._is_slurm_managing_tasks - assert isinstance(trainer.accelerator, GPUAccelerator) - assert isinstance(trainer.training_type_plugin, DDPPlugin) - assert isinstance(trainer.training_type_plugin.cluster_environment, SLURMEnvironment) - assert trainer.training_type_plugin.cluster_environment.local_rank() == 1 - assert trainer.training_type_plugin.local_rank == 1 - raise SystemExit() - - model = BoringModel() - trainer = Trainer(fast_dev_run=True, accelerator="ddp", gpus=2, callbacks=[CB()]) - - with pytest.raises(SystemExit): - trainer.fit(model) +def test_accelerator_choice_ddp_slurm(*_): + trainer = Trainer(fast_dev_run=True, accelerator="ddp", gpus=2) + assert trainer._accelerator_connector._is_slurm_managing_tasks + assert isinstance(trainer.accelerator, GPUAccelerator) + assert isinstance(trainer.training_type_plugin, DDPPlugin) + assert isinstance(trainer.training_type_plugin.cluster_environment, SLURMEnvironment) + assert trainer.training_type_plugin.cluster_environment.local_rank() == 1 + assert trainer.training_type_plugin.local_rank == 1 -@RunIf(min_gpus=2) @mock.patch.dict( os.environ, { @@ -131,25 +120,16 @@ def on_fit_start(self, trainer, pl_module): ) @mock.patch("torch.cuda.device_count", return_value=2) @mock.patch("pytorch_lightning.plugins.DDPPlugin.setup_distributed", autospec=True) -def test_accelerator_choice_ddp2_slurm(device_count_mock, setup_distributed_mock): - class CB(Callback): - def on_fit_start(self, trainer, pl_module): - assert trainer._accelerator_connector._is_slurm_managing_tasks - assert isinstance(trainer.accelerator, GPUAccelerator) - assert isinstance(trainer.training_type_plugin, DDP2Plugin) - assert isinstance(trainer.training_type_plugin.cluster_environment, SLURMEnvironment) - assert trainer.training_type_plugin.cluster_environment.local_rank() == 1 - assert trainer.training_type_plugin.local_rank == 1 - raise SystemExit() - - model = BoringModel() - trainer = Trainer(fast_dev_run=True, accelerator="ddp2", gpus=2, callbacks=[CB()]) - - with pytest.raises(SystemExit): - trainer.fit(model) +def test_accelerator_choice_ddp2_slurm(*_): + trainer = Trainer(fast_dev_run=True, accelerator="ddp2", gpus=2) + assert trainer._accelerator_connector._is_slurm_managing_tasks + assert isinstance(trainer.accelerator, GPUAccelerator) + assert isinstance(trainer.training_type_plugin, DDP2Plugin) + assert isinstance(trainer.training_type_plugin.cluster_environment, SLURMEnvironment) + assert trainer.training_type_plugin.cluster_environment.local_rank() == 1 + assert trainer.training_type_plugin.local_rank == 1 -@RunIf(min_gpus=1) @mock.patch.dict( os.environ, { @@ -163,24 +143,15 @@ def on_fit_start(self, trainer, pl_module): ) @mock.patch("torch.cuda.device_count", return_value=2) @mock.patch("pytorch_lightning.plugins.DDPPlugin.setup_distributed", autospec=True) -def test_accelerator_choice_ddp_te(device_count_mock, setup_distributed_mock): - class CB(Callback): - def on_fit_start(self, trainer, pl_module): - assert isinstance(trainer.accelerator, GPUAccelerator) - assert isinstance(trainer.training_type_plugin, DDPPlugin) - assert isinstance(trainer.training_type_plugin.cluster_environment, TorchElasticEnvironment) - assert trainer.training_type_plugin.cluster_environment.local_rank() == 1 - assert trainer.training_type_plugin.local_rank == 1 - raise SystemExit() - - model = BoringModel() - trainer = Trainer(fast_dev_run=True, accelerator="ddp", gpus=2, callbacks=[CB()]) - - with pytest.raises(SystemExit): - trainer.fit(model) +def test_accelerator_choice_ddp_te(*_): + trainer = Trainer(fast_dev_run=True, accelerator="ddp", gpus=2) + assert isinstance(trainer.accelerator, GPUAccelerator) + assert isinstance(trainer.training_type_plugin, DDPPlugin) + assert isinstance(trainer.training_type_plugin.cluster_environment, TorchElasticEnvironment) + assert trainer.training_type_plugin.cluster_environment.local_rank() == 1 + assert trainer.training_type_plugin.local_rank == 1 -@RunIf(min_gpus=1) @mock.patch.dict( os.environ, { @@ -194,21 +165,13 @@ def on_fit_start(self, trainer, pl_module): ) @mock.patch("torch.cuda.device_count", return_value=2) @mock.patch("pytorch_lightning.plugins.DDPPlugin.setup_distributed", autospec=True) -def test_accelerator_choice_ddp2_te(device_count_mock, setup_distributed_mock): - class CB(Callback): - def on_fit_start(self, trainer, pl_module): - assert isinstance(trainer.accelerator, GPUAccelerator) - assert isinstance(trainer.training_type_plugin, DDP2Plugin) - assert isinstance(trainer.training_type_plugin.cluster_environment, TorchElasticEnvironment) - assert trainer.training_type_plugin.cluster_environment.local_rank() == 1 - assert trainer.training_type_plugin.local_rank == 1 - raise SystemExit() - - model = BoringModel() - trainer = Trainer(fast_dev_run=True, accelerator="ddp2", gpus=2, callbacks=[CB()]) - - with pytest.raises(SystemExit): - trainer.fit(model) +def test_accelerator_choice_ddp2_te(*_): + trainer = Trainer(fast_dev_run=True, accelerator="ddp2", gpus=2) + assert isinstance(trainer.accelerator, GPUAccelerator) + assert isinstance(trainer.training_type_plugin, DDP2Plugin) + assert isinstance(trainer.training_type_plugin.cluster_environment, TorchElasticEnvironment) + assert trainer.training_type_plugin.cluster_environment.local_rank() == 1 + assert trainer.training_type_plugin.local_rank == 1 @mock.patch.dict( @@ -216,24 +179,15 @@ def on_fit_start(self, trainer, pl_module): ) @mock.patch("torch.cuda.device_count", return_value=0) @mock.patch("pytorch_lightning.plugins.DDPPlugin.setup_distributed", autospec=True) -def test_accelerator_choice_ddp_cpu_te(device_count_mock, setup_distributed_mock): - class CB(Callback): - def on_fit_start(self, trainer, pl_module): - assert isinstance(trainer.accelerator, CPUAccelerator) - assert isinstance(trainer.training_type_plugin, DDPPlugin) - assert isinstance(trainer.training_type_plugin.cluster_environment, TorchElasticEnvironment) - assert trainer.training_type_plugin.cluster_environment.local_rank() == 1 - assert trainer.training_type_plugin.local_rank == 1 - raise SystemExit() - - model = BoringModel() - trainer = Trainer(fast_dev_run=True, accelerator="ddp_cpu", num_processes=2, callbacks=[CB()]) - - with pytest.raises(SystemExit): - trainer.fit(model) +def test_accelerator_choice_ddp_cpu_te(*_): + trainer = Trainer(fast_dev_run=True, accelerator="ddp_cpu", num_processes=2) + assert isinstance(trainer.accelerator, CPUAccelerator) + assert isinstance(trainer.training_type_plugin, DDPPlugin) + assert isinstance(trainer.training_type_plugin.cluster_environment, TorchElasticEnvironment) + assert trainer.training_type_plugin.cluster_environment.local_rank() == 1 + assert trainer.training_type_plugin.local_rank == 1 -@RunIf(min_gpus=1) @mock.patch.dict( os.environ, { @@ -247,21 +201,13 @@ def on_fit_start(self, trainer, pl_module): ) @mock.patch("torch.cuda.device_count", return_value=1) @mock.patch("pytorch_lightning.plugins.DDPPlugin.setup_distributed", autospec=True) -def test_accelerator_choice_ddp_kubeflow(device_count_mock, setup_distributed_mock): - class CB(Callback): - def on_fit_start(self, trainer, pl_module): - assert isinstance(trainer.accelerator, GPUAccelerator) - assert isinstance(trainer.training_type_plugin, DDPPlugin) - assert isinstance(trainer.training_type_plugin.cluster_environment, KubeflowEnvironment) - assert trainer.training_type_plugin.cluster_environment.local_rank() == 0 - assert trainer.training_type_plugin.local_rank == 0 - raise SystemExit() - - model = BoringModel() - trainer = Trainer(fast_dev_run=True, accelerator="ddp", gpus=1, callbacks=[CB()]) - - with pytest.raises(SystemExit): - trainer.fit(model) +def test_accelerator_choice_ddp_kubeflow(*_): + trainer = Trainer(fast_dev_run=True, accelerator="ddp", gpus=1) + assert isinstance(trainer.accelerator, GPUAccelerator) + assert isinstance(trainer.training_type_plugin, DDPPlugin) + assert isinstance(trainer.training_type_plugin.cluster_environment, KubeflowEnvironment) + assert trainer.training_type_plugin.cluster_environment.local_rank() == 0 + assert trainer.training_type_plugin.local_rank == 0 @mock.patch.dict( @@ -276,21 +222,13 @@ def on_fit_start(self, trainer, pl_module): ) @mock.patch("torch.cuda.device_count", return_value=0) @mock.patch("pytorch_lightning.plugins.DDPPlugin.setup_distributed", autospec=True) -def test_accelerator_choice_ddp_cpu_kubeflow(device_count_mock, setup_distributed_mock): - class CB(Callback): - def on_fit_start(self, trainer, pl_module): - assert isinstance(trainer.accelerator, CPUAccelerator) - assert isinstance(trainer.training_type_plugin, DDPPlugin) - assert isinstance(trainer.training_type_plugin.cluster_environment, KubeflowEnvironment) - assert trainer.training_type_plugin.cluster_environment.local_rank() == 0 - assert trainer.training_type_plugin.local_rank == 0 - raise SystemExit() - - model = BoringModel() - trainer = Trainer(fast_dev_run=True, accelerator="ddp_cpu", num_processes=1, callbacks=[CB()]) - - with pytest.raises(SystemExit): - trainer.fit(model) +def test_accelerator_choice_ddp_cpu_kubeflow(*_): + trainer = Trainer(fast_dev_run=True, accelerator="ddp_cpu", num_processes=1) + assert isinstance(trainer.accelerator, CPUAccelerator) + assert isinstance(trainer.training_type_plugin, DDPPlugin) + assert isinstance(trainer.training_type_plugin.cluster_environment, KubeflowEnvironment) + assert trainer.training_type_plugin.cluster_environment.local_rank() == 0 + assert trainer.training_type_plugin.local_rank == 0 @mock.patch.dict( @@ -306,21 +244,13 @@ def on_fit_start(self, trainer, pl_module): ) @mock.patch("torch.cuda.device_count", return_value=0) @mock.patch("pytorch_lightning.plugins.DDPPlugin.setup_distributed", autospec=True) -def test_accelerator_choice_ddp_cpu_slurm(device_count_mock, setup_distributed_mock): - class CB(Callback): - def on_fit_start(self, trainer, pl_module): - assert trainer._accelerator_connector._is_slurm_managing_tasks - assert isinstance(trainer.accelerator, CPUAccelerator) - assert isinstance(trainer.training_type_plugin, DDPPlugin) - assert isinstance(trainer.training_type_plugin.cluster_environment, SLURMEnvironment) - assert trainer.training_type_plugin.local_rank == 0 - raise SystemExit() - - model = BoringModel() - trainer = Trainer(fast_dev_run=True, accelerator="ddp_cpu", num_processes=2, callbacks=[CB()]) - - with pytest.raises(SystemExit): - trainer.fit(model) +def test_accelerator_choice_ddp_cpu_slurm(*_): + trainer = Trainer(fast_dev_run=True, accelerator="ddp_cpu", num_processes=2) + assert trainer._accelerator_connector._is_slurm_managing_tasks + assert isinstance(trainer.accelerator, CPUAccelerator) + assert isinstance(trainer.training_type_plugin, DDPPlugin) + assert isinstance(trainer.training_type_plugin.cluster_environment, SLURMEnvironment) + assert trainer.training_type_plugin.local_rank == 0 @RunIf(skip_windows=True, standalone=True) @@ -429,19 +359,11 @@ class DistributedPlugin(DDPPlugin): ) @mock.patch("torch.cuda.device_count", return_value=0) @mock.patch("pytorch_lightning.plugins.DDPPlugin.setup_distributed", autospec=True) -def test_dist_backend_accelerator_mapping(device_count_mock, setup_distributed_mock): - class CB(Callback): - def on_fit_start(self, trainer, pl_module): - assert isinstance(trainer.accelerator, CPUAccelerator) - assert isinstance(trainer.training_type_plugin, DDPPlugin) - assert trainer.training_type_plugin.local_rank == 0 - raise SystemExit() - - model = BoringModel() - trainer = Trainer(fast_dev_run=True, strategy="ddp_spawn", num_processes=2, callbacks=[CB()]) - - with pytest.raises(SystemExit): - trainer.fit(model) +def test_dist_backend_accelerator_mapping(*_): + trainer = Trainer(fast_dev_run=True, strategy="ddp_spawn", num_processes=2) + assert isinstance(trainer.accelerator, CPUAccelerator) + assert isinstance(trainer.training_type_plugin, DDPPlugin) + assert trainer.training_type_plugin.local_rank == 0 @mock.patch("pytorch_lightning.utilities._IS_INTERACTIVE", return_value=True) @@ -772,24 +694,15 @@ def test_strategy_choice_ddp_spawn(cuda_available_mock, device_count_mock): @mock.patch("pytorch_lightning.plugins.DDPPlugin.setup_distributed", autospec=True) @pytest.mark.parametrize("strategy", ["ddp", DDPPlugin()]) def test_strategy_choice_ddp_slurm(setup_distributed_mock, strategy): - class CB(Callback): - def on_fit_start(self, trainer, pl_module): - assert trainer._accelerator_connector._is_slurm_managing_tasks - assert isinstance(trainer.accelerator, GPUAccelerator) - assert isinstance(trainer.training_type_plugin, DDPPlugin) - assert isinstance(trainer.training_type_plugin.cluster_environment, SLURMEnvironment) - assert trainer.training_type_plugin.cluster_environment.local_rank() == 1 - assert trainer.training_type_plugin.local_rank == 1 - raise SystemExit() - - model = BoringModel() - trainer = Trainer(fast_dev_run=True, strategy=strategy, gpus=2, callbacks=[CB()]) - - with pytest.raises(SystemExit): - trainer.fit(model) + trainer = Trainer(fast_dev_run=True, strategy=strategy, gpus=2) + assert trainer._accelerator_connector._is_slurm_managing_tasks + assert isinstance(trainer.accelerator, GPUAccelerator) + assert isinstance(trainer.training_type_plugin, DDPPlugin) + assert isinstance(trainer.training_type_plugin.cluster_environment, SLURMEnvironment) + assert trainer.training_type_plugin.cluster_environment.local_rank() == 1 + assert trainer.training_type_plugin.local_rank == 1 -@RunIf(min_gpus=2) @mock.patch.dict( os.environ, { @@ -805,24 +718,15 @@ def on_fit_start(self, trainer, pl_module): @mock.patch("pytorch_lightning.plugins.DDPPlugin.setup_distributed", autospec=True) @pytest.mark.parametrize("strategy", ["ddp2", DDP2Plugin()]) def test_strategy_choice_ddp2_slurm(device_count_mock, setup_distributed_mock, strategy): - class CB(Callback): - def on_fit_start(self, trainer, pl_module): - assert trainer._accelerator_connector._is_slurm_managing_tasks - assert isinstance(trainer.accelerator, GPUAccelerator) - assert isinstance(trainer.training_type_plugin, DDP2Plugin) - assert isinstance(trainer.training_type_plugin.cluster_environment, SLURMEnvironment) - assert trainer.training_type_plugin.cluster_environment.local_rank() == 1 - assert trainer.training_type_plugin.local_rank == 1 - raise SystemExit() - - model = BoringModel() - trainer = Trainer(fast_dev_run=True, strategy=strategy, gpus=2, callbacks=[CB()]) - - with pytest.raises(SystemExit): - trainer.fit(model) + trainer = Trainer(fast_dev_run=True, strategy=strategy, gpus=2) + assert trainer._accelerator_connector._is_slurm_managing_tasks + assert isinstance(trainer.accelerator, GPUAccelerator) + assert isinstance(trainer.training_type_plugin, DDP2Plugin) + assert isinstance(trainer.training_type_plugin.cluster_environment, SLURMEnvironment) + assert trainer.training_type_plugin.cluster_environment.local_rank() == 1 + assert trainer.training_type_plugin.local_rank == 1 -@RunIf(min_gpus=1) @mock.patch.dict( os.environ, { @@ -836,24 +740,15 @@ def on_fit_start(self, trainer, pl_module): ) @mock.patch("torch.cuda.device_count", return_value=2) @mock.patch("pytorch_lightning.plugins.DDPPlugin.setup_distributed", autospec=True) -def test_strategy_choice_ddp_te(device_count_mock, setup_distributed_mock): - class CB(Callback): - def on_fit_start(self, trainer, pl_module): - assert isinstance(trainer.accelerator, GPUAccelerator) - assert isinstance(trainer.training_type_plugin, DDPPlugin) - assert isinstance(trainer.training_type_plugin.cluster_environment, TorchElasticEnvironment) - assert trainer.training_type_plugin.cluster_environment.local_rank() == 1 - assert trainer.training_type_plugin.local_rank == 1 - raise SystemExit() - - model = BoringModel() - trainer = Trainer(fast_dev_run=True, strategy="ddp", gpus=2, callbacks=[CB()]) - - with pytest.raises(SystemExit): - trainer.fit(model) +def test_strategy_choice_ddp_te(*_): + trainer = Trainer(fast_dev_run=True, strategy="ddp", gpus=2) + assert isinstance(trainer.accelerator, GPUAccelerator) + assert isinstance(trainer.training_type_plugin, DDPPlugin) + assert isinstance(trainer.training_type_plugin.cluster_environment, TorchElasticEnvironment) + assert trainer.training_type_plugin.cluster_environment.local_rank() == 1 + assert trainer.training_type_plugin.local_rank == 1 -@RunIf(min_gpus=1) @mock.patch.dict( os.environ, { @@ -867,21 +762,13 @@ def on_fit_start(self, trainer, pl_module): ) @mock.patch("torch.cuda.device_count", return_value=2) @mock.patch("pytorch_lightning.plugins.DDPPlugin.setup_distributed", autospec=True) -def test_strategy_choice_ddp2_te(device_count_mock, setup_distributed_mock): - class CB(Callback): - def on_fit_start(self, trainer, pl_module): - assert isinstance(trainer.accelerator, GPUAccelerator) - assert isinstance(trainer.training_type_plugin, DDP2Plugin) - assert isinstance(trainer.training_type_plugin.cluster_environment, TorchElasticEnvironment) - assert trainer.training_type_plugin.cluster_environment.local_rank() == 1 - assert trainer.training_type_plugin.local_rank == 1 - raise SystemExit() - - model = BoringModel() - trainer = Trainer(fast_dev_run=True, strategy="ddp2", gpus=2, callbacks=[CB()]) - - with pytest.raises(SystemExit): - trainer.fit(model) +def test_strategy_choice_ddp2_te(*_): + trainer = Trainer(fast_dev_run=True, strategy="ddp2", gpus=2) + assert isinstance(trainer.accelerator, GPUAccelerator) + assert isinstance(trainer.training_type_plugin, DDP2Plugin) + assert isinstance(trainer.training_type_plugin.cluster_environment, TorchElasticEnvironment) + assert trainer.training_type_plugin.cluster_environment.local_rank() == 1 + assert trainer.training_type_plugin.local_rank == 1 @mock.patch.dict( @@ -889,24 +776,15 @@ def on_fit_start(self, trainer, pl_module): ) @mock.patch("torch.cuda.device_count", return_value=0) @mock.patch("pytorch_lightning.plugins.DDPPlugin.setup_distributed", autospec=True) -def test_strategy_choice_ddp_cpu_te(device_count_mock, setup_distributed_mock): - class CB(Callback): - def on_fit_start(self, trainer, pl_module): - assert isinstance(trainer.accelerator, CPUAccelerator) - assert isinstance(trainer.training_type_plugin, DDPPlugin) - assert isinstance(trainer.training_type_plugin.cluster_environment, TorchElasticEnvironment) - assert trainer.training_type_plugin.cluster_environment.local_rank() == 1 - assert trainer.training_type_plugin.local_rank == 1 - raise SystemExit() - - model = BoringModel() - trainer = Trainer(fast_dev_run=True, strategy="ddp_spawn", num_processes=2, callbacks=[CB()]) - - with pytest.raises(SystemExit): - trainer.fit(model) +def test_strategy_choice_ddp_cpu_te(*_): + trainer = Trainer(fast_dev_run=True, strategy="ddp_spawn", num_processes=2) + assert isinstance(trainer.accelerator, CPUAccelerator) + assert isinstance(trainer.training_type_plugin, DDPPlugin) + assert isinstance(trainer.training_type_plugin.cluster_environment, TorchElasticEnvironment) + assert trainer.training_type_plugin.cluster_environment.local_rank() == 1 + assert trainer.training_type_plugin.local_rank == 1 -@RunIf(min_gpus=1) @mock.patch.dict( os.environ, { @@ -920,21 +798,13 @@ def on_fit_start(self, trainer, pl_module): ) @mock.patch("torch.cuda.device_count", return_value=1) @mock.patch("pytorch_lightning.plugins.DDPPlugin.setup_distributed", autospec=True) -def test_strategy_choice_ddp_kubeflow(device_count_mock, setup_distributed_mock): - class CB(Callback): - def on_fit_start(self, trainer, pl_module): - assert isinstance(trainer.accelerator, GPUAccelerator) - assert isinstance(trainer.training_type_plugin, DDPPlugin) - assert isinstance(trainer.training_type_plugin.cluster_environment, KubeflowEnvironment) - assert trainer.training_type_plugin.cluster_environment.local_rank() == 0 - assert trainer.training_type_plugin.local_rank == 0 - raise SystemExit() - - model = BoringModel() - trainer = Trainer(fast_dev_run=True, strategy="ddp", gpus=1, callbacks=[CB()]) - - with pytest.raises(SystemExit): - trainer.fit(model) +def test_strategy_choice_ddp_kubeflow(*_): + trainer = Trainer(fast_dev_run=True, strategy="ddp", gpus=1) + assert isinstance(trainer.accelerator, GPUAccelerator) + assert isinstance(trainer.training_type_plugin, DDPPlugin) + assert isinstance(trainer.training_type_plugin.cluster_environment, KubeflowEnvironment) + assert trainer.training_type_plugin.cluster_environment.local_rank() == 0 + assert trainer.training_type_plugin.local_rank == 0 @mock.patch.dict( @@ -949,21 +819,13 @@ def on_fit_start(self, trainer, pl_module): ) @mock.patch("torch.cuda.device_count", return_value=0) @mock.patch("pytorch_lightning.plugins.DDPPlugin.setup_distributed", autospec=True) -def test_strategy_choice_ddp_cpu_kubeflow(device_count_mock, setup_distributed_mock): - class CB(Callback): - def on_fit_start(self, trainer, pl_module): - assert isinstance(trainer.accelerator, CPUAccelerator) - assert isinstance(trainer.training_type_plugin, DDPPlugin) - assert isinstance(trainer.training_type_plugin.cluster_environment, KubeflowEnvironment) - assert trainer.training_type_plugin.cluster_environment.local_rank() == 0 - assert trainer.training_type_plugin.local_rank == 0 - raise SystemExit() - - model = BoringModel() - trainer = Trainer(fast_dev_run=True, strategy="ddp_spawn", num_processes=2, callbacks=[CB()]) - - with pytest.raises(SystemExit): - trainer.fit(model) +def test_strategy_choice_ddp_cpu_kubeflow(*_): + trainer = Trainer(fast_dev_run=True, strategy="ddp_spawn", num_processes=2) + assert isinstance(trainer.accelerator, CPUAccelerator) + assert isinstance(trainer.training_type_plugin, DDPPlugin) + assert isinstance(trainer.training_type_plugin.cluster_environment, KubeflowEnvironment) + assert trainer.training_type_plugin.cluster_environment.local_rank() == 0 + assert trainer.training_type_plugin.local_rank == 0 @mock.patch.dict( @@ -981,20 +843,11 @@ def on_fit_start(self, trainer, pl_module): @mock.patch("pytorch_lightning.plugins.DDPPlugin.setup_distributed", autospec=True) @pytest.mark.parametrize("strategy", ["ddp", DDPPlugin()]) def test_strategy_choice_ddp_cpu_slurm(device_count_mock, setup_distributed_mock, strategy): - class CB(Callback): - def on_fit_start(self, trainer, pl_module): - assert trainer._accelerator_connector._is_slurm_managing_tasks - assert isinstance(trainer.accelerator, CPUAccelerator) - assert isinstance(trainer.training_type_plugin, DDPPlugin) - assert isinstance(trainer.training_type_plugin.cluster_environment, SLURMEnvironment) - assert trainer.training_type_plugin.local_rank == 0 - raise SystemExit() - - model = BoringModel() - trainer = Trainer(fast_dev_run=True, strategy=strategy, num_processes=2, callbacks=[CB()]) - - with pytest.raises(SystemExit): - trainer.fit(model) + trainer = Trainer(fast_dev_run=True, strategy=strategy, num_processes=2) + assert isinstance(trainer.accelerator, CPUAccelerator) + assert isinstance(trainer.training_type_plugin, DDPPlugin) + assert isinstance(trainer.training_type_plugin.cluster_environment, SLURMEnvironment) + assert trainer.training_type_plugin.local_rank == 0 def test_unsupported_tpu_choice(monkeypatch): diff --git a/tests/accelerators/test_gpu.py b/tests/accelerators/test_gpu.py index 85ce0cd9f0f18..764630f30b0b1 100644 --- a/tests/accelerators/test_gpu.py +++ b/tests/accelerators/test_gpu.py @@ -1,8 +1,25 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from unittest import mock + import torch +from pytorch_lightning import Trainer from pytorch_lightning.accelerators import GPUAccelerator from pytorch_lightning.plugins.precision.precision_plugin import PrecisionPlugin from pytorch_lightning.plugins.training_type.dp import DataParallelPlugin +from tests.helpers import BoringModel from tests.helpers.runif import RunIf @@ -34,3 +51,20 @@ def test_get_nvidia_gpu_stats(tmpdir): for f in fields: assert any(f in h for h in gpu_stats.keys()) + + +@RunIf(min_gpus=1) +@mock.patch("torch.cuda.set_device") +def test_set_cuda_device(set_device_mock, tmpdir): + model = BoringModel() + trainer = Trainer( + default_root_dir=tmpdir, + fast_dev_run=True, + accelerator="gpu", + devices=1, + enable_checkpointing=False, + enable_model_summary=False, + enable_progress_bar=False, + ) + trainer.fit(model) + set_device_mock.assert_called_once() From 0a635d9a5c5b5fe09bc910f72b2d32367f67edb6 Mon Sep 17 00:00:00 2001 From: guyang3532 <62738430+guyang3532@users.noreply.github.com> Date: Mon, 20 Dec 2021 19:51:46 +0800 Subject: [PATCH 100/123] Safely disable profiler (#11167) --- pytorch_lightning/profiler/pytorch.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pytorch_lightning/profiler/pytorch.py b/pytorch_lightning/profiler/pytorch.py index 92bb9965dac4a..f5c5968c0f1d2 100644 --- a/pytorch_lightning/profiler/pytorch.py +++ b/pytorch_lightning/profiler/pytorch.py @@ -377,10 +377,8 @@ def start(self, action_name: str) -> None: # close profiler if it is already opened. might happen if 2 profilers # are created and the first one did not call `describe` - try: + if torch.autograd._profiler_enabled(): torch.autograd._disable_profiler() - except (AttributeError, RuntimeError): - pass if self._schedule is not None: self._schedule.setup(action_name) From ff9c3f99c2c70ef8c3b8b4c51b623ad8d817baec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 21 Dec 2021 13:26:22 +0100 Subject: [PATCH 101/123] Add required states for resumed ModelCheckpoint GC (#10995) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Justus Schock <12886177+justusschock@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Carlos Mocholí Co-authored-by: Adrian Wälchli --- CHANGELOG.md | 2 ++ .../callbacks/model_checkpoint.py | 8 +++++ tests/checkpointing/test_model_checkpoint.py | 34 +++++++++++++++++++ tests/models/test_restore.py | 11 ++++-- 4 files changed, 53 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 71f9cb424884e..8266c7861f054 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -18,6 +18,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Changed - DeepSpeed does not require lightning module zero 3 partitioning ([#10655](https://github.com/PyTorchLightning/pytorch-lightning/pull/10655)) +- The `ModelCheckpoint` callback now saves and restores attributes `best_k_models`, `kth_best_model_path`, `kth_value`, and `last_model_path` ([#10995](https://github.com/PyTorchLightning/pytorch-lightning/pull/10995)) + ## [1.5.6] - 2021-12-15 diff --git a/pytorch_lightning/callbacks/model_checkpoint.py b/pytorch_lightning/callbacks/model_checkpoint.py index f9fbf1c93d380..c4efffdc6d643 100644 --- a/pytorch_lightning/callbacks/model_checkpoint.py +++ b/pytorch_lightning/callbacks/model_checkpoint.py @@ -357,6 +357,10 @@ def on_save_checkpoint( "best_model_path": self.best_model_path, "current_score": self.current_score, "dirpath": self.dirpath, + "best_k_models": self.best_k_models, + "kth_best_model_path": self.kth_best_model_path, + "kth_value": self.kth_value, + "last_model_path": self.last_model_path, } def on_load_checkpoint( @@ -364,6 +368,10 @@ def on_load_checkpoint( ) -> None: self.best_model_score = callback_state["best_model_score"] self.best_model_path = callback_state["best_model_path"] + self.best_k_models = callback_state.get("best_k_models", self.best_k_models) + self.kth_best_model_path = callback_state.get("kth_best_model_path", self.kth_best_model_path) + self.kth_value = callback_state.get("kth_value", self.kth_value) + self.last_model_path = callback_state.get("last_model_path", self.last_model_path) def save_checkpoint(self, trainer: "pl.Trainer") -> None: """Performs the main logic around saving a checkpoint. diff --git a/tests/checkpointing/test_model_checkpoint.py b/tests/checkpointing/test_model_checkpoint.py index 04255d51ad069..733ea9348b2bd 100644 --- a/tests/checkpointing/test_model_checkpoint.py +++ b/tests/checkpointing/test_model_checkpoint.py @@ -1206,3 +1206,37 @@ def test_check_val_every_n_epochs_top_k_integration(tmpdir): ) trainer.fit(model) assert set(os.listdir(tmpdir)) == {"epoch=1.ckpt", "epoch=3.ckpt"} + + +def test_model_checkpoint_saveload_ckpt(tmpdir): + ckpt = { + "monitor": "random_value", + "best_model_path": "epoch=10-step=1436.ckpt", + "best_model_score": torch.tensor(2.246), + "current_score": torch.tensor(1.5), + "dirpath": tmpdir, + "best_k_models": {"epoch=10-step=1436.ckpt": torch.tensor(2.246)}, + "kth_best_model_path": "epoch=10-step=1436.ckpt", + "kth_value": torch.tensor(2.246), + "last_model_path": "last2245.ckpt", + } + + # test on_save_checkpoint + cb_write = ModelCheckpoint(dirpath=tmpdir, monitor="random_value", save_top_k=-1, save_last=True) + for key, val in ckpt.items(): + setattr(cb_write, key, val) + written_ckpt = cb_write.on_save_checkpoint("", "", "") + for state in ckpt: + assert ckpt[state] == written_ckpt[state] + + # test on_load_checkpoint + # Note: "current_score", "dirpath" and "monitor" are currently not restored by on_load_checkpoint. + # We therefore set "dirpath" and "monitor" to something different than for ckpt/cb_write so we can assert them. + # "current_score" is left as initialized, i.e. None, and can therefore also be asserted + cb_restore = ModelCheckpoint(dirpath=tmpdir + "restore", monitor=None, save_top_k=-1, save_last=True) + cb_restore.on_load_checkpoint("", "", written_ckpt) + for key, val in written_ckpt.items(): + if key not in ("current_score", "dirpath", "monitor"): + assert getattr(cb_restore, key) == val + else: + assert getattr(cb_restore, key) != val diff --git a/tests/models/test_restore.py b/tests/models/test_restore.py index 6d241222526ab..adbfa769e1eac 100644 --- a/tests/models/test_restore.py +++ b/tests/models/test_restore.py @@ -266,8 +266,15 @@ def get_trainer_args(): for before, after in zip(callbacks_before_resume, callback_capture.callbacks): if isinstance(before, ModelCheckpoint): - assert before.best_model_path == after.best_model_path - assert before.best_model_score == after.best_model_score + for attribute in ( + "best_model_path", + "best_model_score", + "best_k_models", + "kth_best_model_path", + "kth_value", + "last_model_path", + ): + assert getattr(before, attribute) == getattr(after, attribute) def test_callbacks_references_fit_ckpt_path(tmpdir): From 1f3292384ffe3e573531efb6d3c89e1d5ed1ad37 Mon Sep 17 00:00:00 2001 From: Rohit Gupta Date: Tue, 21 Dec 2021 18:24:26 +0530 Subject: [PATCH 102/123] Avoid torch amp cuda warning with bf16 on cpu (#11161) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Justus Schock <12886177+justusschock@users.noreply.github.com> Co-authored-by: Adrian Wälchli --- CHANGELOG.md | 3 ++- pytorch_lightning/utilities/model_summary.py | 17 ++++++++++++----- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8266c7861f054..249c7c1a1ad5b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,7 +13,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Avoid the deprecated `onnx.export(example_outputs=...)` in torch 1.10 ([#11116](https://github.com/PyTorchLightning/pytorch-lightning/pull/11116)) - Fixed an issue when torch-scripting a `LightningModule` after training with `Trainer(sync_batchnorm=True)` ([#11078](https://github.com/PyTorchLightning/pytorch-lightning/pull/11078)) - Fixed an `AttributeError` occuring when using a `CombinedLoader` (multiple dataloaders) for prediction ([#11111](https://github.com/PyTorchLightning/pytorch-lightning/pull/11111)) -- Fixed bug where `Trainer(track_grad_norm=..., logger=False)' would fail ([#11114](https://github.com/PyTorchLightning/pytorch-lightning/pull/11114)) +- Fixed bug where `Trainer(track_grad_norm=..., logger=False)` would fail ([#11114](https://github.com/PyTorchLightning/pytorch-lightning/pull/11114)) +- Fixed an incorrect warning being produced by the model summary when using `bf16` precision on CPU ([#11161](https://github.com/PyTorchLightning/pytorch-lightning/pull/11161)) ### Changed diff --git a/pytorch_lightning/utilities/model_summary.py b/pytorch_lightning/utilities/model_summary.py index 9c2690202df90..83f9861b0f550 100644 --- a/pytorch_lightning/utilities/model_summary.py +++ b/pytorch_lightning/utilities/model_summary.py @@ -12,7 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. +import contextlib import logging +import sys from collections import OrderedDict from typing import Any, Dict, List, Optional, Tuple, Union @@ -23,7 +25,7 @@ from torch.utils.hooks import RemovableHandle import pytorch_lightning as pl -from pytorch_lightning.utilities import AMPType, DeviceType, ModelSummaryMode, rank_zero_deprecation +from pytorch_lightning.utilities import ModelSummaryMode, rank_zero_deprecation from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_8 from pytorch_lightning.utilities.warnings import WarningCache @@ -282,12 +284,17 @@ def _forward_example_input(self) -> None: input_ = model.example_input_array input_ = model._apply_batch_transfer_handler(input_) - if trainer is not None and trainer.amp_backend == AMPType.NATIVE and trainer._device_type != DeviceType.TPU: - model.forward = torch.cuda.amp.autocast()(model.forward) - mode = model.training model.eval() - with torch.no_grad(): + + if trainer is not None: + forward_context = trainer.precision_plugin.forward_context() + elif sys.version_info >= (3, 7): + forward_context = contextlib.nullcontext() + else: + forward_context = contextlib.suppress() + + with torch.no_grad(), forward_context: # let the model hooks collect the input- and output shapes if isinstance(input_, (list, tuple)): model(*input_) From 98fcf1eed46723a801fc09838b5dd40ce19fcbd0 Mon Sep 17 00:00:00 2001 From: Oliver Borchert Date: Tue, 21 Dec 2021 15:40:41 +0100 Subject: [PATCH 103/123] Suppress Warning in `PredictionEpochLoop` (#11189) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Rohit Gupta Co-authored-by: Carlos Mocholí --- pytorch_lightning/loops/epoch/prediction_epoch_loop.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pytorch_lightning/loops/epoch/prediction_epoch_loop.py b/pytorch_lightning/loops/epoch/prediction_epoch_loop.py index 6e3b326da534d..7b31432cab8f6 100644 --- a/pytorch_lightning/loops/epoch/prediction_epoch_loop.py +++ b/pytorch_lightning/loops/epoch/prediction_epoch_loop.py @@ -68,8 +68,9 @@ def on_run_start( void(dataloader_iter, dataloader_idx) self._dl_max_batches = dl_max_batches self._num_dataloaders = num_dataloaders - self._seen_batch_indices = self._get_batch_indices(dataloader_idx) self.return_predictions = return_predictions + # this call requires that `self.return_predictions` is set + self._seen_batch_indices = self._get_batch_indices(dataloader_idx) def advance( self, From a2e9720bd1d12be0df859bd12392227690e07bb4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 4 Jan 2022 19:02:14 +0100 Subject: [PATCH 104/123] update version for 1.5.8 release --- CHANGELOG.md | 8 ++++++++ pytorch_lightning/__about__.py | 2 +- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 249c7c1a1ad5b..6a428c7147879 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,14 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). + +## [1.5.8] - 2022-01-05 + +### Fixed + +- + + ## [1.5.7] - 2021-12-21 ### Fixed diff --git a/pytorch_lightning/__about__.py b/pytorch_lightning/__about__.py index 327dd9e6b61fb..00574290706f4 100644 --- a/pytorch_lightning/__about__.py +++ b/pytorch_lightning/__about__.py @@ -1,7 +1,7 @@ import time _this_year = time.strftime("%Y") -__version__ = "1.5.7" +__version__ = "1.5.8" __author__ = "William Falcon et al." __author_email__ = "waf2107@columbia.edu" __license__ = "Apache-2.0" From d6cf2bbcedf43c5c562775296c7ed50072ed267a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Thu, 23 Dec 2021 17:45:06 +0100 Subject: [PATCH 105/123] Fix CLI race condition saving the config (#11199) --- CHANGELOG.md | 1 + pytorch_lightning/utilities/cli.py | 27 ++++++++++++++++++--------- 2 files changed, 19 insertions(+), 9 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6a428c7147879..8d96f6793befe 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - +- Fixed `LightningCLI` race condition while saving the config ([#11199](https://github.com/PyTorchLightning/pytorch-lightning/pull/11199)) ## [1.5.7] - 2021-12-21 diff --git a/pytorch_lightning/utilities/cli.py b/pytorch_lightning/utilities/cli.py index 7a2d4ba994d7f..f636d4a2abda3 100644 --- a/pytorch_lightning/utilities/cli.py +++ b/pytorch_lightning/utilities/cli.py @@ -395,21 +395,30 @@ def __init__( def setup(self, trainer: Trainer, pl_module: LightningModule, stage: Optional[str] = None) -> None: # save the config in `setup` because (1) we want it to save regardless of the trainer function run # and we want to save before processes are spawned - log_dir = trainer.log_dir + log_dir = trainer.log_dir # this broadcasts the directory assert log_dir is not None config_path = os.path.join(log_dir, self.config_filename) - if not self.overwrite and os.path.isfile(config_path): - raise RuntimeError( - f"{self.__class__.__name__} expected {config_path} to NOT exist. Aborting to avoid overwriting" - " results of a previous run. You can delete the previous config file," - " set `LightningCLI(save_config_callback=None)` to disable config saving," - " or set `LightningCLI(save_config_overwrite=True)` to overwrite the config file." - ) + fs = get_filesystem(log_dir) + + if not self.overwrite: + # check if the file exists on rank 0 + file_exists = fs.isfile(config_path) if trainer.is_global_zero else False + # broadcast whether to fail to all ranks + file_exists = trainer.strategy.broadcast(file_exists) + if file_exists: + raise RuntimeError( + f"{self.__class__.__name__} expected {config_path} to NOT exist. Aborting to avoid overwriting" + " results of a previous run. You can delete the previous config file," + " set `LightningCLI(save_config_callback=None)` to disable config saving," + " or set `LightningCLI(save_config_overwrite=True)` to overwrite the config file." + ) + + # save the file on rank 0 if trainer.is_global_zero: # save only on rank zero to avoid race conditions on DDP. # the `log_dir` needs to be created as we rely on the logger to do it usually # but it hasn't logged anything at this point - get_filesystem(log_dir).makedirs(log_dir, exist_ok=True) + fs.makedirs(log_dir, exist_ok=True) self.parser.save( self.config, config_path, skip_none=False, overwrite=self.overwrite, multifile=self.multifile ) From f562427de70f3e446d1e127b65817340a0740a97 Mon Sep 17 00:00:00 2001 From: Aki Nitta Date: Mon, 3 Jan 2022 20:45:41 +0900 Subject: [PATCH 106/123] Revert "[CI] Comment flaky tests (#10084)" (#10580) * Revert "[CI] Comment flaky tests (#10084)" This reverts commit ed9802643c5485a0f07d8376009410ae76076cc4. --- tests/profiler/test_profiler.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/profiler/test_profiler.py b/tests/profiler/test_profiler.py index 15cf40e924c68..a8cff9368546e 100644 --- a/tests/profiler/test_profiler.py +++ b/tests/profiler/test_profiler.py @@ -331,7 +331,6 @@ def test_pytorch_profiler_trainer_ddp(tmpdir, pytorch_profiler): assert any(f"{local_rank}-validation_step" in f for f in files) -@RunIf(standalone=True) @pytest.mark.parametrize("fast_dev_run", [1, 2, 3, 4, 5]) @pytest.mark.parametrize("boring_model_cls", [ManualOptimBoringModel, BoringModel]) def test_pytorch_profiler_trainer_fit(fast_dev_run, boring_model_cls, tmpdir): From 6a000662aa8f955bea20aa3906a608a55c5cf618 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 3 Jan 2022 13:49:17 +0100 Subject: [PATCH 107/123] Fix data fetcher selection (#11294) --- CHANGELOG.md | 1 + pytorch_lightning/trainer/connectors/data_connector.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8d96f6793befe..cbc92d1d88b0d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed `LightningCLI` race condition while saving the config ([#11199](https://github.com/PyTorchLightning/pytorch-lightning/pull/11199)) +- Fixed data fetcher selection ([#11294](https://github.com/PyTorchLightning/pytorch-lightning/pull/11294)) ## [1.5.7] - 2021-12-21 ### Fixed diff --git a/pytorch_lightning/trainer/connectors/data_connector.py b/pytorch_lightning/trainer/connectors/data_connector.py index 8f286964940d2..73d19a2e28a1e 100644 --- a/pytorch_lightning/trainer/connectors/data_connector.py +++ b/pytorch_lightning/trainer/connectors/data_connector.py @@ -120,7 +120,7 @@ def _select_data_fetcher(self) -> AbstractDataFetcher: def get_profiled_dataloader(self, dataloader: Iterable, dataloader_idx: int = 0) -> Iterable: stage: str = self.trainer.state.stage.value - data_fetcher = setattr(self, f"{stage}_data_fetcher", None) or self._select_data_fetcher() + data_fetcher = getattr(self, f"{stage}_data_fetcher", None) or self._select_data_fetcher() data_fetcher.setup( dataloader, stage=stage, From 15f524b2d8c6755cdaf1cb714a117bde755cc7a2 Mon Sep 17 00:00:00 2001 From: Ed Pizzi Date: Mon, 3 Jan 2022 14:17:50 -0800 Subject: [PATCH 108/123] Avoid non-blocking GPU->CPU copies. (#11288) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Jirka Borovec Co-authored-by: Rohit Gupta Co-authored-by: Justus Schock <12886177+justusschock@users.noreply.github.com> Co-authored-by: Adrian Wälchli --- CHANGELOG.md | 5 ++--- pytorch_lightning/utilities/apply_func.py | 8 +++++++- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index cbc92d1d88b0d..ca1f850709e3a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,11 +9,10 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Fixed -- - - Fixed `LightningCLI` race condition while saving the config ([#11199](https://github.com/PyTorchLightning/pytorch-lightning/pull/11199)) - - Fixed data fetcher selection ([#11294](https://github.com/PyTorchLightning/pytorch-lightning/pull/11294)) +- Fixed a race condition that could result in incorrect (zero) values being observed in prediction writer callbacks ([#11288](https://github.com/PyTorchLightning/pytorch-lightning/pull/11288)) + ## [1.5.7] - 2021-12-21 ### Fixed diff --git a/pytorch_lightning/utilities/apply_func.py b/pytorch_lightning/utilities/apply_func.py index 5a76f402bcc02..1ce23050e0113 100644 --- a/pytorch_lightning/utilities/apply_func.py +++ b/pytorch_lightning/utilities/apply_func.py @@ -34,6 +34,9 @@ Batch = type(None) +_CPU_DEVICES = ("cpu", torch.device("cpu")) + + def to_dtype_tensor( value: Union[int, float, List[Union[int, float]]], dtype: torch.dtype, device: Union[str, torch.device] ) -> torch.Tensor: @@ -268,7 +271,10 @@ def batch_to(data: Any) -> Any: setattr(device_data, field, device_field) return device_data - kwargs = dict(non_blocking=True) if isinstance(data, torch.Tensor) else {} + kwargs = {} + # Don't issue non-blocking transfers to CPU + if isinstance(data, torch.Tensor) and device not in _CPU_DEVICES: + kwargs["non_blocking"] = True data_output = data.to(device, **kwargs) if data_output is not None: return data_output From a85033a49d2c076878bf6ffedecdcd4fcd8d14b0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 4 Jan 2022 20:11:32 +0100 Subject: [PATCH 109/123] 1.5.8 release commit --- pytorch_lightning/utilities/cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/utilities/cli.py b/pytorch_lightning/utilities/cli.py index f636d4a2abda3..ee9cc17bf4b49 100644 --- a/pytorch_lightning/utilities/cli.py +++ b/pytorch_lightning/utilities/cli.py @@ -404,7 +404,7 @@ def setup(self, trainer: Trainer, pl_module: LightningModule, stage: Optional[st # check if the file exists on rank 0 file_exists = fs.isfile(config_path) if trainer.is_global_zero else False # broadcast whether to fail to all ranks - file_exists = trainer.strategy.broadcast(file_exists) + file_exists = trainer.accelerator.broadcast(file_exists) if file_exists: raise RuntimeError( f"{self.__class__.__name__} expected {config_path} to NOT exist. Aborting to avoid overwriting" From 4398db270e69455a0c55150cb0b087f4cd0b77eb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 5 Jan 2022 09:25:27 +0100 Subject: [PATCH 110/123] Fix `_should_reload_dl_epoch` causing inconsistent validation dataloader reloading (#11036) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Adam Viola Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Jirka Borovec Co-authored-by: thomas chaton Co-authored-by: Adrian Wälchli --- CHANGELOG.md | 1 + .../loops/dataloader/evaluation_loop.py | 2 +- pytorch_lightning/loops/fit_loop.py | 2 +- pytorch_lightning/trainer/data_loading.py | 22 +++ pytorch_lightning/trainer/trainer.py | 10 +- tests/deprecated_api/test_remove_1-6.py | 5 +- tests/models/test_hooks.py | 1 - tests/trainer/test_dataloaders.py | 170 ++++++++++++------ 8 files changed, 143 insertions(+), 70 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ca1f850709e3a..99429882c1ada 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed `LightningCLI` race condition while saving the config ([#11199](https://github.com/PyTorchLightning/pytorch-lightning/pull/11199)) - Fixed data fetcher selection ([#11294](https://github.com/PyTorchLightning/pytorch-lightning/pull/11294)) - Fixed a race condition that could result in incorrect (zero) values being observed in prediction writer callbacks ([#11288](https://github.com/PyTorchLightning/pytorch-lightning/pull/11288)) +- Fixed dataloaders not getting reloaded the correct amount of times when setting `reload_dataloaders_every_n_epochs` and `check_val_every_n_epoch` ([#10948](https://github.com/PyTorchLightning/pytorch-lightning/pull/10948)) ## [1.5.7] - 2021-12-21 diff --git a/pytorch_lightning/loops/dataloader/evaluation_loop.py b/pytorch_lightning/loops/dataloader/evaluation_loop.py index b3291fcc21890..f95196d360ca8 100644 --- a/pytorch_lightning/loops/dataloader/evaluation_loop.py +++ b/pytorch_lightning/loops/dataloader/evaluation_loop.py @@ -166,7 +166,7 @@ def _reload_evaluation_dataloaders(self) -> None: """Reloads dataloaders if necessary.""" if self.trainer.testing: self.trainer.reset_test_dataloader() - elif self.trainer.val_dataloaders is None or self.trainer._should_reload_dl_epoch: + elif self.trainer.val_dataloaders is None or self.trainer._should_reload_val_dl: self.trainer.reset_val_dataloader() def _on_evaluation_start(self, *args: Any, **kwargs: Any) -> None: diff --git a/pytorch_lightning/loops/fit_loop.py b/pytorch_lightning/loops/fit_loop.py index df6634c963851..fcd75ef274914 100644 --- a/pytorch_lightning/loops/fit_loop.py +++ b/pytorch_lightning/loops/fit_loop.py @@ -205,7 +205,7 @@ def on_advance_start(self) -> None: model = self.trainer.lightning_module # reset train dataloader - if not self._is_fresh_start_epoch and self.trainer._should_reload_dl_epoch: + if not self._is_fresh_start_epoch and self.trainer._should_reload_train_dl: self.trainer.reset_train_dataloader(model) self._is_fresh_start_epoch = False diff --git a/pytorch_lightning/trainer/data_loading.py b/pytorch_lightning/trainer/data_loading.py index 48c9666c24726..1662c1b3aaa92 100644 --- a/pytorch_lightning/trainer/data_loading.py +++ b/pytorch_lightning/trainer/data_loading.py @@ -50,6 +50,7 @@ class TrainerDataLoadingMixin(ABC): # this is just a summary on variables used in this abstract class, # the proper values/initialisation should be done in child class val_check_interval: float + reload_dataloaders_every_n_epochs: int tpu_local_core_rank: int train_dataloader: DataLoader limit_train_batches: Union[int, float] @@ -70,6 +71,21 @@ class TrainerDataLoadingMixin(ABC): accelerator: Accelerator accelerator_connector: AcceleratorConnector call_hook: Callable + current_epoch: int + _last_train_dl_reload_epoch: int + _last_val_dl_reload_epoch: int + + @property + def _should_reload_train_dl(self) -> bool: + """Check if train dataloader should be reloaded.""" + n_epochs = self.reload_dataloaders_every_n_epochs + return n_epochs and (self.current_epoch - self._last_train_dl_reload_epoch >= n_epochs) + + @property + def _should_reload_val_dl(self) -> bool: + """Check if validation dataloader should be reloaded.""" + n_epochs = self.reload_dataloaders_every_n_epochs + return n_epochs and (self.current_epoch - self._last_val_dl_reload_epoch >= n_epochs) def _worker_check(self, dataloader: DataLoader, name: str) -> None: if not isinstance(dataloader, DataLoader): @@ -415,6 +431,9 @@ def reset_train_dataloader(self, model: Optional["pl.LightningModule"] = None) - " you want to see logs for the training epoch." ) + # store epoch of dataloader reset for reload_dataloaders_every_n_epochs + self._last_train_dl_reload_epoch = self.current_epoch + def _reset_eval_dataloader( self, mode: RunningStage, model: Optional["pl.LightningModule"] = None ) -> Tuple[List[Union[int, float]], List[DataLoader]]: @@ -529,6 +548,9 @@ def reset_val_dataloader(self, model: Optional["pl.LightningModule"] = None) -> RunningStage.VALIDATING, model=pl_module ) + # store epoch of dataloader reset for reload_dataloaders_every_n_epochs + self._last_val_dl_reload_epoch = self.current_epoch + def reset_test_dataloader(self, model: Optional["pl.LightningModule"] = None) -> None: """Resets the test dataloader and determines the number of batches. diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index a21763259ece2..22df6859fb256 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -663,6 +663,8 @@ def _setup_on_init(self, num_sanity_val_steps: int) -> None: self.num_val_batches = [] self.test_dataloaders = None self.val_dataloaders = None + self._last_train_dl_reload_epoch = float("-inf") + self._last_val_dl_reload_epoch = float("-inf") # when true, print evaluation results in .validate() and .test() self.verbose_evaluate = True @@ -752,6 +754,8 @@ def _fit_impl( self.state.fn = TrainerFn.FITTING self.state.status = TrainerStatus.RUNNING self.training = True + self._last_train_dl_reload_epoch = float("-inf") + self._last_val_dl_reload_epoch = float("-inf") # if a datamodule comes in as the second arg, then fix it for the user if isinstance(train_dataloaders, LightningDataModule): @@ -1826,12 +1830,6 @@ def progress_bar_dict(self) -> dict: return self.progress_bar_callback.get_metrics(self, ref_model) return self.progress_bar_metrics - @property - def _should_reload_dl_epoch(self) -> bool: - """Check if dataloader should be reloaded in the current epoch.""" - n_epochs = self.reload_dataloaders_every_n_epochs - return n_epochs and (not self.current_epoch % n_epochs) - @property def disable_validation(self) -> bool: """Check if validation is disabled during training.""" diff --git a/tests/deprecated_api/test_remove_1-6.py b/tests/deprecated_api/test_remove_1-6.py index 62791b482c186..144f84551105c 100644 --- a/tests/deprecated_api/test_remove_1-6.py +++ b/tests/deprecated_api/test_remove_1-6.py @@ -118,13 +118,12 @@ def test_v1_6_0_reload_dataloaders_every_epoch(tmpdir): limit_val_batches=0.3, reload_dataloaders_every_epoch=True, max_epochs=3, + num_sanity_val_steps=0, ) trainer.fit(model) trainer.test() - expected_sequence = ( - [call.val_dataloader()] + [call.train_dataloader(), call.val_dataloader()] * 3 + [call.test_dataloader()] - ) + expected_sequence = [call.train_dataloader(), call.val_dataloader()] * 3 + [call.test_dataloader()] assert tracker.mock_calls == expected_sequence diff --git a/tests/models/test_hooks.py b/tests/models/test_hooks.py index 51c041fadd85e..452051e58ff65 100644 --- a/tests/models/test_hooks.py +++ b/tests/models/test_hooks.py @@ -878,7 +878,6 @@ def call(hook, fn, *args, **kwargs): *batch_transfer * batches, dict(name="train_dataloader"), *batch_transfer * batches, - dict(name="val_dataloader"), *batch_transfer * batches, dict( name="on_save_checkpoint", diff --git a/tests/trainer/test_dataloaders.py b/tests/trainer/test_dataloaders.py index 533eceb8018db..55956c954e497 100644 --- a/tests/trainer/test_dataloaders.py +++ b/tests/trainer/test_dataloaders.py @@ -1122,17 +1122,12 @@ def test_dataloaders_load_only_once(tmpdir): assert tracker.mock_calls == [call.val_dataloader(), call.train_dataloader()] -def test_dataloaders_load_only_once_val_interval(tmpdir): +def test_dataloaders_load_only_once_no_sanity_check(tmpdir): model = BoringModel() # logger file to get meta trainer = Trainer( - default_root_dir=tmpdir, - limit_train_batches=10, - limit_val_batches=10, - val_check_interval=0.3, - reload_dataloaders_every_n_epochs=True, - max_epochs=3, + default_root_dir=tmpdir, limit_train_batches=0.3, limit_val_batches=0.3, num_sanity_val_steps=0, max_epochs=3 ) tracker = Mock() @@ -1145,34 +1140,33 @@ def test_dataloaders_load_only_once_val_interval(tmpdir): tracker.attach_mock(model.test_dataloader, "test_dataloader") trainer.fit(model) - trainer.test(model) # verify the sequence - expected_sequence = [ - call.val_dataloader(), - call.train_dataloader(), - call.val_dataloader(), - call.val_dataloader(), - call.val_dataloader(), - call.train_dataloader(), - call.val_dataloader(), - call.val_dataloader(), - call.val_dataloader(), - call.train_dataloader(), - call.val_dataloader(), - call.val_dataloader(), - call.val_dataloader(), - call.test_dataloader(), - ] + expected_sequence = [call.train_dataloader(), call.val_dataloader()] assert tracker.mock_calls == expected_sequence -def test_dataloaders_load_only_once_no_sanity_check(tmpdir): - model = BoringModel() +@pytest.mark.parametrize("n", [1, 2]) +def test_dataloaders_load_every_n_epochs(tmpdir, n): + train_reload_epochs, val_reload_epochs = [], [] + + class TestModel(BoringModel): + def train_dataloader(self): + train_reload_epochs.append(self.current_epoch) + return super().train_dataloader() + + def val_dataloader(self): + val_reload_epochs.append(self.current_epoch) + return super().val_dataloader() + + model = TestModel() - # logger file to get meta trainer = Trainer( - default_root_dir=tmpdir, limit_train_batches=0.3, limit_val_batches=0.3, num_sanity_val_steps=0, max_epochs=3 + default_root_dir=tmpdir, + limit_train_batches=0.3, + limit_val_batches=0.3, + reload_dataloaders_every_n_epochs=n, + max_epochs=5, ) tracker = Mock() @@ -1185,44 +1179,113 @@ def test_dataloaders_load_only_once_no_sanity_check(tmpdir): tracker.attach_mock(model.test_dataloader, "test_dataloader") trainer.fit(model) + trainer.test(model) + + # Verify the sequence + expected_sequence = [call.val_dataloader(), call.train_dataloader()] # Sanity check first + if n == 1: + expected_sequence += [call.train_dataloader(), call.val_dataloader()] * 4 + elif n == 2: + expected_sequence += [call.train_dataloader(), call.val_dataloader()] * 2 + expected_sequence += [call.test_dataloader()] - # verify the sequence - expected_sequence = [call.train_dataloader(), call.val_dataloader()] assert tracker.mock_calls == expected_sequence + # Verify epoch of reloads + if n == 1: + assert train_reload_epochs == [0, 1, 2, 3, 4] + assert val_reload_epochs == [0, 1, 2, 3, 4] + elif n == 2: + assert train_reload_epochs == [0, 2, 4] + assert val_reload_epochs == [0, 2, 4] -@pytest.mark.parametrize("n", [1, 2]) -def test_dataloaders_load_every_n_epochs(tmpdir, n): - model = BoringModel() + +@pytest.mark.parametrize( + "n, train_reload_epochs_expect, val_reload_epochs_expect", + [ + # Sanity check at epoch 0 creates a validation dataloader, but validation is + # checked (and in this case reloaded) every n epochs starting from epoch n-1 + (3, [0, 2, 4, 6, 8], [0, 2, 5, 8]), + (5, [0, 2, 4, 6, 8], [0, 4, 9]), + ], +) +def test_dataloaders_load_every_n_epochs_infrequent_val( + tmpdir, n, train_reload_epochs_expect, val_reload_epochs_expect +): + """Test dataloader reload behavior when infrequently checking validation set (via check_val_every_n_epoch)""" + train_reload_epochs, val_reload_epochs = [], [] + + class TestModel(BoringModel): + def train_dataloader(self): + train_reload_epochs.append(self.current_epoch) + return super().train_dataloader() + + def val_dataloader(self): + val_reload_epochs.append(self.current_epoch) + return super().val_dataloader() + + model = TestModel() trainer = Trainer( default_root_dir=tmpdir, limit_train_batches=0.3, limit_val_batches=0.3, - reload_dataloaders_every_n_epochs=n, + check_val_every_n_epoch=n, + reload_dataloaders_every_n_epochs=2, + max_epochs=10, + ) + model.test_dataloader = Mock(wraps=model.test_dataloader) + + trainer.fit(model) + trainer.test(model) + + # Verify epoch of reloads + assert train_reload_epochs == train_reload_epochs_expect + assert val_reload_epochs == val_reload_epochs_expect + + model.test_dataloader.assert_called_once() + + +def test_dataloaders_load_every_n_epochs_frequent_val(tmpdir): + """Test dataloader reload behavior when frequently checking validation set (via val_check_interval)""" + train_reload_epochs, val_reload_epochs, val_check_epochs = [], [], [] + + class TestModel(BoringModel): + def train_dataloader(self): + train_reload_epochs.append(self.current_epoch) + return super().train_dataloader() + + def val_dataloader(self): + val_reload_epochs.append(self.current_epoch) + return super().val_dataloader() + + def validation_epoch_end(self, outputs): + val_check_epochs.append(self.current_epoch) + return super().validation_epoch_end(outputs) + + model = TestModel() + + trainer = Trainer( + default_root_dir=tmpdir, + limit_train_batches=0.3, + limit_val_batches=0.3, + val_check_interval=0.3, + reload_dataloaders_every_n_epochs=1, max_epochs=3, ) - tracker = Mock() - model.train_dataloader = Mock(wraps=model.train_dataloader) - model.val_dataloader = Mock(wraps=model.val_dataloader) model.test_dataloader = Mock(wraps=model.test_dataloader) - tracker.attach_mock(model.train_dataloader, "train_dataloader") - tracker.attach_mock(model.val_dataloader, "val_dataloader") - tracker.attach_mock(model.test_dataloader, "test_dataloader") - trainer.fit(model) trainer.test(model) - # verify the sequence - expected_sequence = [call.val_dataloader()] - if n == 1: - expected_sequence += [call.train_dataloader(), call.val_dataloader()] * 3 - elif n == 2: - expected_sequence += [call.train_dataloader(), call.val_dataloader()] * 2 - expected_sequence += [call.test_dataloader()] - assert tracker.mock_calls == expected_sequence + # Verify epoch of reloads + assert train_reload_epochs == [0, 1, 2] + assert val_reload_epochs == [0, 1, 2] + model.test_dataloader.assert_called_once() + + # Verify validation happens 3 times per epoch + 1 for sanity check + assert val_check_epochs == [0, 0, 0, 0, 1, 1, 1, 2, 2, 2] @pytest.mark.parametrize("n", ["test", -1]) @@ -1269,15 +1332,6 @@ def validation_step(self, batch, batch_idx): expected_calls = [ call.train_dataloader(), call.val_dataloader(), - # This has subsequent calls to val_dataloader - # because the training loop runs the evaluation loop, - # which reloads the val dataloader again. - # We cannot yet rely on trainer.current_epoch=0 to skip reloading - # the val dataloader on the first epoch because this only tracks the training epoch - # meaning multiple passes through the validation data within a single training epoch - # would not have the dataloader reloaded. - # This breaks the assumption behind reload_dataloaders_every_epoch=True - call.val_dataloader(), call.train_dataloader(), call.val_dataloader(), call.train_dataloader(), From b707c677eb06bd00e537a09e62ffa4bf55b4e30a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 5 Jan 2022 13:23:03 +0100 Subject: [PATCH 111/123] Fix min/max logging default value (#11310) Co-authored-by: Carlos Mocholi --- CHANGELOG.md | 1 + .../trainer/connectors/logger_connector/result.py | 8 +++++++- tests/core/test_metric_result_integration.py | 13 +++++++++++-- 3 files changed, 19 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 99429882c1ada..d38201f5103b9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Fixed - Fixed `LightningCLI` race condition while saving the config ([#11199](https://github.com/PyTorchLightning/pytorch-lightning/pull/11199)) +- Fixed the default value used with `log(reduce_fx=min|max)` ([#11310](https://github.com/PyTorchLightning/pytorch-lightning/pull/11310)) - Fixed data fetcher selection ([#11294](https://github.com/PyTorchLightning/pytorch-lightning/pull/11294)) - Fixed a race condition that could result in incorrect (zero) values being observed in prediction writer callbacks ([#11288](https://github.com/PyTorchLightning/pytorch-lightning/pull/11288)) - Fixed dataloaders not getting reloaded the correct amount of times when setting `reload_dataloaders_every_n_epochs` and `check_val_every_n_epoch` ([#10948](https://github.com/PyTorchLightning/pytorch-lightning/pull/10948)) diff --git a/pytorch_lightning/trainer/connectors/logger_connector/result.py b/pytorch_lightning/trainer/connectors/logger_connector/result.py index 8a3307a373998..ca7492655047a 100644 --- a/pytorch_lightning/trainer/connectors/logger_connector/result.py +++ b/pytorch_lightning/trainer/connectors/logger_connector/result.py @@ -208,8 +208,14 @@ def __init__(self, metadata: _Metadata, is_tensor: bool) -> None: self.meta = metadata self.has_reset = False if is_tensor: + if metadata.is_max_reduction: + default = float("-inf") + elif metadata.is_min_reduction: + default = float("inf") + else: + default = 0.0 # do not set a dtype in case the default dtype was changed - self.add_state("value", torch.tensor(0.0), dist_reduce_fx=torch.sum) + self.add_state("value", torch.tensor(default), dist_reduce_fx=torch.sum) if self.meta.is_mean_reduction: self.add_state("cumulated_batch_size", torch.tensor(0), dist_reduce_fx=torch.sum) diff --git a/tests/core/test_metric_result_integration.py b/tests/core/test_metric_result_integration.py index 0138e0d26359d..d660054d39d13 100644 --- a/tests/core/test_metric_result_integration.py +++ b/tests/core/test_metric_result_integration.py @@ -575,14 +575,14 @@ def test_metric_result_respects_dtype(floating_dtype): assert rm.cumulated_batch_size.dtype == fixed_dtype # two fixed point numbers - should be converted - value, batch_size = torch.tensor(2), torch.tensor(3) + value, batch_size = torch.tensor(2), 3 assert value.dtype == fixed_dtype with pytest.warns( UserWarning, match=rf"`self.log\('bar', ...\)` in your `foo` .* Converting it to {floating_dtype}" ): rm.update(value, batch_size) # floating and fixed - rm.update(torch.tensor(4.0), torch.tensor(5)) + rm.update(torch.tensor(4.0), 5) total = rm.compute() @@ -591,3 +591,12 @@ def test_metric_result_respects_dtype(floating_dtype): # restore to avoid impacting other tests torch.set_default_dtype(torch.float) + + +@pytest.mark.parametrize(["reduce_fx", "expected"], [(max, -2), (min, 2)]) +def test_result_metric_max_min(reduce_fx, expected): + metadata = _Metadata("foo", "bar", reduce_fx=reduce_fx) + metadata.sync = _Sync() + rm = ResultMetric(metadata, is_tensor=True) + rm.update(torch.tensor(expected), 1) + assert rm.compute() == expected From 15ee1f4c561c1aaea8aaceccca5aba98dbf47474 Mon Sep 17 00:00:00 2001 From: Justus Schock <12886177+justusschock@users.noreply.github.com> Date: Tue, 11 Jan 2022 14:38:46 +0100 Subject: [PATCH 112/123] Update Version --- pytorch_lightning/__about__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/__about__.py b/pytorch_lightning/__about__.py index 00574290706f4..3f0cb9ee1441b 100644 --- a/pytorch_lightning/__about__.py +++ b/pytorch_lightning/__about__.py @@ -1,7 +1,7 @@ import time _this_year = time.strftime("%Y") -__version__ = "1.5.8" +__version__ = "1.5.9" __author__ = "William Falcon et al." __author_email__ = "waf2107@columbia.edu" __license__ = "Apache-2.0" From dde425a3cac7991643f14a27af90a563359fc65a Mon Sep 17 00:00:00 2001 From: Rohit Gupta Date: Tue, 11 Jan 2022 04:42:53 +0530 Subject: [PATCH 113/123] pin sphinx-autodoc-typehints version to v1.15 (#11400) --- requirements/docs.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/docs.txt b/requirements/docs.txt index 40b7b5919f90d..e9fea736f8b68 100644 --- a/requirements/docs.txt +++ b/requirements/docs.txt @@ -7,7 +7,7 @@ docutils>=0.16,<0.18 # Sphinx not yet compatible with docutils >= 0.18 sphinxcontrib-fulltoc>=1.0 sphinxcontrib-mockautodoc https://github.com/PyTorchLightning/lightning_sphinx_theme/archive/master.zip#egg=pt-lightning-sphinx-theme -sphinx-autodoc-typehints>=1.0 +sphinx-autodoc-typehints>=1.0,<1.15 # v1.15 failing on master (#11405) sphinx-paramlinks>=0.5.1 sphinx-togglebutton>=0.2 sphinx-copybutton>=0.3 From b5b11742c1ec6fb9c3a9652034004390ce8ed600 Mon Sep 17 00:00:00 2001 From: Aki Nitta Date: Fri, 7 Jan 2022 22:09:52 +0900 Subject: [PATCH 114/123] Skip testing with PyTorch 1.7 and Python 3.9 on Ubuntu (#11217) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Carlos Mocholí --- .github/workflows/ci_test-full.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/ci_test-full.yml b/.github/workflows/ci_test-full.yml index ad600623dfba0..ad8e27e263c2d 100644 --- a/.github/workflows/ci_test-full.yml +++ b/.github/workflows/ci_test-full.yml @@ -28,6 +28,9 @@ jobs: - {os: macOS-10.15, python-version: "3.6", requires: "oldest", release: "stable"} # nightly: add when there's a release candidate #- {os: ubuntu-20.04, python-version: "3.10", requires: "latest", release: "pre"} + exclude: + # Skip if torch<1.8 and py3.9 on Linux: https://github.com/pytorch/pytorch/issues/50014 + - {os: ubuntu-18.04, python-version: "3.9", requires: "oldest", release: "stable"} timeout-minutes: 40 From 18e95e055e65f9bdf885c0fdcc65f1808c2885dd Mon Sep 17 00:00:00 2001 From: rohitgr7 Date: Wed, 12 Jan 2022 02:19:51 +0530 Subject: [PATCH 115/123] release 1.5.9 --- CHANGELOG.md | 7 +++++++ pytorch_lightning/callbacks/lr_monitor.py | 2 ++ requirements.txt | 1 + 3 files changed, 10 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index d38201f5103b9..9776d5fa85bf3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,13 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). +## [1.5.9] - 2022-01-11 + +### Fixed + +- Pin sphinx-autodoc-typehints with bool: self.last_momentum_values = {name + "-momentum": None for name in names_flatten} def on_train_batch_start(self, trainer: "pl.Trainer", *args: Any, **kwargs: Any) -> None: + assert trainer.logger is not None if not trainer.logger_connector.should_update_logs: return @@ -158,6 +159,7 @@ def on_train_batch_start(self, trainer: "pl.Trainer", *args: Any, **kwargs: Any) trainer.logger.log_metrics(latest_stat, step=trainer.global_step) def on_train_epoch_start(self, trainer: "pl.Trainer", *args: Any, **kwargs: Any) -> None: + assert trainer.logger is not None if self.logging_interval != "step": interval = "epoch" if self.logging_interval is None else "any" latest_stat = self._extract_stats(trainer, interval) diff --git a/requirements.txt b/requirements.txt index 34879d9290acb..dd34e9273c31e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,3 +11,4 @@ torchmetrics>=0.4.1 pyDeprecate==0.3.1 packaging>=17.0 typing-extensions +setuptools==59.5.0 # required for https://github.com/pytorch/pytorch/pull/69904 From 8cffc0f2a741b980708b3f0cb0e3c054eb952913 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Wed, 12 Jan 2022 09:09:36 +0100 Subject: [PATCH 116/123] Avoid in-place ops during logging result updates (#11401) Co-authored-by: rohitgr7 --- CHANGELOG.md | 3 ++- .../connectors/logger_connector/result.py | 8 +++++--- tests/core/test_metric_result_integration.py | 20 +++++++++++++++++++ 3 files changed, 27 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9776d5fa85bf3..db46c604f00e4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,12 +4,13 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). -## [1.5.9] - 2022-01-11 +## [1.5.9] - 2022-01-18 ### Fixed - Pin sphinx-autodoc-typehints with None: # do not set a dtype in case the default dtype was changed self.add_state("value", torch.tensor(default), dist_reduce_fx=torch.sum) if self.meta.is_mean_reduction: + self.cumulated_batch_size: torch.Tensor self.add_state("cumulated_batch_size", torch.tensor(0), dist_reduce_fx=torch.sum) def update(self, value: _IN_METRIC, batch_size: int) -> None: @@ -240,12 +241,13 @@ def update(self, value: _IN_METRIC, batch_size: int) -> None: # perform accumulation with reduction if self.meta.is_mean_reduction: - self.value += value.mean() * batch_size - self.cumulated_batch_size += batch_size + # do not use `+=` as it doesn't do type promotion + self.value = self.value + value.mean() * batch_size + self.cumulated_batch_size = self.cumulated_batch_size + batch_size elif self.meta.is_max_reduction or self.meta.is_min_reduction: self.value = self.meta.reduce_fx(self.value, value.mean()) elif self.meta.is_sum_reduction: - self.value += value.mean() + self.value = self.value + value.mean() else: self.value = value self._forward_cache = value._forward_cache diff --git a/tests/core/test_metric_result_integration.py b/tests/core/test_metric_result_integration.py index d660054d39d13..85149a78211f6 100644 --- a/tests/core/test_metric_result_integration.py +++ b/tests/core/test_metric_result_integration.py @@ -593,6 +593,26 @@ def test_metric_result_respects_dtype(floating_dtype): torch.set_default_dtype(torch.float) +@pytest.mark.parametrize("reduce_fx", ("mean", sum)) +def test_metric_result_dtype_promotion(reduce_fx): + metadata = _Metadata("foo", "bar", reduce_fx=reduce_fx) + metadata.sync = _Sync() + rm = ResultMetric(metadata, is_tensor=True) + assert rm.value.dtype == torch.float + + # log a double + rm.update(torch.tensor(0, dtype=torch.double), 1) + # `rm.value.dtype` is promoted + assert rm.value.dtype == torch.double + # log a float + rm.update(torch.tensor(0, dtype=torch.float), 1) + # the previous dtype stays + assert rm.value.dtype == torch.double + + total = rm.compute() + assert total.dtype == torch.double + + @pytest.mark.parametrize(["reduce_fx", "expected"], [(max, -2), (min, 2)]) def test_result_metric_max_min(reduce_fx, expected): metadata = _Metadata("foo", "bar", reduce_fx=reduce_fx) From e95d8b1c984bbbe3f0c698e6e8126d561b7b5529 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sun, 16 Jan 2022 03:36:57 +0100 Subject: [PATCH 117/123] Modify LSFEnvironment to use more reliable environment variable (#10825) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: thomas chaton Co-authored-by: Carlos Mocholí Co-authored-by: Adrian Wälchli Co-authored-by: Jirka Borovec Co-authored-by: Andrew Tritt --- CHANGELOG.md | 4 + .../plugins/environments/lsf_environment.py | 188 +++++++++++------- .../environments/test_lsf_environment.py | 121 +++++++---- 3 files changed, 197 insertions(+), 116 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index db46c604f00e4..6a57a508c6cec 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,10 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Skip testing with PyTorch 1.7 and Python 3.9 on Ubuntu ([#11217](https://github.com/PyTorchLightning/pytorch-lightning/pull/11217)) - Fixed type promotion when tensors of higher category than float are logged ([#11401](https://github.com/PyTorchLightning/pytorch-lightning/pull/11401)) +### Changed + +- Changed `LSFEnvironment` to use `LSB_DJOB_RANKFILE` environment variable instead of `LSB_HOSTS` for determining node rank and main address ([#10825](https://github.com/PyTorchLightning/pytorch-lightning/pull/10825)) + ## [1.5.8] - 2022-01-05 diff --git a/pytorch_lightning/plugins/environments/lsf_environment.py b/pytorch_lightning/plugins/environments/lsf_environment.py index 06563c7f017bb..4d2ead915ed0c 100644 --- a/pytorch_lightning/plugins/environments/lsf_environment.py +++ b/pytorch_lightning/plugins/environments/lsf_environment.py @@ -14,9 +14,12 @@ import os import socket +from typing import Dict, List from pytorch_lightning import _logger as log from pytorch_lightning.plugins.environments import ClusterEnvironment +from pytorch_lightning.utilities import rank_zero_deprecation +from pytorch_lightning.utilities.cloud_io import get_filesystem class LSFEnvironment(ClusterEnvironment): @@ -25,128 +28,161 @@ class LSFEnvironment(ClusterEnvironment): It is expected that any execution using this ClusterEnvironment was executed using the Job Step Manager i.e. ``jsrun``. - This plugin expects the following environment variables. + This plugin expects the following environment variables: - LSB_JOBID: - The LSF assigned job ID + ``LSB_JOBID`` + The LSF assigned job ID - LSB_HOSTS: - The hosts used in the job. This string is expected to have the format "batch ...." + ``LSB_DJOB_RANKFILE`` + The OpenMPI compatibile rank file for the LSF job - JSM_NAMESPACE_LOCAL_RANK: - The node local rank for the task. This environment variable is set by jsrun + ``JSM_NAMESPACE_LOCAL_RANK`` + The node local rank for the task. This environment variable is set by ``jsrun`` - JSM_NAMESPACE_SIZE: - The world size for the task. This environment variable is set by jsrun - """ + ``JSM_NAMESPACE_SIZE`` + The world size for the task. This environment variable is set by ``jsrun`` - def __init__(self): - self._master_address = self._get_master_address() - self._master_port = self._get_master_port() - log.debug(f"MASTER_ADDR: {self._master_address}") - log.debug(f"MASTER_PORT: {self._master_port}") + ``JSM_NAMESPACE_RANK`` + The global rank for the task. This environment variable is set by ``jsrun`` + """ - @staticmethod - def is_using_lsf() -> bool: - """Returns ``True`` if the current process was launched using the jsrun command.""" - required_env_vars = ("LSB_JOBID", "LSB_HOSTS", "JSM_NAMESPACE_LOCAL_RANK", "JSM_NAMESPACE_SIZE") - return all(v in os.environ for v in required_env_vars) + def __init__(self) -> None: + super().__init__() + # TODO: remove in 1.7 + if hasattr(self, "is_using_lsf") and callable(self.is_using_lsf): + rank_zero_deprecation( + f"`{self.__class__.__name__}.is_using_lsf` has been deprecated in v1.6 and will be removed in v1.7." + " Implement the static method `detect()` instead (do not forget to add the `@staticmethod` decorator)." + ) + self._main_address = self._get_main_address() + self._main_port = self._get_main_port() + self._node_rank = self._get_node_rank() + self._set_init_progress_group_env_vars() + + def _set_init_progress_group_env_vars(self) -> None: + # set environment variables needed for initializing torch distributed process group + os.environ["MASTER_ADDR"] = str(self._main_address) + log.debug(f"MASTER_ADDR: {os.environ['MASTER_ADDR']}") + os.environ["MASTER_PORT"] = str(self._main_port) + log.debug(f"MASTER_PORT: {os.environ['MASTER_PORT']}") @property def creates_processes_externally(self) -> bool: + """LSF creates subprocesses, i.e., PyTorch Lightning does not need to spawn them.""" return True - def master_address(self): - """The master address is read from a list of hosts contained in the environment variable `LSB_HOSTS`.""" - return self._master_address + def master_address(self) -> str: + """The main address is read from an OpenMPI host rank file in the environment variable + ``LSB_DJOB_RANKFILE``.""" + return self._main_address + + def master_port(self) -> int: + """The main port is calculated from the LSF job ID.""" + return self._main_port - def master_port(self): - """THe master port gets calculated from the LSF job ID.""" - return self._master_port + @staticmethod + def is_using_lsf() -> bool: + """Returns ``True`` if the current process was launched using the ``jsrun`` command.""" + required_env_vars = {"LSB_JOBID", "LSB_DJOB_RANKFILE", "JSM_NAMESPACE_LOCAL_RANK", "JSM_NAMESPACE_SIZE"} + return required_env_vars.issubset(os.environ.keys()) - def world_size(self): - """The world size is read from the environment variable `JSM_NAMESPACE_SIZE`.""" - var = "JSM_NAMESPACE_SIZE" - world_size = os.environ.get(var) + def world_size(self) -> int: + """The world size is read from the environment variable ``JSM_NAMESPACE_SIZE``.""" + world_size = os.environ.get("JSM_NAMESPACE_SIZE") if world_size is None: raise ValueError( - f"Cannot determine world size from environment variable {var}." - " Make sure you run your executable with `jsrun`" + "Cannot determine world size. Environment variable `JSM_NAMESPACE_SIZE` not found." + "Make sure you run your executable with `jsrun`." ) return int(world_size) def set_world_size(self, size: int) -> None: log.debug("LSFEnvironment.set_world_size was called, but setting world size is not allowed. Ignored.") - def global_rank(self): - """The world size is read from the environment variable `JSM_NAMESPACE_RANK`.""" - var = "JSM_NAMESPACE_RANK" - global_rank = os.environ.get(var) + def global_rank(self) -> int: + """The world size is read from the environment variable ``JSM_NAMESPACE_RANK``.""" + global_rank = os.environ.get("JSM_NAMESPACE_RANK") if global_rank is None: raise ValueError( - f"Cannot determine global rank from environment variable {var}." - " Make sure you run your executable with `jsrun`" + "Cannot determine global rank. Environment variable `JSM_NAMESPACE_RANK` not found." + "Make sure you run your executable with `jsrun`." ) return int(global_rank) def set_global_rank(self, rank: int) -> None: log.debug("LSFEnvironment.set_global_rank was called, but setting global rank is not allowed. Ignored.") - def local_rank(self): + def local_rank(self) -> int: """The local rank is read from the environment variable `JSM_NAMESPACE_LOCAL_RANK`.""" - var = "JSM_NAMESPACE_LOCAL_RANK" - local_rank = os.environ.get(var) + local_rank = os.environ.get("JSM_NAMESPACE_LOCAL_RANK") if local_rank is None: raise ValueError( - f"Cannot determine local rank from environment variable {var}." - " Make sure you run your executable with `jsrun`" + "Cannot determine local rank. Environment variable `JSM_NAMESPACE_LOCAL_RANK` not found." + "Make sure you run your executable with `jsrun`." ) return int(local_rank) - def node_rank(self): - """The node rank is determined by the position of the current hostname in the list of hosts stored in the - environment variable `LSB_HOSTS`.""" + def node_rank(self) -> int: + """The node rank is determined by the position of the current hostname in the OpenMPI host rank file stored + in ``LSB_DJOB_RANKFILE``.""" + return self._node_rank + + def _get_node_rank(self) -> int: + """A helper method for getting the node rank. + + The node rank is determined by the position of the current node in the list of hosts used in the job. This is + calculated by reading all hosts from ``LSB_DJOB_RANKFILE`` and finding this node's hostname in the list. + """ hosts = self._read_hosts() - count = {} + count: Dict[str, int] = {} for host in hosts: - if "batch" in host or "login" in host: - continue if host not in count: count[host] = len(count) return count[socket.gethostname()] @staticmethod - def _read_hosts(): - hosts = os.environ.get("LSB_HOSTS") - if not hosts: - raise ValueError("Could not find hosts in environment variable LSB_HOSTS") - hosts = hosts.split() - if len(hosts) < 2: - raise ValueError( - 'Cannot parse hosts from LSB_HOSTS environment variable. Expected format: "batch ..."' - ) - return hosts + def _read_hosts() -> List[str]: + """Read compute hosts that are a part of the compute job. - def _get_master_address(self): + LSF uses the Job Step Manager (JSM) to manage job steps. Job steps are executed by the JSM from "launch" nodes. + Each job is assigned a launch node. This launch node will be the first node in the list contained in + ``LSB_DJOB_RANKFILE``. + """ + var = "LSB_DJOB_RANKFILE" + rankfile = os.environ.get(var) + if rankfile is None: + raise ValueError("Did not find the environment variable `LSB_DJOB_RANKFILE`") + if not rankfile: + raise ValueError("The environment variable `LSB_DJOB_RANKFILE` is empty") + + fs = get_filesystem(rankfile) + with fs.open(rankfile, "r") as f: + ret = [line.strip() for line in f] + # remove the launch node (i.e. the first node in LSB_DJOB_RANKFILE) from the list + return ret[1:] + + def _get_main_address(self) -> str: + """A helper for getting the main address. + + The main address is assigned to the first node in the list of nodes used for the job. + """ hosts = self._read_hosts() - return hosts[1] + return hosts[0] @staticmethod - def _get_master_port(): - """A helper function for accessing the master port. + def _get_main_port() -> int: + """A helper function for accessing the main port. - Uses the LSF job ID so all ranks can compute the master port. + Uses the LSF job ID so all ranks can compute the main port. """ - # check for user-specified master port - port = os.environ.get("MASTER_PORT") - if not port: - jobid = os.environ.get("LSB_JOBID") - if not jobid: - raise ValueError("Could not find job id in environment variable LSB_JOBID") - port = int(jobid) + # check for user-specified main port + if "MASTER_PORT" in os.environ: + log.debug(f"Using externally specified main port: {os.environ['MASTER_PORT']}") + return int(os.environ["MASTER_PORT"]) + if "LSB_JOBID" in os.environ: + port = int(os.environ["LSB_JOBID"]) # all ports should be in the 10k+ range - port = int(port) % 1000 + 10000 - log.debug(f"calculated LSF master port: {port}") - else: - log.debug(f"using externally specified master port: {port}") - return int(port) + port = port % 1000 + 10000 + log.debug(f"calculated LSF main port: {port}") + return port + raise ValueError("Could not find job id in environment variable LSB_JOBID") diff --git a/tests/plugins/environments/test_lsf_environment.py b/tests/plugins/environments/test_lsf_environment.py index e3a5a67ba4be2..35cdcb4580e8d 100644 --- a/tests/plugins/environments/test_lsf_environment.py +++ b/tests/plugins/environments/test_lsf_environment.py @@ -19,57 +19,98 @@ from pytorch_lightning.plugins.environments import LSFEnvironment -@mock.patch.dict(os.environ, {"LSB_HOSTS": "batch 10.10.10.0 10.10.10.1", "LSB_JOBID": "1234"}) -def test_missing_lsb_hosts(): - """Test an error when the lsb hosts list cannot be found.""" - del os.environ["LSB_HOSTS"] - with pytest.raises(ValueError, match="Could not find hosts in environment variable LSB_HOSTS"): +def _make_rankfile(tmp_path): + hosts = "batch\n10.10.10.0\n10.10.10.1\n10.10.10.2\n10.10.10.3" + p = tmp_path / "lsb_djob_rankfile" + p.write_text(hosts) + return str(p) + + +@mock.patch.dict(os.environ, {"LSB_JOBID": "1234"}) +def test_missing_lsb_djob_rankfile(): + """Test an error when the LSB_DJOB_RANKFILE cannot be found.""" + with pytest.raises(ValueError, match="Did not find the environment variable `LSB_DJOB_RANKFILE`"): + LSFEnvironment() + + +@mock.patch.dict(os.environ, {"LSB_DJOB_RANKFILE": "", "LSB_JOBID": "1234"}) +def test_empty_lsb_djob_rankfile(): + """Test an error when the LSB_DJOB_RANKFILE is not populated.""" + with pytest.raises(ValueError, match="The environment variable `LSB_DJOB_RANKFILE` is empty"): LSFEnvironment() -@mock.patch.dict(os.environ, {"LSB_HOSTS": "batch 10.10.10.0 10.10.10.1", "LSB_JOBID": "1234"}) -def test_missing_lsb_job_id(): +def test_missing_lsb_job_id(tmp_path): """Test an error when the job id cannot be found.""" - del os.environ["LSB_JOBID"] - with pytest.raises(ValueError, match="Could not find job id in environment variable LSB_JOBID"): + with mock.patch.dict(os.environ, {"LSB_DJOB_RANKFILE": _make_rankfile(tmp_path)}), pytest.raises( + ValueError, match="Could not find job id in environment variable LSB_JOBID" + ): LSFEnvironment() -@mock.patch.dict(os.environ, {"MASTER_PORT": "4321", "LSB_JOBID": "1234", "LSB_HOSTS": "batch 10.10.10.0 10.10.10.1"}) -def test_manual_master_port_and_address(): +def test_manual_main_port_and_address(tmp_path): """Test a user can set the port manually through the MASTER_PORT env variable.""" - env = LSFEnvironment() - assert env.master_port() == 4321 + environ = { + "LSB_DJOB_RANKFILE": _make_rankfile(tmp_path), + "LSB_JOBID": "1234", + "JSM_NAMESPACE_SIZE": "4", + "JSM_NAMESPACE_RANK": "3", + "JSM_NAMESPACE_LOCAL_RANK": "1", + } + with mock.patch.dict(os.environ, environ), mock.patch("socket.gethostname", return_value="10.10.10.2"): + env = LSFEnvironment() + assert env.master_port() == 10234 + +def test_attributes_from_environment_variables(tmp_path): + """Test that the LSF environment takes the attributes from the environment variables.""" + environ = { + "LSB_DJOB_RANKFILE": _make_rankfile(tmp_path), + "LSB_JOBID": "1234", + "JSM_NAMESPACE_SIZE": "4", + "JSM_NAMESPACE_RANK": "3", + "JSM_NAMESPACE_LOCAL_RANK": "1", + } + with mock.patch.dict(os.environ, environ), mock.patch("socket.gethostname", return_value="10.10.10.2"): + env = LSFEnvironment() + assert env.creates_processes_externally + assert env.master_address() == "10.10.10.0" + assert env.master_port() == 10234 + assert env.world_size() == 4 + assert env.global_rank() == 3 + assert env.local_rank() == 1 + env.set_global_rank(100) + assert env.global_rank() == 3 + env.set_world_size(100) + assert env.world_size() == 4 + assert LSFEnvironment.is_using_lsf() -@mock.patch.dict( - os.environ, - { - "LSB_HOSTS": "batch 10.10.10.0 10.10.10.1 10.10.10.2 10.10.10.3", + +def test_node_rank(tmp_path): + environ = { + "LSB_DJOB_RANKFILE": _make_rankfile(tmp_path), "LSB_JOBID": "1234", "JSM_NAMESPACE_SIZE": "4", "JSM_NAMESPACE_RANK": "3", "JSM_NAMESPACE_LOCAL_RANK": "1", - }, -) -def test_attributes_from_environment_variables(): - """Test that the LSF environment takes the attributes from the environment variables.""" - env = LSFEnvironment() - assert env.creates_processes_externally - assert env.master_address() == "10.10.10.0" - assert env.master_port() == 10234 - assert env.world_size() == 4 - assert env.global_rank() == 3 - assert env.local_rank() == 1 - env.set_global_rank(100) - assert env.global_rank() == 3 - env.set_world_size(100) - assert env.world_size() == 4 - assert LSFEnvironment.is_using_lsf() - - -@mock.patch("socket.gethostname", return_value="host2") -@mock.patch.dict(os.environ, {"LSB_HOSTS": "batch host0 host1 host2 host3", "LSB_JOBID": "1234"}) -def test_node_rank(_): - env = LSFEnvironment() - assert env.node_rank() == 2 + } + with mock.patch.dict(os.environ, environ), mock.patch("socket.gethostname", return_value="10.10.10.2"): + env = LSFEnvironment() + assert env.node_rank() == 2 + + +def test_detect(): + """Test the detection of a LSF environment configuration.""" + with mock.patch.dict(os.environ, {}): + assert not LSFEnvironment.is_using_lsf() + + with mock.patch.dict( + os.environ, + { + "LSB_DJOB_RANKFILE": "", + "LSB_JOBID": "", + "JSM_NAMESPACE_SIZE": "", + "JSM_NAMESPACE_LOCAL_RANK": "", + }, + ): + assert LSFEnvironment.is_using_lsf() From f72bf31e6afee8acea005cb9bcb1e36889ea160f Mon Sep 17 00:00:00 2001 From: Rohit Gupta Date: Tue, 18 Jan 2022 04:03:57 +0530 Subject: [PATCH 118/123] Disable attaching samplers when using `IterableDataset` (#11507) --- CHANGELOG.md | 3 +++ pytorch_lightning/trainer/data_loading.py | 10 +++++++--- tests/trainer/test_data_loading.py | 16 +++++++++++++++- 3 files changed, 25 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6a57a508c6cec..3c12f0f44539a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,6 +17,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Changed `LSFEnvironment` to use `LSB_DJOB_RANKFILE` environment variable instead of `LSB_HOSTS` for determining node rank and main address ([#10825](https://github.com/PyTorchLightning/pytorch-lightning/pull/10825)) +- Disbled sampler replacement when using `IterableDataset` ([#11507](https://github.com/PyTorchLightning/pytorch-lightning/pull/11507)) + + ## [1.5.8] - 2022-01-05 ### Fixed diff --git a/pytorch_lightning/trainer/data_loading.py b/pytorch_lightning/trainer/data_loading.py index 1662c1b3aaa92..fdeddcbca1e50 100644 --- a/pytorch_lightning/trainer/data_loading.py +++ b/pytorch_lightning/trainer/data_loading.py @@ -272,9 +272,13 @@ def _get_dataloader_init_kwargs( # kwargs to re-construct the dataloader dl_kwargs = {k: v for k, v in attrs.items() if k in non_defaults} - dl_kwargs.update( - TrainerDataLoadingMixin._dataloader_init_kwargs_resolve_sampler(dataloader, sampler, mode=mode) - ) + if isinstance(dl_kwargs["dataset"], IterableDataset): + dl_kwargs["batch_sampler"] = None + dl_kwargs["sampler"] = None + else: + dl_kwargs.update( + TrainerDataLoadingMixin._dataloader_init_kwargs_resolve_sampler(dataloader, sampler, mode=mode) + ) required_args = { p.name diff --git a/tests/trainer/test_data_loading.py b/tests/trainer/test_data_loading.py index 9b1e5ca45e655..8f745db4b8400 100644 --- a/tests/trainer/test_data_loading.py +++ b/tests/trainer/test_data_loading.py @@ -20,11 +20,12 @@ from torch.utils.data.sampler import BatchSampler, Sampler, SequentialSampler from pytorch_lightning import Trainer +from pytorch_lightning.trainer.data_loading import TrainerDataLoadingMixin from pytorch_lightning.trainer.states import RunningStage from pytorch_lightning.trainer.supporters import CombinedLoader from pytorch_lightning.utilities.enums import DistributedType from pytorch_lightning.utilities.exceptions import MisconfigurationException -from tests.helpers import BoringModel, RandomDataset +from tests.helpers.boring_model import BoringModel, RandomDataset, RandomIterableDataset from tests.helpers.runif import RunIf @@ -389,3 +390,16 @@ def test_non_sequential_sampler_warning_is_raised_for_eval_dataloader(val_dl): trainer._data_connector.attach_data(model, val_dataloaders=val_dl) with pytest.warns(UserWarning, match="recommended .* turn this off for val/test/predict"): trainer._reset_eval_dataloader(RunningStage.VALIDATING, model) + + +@pytest.mark.parametrize("mode", [RunningStage.TRAINING, RunningStage.PREDICTING, RunningStage.TESTING]) +def test_dataloader_kwargs_replacement_with_iterable_dataset(mode): + """Test that DataLoader kwargs are not replaced when using Iterable Dataset.""" + dataset = RandomIterableDataset(7, 100) + dataloader = DataLoader(dataset, batch_size=32) + dl_kwargs = TrainerDataLoadingMixin._get_dataloader_init_kwargs(dataloader, dataloader.sampler, mode=mode) + assert dl_kwargs["sampler"] is None + assert dl_kwargs["batch_sampler"] is None + assert dl_kwargs["batch_size"] is dataloader.batch_size + assert dl_kwargs["dataset"] is dataloader.dataset + assert dl_kwargs["collate_fn"] is dataloader.collate_fn From 9811284684359e97e33f7be88e65ac5572ee951a Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Tue, 18 Jan 2022 15:54:37 +0100 Subject: [PATCH 119/123] Pin jsonargparse maximum version --- requirements/extra.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/extra.txt b/requirements/extra.txt index babaffca6280d..74743185e880c 100644 --- a/requirements/extra.txt +++ b/requirements/extra.txt @@ -5,6 +5,6 @@ horovod>=0.21.2 # no need to install with [pytorch] as pytorch is already insta torchtext>=0.8.* omegaconf>=2.0.5 hydra-core>=1.0.5 -jsonargparse[signatures]>=4.0.4 +jsonargparse[signatures]>=4.0.4,<5.0.0 gcsfs>=2021.5.0 rich>=10.2.2 From 6d5e22d034a464575d624f37263fbed08e64f1bf Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Wed, 19 Jan 2022 16:29:24 +0100 Subject: [PATCH 120/123] [CLI] Save only the configuration used (#11532) --- CHANGELOG.md | 3 +-- pytorch_lightning/utilities/cli.py | 8 +++----- tests/utilities/test_cli.py | 8 ++++---- 3 files changed, 8 insertions(+), 11 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3c12f0f44539a..665a0887afbc0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,12 +11,11 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Pin sphinx-autodoc-typehints with Any: """Utility to get a config value which might be inside a subcommand.""" - if self.subcommand is not None: - return config[self.subcommand].get(key, default) - return config.get(key, default) + return config.get(str(self.subcommand), config).get(key, default) def _run_subcommand(self, subcommand: str) -> None: """Run the chosen subcommand.""" diff --git a/tests/utilities/test_cli.py b/tests/utilities/test_cli.py index 58903a799cc61..dd2132d40c63d 100644 --- a/tests/utilities/test_cli.py +++ b/tests/utilities/test_cli.py @@ -347,9 +347,7 @@ def test_lightning_cli_args(tmpdir): with open(config_path) as f: loaded_config = yaml.safe_load(f.read()) - loaded_config = loaded_config["fit"] cli_config = cli.config["fit"] - assert cli_config["seed_everything"] == 1234 assert "model" not in loaded_config and "model" not in cli_config # no arguments to include assert loaded_config["data"] == cli_config["data"] @@ -403,9 +401,7 @@ def test_lightning_cli_config_and_subclass_mode(tmpdir): with open(config_path) as f: loaded_config = yaml.safe_load(f.read()) - loaded_config = loaded_config["fit"] cli_config = cli.config["fit"] - assert loaded_config["model"] == cli_config["model"] assert loaded_config["data"] == cli_config["data"] assert loaded_config["trainer"] == cli_config["trainer"] @@ -1251,6 +1247,10 @@ def test_lightning_cli_config_before_subcommand(): test_mock.assert_called_once_with(cli.trainer, model=cli.model, verbose=True, ckpt_path="foobar") assert cli.trainer.limit_test_batches == 1 + save_config_callback = cli.trainer.callbacks[0] + assert save_config_callback.config["trainer"]["limit_test_batches"] == 1 + assert save_config_callback.parser.subcommand == "test" + with mock.patch("sys.argv", ["any.py", f"--config={config}", "validate"]), mock.patch( "pytorch_lightning.Trainer.validate", autospec=True ) as validate_mock: From a6394ea82850d02f8b5825297c4bd847d016e7db Mon Sep 17 00:00:00 2001 From: Aki Nitta Date: Wed, 12 Jan 2022 13:31:56 +0900 Subject: [PATCH 121/123] Don't use old testing packages in CI (#11366) --- .github/set-min-requirements.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/set-min-requirements.py b/.github/set-min-requirements.py index b67ba224662ab..e5162293e2b4c 100644 --- a/.github/set-min-requirements.py +++ b/.github/set-min-requirements.py @@ -2,8 +2,8 @@ "requirements.txt", "requirements/extra.txt", "requirements/loggers.txt", - "requirements/test.txt", "requirements/examples.txt", + # "requirements/test.txt", # Don't use old testing packages ) From ab1c2ff23fd27ab6e5647e390313b569e32b61c5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Wed, 19 Jan 2022 19:30:30 +0100 Subject: [PATCH 122/123] Update CHANGELOG.md Co-authored-by: ananthsub --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 665a0887afbc0..5caef5f6509e3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,7 +16,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Changed - Changed `LSFEnvironment` to use `LSB_DJOB_RANKFILE` environment variable instead of `LSB_HOSTS` for determining node rank and main address ([#10825](https://github.com/PyTorchLightning/pytorch-lightning/pull/10825)) -- Disbled sampler replacement when using `IterableDataset` ([#11507](https://github.com/PyTorchLightning/pytorch-lightning/pull/11507)) +- Disabled sampler replacement when using `IterableDataset` ([#11507](https://github.com/PyTorchLightning/pytorch-lightning/pull/11507)) ## [1.5.8] - 2022-01-05 From 7acebbba32e193b7dd0ff104181267e61af83a8b Mon Sep 17 00:00:00 2001 From: rohitgr7 Date: Mon, 7 Feb 2022 19:26:29 +0530 Subject: [PATCH 123/123] update version --- CHANGELOG.md | 11 +++++++++++ pytorch_lightning/__about__.py | 2 +- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5caef5f6509e3..74b19da46d05b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,16 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). +## [1.5.10] - 2021-02-08 + +### Fixed + +- + + +- + + ## [1.5.9] - 2022-01-18 ### Fixed @@ -29,6 +39,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed a race condition that could result in incorrect (zero) values being observed in prediction writer callbacks ([#11288](https://github.com/PyTorchLightning/pytorch-lightning/pull/11288)) - Fixed dataloaders not getting reloaded the correct amount of times when setting `reload_dataloaders_every_n_epochs` and `check_val_every_n_epoch` ([#10948](https://github.com/PyTorchLightning/pytorch-lightning/pull/10948)) + ## [1.5.7] - 2021-12-21 ### Fixed diff --git a/pytorch_lightning/__about__.py b/pytorch_lightning/__about__.py index 3f0cb9ee1441b..83335912ce29e 100644 --- a/pytorch_lightning/__about__.py +++ b/pytorch_lightning/__about__.py @@ -1,7 +1,7 @@ import time _this_year = time.strftime("%Y") -__version__ = "1.5.9" +__version__ = "1.5.10" __author__ = "William Falcon et al." __author_email__ = "waf2107@columbia.edu" __license__ = "Apache-2.0"