diff --git a/.actions/pull_legacy_checkpoints.sh b/.actions/pull_legacy_checkpoints.sh new file mode 100644 index 0000000000000..8b3f791297b66 --- /dev/null +++ b/.actions/pull_legacy_checkpoints.sh @@ -0,0 +1,9 @@ +#!/bin/bash +# Run this script from the project root. +URL="https://pl-public-data.s3.amazonaws.com/legacy/checkpoints.zip" +mkdir -p legacy +# wget is simpler but does not work on Windows +python -c "from urllib.request import urlretrieve; urlretrieve('$URL', 'legacy/checkpoints.zip')" +ls -l legacy/ +unzip -o legacy/checkpoints.zip -d legacy/ +ls -l legacy/checkpoints/ diff --git a/.azure-pipelines/gpu-benchmark.yml b/.azure/gpu-benchmark.yml similarity index 97% rename from .azure-pipelines/gpu-benchmark.yml rename to .azure/gpu-benchmark.yml index cfccbf7081f14..451f5b5646dca 100644 --- a/.azure-pipelines/gpu-benchmark.yml +++ b/.azure/gpu-benchmark.yml @@ -26,7 +26,7 @@ jobs: - job: benchmarks timeoutInMinutes: "90" cancelTimeoutInMinutes: "2" - pool: azure-gpus-spot + pool: azure-jirka-spot container: image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.11" options: "--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all --shm-size=32g" diff --git a/.azure-pipelines/gpu-tests.yml b/.azure/gpu-tests.yml similarity index 95% rename from .azure-pipelines/gpu-tests.yml rename to .azure/gpu-tests.yml index c0074adf4e81c..5ec9db1297b43 100644 --- a/.azure-pipelines/gpu-tests.yml +++ b/.azure/gpu-tests.yml @@ -29,7 +29,7 @@ jobs: # how much time to give 'run always even if cancelled tasks' before stopping them cancelTimeoutInMinutes: "2" - pool: azure-gpus-spot + pool: azure-jirka-spot container: image: $(image) @@ -69,10 +69,7 @@ jobs: python requirements/check-avail-extras.py displayName: 'Env details' - - bash: | - wget https://pl-public-data.s3.amazonaws.com/legacy/checkpoints.zip -P legacy/ - unzip -o legacy/checkpoints.zip -d legacy/ - ls -l legacy/checkpoints/ + - bash: bash .actions/pull_legacy_checkpoints.sh displayName: 'Get legacy checkpoints' - bash: | diff --git a/.azure-pipelines/hpu-tests.yml b/.azure/hpu-tests.yml similarity index 100% rename from .azure-pipelines/hpu-tests.yml rename to .azure/hpu-tests.yml diff --git a/.azure-pipelines/ipu-tests.yml b/.azure/ipu-tests.yml similarity index 100% rename from .azure-pipelines/ipu-tests.yml rename to .azure/ipu-tests.yml diff --git a/.github/workflows/ci_dockers.yml b/.github/workflows/ci_dockers.yml index 94d5be6901c5a..c9ed112516ff4 100644 --- a/.github/workflows/ci_dockers.yml +++ b/.github/workflows/ci_dockers.yml @@ -97,7 +97,7 @@ jobs: UBUNTU_VERSION=${{ matrix.ubuntu_version }} file: dockers/base-cuda/Dockerfile push: false - timeout-minutes: 75 + timeout-minutes: 95 build-Conda: runs-on: ubuntu-20.04 @@ -123,7 +123,7 @@ jobs: CUDA_VERSION=${{ matrix.cuda_version }} file: dockers/base-conda/Dockerfile push: false - timeout-minutes: 75 + timeout-minutes: 95 build-ipu: runs-on: ubuntu-20.04 diff --git a/.github/workflows/ci_schema.yml b/.github/workflows/ci_schema.yml index 54efaff27a201..03b230124085d 100644 --- a/.github/workflows/ci_schema.yml +++ b/.github/workflows/ci_schema.yml @@ -21,4 +21,4 @@ jobs: - name: Azure Pipelines env: SCHEMA_FILE: https://raw.githubusercontent.com/microsoft/azure-pipelines-vscode/v1.204.0/service-schema.json - run: check-jsonschema .azure-pipelines/*.yml --schemafile "$SCHEMA_FILE" + run: check-jsonschema .azure/*.yml --schemafile "$SCHEMA_FILE" diff --git a/.github/workflows/ci_test-base.yml b/.github/workflows/ci_test-base.yml deleted file mode 100644 index 221f92d26ad5e..0000000000000 --- a/.github/workflows/ci_test-base.yml +++ /dev/null @@ -1,81 +0,0 @@ -# this jobs runs `pytest` over the source directory. It does not install any extra dependencies. -# this is useful to catch errors where an import has been added which is not part of the basic dependencies. -name: Test simple - -# see: https://help.github.com/en/actions/reference/events-that-trigger-workflows -on: # Trigger the workflow on push or pull request, but only for the master branch - push: - branches: [master, "release/*"] - pull_request: - branches: [master, "release/*"] - -concurrency: - group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }} - cancel-in-progress: ${{ ! (github.ref == 'refs/heads/master' || startsWith(github.ref, 'refs/heads/release/')) }} - -jobs: - source: - runs-on: ${{ matrix.os }} - strategy: - fail-fast: false - matrix: - os: [ubuntu-20.04] - # this will install stable torch - python-version: [3.9] - - # lower timeout as this should run very quickly - timeout-minutes: 20 - steps: - - uses: actions/checkout@v2 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 - with: - python-version: ${{ matrix.python-version }} - - - name: Reset caching - run: python -c "import time; days = time.time() / 60 / 60 / 24; print(f'TIME_PERIOD=d{int(days / 2) * 2}')" >> $GITHUB_ENV - - # Note: This uses an internal pip API and may not always work - # https://github.com/actions/cache/blob/master/examples.md#multiple-oss-in-a-workflow - - name: Get pip cache - id: pip-cache - run: python -c "from pip._internal.locations import USER_CACHE_DIR; print('::set-output name=dir::' + USER_CACHE_DIR)" - - - name: Cache pip - uses: actions/cache@v2 - with: - path: ${{ steps.pip-cache.outputs.dir }} - key: ${{ runner.os }}-pip-td${{ env.TIME_PERIOD }}-py${{ matrix.python-version }}-${{ matrix.requires }}-${{ hashFiles('requirements/base.txt') }} - restore-keys: ${{ runner.os }}-pip-td${{ env.TIME_PERIOD }}-py${{ matrix.python-version }}-${{ matrix.requires }}- - - - name: Install dependencies - run: | - python --version - python -m pip install --upgrade --user pip - pip --version - pip install -r requirements.txt --find-links https://download.pytorch.org/whl/cpu/torch_stable.html --upgrade - pip install -r requirements/test.txt - pip list - shell: bash - - - name: Test Package [only] - run: | - coverage run --source pytorch_lightning -m pytest pytorch_lightning -v - - - name: Statistics - if: success() - run: | - coverage report - coverage xml - - - name: Upload coverage to Codecov - uses: codecov/codecov-action@v1 - if: always() - # see: https://github.com/actions/toolkit/issues/399 - continue-on-error: true - with: - token: ${{ secrets.CODECOV_TOKEN }} - file: coverage.xml - flags: cpu,pytest - name: Base-coverage - fail_ci_if_error: false diff --git a/.github/workflows/ci_test-conda.yml b/.github/workflows/ci_test-conda.yml index c907e13b4ac23..1064c7a095d54 100644 --- a/.github/workflows/ci_test-conda.yml +++ b/.github/workflows/ci_test-conda.yml @@ -31,19 +31,27 @@ jobs: timeout-minutes: 30 steps: - name: Workaround for https://github.com/actions/checkout/issues/760 - run: git config --global --add safe.directory /__w/pytorch-lightning/pytorch-lightning + run: git config --global --add safe.directory /__w/lightning/lightning - uses: actions/checkout@v2 - - name: Update dependencies + - name: Update base dependencies + run: | + conda info + conda list + pip install -r requirements/test.txt + + - name: DocTests + run: | + coverage run --source pytorch_lightning -m pytest pytorch_lightning + + - name: Update all dependencies env: HOROVOD_BUILD_ARCH_FLAGS: "-mfma" HOROVOD_WITHOUT_MXNET: 1 HOROVOD_WITHOUT_TENSORFLOW: 1 run: | set -e - conda info - conda list # adjust versions according installed Torch version python ./requirements/adjust-versions.py requirements/extra.txt python ./requirements/adjust-versions.py requirements/examples.txt @@ -55,17 +63,12 @@ jobs: # sanity check python requirements/check-avail-extras.py - - name: Pull checkpoints from S3 - working-directory: ./legacy - run: | - # enter legacy and update checkpoints from S3 - curl https://pl-public-data.s3.amazonaws.com/legacy/checkpoints.zip --output checkpoints.zip - unzip -o checkpoints.zip - ls -l checkpoints/ + - name: Pull legacy checkpoints + run: bash .actions/pull_legacy_checkpoints.sh - - name: Tests + - name: UnitTests run: | - coverage run --source pytorch_lightning -m pytest --timeout 150 pytorch_lightning tests -v --durations=50 --junitxml=junit/test-results-${{ runner.os }}-torch${{ matrix.pytorch-version }}.xml + coverage run --source pytorch_lightning -m pytest --timeout 150 tests -v --durations=50 --junitxml=junit/test-results-${{ runner.os }}-torch${{ matrix.pytorch-version }}.xml - name: Upload pytest results uses: actions/upload-artifact@v2 diff --git a/.github/workflows/ci_test-full.yml b/.github/workflows/ci_test-full.yml index f260c67069912..bca5699d43029 100644 --- a/.github/workflows/ci_test-full.yml +++ b/.github/workflows/ci_test-full.yml @@ -76,27 +76,21 @@ jobs: restore-keys: | ${{ runner.os }}-pip-td${{ env.TIME_PERIOD }}-py${{ matrix.python-version }}-${{ matrix.release }}-${{ matrix.requires }}- - - name: Pull checkpoints from S3 - working-directory: ./legacy - run: | - # wget is simpler but does not work on Windows - python -c "from urllib.request import urlretrieve ; urlretrieve('https://pl-public-data.s3.amazonaws.com/legacy/checkpoints.zip', 'checkpoints.zip')" - ls -l . - unzip -o checkpoints.zip - ls -l checkpoints/ + - name: Pull legacy checkpoints + run: bash .actions/pull_legacy_checkpoints.sh - name: Install dependencies run: | flag=$(python -c "print('--pre' if '${{matrix.release}}' == 'pre' else '')" 2>&1) url=$(python -c "print('test/cpu/torch_test.html' if '${{matrix.release}}' == 'pre' else 'cpu/torch_stable.html')" 2>&1) pip install -r requirements.txt --upgrade $flag --find-links "https://download.pytorch.org/whl/${url}" - # adjust versions according installed Torch version - python ./requirements/adjust-versions.py requirements/examples.txt - pip install -r requirements/examples.txt --find-links https://download.pytorch.org/whl/cpu/torch_stable.html --upgrade pip install -r requirements/test.txt --upgrade pip list shell: bash + - name: DocTests + run: coverage run --source pytorch_lightning -m pytest pytorch_lightning + - name: Install extra dependencies run: | # adjust versions according installed Torch version @@ -132,13 +126,16 @@ jobs: run: | python requirements/check-avail-extras.py - - name: Tests + - name: UnitTests run: | # NOTE: do not include coverage report here, see: https://github.com/nedbat/coveragepy/issues/1003 coverage run --source pytorch_lightning -m pytest pytorch_lightning tests -v --durations=50 --junitxml=junit/test-results-${{ runner.os }}-py${{ matrix.python-version }}-${{ matrix.requires }}-${{ matrix.release }}.xml - name: Examples run: | + # adjust versions according installed Torch version + python ./requirements/adjust-versions.py requirements/examples.txt + pip install -r requirements/examples.txt --find-links https://download.pytorch.org/whl/cpu/torch_stable.html --upgrade python -m pytest pl_examples -v --durations=10 - name: Upload pytest results diff --git a/.github/workflows/events-nightly.yml b/.github/workflows/events-nightly.yml index 91a666dd2f6dd..4496e176d9720 100644 --- a/.github/workflows/events-nightly.yml +++ b/.github/workflows/events-nightly.yml @@ -146,7 +146,7 @@ jobs: file: dockers/base-cuda/Dockerfile push: ${{ env.PUSH_TO_HUB }} tags: pytorchlightning/pytorch_lightning:base-cuda-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }} - timeout-minutes: 85 + timeout-minutes: 95 # report failure to Slack - name: Slack notification @@ -197,7 +197,7 @@ jobs: file: dockers/base-conda/Dockerfile push: ${{ env.PUSH_TO_HUB }} tags: pytorchlightning/pytorch_lightning:base-conda-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }} - timeout-minutes: 85 + timeout-minutes: 95 # report failure to Slack - name: Slack notification diff --git a/CHANGELOG.md b/CHANGELOG.md index bdcd6e07da750..455fa67add3f1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,16 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). +## [1.6.5] - 2022-07-12 + +### Fixed + +- Fixed `estimated_stepping_batches` requiring distributed comms in `configure_optimizers` for the `DeepSpeedStrategy` ([#13350](https://github.com/PyTorchLightning/pytorch-lightning/pull/13350)) +- Fixed bug with Python version check that prevented use with development versions of Python ([#13420](https://github.com/PyTorchLightning/pytorch-lightning/pull/13420)) +- The loops now call `.set_epoch()` also on batch samplers if the dataloader has one wrapped in a distributed sampler ([#13396](https://github.com/PyTorchLightning/pytorch-lightning/pull/13396)) +- Fixed the restoration of log step during restart ([#13467](https://github.com/PyTorchLightning/pytorch-lightning/pull/13467)) + + ## [1.6.4] - 2022-06-01 ### Added diff --git a/_notebooks b/_notebooks index 290fb466de1fc..8a36a41548f34 160000 --- a/_notebooks +++ b/_notebooks @@ -1 +1 @@ -Subproject commit 290fb466de1fcc2ac6025f74b56906592911e856 +Subproject commit 8a36a41548f34c44ac455d515a72994487e85813 diff --git a/dockers/base-conda/Dockerfile b/dockers/base-conda/Dockerfile index 05227707b31fa..72603b04ffd64 100644 --- a/dockers/base-conda/Dockerfile +++ b/dockers/base-conda/Dockerfile @@ -72,16 +72,15 @@ COPY environment.yml environment.yml # conda init RUN conda update -n base -c defaults conda && \ - conda install mamba -n base -c conda-forge && \ - mamba create -y --name $CONDA_ENV python=${PYTHON_VERSION} pytorch=${PYTORCH_VERSION} torchvision torchtext cudatoolkit=${CUDA_VERSION} -c nvidia -c pytorch -c pytorch-test -c pytorch-nightly && \ + conda create -y --name $CONDA_ENV python=${PYTHON_VERSION} pytorch=${PYTORCH_VERSION} torchvision torchtext cudatoolkit=${CUDA_VERSION} -c nvidia -c pytorch -c pytorch-test -c pytorch-nightly && \ conda init bash && \ # NOTE: this requires that the channel is presented in the yaml before packages \ printf "import re;\nfname = 'environment.yml';\nreq = open(fname).read();\nfor n in ['python', 'pytorch', 'torchtext', 'torchvision']:\n req = re.sub(rf'- {n}[>=]+', f'# - {n}=', req);\nopen(fname, 'w').write(req)" > prune.py && \ python prune.py && \ rm prune.py && \ cat environment.yml && \ - mamba env update --name $CONDA_ENV --file environment.yml && \ - mamba clean -ya && \ + conda env update --name $CONDA_ENV --file environment.yml && \ + conda clean -ya && \ rm environment.yml ENV \ diff --git a/dockers/nvidia/Dockerfile b/dockers/nvidia/Dockerfile index 792835c574ada..6848f6cf34eae 100644 --- a/dockers/nvidia/Dockerfile +++ b/dockers/nvidia/Dockerfile @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -ARG PYTORCH_VERSION=21.11 +ARG PYTORCH_VERSION=22.04 # https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes FROM nvcr.io/nvidia/pytorch:${PYTORCH_VERSION}-py3 @@ -37,20 +37,19 @@ RUN \ cd .. ; \ fi && \ # save the examples - mv pytorch-lightning/_notebooks notebooks && \ + mv pytorch-lightning/_notebooks/.notebooks/ notebooks && \ mv pytorch-lightning/pl_examples . && \ # Installations \ pip install -q fire && \ - python ./pytorch-lightning/.actions/assistant.py requirements_prune_pkgs horovod --req_files ./pytorch-lightning/requirements/extra.txt && \ - pip install "Pillow>=8.2, !=8.3.0" "cryptography>=3.4" "py>=1.10" --no-cache-dir --upgrade-strategy only-if-needed && \ - pip install -r ./pytorch-lightning/requirements/extra.txt --no-cache-dir --upgrade-strategy only-if-needed && \ - pip install -r ./pytorch-lightning/requirements/examples.txt --no-cache-dir --upgrade-strategy only-if-needed && \ - pip install ./pytorch-lightning --no-cache-dir && \ + pip install "Pillow>=8.2, !=8.3.0" "cryptography>=3.4" "py>=1.10" --no-cache-dir && \ + pip install ./pytorch-lightning["extra","loggers","strategies"] --no-cache-dir && \ + pip install -r ./pytorch-lightning/requirements/examples.txt --no-cache-dir && \ rm -rf pytorch-lightning && \ - pip install jupyterlab[all] -U && \ pip list +RUN pip install jupyterlab[all] -U + RUN pip install lightning-grid -U && \ pip install "py>=1.10" "protobuf>=3.15.6" --upgrade-strategy only-if-needed diff --git a/dockers/release/Dockerfile b/dockers/release/Dockerfile index f4083f2dd42fc..a0ba3a4a41c37 100644 --- a/dockers/release/Dockerfile +++ b/dockers/release/Dockerfile @@ -36,7 +36,7 @@ RUN \ mv pytorch-lightning-*/ pytorch-lightning ; \ rm *.zip ; \ fi && \ - pip install ./pytorch-lightning["extra"] --no-cache-dir && \ + pip install ./pytorch-lightning["extra","loggers","strategies"] --no-cache-dir && \ rm -rf pytorch-lightning RUN python --version && \ diff --git a/dockers/tpu-tests/Dockerfile b/dockers/tpu-tests/Dockerfile index 602ef1684b859..d4c58c665e7a5 100644 --- a/dockers/tpu-tests/Dockerfile +++ b/dockers/tpu-tests/Dockerfile @@ -22,10 +22,7 @@ LABEL maintainer="PyTorchLightning " COPY ./ ./pytorch-lightning/ # Pull the legacy checkpoints -RUN cd pytorch-lightning && \ - wget https://pl-public-data.s3.amazonaws.com/legacy/checkpoints.zip -P legacy/ && \ - unzip -o legacy/checkpoints.zip -d legacy/ && \ - ls -l legacy/checkpoints/ +RUN cd pytorch-lightning && bash .actions/pull_legacy_checkpoints.sh RUN \ pip install -q fire && \ diff --git a/legacy/README.md b/legacy/README.md index efbd18f7eede6..68eb718a98b07 100644 --- a/legacy/README.md +++ b/legacy/README.md @@ -7,8 +7,7 @@ At this moment we focus on ability running old checkpoints, so the flow here is If you want to pull all saved version-checkpoints for local testing/development, call ```bash -wget https://pl-public-data.s3.amazonaws.com/legacy/checkpoints.zip -unzip -o checkpoints.zip +bash .actions/pull_legacy_checkpoints.sh ``` To back populate collection with past version you can use following bash: diff --git a/pl_examples/basic_examples/mnist_datamodule.py b/pl_examples/basic_examples/mnist_datamodule.py index 31a2e284dd8ba..02d7ff8fd7a37 100644 --- a/pl_examples/basic_examples/mnist_datamodule.py +++ b/pl_examples/basic_examples/mnist_datamodule.py @@ -36,7 +36,7 @@ class _MNIST(Dataset): """Carbon copy of ``tests.helpers.datasets.MNIST``. We cannot import the tests as they are not distributed with the package. - See https://github.com/PyTorchLightning/pytorch-lightning/pull/7614#discussion_r671183652 for more context. + See https://github.com/Lightning-AI/lightning/pull/7614#discussion_r671183652 for more context. """ RESOURCES = ( diff --git a/pl_examples/domain_templates/reinforce_learn_ppo.py b/pl_examples/domain_templates/reinforce_learn_ppo.py index a400de062bf6f..baae200f9e2bc 100644 --- a/pl_examples/domain_templates/reinforce_learn_ppo.py +++ b/pl_examples/domain_templates/reinforce_learn_ppo.py @@ -125,7 +125,7 @@ def get_log_prob(self, pi: Normal, actions: torch.Tensor): class ExperienceSourceDataset(IterableDataset): - """Implementation from PyTorch Lightning Bolts: https://github.com/PyTorchLightning/lightning- + """Implementation from PyTorch Lightning Bolts: https://github.com/Lightning-AI/lightning- bolts/blob/master/pl_bolts/datamodules/experience_source.py. Basic experience source dataset. Takes a generate_batch function that returns an iterator. The logic for the diff --git a/pytorch_lightning/__about__.py b/pytorch_lightning/__about__.py index fdaa499392c7e..ec1d1701bebd2 100644 --- a/pytorch_lightning/__about__.py +++ b/pytorch_lightning/__about__.py @@ -1,7 +1,7 @@ import time _this_year = time.strftime("%Y") -__version__ = "1.6.4" +__version__ = "1.6.5" __author__ = "William Falcon et al." __author_email__ = "waf2107@columbia.edu" __license__ = "Apache-2.0" diff --git a/pytorch_lightning/callbacks/progress/rich_progress.py b/pytorch_lightning/callbacks/progress/rich_progress.py index fb5914a7a5d41..9dcbb1f4522fd 100644 --- a/pytorch_lightning/callbacks/progress/rich_progress.py +++ b/pytorch_lightning/callbacks/progress/rich_progress.py @@ -12,13 +12,16 @@ # See the License for the specific language governing permissions and # limitations under the License. import math +import operator from dataclasses import dataclass from datetime import timedelta from typing import Any, Dict, Optional, Union import pytorch_lightning as pl from pytorch_lightning.callbacks.progress.base import ProgressBarBase -from pytorch_lightning.utilities.imports import _RICH_AVAILABLE +from pytorch_lightning.utilities.imports import _compare_version, _package_available + +_RICH_AVAILABLE: bool = _package_available("rich") and _compare_version("rich", operator.ge, "10.2.2") Task, Style = None, None if _RICH_AVAILABLE: diff --git a/pytorch_lightning/callbacks/rich_model_summary.py b/pytorch_lightning/callbacks/rich_model_summary.py index 148de6275950e..f2833bb33ff8c 100644 --- a/pytorch_lightning/callbacks/rich_model_summary.py +++ b/pytorch_lightning/callbacks/rich_model_summary.py @@ -14,7 +14,7 @@ from typing import List, Tuple from pytorch_lightning.callbacks import ModelSummary -from pytorch_lightning.utilities.imports import _RICH_AVAILABLE +from pytorch_lightning.callbacks.progress.rich_progress import _RICH_AVAILABLE from pytorch_lightning.utilities.model_summary import get_human_readable_count if _RICH_AVAILABLE: diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py index bf86471fe92ae..849d6715ef0eb 100644 --- a/pytorch_lightning/core/lightning.py +++ b/pytorch_lightning/core/lightning.py @@ -46,7 +46,7 @@ from pytorch_lightning.utilities.cloud_io import get_filesystem from pytorch_lightning.utilities.distributed import distributed_available, sync_ddp from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_12 +from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_11, _TORCH_GREATER_EQUAL_1_12 from pytorch_lightning.utilities.memory import get_model_size_mb from pytorch_lightning.utilities.model_summary import ModelSummary, summarize from pytorch_lightning.utilities.parsing import collect_init_args @@ -2064,7 +2064,10 @@ def _register_sharded_tensor_state_dict_hooks_if_available(self) -> None: rank_zero_debug("Could not register sharded tensor state dict hooks") return - from torch.distributed._sharded_tensor import pre_load_state_dict_hook, state_dict_hook + if _TORCH_GREATER_EQUAL_1_11: + from torch.distributed._shard.sharded_tensor import pre_load_state_dict_hook, state_dict_hook + else: + from torch.distributed._sharded_tensor import pre_load_state_dict_hook, state_dict_hook self._register_state_dict_hook(state_dict_hook) diff --git a/pytorch_lightning/loggers/neptune.py b/pytorch_lightning/loggers/neptune.py index 05952f6bc5747..e3cae87d19d18 100644 --- a/pytorch_lightning/loggers/neptune.py +++ b/pytorch_lightning/loggers/neptune.py @@ -54,7 +54,7 @@ log = logging.getLogger(__name__) -_INTEGRATION_VERSION_KEY = "source_code/integrations/pytorch-lightning" +_INTEGRATION_VERSION_KEY = "source_code/integrations/lightning" # kwargs used in previous NeptuneLogger version, now deprecated _LEGACY_NEPTUNE_INIT_KWARGS = [ @@ -113,7 +113,7 @@ class NeptuneLogger(LightningLoggerBase): neptune_logger = NeptuneLogger( api_key="ANONYMOUS", # replace with your own - project="common/pytorch-lightning-integration", # format "" + project="common/lightning-integration", # format "" tags=["training", "resnet"], # optional ) trainer = Trainer(max_epochs=10, logger=neptune_logger) @@ -157,7 +157,7 @@ def any_lightning_module_function_or_hook(self): .. code-block:: python - neptune_logger = NeptuneLogger(project="common/pytorch-lightning-integration") + neptune_logger = NeptuneLogger(project="common/lightning-integration") trainer = pl.Trainer(logger=neptune_logger) model = ... @@ -182,7 +182,7 @@ def any_lightning_module_function_or_hook(self): .. code-block:: python - neptune_logger = NeptuneLogger(project="common/pytorch-lightning-integration", log_model_checkpoints=False) + neptune_logger = NeptuneLogger(project="common/lightning-integration", log_model_checkpoints=False) **Pass additional parameters to the Neptune run** @@ -194,7 +194,7 @@ def any_lightning_module_function_or_hook(self): from pytorch_lightning.loggers import NeptuneLogger neptune_logger = NeptuneLogger( - project="common/pytorch-lightning-integration", + project="common/lightning-integration", name="lightning-run", description="mlp quick run with pytorch-lightning", tags=["mlp", "quick-run"], @@ -216,10 +216,10 @@ def any_lightning_module_function_or_hook(self): See Also: - Read about `what object you can log to Neptune `_. - - Check `example run `_ + - Check `example run `_ with multiple types of metadata logged. - For more detailed info check - `user guide `_. + `user guide `_. Args: api_key: Optional. @@ -350,7 +350,7 @@ def _verify_input_arguments( " - https://docs-legacy.neptune.ai/integrations/pytorch_lightning.html\n" "The NeptuneLogger was re-written to use the neptune.new Python API\n" " - https://neptune.ai/blog/neptune-new\n" - " - https://docs.neptune.ai/integrations-and-supported-tools/model-training/pytorch-lightning\n" + " - https://docs.neptune.ai/integrations-and-supported-tools/model-training/lightning\n" "You should use arguments accepted by either NeptuneLogger.init() or neptune.init()" ) @@ -377,7 +377,7 @@ def _verify_input_arguments( " - https://docs-legacy.neptune.ai/integrations/pytorch_lightning.html\n" "The NeptuneLogger was re-written to use the neptune.new Python API\n" " - https://neptune.ai/blog/neptune-new\n" - " - https://docs.neptune.ai/integrations-and-supported-tools/model-training/pytorch-lightning\n" + " - https://docs.neptune.ai/integrations-and-supported-tools/model-training/lightning\n" ) # check if user passed redundant neptune.init arguments when passed run @@ -477,7 +477,7 @@ def log_hyperparams(self, params: Union[Dict[str, Any], Namespace]) -> None: # neptune_logger = NeptuneLogger( api_key="ANONYMOUS", - project="common/pytorch-lightning-integration" + project="common/lightning-integration" ) neptune_logger.log_hyperparams(PARAMS) @@ -627,7 +627,7 @@ def _signal_deprecated_api_usage(f_name, sample_code, raise_exception=False): f" - https://docs-legacy.neptune.ai/integrations/pytorch_lightning.html\n" f"The NeptuneLogger was re-written to use the neptune.new Python API\n" f" - https://neptune.ai/blog/neptune-new\n" - f" - https://docs.neptune.ai/integrations-and-supported-tools/model-training/pytorch-lightning\n" + f" - https://docs.neptune.ai/integrations-and-supported-tools/model-training/lightning\n" f"Instead of `logger.{f_name}` you can use:\n" f"\t{sample_code}" ) diff --git a/pytorch_lightning/loops/dataloader/evaluation_loop.py b/pytorch_lightning/loops/dataloader/evaluation_loop.py index 2ec42fa0acbb5..46ea5ba631d10 100644 --- a/pytorch_lightning/loops/dataloader/evaluation_loop.py +++ b/pytorch_lightning/loops/dataloader/evaluation_loop.py @@ -24,8 +24,10 @@ import pytorch_lightning as pl from pytorch_lightning.accelerators import GPUAccelerator +from pytorch_lightning.callbacks.progress.rich_progress import _RICH_AVAILABLE from pytorch_lightning.loops.dataloader import DataLoaderLoop from pytorch_lightning.loops.epoch import EvaluationEpochLoop +from pytorch_lightning.loops.utilities import _set_sampler_epoch from pytorch_lightning.trainer.connectors.logger_connector.result import _OUT_DICT, _ResultCollection from pytorch_lightning.trainer.states import TrainerFn from pytorch_lightning.utilities.apply_func import apply_to_collection @@ -36,7 +38,6 @@ DataLoaderIterDataFetcher, InterBatchParallelDataFetcher, ) -from pytorch_lightning.utilities.imports import _RICH_AVAILABLE from pytorch_lightning.utilities.rank_zero import rank_zero_warn from pytorch_lightning.utilities.signature_utils import is_param_in_hook_signature from pytorch_lightning.utilities.types import EPOCH_OUTPUT @@ -161,14 +162,8 @@ def advance(self, *args: Any, **kwargs: Any) -> None: self._has_run = True def on_advance_start(self, *args: Any, **kwargs: Any) -> None: - dataloader = self.current_dataloader - if ( - dataloader is not None - and getattr(dataloader, "sampler", None) - and callable(getattr(dataloader.sampler, "set_epoch", None)) - ): - # set seed for distributed sampler (enables shuffling for each epoch) - dataloader.sampler.set_epoch(self.trainer.fit_loop.epoch_progress.current.processed) + if self.current_dataloader is not None: + _set_sampler_epoch(self.current_dataloader, self.trainer.fit_loop.epoch_progress.current.processed) super().on_advance_start(*args, **kwargs) diff --git a/pytorch_lightning/loops/dataloader/prediction_loop.py b/pytorch_lightning/loops/dataloader/prediction_loop.py index a14a218ef67e9..36648b7f43e34 100644 --- a/pytorch_lightning/loops/dataloader/prediction_loop.py +++ b/pytorch_lightning/loops/dataloader/prediction_loop.py @@ -5,6 +5,7 @@ from pytorch_lightning.loops.dataloader.dataloader_loop import DataLoaderLoop from pytorch_lightning.loops.epoch.prediction_epoch_loop import PredictionEpochLoop +from pytorch_lightning.loops.utilities import _set_sampler_epoch from pytorch_lightning.strategies import DDPSpawnStrategy from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.types import _PREDICT_OUTPUT @@ -87,13 +88,8 @@ def advance(self, *args: Any, **kwargs: Any) -> None: """Predicts one entire dataloader.""" void(*args, **kwargs) dataloader = self.current_dataloader - if ( - dataloader is not None - and getattr(dataloader, "sampler", None) - and callable(getattr(dataloader.sampler, "set_epoch", None)) - ): - # set seed for distributed sampler (enables shuffling for each epoch) - dataloader.sampler.set_epoch(self.trainer.fit_loop.epoch_progress.current.processed) + if dataloader is not None: + _set_sampler_epoch(dataloader, self.trainer.fit_loop.epoch_progress.current.processed) dataloader = self.trainer.strategy.process_dataloader(dataloader) dataloader_iter = enumerate(dataloader) dl_max_batches = self.max_batches[self.current_dataloader_idx] diff --git a/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py b/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py index 32fd893b759ee..9136cfc9d7f6a 100644 --- a/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py +++ b/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py @@ -195,7 +195,7 @@ def _reload_dataloader_state_dict(self, data_fetcher: AbstractDataFetcher) -> No if isinstance(dataloader, CombinedLoader): raise MisconfigurationException( "Reloading support hasn't been implemented for `CombinedLoader`. You can request it by opening an issue" - " in `https://github.com/PyTorchLightning/pytorch-lightning/issues`." + " in `https://github.com/Lightning-AI/lightning/issues`." ) assert isinstance(dataloader, DataLoader) _reload_dataloader_state_dict(dataloader, self._dataloader_state_dict) diff --git a/pytorch_lightning/loops/epoch/training_epoch_loop.py b/pytorch_lightning/loops/epoch/training_epoch_loop.py index b6887a4cf546c..395471861a289 100644 --- a/pytorch_lightning/loops/epoch/training_epoch_loop.py +++ b/pytorch_lightning/loops/epoch/training_epoch_loop.py @@ -281,6 +281,7 @@ def teardown(self) -> None: def on_save_checkpoint(self) -> Dict: state_dict = super().on_save_checkpoint() + state_dict["_batches_that_stepped"] = self._batches_that_stepped if ( self.trainer is not None @@ -300,6 +301,7 @@ def on_save_checkpoint(self) -> Dict: def on_load_checkpoint(self, state_dict: Dict) -> None: # cache the dataloader state dict until the dataloader objects are available self._dataloader_state_dict = state_dict.get("dataloader_state_dict") + self._batches_that_stepped = state_dict.get("_batches_that_stepped", 0) def _run_validation(self) -> None: # reload dataloaders diff --git a/pytorch_lightning/loops/fit_loop.py b/pytorch_lightning/loops/fit_loop.py index ac33390a97cec..0771a4a71de9f 100644 --- a/pytorch_lightning/loops/fit_loop.py +++ b/pytorch_lightning/loops/fit_loop.py @@ -21,7 +21,7 @@ from pytorch_lightning.loops import Loop from pytorch_lightning.loops.epoch import TrainingEpochLoop from pytorch_lightning.loops.epoch.training_epoch_loop import _OUTPUTS_TYPE as _EPOCH_OUTPUTS_TYPE -from pytorch_lightning.loops.utilities import _is_max_limit_reached +from pytorch_lightning.loops.utilities import _is_max_limit_reached, _set_sampler_epoch from pytorch_lightning.trainer.connectors.logger_connector.result import _ResultCollection from pytorch_lightning.trainer.progress import Progress from pytorch_lightning.trainer.supporters import TensorRunningAccum @@ -232,11 +232,8 @@ def on_advance_start(self) -> None: # type: ignore[override] # reset outputs here instead of in `reset` as they are not accumulated between epochs self._outputs = [] - if self.trainer.train_dataloader is not None and callable( - getattr(self.trainer.train_dataloader.sampler, "set_epoch", None) - ): - # set seed for distributed sampler (enables shuffling for each epoch) - self.trainer.train_dataloader.sampler.set_epoch(self.epoch_progress.current.processed) + if self.trainer.train_dataloader is not None: + _set_sampler_epoch(self.trainer.train_dataloader, self.epoch_progress.current.processed) # changing gradient according accumulation_scheduler self.trainer.accumulation_scheduler.on_train_epoch_start(self.trainer, self.trainer.lightning_module) diff --git a/pytorch_lightning/loops/utilities.py b/pytorch_lightning/loops/utilities.py index d84c195d758f9..15142be626587 100644 --- a/pytorch_lightning/loops/utilities.py +++ b/pytorch_lightning/loops/utilities.py @@ -21,6 +21,7 @@ import numpy as np import torch from torch.optim import Optimizer +from torch.utils.data import DataLoader import pytorch_lightning as pl from pytorch_lightning.loops import Loop @@ -228,3 +229,16 @@ def _reset_progress(loop: Loop) -> None: def _v1_8_output_format(fx: Callable) -> bool: parameters = inspect.signature(fx).parameters return "new_format" in parameters and parameters["new_format"].default is True + + +def _set_sampler_epoch(dataloader: DataLoader, epoch: int) -> None: + """Calls the ``set_epoch`` method on either the sampler or the batch sampler of the given dataloader. + + Every PyTorch dataloader has either a sampler or a batch sampler, and if it is wrapped by a + :class:`~torch.utils.data.distributed.DistributedSampler`, ``set_epoch`` must be called at the beginning + of every epoch to ensure shuffling applies a new ordering. This has no effect if shuffling is off. + """ + for sampler_name in ("sampler", "batch_sampler"): + sampler = getattr(dataloader, sampler_name, None) + if sampler is not None and callable(getattr(sampler, "set_epoch", None)): + sampler.set_epoch(epoch) diff --git a/pytorch_lightning/overrides/fairscale.py b/pytorch_lightning/overrides/fairscale.py index c33bed60902a1..9ab860774cf31 100644 --- a/pytorch_lightning/overrides/fairscale.py +++ b/pytorch_lightning/overrides/fairscale.py @@ -11,17 +11,23 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import operator + import torch.nn as nn import pytorch_lightning as pl from pytorch_lightning.overrides.base import _LightningModuleWrapperBase, unwrap_lightning_module -from pytorch_lightning.utilities import _FAIRSCALE_AVAILABLE +from pytorch_lightning.utilities import _module_available +from pytorch_lightning.utilities.imports import _compare_version, _IS_WINDOWS + +_FAIRSCALE_AVAILABLE = not _IS_WINDOWS and _module_available("fairscale.nn") +_FAIRSCALE_OSS_FP16_BROADCAST_AVAILABLE = _FAIRSCALE_AVAILABLE and _compare_version("fairscale", operator.ge, "0.3.3") +_FAIRSCALE_FULLY_SHARDED_AVAILABLE = _FAIRSCALE_AVAILABLE and _compare_version("fairscale", operator.ge, "0.3.4") -LightningShardedDataParallel = None if _FAIRSCALE_AVAILABLE: from fairscale.nn.data_parallel.sharded_ddp import ShardedDataParallel - class LightningShardedDataParallel(_LightningModuleWrapperBase): # type: ignore[no-redef] + class LightningShardedDataParallel(_LightningModuleWrapperBase): # Just do this for later docstrings pass @@ -31,3 +37,7 @@ def unwrap_lightning_module_sharded(wrapped_model: nn.Module) -> "pl.LightningMo model = model.module return unwrap_lightning_module(model) + +else: + LightningShardedDataParallel = ... # type: ignore[assignment,misc] + unwrap_lightning_module_sharded = ... # type: ignore[assignment] diff --git a/pytorch_lightning/overrides/torch_distributed.py b/pytorch_lightning/overrides/torch_distributed.py index ec09b7723132d..26d8ca425bb4d 100644 --- a/pytorch_lightning/overrides/torch_distributed.py +++ b/pytorch_lightning/overrides/torch_distributed.py @@ -10,10 +10,10 @@ _pickler = pickle.Pickler _unpickler = pickle.Unpickler - +_TORCH_DIST_AVAILABLE = torch.distributed.is_available() logger = logging.getLogger(__name__) -if torch.distributed.is_available(): +if _TORCH_DIST_AVAILABLE: from torch._C._distributed_c10d import ProcessGroup from torch.distributed import Backend, broadcast, get_backend, get_rank, GroupMember @@ -157,7 +157,7 @@ def _broadcast_object_list(object_list, src=0, group=None, device=None): object_list[i] = _tensor_to_object(obj_view, obj_size) -if not torch.distributed.is_available(): +if not _TORCH_DIST_AVAILABLE: # avoid failures on early PyTorch versions for Windows where # not all functions used in `broadcast_object_list` are available. def _broadcast_noop(obj, *_, **__): diff --git a/pytorch_lightning/plugins/io/torch_plugin.py b/pytorch_lightning/plugins/io/torch_plugin.py index be10bf967ab05..8791249e7d90c 100644 --- a/pytorch_lightning/plugins/io/torch_plugin.py +++ b/pytorch_lightning/plugins/io/torch_plugin.py @@ -54,7 +54,7 @@ def save_checkpoint(self, checkpoint: Dict[str, Any], path: _PATH, storage_optio atomic_save(checkpoint, path) except AttributeError as err: # todo (sean): is this try catch necessary still? - # https://github.com/PyTorchLightning/pytorch-lightning/pull/431 + # https://github.com/Lightning-AI/lightning/pull/431 key = pl.LightningModule.CHECKPOINT_HYPER_PARAMS_KEY checkpoint.pop(key, None) rank_zero_warn(f"Warning, `{key}` dropped from checkpoint. An attribute is not picklable: {err}") diff --git a/pytorch_lightning/plugins/precision/ipu.py b/pytorch_lightning/plugins/precision/ipu.py index 9df0edb53913b..a299be9a730a5 100644 --- a/pytorch_lightning/plugins/precision/ipu.py +++ b/pytorch_lightning/plugins/precision/ipu.py @@ -72,7 +72,7 @@ def optimizer_step( # we lack coverage here and IPUs are (currently) limited - something to explore if there's demand raise MisconfigurationException( "Skipping backward by returning `None` from your `training_step` is not implemented for IPUs." - " Please, open an issue in `https://github.com/PyTorchLightning/pytorch-lightning/issues`" + " Please, open an issue in `https://github.com/Lightning-AI/lightning/issues`" " requesting this feature." ) return closure_result diff --git a/pytorch_lightning/plugins/precision/sharded_native_amp.py b/pytorch_lightning/plugins/precision/sharded_native_amp.py index e40aea8ecf4eb..a1484559e5ce4 100644 --- a/pytorch_lightning/plugins/precision/sharded_native_amp.py +++ b/pytorch_lightning/plugins/precision/sharded_native_amp.py @@ -15,8 +15,8 @@ import torch +from pytorch_lightning.overrides.fairscale import _FAIRSCALE_AVAILABLE from pytorch_lightning.plugins.precision.native_amp import NativeMixedPrecisionPlugin -from pytorch_lightning.utilities import _FAIRSCALE_AVAILABLE from pytorch_lightning.utilities.exceptions import MisconfigurationException if _FAIRSCALE_AVAILABLE: diff --git a/pytorch_lightning/plugins/precision/tpu.py b/pytorch_lightning/plugins/precision/tpu.py index 1afd34264c60c..a0ed9de0a4239 100644 --- a/pytorch_lightning/plugins/precision/tpu.py +++ b/pytorch_lightning/plugins/precision/tpu.py @@ -46,7 +46,7 @@ def optimizer_step( # we lack coverage here so disable this - something to explore if there's demand raise MisconfigurationException( "Skipping backward by returning `None` from your `training_step` is not implemented for TPUs." - " Please, open an issue in `https://github.com/PyTorchLightning/pytorch-lightning/issues`" + " Please, open an issue in `https://github.com/Lightning-AI/lightning/issues`" " requesting this feature." ) return closure_result diff --git a/pytorch_lightning/setup_tools.py b/pytorch_lightning/setup_tools.py index 2df3c7946b4d9..82a8b85553232 100644 --- a/pytorch_lightning/setup_tools.py +++ b/pytorch_lightning/setup_tools.py @@ -59,7 +59,7 @@ def _load_readme_description(path_dir: str, homepage: str, version: str) -> str: # drop images from readme text = text.replace("![PT to PL](docs/source/_static/images/general/pl_quick_start_full_compressed.gif)", "") - # https://github.com/PyTorchLightning/pytorch-lightning/raw/master/docs/source/_static/images/lightning_module/pt_to_pl.png + # https://github.com/Lightning-AI/lightning/raw/master/docs/source/_static/images/lightning_module/pt_to_pl.png github_source_url = os.path.join(homepage, "raw", version) # replace relative repository path to absolute link to the release # do not replace all "docs" as in the readme we reger some other sources with particular path to docs @@ -81,7 +81,7 @@ def _load_readme_description(path_dir: str, homepage: str, version: str) -> str: # todo: wrap content as commented description text = re.sub(rf"{skip_begin}.+?{skip_end}", "", text, flags=re.IGNORECASE + re.DOTALL) - # # https://github.com/Borda/pytorch-lightning/releases/download/1.1.0a6/codecov_badge.png + # # https://github.com/Borda/lightning/releases/download/1.1.0a6/codecov_badge.png # github_release_url = os.path.join(homepage, "releases", "download", version) # # download badge and replace url with local file # text = _parse_for_badge(text, github_release_url) diff --git a/pytorch_lightning/strategies/bagua.py b/pytorch_lightning/strategies/bagua.py index 6b6598c790b36..9bf619aac8594 100644 --- a/pytorch_lightning/strategies/bagua.py +++ b/pytorch_lightning/strategies/bagua.py @@ -19,10 +19,12 @@ from pytorch_lightning.trainer.states import TrainerFn from pytorch_lightning.utilities.distributed import ReduceOp from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning.utilities.imports import _BAGUA_AVAILABLE +from pytorch_lightning.utilities.imports import _package_available from pytorch_lightning.utilities.optimizer import optimizers_to_device from pytorch_lightning.utilities.seed import reset_seed +_BAGUA_AVAILABLE = _package_available("bagua") + if _BAGUA_AVAILABLE: import bagua.torch_api as bagua from bagua.torch_api.algorithms import Algorithm diff --git a/pytorch_lightning/strategies/ddp.py b/pytorch_lightning/strategies/ddp.py index 453c821b3f59a..16caaffc0819f 100644 --- a/pytorch_lightning/strategies/ddp.py +++ b/pytorch_lightning/strategies/ddp.py @@ -30,6 +30,7 @@ from pytorch_lightning.core.optimizer import LightningOptimizer from pytorch_lightning.overrides import LightningDistributedModule from pytorch_lightning.overrides.distributed import prepare_for_backward +from pytorch_lightning.overrides.fairscale import _FAIRSCALE_AVAILABLE from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO from pytorch_lightning.plugins.precision import PrecisionPlugin @@ -50,7 +51,6 @@ ) from pytorch_lightning.utilities.exceptions import DeadlockDetectedException from pytorch_lightning.utilities.imports import ( - _FAIRSCALE_AVAILABLE, _IS_WINDOWS, _TORCH_GREATER_EQUAL_1_9, _TORCH_GREATER_EQUAL_1_10, diff --git a/pytorch_lightning/strategies/deepspeed.py b/pytorch_lightning/strategies/deepspeed.py index e3b470cd90363..1ccda976d810c 100644 --- a/pytorch_lightning/strategies/deepspeed.py +++ b/pytorch_lightning/strategies/deepspeed.py @@ -356,6 +356,8 @@ def setup_distributed(self): def setup(self, trainer: "pl.Trainer") -> None: self.accelerator.setup(trainer) + # we set the device so that optimizers can be created with distributed comms. + self.lightning_module._device = self.root_device self.setup_optimizers(trainer) self.setup_precision_plugin() optimizers_to_device(self.optimizers, self.root_device) diff --git a/pytorch_lightning/strategies/fully_sharded.py b/pytorch_lightning/strategies/fully_sharded.py index 971441160d333..7dea5a3f0d9ad 100644 --- a/pytorch_lightning/strategies/fully_sharded.py +++ b/pytorch_lightning/strategies/fully_sharded.py @@ -18,12 +18,12 @@ import torch import pytorch_lightning as pl +from pytorch_lightning.overrides.fairscale import _FAIRSCALE_FULLY_SHARDED_AVAILABLE from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO from pytorch_lightning.plugins.precision import PrecisionPlugin from pytorch_lightning.strategies.ddp import DDPStrategy from pytorch_lightning.trainer.states import TrainerFn -from pytorch_lightning.utilities import _FAIRSCALE_FULLY_SHARDED_AVAILABLE from pytorch_lightning.utilities.enums import PrecisionType from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.optimizer import optimizers_to_device diff --git a/pytorch_lightning/strategies/launchers/spawn.py b/pytorch_lightning/strategies/launchers/spawn.py index d67f9e620a45d..fe2c7763f82c9 100644 --- a/pytorch_lightning/strategies/launchers/spawn.py +++ b/pytorch_lightning/strategies/launchers/spawn.py @@ -53,7 +53,7 @@ def __init__(self, strategy: Strategy) -> None: def is_interactive_compatible(self) -> bool: # The start method 'spawn' is currently the only one that works with DDP and CUDA support # The start method 'fork' is the only one supported in Jupyter environments but not compatible with CUDA - # For more context, see https://github.com/PyTorchLightning/pytorch-lightning/issues/7550 + # For more context, see https://github.com/Lightning-AI/lightning/issues/7550 return self._start_method == "fork" and self._strategy.root_device.type != "cuda" def launch(self, function: Callable, *args: Any, trainer: Optional["pl.Trainer"] = None, **kwargs: Any) -> Any: diff --git a/pytorch_lightning/strategies/sharded.py b/pytorch_lightning/strategies/sharded.py index 8a76520755345..06fc8c0f34a42 100644 --- a/pytorch_lightning/strategies/sharded.py +++ b/pytorch_lightning/strategies/sharded.py @@ -20,11 +20,11 @@ import pytorch_lightning as pl from pytorch_lightning.core.optimizer import LightningOptimizer +from pytorch_lightning.overrides.fairscale import _FAIRSCALE_AVAILABLE, _FAIRSCALE_OSS_FP16_BROADCAST_AVAILABLE from pytorch_lightning.strategies.ddp import DDPStrategy from pytorch_lightning.trainer.states import TrainerFn from pytorch_lightning.utilities.enums import PrecisionType from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning.utilities.imports import _FAIRSCALE_AVAILABLE, _FAIRSCALE_OSS_FP16_BROADCAST_AVAILABLE from pytorch_lightning.utilities.optimizer import optimizers_to_device from pytorch_lightning.utilities.rank_zero import rank_zero_only diff --git a/pytorch_lightning/strategies/sharded_spawn.py b/pytorch_lightning/strategies/sharded_spawn.py index 58ad47f464bfc..40b5f359fe9c5 100644 --- a/pytorch_lightning/strategies/sharded_spawn.py +++ b/pytorch_lightning/strategies/sharded_spawn.py @@ -19,10 +19,10 @@ from torch.optim import Optimizer import pytorch_lightning as pl +from pytorch_lightning.overrides.fairscale import _FAIRSCALE_AVAILABLE from pytorch_lightning.strategies.ddp_spawn import DDPSpawnStrategy from pytorch_lightning.trainer.states import TrainerFn from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning.utilities.imports import _FAIRSCALE_AVAILABLE from pytorch_lightning.utilities.optimizer import optimizers_to_device from pytorch_lightning.utilities.rank_zero import rank_zero_only diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index d242236e8317e..e795358ef3420 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -221,7 +221,7 @@ def _init_deterministic(self, deterministic: Optional[bool]) -> None: torch.use_deterministic_algorithms(self.deterministic) if self.deterministic: # fixing non-deterministic part of horovod - # https://github.com/PyTorchLightning/pytorch-lightning/pull/1572/files#r420279383 + # https://github.com/Lightning-AI/lightning/pull/1572/files#r420279383 os.environ["HOROVOD_FUSION_THRESHOLD"] = "0" # https://docs.nvidia.com/cuda/cublas/index.html#cublasApi_reproducibility @@ -605,7 +605,7 @@ def _check_strategy_and_fallback(self) -> None: if _TPU_AVAILABLE: raise MisconfigurationException( "`accelerator='ddp_cpu'` is not supported on TPU machines. " - "Learn more: https://github.com/PyTorchLightning/pytorch-lightning/issues/7810" + "Learn more: https://github.com/Lightning-AI/lightning/issues/7810" ) if self._devices_flag == 1 and self._num_nodes_flag > 1: strategy_flag = DDPStrategy.strategy_name @@ -725,7 +725,7 @@ def _validate_precision_choice(self) -> None: if self._precision_flag == 64: raise MisconfigurationException( "`Trainer(accelerator='tpu', precision=64)` is not implemented." - " Please, open an issue in `https://github.com/PyTorchLightning/pytorch-lightning/issues`" + " Please, open an issue in `https://github.com/Lightning-AI/lightning/issues`" " requesting this feature." ) if self._precision_plugin_flag and not isinstance( diff --git a/pytorch_lightning/trainer/connectors/logger_connector/fx_validator.py b/pytorch_lightning/trainer/connectors/logger_connector/fx_validator.py index dadecef006278..49ef52614e8f6 100644 --- a/pytorch_lightning/trainer/connectors/logger_connector/fx_validator.py +++ b/pytorch_lightning/trainer/connectors/logger_connector/fx_validator.py @@ -182,7 +182,7 @@ def check_logging(cls, fx_name: str) -> None: if fx_name not in cls.functions: raise RuntimeError( f"Logging inside `{fx_name}` is not implemented." - " Please, open an issue in `https://github.com/PyTorchLightning/pytorch-lightning/issues`." + " Please, open an issue in `https://github.com/Lightning-AI/lightning/issues`." ) if cls.functions[fx_name] is None: diff --git a/pytorch_lightning/trainer/connectors/logger_connector/result.py b/pytorch_lightning/trainer/connectors/logger_connector/result.py index f7f708fdd1fd6..744921c7c28f8 100644 --- a/pytorch_lightning/trainer/connectors/logger_connector/result.py +++ b/pytorch_lightning/trainer/connectors/logger_connector/result.py @@ -125,7 +125,7 @@ def __post_init__(self) -> None: def _parse_reduce_fx(self) -> None: error = ( "Only `self.log(..., reduce_fx={min,max,mean,sum})` are currently supported." - " Please, open an issue in `https://github.com/PyTorchLightning/pytorch-lightning/issues`." + " Please, open an issue in `https://github.com/Lightning-AI/lightning/issues`." f" Found: {self.reduce_fx}" ) if isinstance(self.reduce_fx, str): diff --git a/pytorch_lightning/trainer/supporters.py b/pytorch_lightning/trainer/supporters.py index b8f688892b318..6d3ec88b0be6a 100644 --- a/pytorch_lightning/trainer/supporters.py +++ b/pytorch_lightning/trainer/supporters.py @@ -438,9 +438,14 @@ class DataLoaderDict(dict): @property def sampler(self) -> Union[Iterable, Sequence, Mapping]: - """Return a collections of samplers extracting from loaders.""" + """Return a collections of samplers extracted from loaders.""" return apply_to_collection(self.loaders, (DataLoader, IterableDataset), getattr, "sampler", None) + @property + def batch_sampler(self) -> Union[Iterable, Sequence, Mapping]: + """Return a collections of batch samplers extracted from loaders.""" + return apply_to_collection(self.loaders, (DataLoader, IterableDataset), getattr, "batch_sampler", None) + def _wrap_loaders_max_size_cycle(self) -> Any: """Wraps all loaders to make sure they are cycled until the longest loader is exhausted. diff --git a/pytorch_lightning/utilities/__init__.py b/pytorch_lightning/utilities/__init__.py index 87947ac9a10f3..b5b70e7220a8d 100644 --- a/pytorch_lightning/utilities/__init__.py +++ b/pytorch_lightning/utilities/__init__.py @@ -29,11 +29,7 @@ from pytorch_lightning.utilities.grads import grad_norm # noqa: F401 from pytorch_lightning.utilities.imports import ( # noqa: F401 _APEX_AVAILABLE, - _BAGUA_AVAILABLE, _DEEPSPEED_AVAILABLE, - _FAIRSCALE_AVAILABLE, - _FAIRSCALE_FULLY_SHARDED_AVAILABLE, - _FAIRSCALE_OSS_FP16_BROADCAST_AVAILABLE, _GROUP_AVAILABLE, _HOROVOD_AVAILABLE, _HPU_AVAILABLE, @@ -46,7 +42,6 @@ _module_available, _OMEGACONF_AVAILABLE, _POPTORCH_AVAILABLE, - _RICH_AVAILABLE, _TORCH_GREATER_EQUAL_1_9, _TORCH_GREATER_EQUAL_1_10, _TORCH_GREATER_EQUAL_1_11, diff --git a/pytorch_lightning/utilities/imports.py b/pytorch_lightning/utilities/imports.py index 3647dbedd11ee..b9884b29cb999 100644 --- a/pytorch_lightning/utilities/imports.py +++ b/pytorch_lightning/utilities/imports.py @@ -87,7 +87,7 @@ def _compare_version(package: str, op: Callable, version: str, use_base_version: _IS_WINDOWS = platform.system() == "Windows" _IS_INTERACTIVE = hasattr(sys, "ps1") # https://stackoverflow.com/a/64523765 -_PYTHON_GREATER_EQUAL_3_8_0 = Version(platform.python_version()) >= Version("3.8.0") +_PYTHON_GREATER_EQUAL_3_8_0 = (sys.version_info.major, sys.version_info.minor) >= (3, 8) _TORCH_GREATER_EQUAL_1_8_1 = _compare_version("torch", operator.ge, "1.8.1") _TORCH_GREATER_EQUAL_1_9 = _compare_version("torch", operator.ge, "1.9.0") _TORCH_GREATER_EQUAL_1_9_1 = _compare_version("torch", operator.ge, "1.9.1") @@ -97,14 +97,10 @@ def _compare_version(package: str, op: Callable, version: str, use_base_version: _TORCH_GREATER_EQUAL_1_12 = _compare_version("torch", operator.ge, "1.12.0", use_base_version=True) _APEX_AVAILABLE = _module_available("apex.amp") -_BAGUA_AVAILABLE = _package_available("bagua") _DEEPSPEED_AVAILABLE = _package_available("deepspeed") _DEEPSPEED_GREATER_EQUAL_0_5_9 = _DEEPSPEED_AVAILABLE and _compare_version("deepspeed", operator.ge, "0.5.9") _DEEPSPEED_GREATER_EQUAL_0_6 = _DEEPSPEED_AVAILABLE and _compare_version("deepspeed", operator.ge, "0.6.0") _DOCSTRING_PARSER_AVAILABLE = _package_available("docstring_parser") -_FAIRSCALE_AVAILABLE = not _IS_WINDOWS and _module_available("fairscale.nn") -_FAIRSCALE_OSS_FP16_BROADCAST_AVAILABLE = _FAIRSCALE_AVAILABLE and _compare_version("fairscale", operator.ge, "0.3.3") -_FAIRSCALE_FULLY_SHARDED_AVAILABLE = _FAIRSCALE_AVAILABLE and _compare_version("fairscale", operator.ge, "0.3.4") _GROUP_AVAILABLE = not _IS_WINDOWS and _module_available("torch.distributed.group") _HOROVOD_AVAILABLE = _module_available("horovod.torch") _HYDRA_AVAILABLE = _package_available("hydra") @@ -116,7 +112,7 @@ def _compare_version(package: str, op: Callable, version: str, use_base_version: _OMEGACONF_AVAILABLE = _package_available("omegaconf") _POPTORCH_AVAILABLE = _package_available("poptorch") _HABANA_FRAMEWORK_AVAILABLE = _package_available("habana_frameworks") -_RICH_AVAILABLE = _package_available("rich") and _compare_version("rich", operator.ge, "10.2.2") + _TORCH_QUANTIZE_AVAILABLE = bool([eg for eg in torch.backends.quantized.supported_engines if eg != "none"]) _TORCHTEXT_AVAILABLE = _package_available("torchtext") _TORCHTEXT_LEGACY: bool = _TORCHTEXT_AVAILABLE and _compare_version("torchtext", operator.lt, "0.11.0") diff --git a/pytorch_lightning/utilities/meta.py b/pytorch_lightning/utilities/meta.py index a5edcfb300188..ef2327d81119e 100644 --- a/pytorch_lightning/utilities/meta.py +++ b/pytorch_lightning/utilities/meta.py @@ -13,6 +13,7 @@ # limitations under the License. import importlib import inspect +import operator import threading from contextlib import contextmanager from functools import partial @@ -27,9 +28,12 @@ import pytorch_lightning as pl from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_10 +from pytorch_lightning.utilities.imports import _compare_version from pytorch_lightning.utilities.rank_zero import rank_zero_warn +# this is needed for proper generating Meta package +_TORCH_GREATER_EQUAL_1_10 = _compare_version("torch", operator.ge, "1.10.0") + if _TORCH_GREATER_EQUAL_1_10: from torch._C import _DisableTorchDispatch # type: ignore[attr-defined] diff --git a/pytorch_lightning/utilities/migration.py b/pytorch_lightning/utilities/migration.py index 30cc823210423..ed71f25a571f7 100644 --- a/pytorch_lightning/utilities/migration.py +++ b/pytorch_lightning/utilities/migration.py @@ -28,7 +28,7 @@ class pl_legacy_patch: unpickling old checkpoints. The following patches apply. 1. ``pytorch_lightning.utilities.argparse._gpus_arg_default``: Applies to all checkpoints saved prior to - version 1.2.8. See: https://github.com/PyTorchLightning/pytorch-lightning/pull/6898 + version 1.2.8. See: https://github.com/Lightning-AI/lightning/pull/6898 2. ``pytorch_lightning.utilities.argparse_utils``: A module that was deprecated in 1.2 and removed in 1.4, but still needs to be available for import for legacy checkpoints. diff --git a/pytorch_lightning/utilities/model_summary.py b/pytorch_lightning/utilities/model_summary.py index f0419e4b97077..9c5ff088da368 100644 --- a/pytorch_lightning/utilities/model_summary.py +++ b/pytorch_lightning/utilities/model_summary.py @@ -15,7 +15,7 @@ import contextlib import logging from collections import OrderedDict -from typing import Any, Dict, List, Optional, Tuple, Union +from typing import Any, cast, Dict, List, Optional, Tuple, Union import numpy as np import torch @@ -120,7 +120,9 @@ def layer_type(self) -> str: @property def num_parameters(self) -> int: """Returns the number of parameters in this module.""" - return sum(np.prod(p.shape) if not _is_lazy_weight_tensor(p) else 0 for p in self._module.parameters()) + return sum( + cast(int, np.prod(p.shape)) if not _is_lazy_weight_tensor(p) else 0 for p in self._module.parameters() + ) class ModelSummary: diff --git a/requirements/base.txt b/requirements/base.txt index 555997c6576e6..768da61c48fd6 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -4,7 +4,7 @@ tqdm>=4.57.0, <=4.63.0 PyYAML>=5.4, <=6.0 fsspec[http]>=2021.05.0, !=2021.06.0, <=2022.2.0 tensorboard>=2.2.0, <2.10.0 -torchmetrics>=0.4.1, <=0.7.2 +torchmetrics>=0.4.1, <0.9.2 pyDeprecate>=0.3.1, <=0.3.2 packaging>=17.0, <=21.3 typing-extensions>=4.0.0, <4.2.1 diff --git a/requirements/extra.txt b/requirements/extra.txt index cef58c6c21221..8162eed3f8518 100644 --- a/requirements/extra.txt +++ b/requirements/extra.txt @@ -6,4 +6,4 @@ omegaconf>=2.0.5, <=2.1.* hydra-core>=1.0.5, <=1.1.* jsonargparse[signatures]>=4.7.1, <4.7.4 gcsfs>=2021.5.0, <=2022.2.0 -rich>=10.2.2,!=10.15.*, <=12.0.0 +rich>=10.2.2, !=10.15.0.a, <13.0.0 diff --git a/setup.py b/setup.py index 57f59045ed256..2627ebdc6eca8 100755 --- a/setup.py +++ b/setup.py @@ -69,7 +69,7 @@ def _load_py_module(fname, pkg="pytorch_lightning"): author=about.__author__, author_email=about.__author_email__, url=about.__homepage__, - download_url="https://github.com/PyTorchLightning/pytorch-lightning", + download_url="https://github.com/Lightning-AI/lightning", license=about.__license__, packages=find_packages(exclude=["tests*", "pl_examples*", "legacy*"]), include_package_data=True, @@ -82,9 +82,9 @@ def _load_py_module(fname, pkg="pytorch_lightning"): install_requires=setup_tools._load_requirements(_PATH_REQUIRE), extras_require=extras, project_urls={ - "Bug Tracker": "https://github.com/PyTorchLightning/pytorch-lightning/issues", - "Documentation": "https://pytorch-lightning.rtfd.io/en/latest/", - "Source Code": "https://github.com/PyTorchLightning/pytorch-lightning", + "Bug Tracker": "https://github.com/Lightning-AI/lightning/issues", + "Documentation": "https://lightning.rtfd.io/en/latest/", + "Source Code": "https://github.com/Lightning-AI/lightning", }, classifiers=[ "Environment :: Console", diff --git a/tests/README.md b/tests/README.md index 105aed20004ef..278dd9fe45ea0 100644 --- a/tests/README.md +++ b/tests/README.md @@ -22,8 +22,7 @@ pre-commit install Additionally, for testing backward compatibility with older versions of PyTorch Lightning, you also need to download all saved version-checkpoints from the public AWS storage. Run the following script to get all saved version-checkpoints: ```bash -wget https://pl-public-data.s3.amazonaws.com/legacy/checkpoints.zip -P legacy/ -unzip -o legacy/checkpoints.zip -d legacy/ +bash .actions/pull_legacy_checkpoints.sh ``` Note: These checkpoints are generated to set baselines for maintaining backward compatibility with legacy versions of PyTorch Lightning. Details of checkpoints for back-compatibility can be found [here](https://github.com/PyTorchLightning/pytorch-lightning/blob/master/legacy/README.md). diff --git a/tests/core/test_lightning_module.py b/tests/core/test_lightning_module.py index 180e7c46fe7d4..4fe9d300c3596 100644 --- a/tests/core/test_lightning_module.py +++ b/tests/core/test_lightning_module.py @@ -299,7 +299,10 @@ def assert_device(device: torch.device) -> None: @RunIf(min_torch="1.10", skip_windows=True) def test_sharded_tensor_state_dict(single_process_pg): - from torch.distributed._sharded_tensor import empty as sharded_tensor_empty + if _TORCH_GREATER_EQUAL_1_11: + from torch.distributed._shard.sharded_tensor import empty as sharded_tensor_empty + else: + from torch.distributed._sharded_tensor import empty as sharded_tensor_empty from torch.distributed._sharding_spec import ChunkShardingSpec class BoringModelWithShardedTensor(BoringModel): diff --git a/tests/helpers/runif.py b/tests/helpers/runif.py index 5a2464f6fd6ba..ed1b8ee7bc044 100644 --- a/tests/helpers/runif.py +++ b/tests/helpers/runif.py @@ -20,17 +20,16 @@ from packaging.version import Version from pkg_resources import get_distribution +from pytorch_lightning.callbacks.progress.rich_progress import _RICH_AVAILABLE +from pytorch_lightning.overrides.fairscale import _FAIRSCALE_AVAILABLE, _FAIRSCALE_FULLY_SHARDED_AVAILABLE +from pytorch_lightning.strategies.bagua import _BAGUA_AVAILABLE from pytorch_lightning.utilities import ( _APEX_AVAILABLE, - _BAGUA_AVAILABLE, _DEEPSPEED_AVAILABLE, - _FAIRSCALE_AVAILABLE, - _FAIRSCALE_FULLY_SHARDED_AVAILABLE, _HOROVOD_AVAILABLE, _HPU_AVAILABLE, _IPU_AVAILABLE, _OMEGACONF_AVAILABLE, - _RICH_AVAILABLE, _TORCH_GREATER_EQUAL_1_10, _TORCH_QUANTIZE_AVAILABLE, _TPU_AVAILABLE, diff --git a/tests/loggers/test_neptune.py b/tests/loggers/test_neptune.py index 87e52159b61d6..de9329564292b 100644 --- a/tests/loggers/test_neptune.py +++ b/tests/loggers/test_neptune.py @@ -47,7 +47,7 @@ class Run: def __setitem__(self, key, value): # called once - assert key == "source_code/integrations/pytorch-lightning" + assert key == "source_code/integrations/lightning" assert value == __version__ def wait(self): @@ -89,7 +89,7 @@ def test_neptune_online(self, neptune): self.assertEqual(created_run_mock.__getitem__.call_count, 2) self.assertEqual(created_run_mock.__setitem__.call_count, 1) created_run_mock.__getitem__.assert_has_calls([call("sys/id"), call("sys/name")], any_order=True) - created_run_mock.__setitem__.assert_called_once_with("source_code/integrations/pytorch-lightning", __version__) + created_run_mock.__setitem__.assert_called_once_with("source_code/integrations/lightning", __version__) @patch("pytorch_lightning.loggers.neptune.Run", Run) def test_online_with_custom_run(self, neptune): diff --git a/tests/loops/test_evaluation_loop.py b/tests/loops/test_evaluation_loop.py index 137608c426ee0..bddf819aafdd6 100644 --- a/tests/loops/test_evaluation_loop.py +++ b/tests/loops/test_evaluation_loop.py @@ -12,11 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. from unittest import mock -from unittest.mock import Mock +from unittest.mock import call, Mock import torch from torch.utils.data.dataloader import DataLoader -from torch.utils.data.sampler import RandomSampler +from torch.utils.data.sampler import BatchSampler, RandomSampler from pytorch_lightning import Trainer from pytorch_lightning.loops import EvaluationEpochLoop @@ -44,9 +44,8 @@ def test_on_evaluation_epoch_end(eval_epoch_end_mock, tmpdir): assert eval_epoch_end_mock.call_count == 4 -def test_set_epoch_called_eval_predict(tmpdir): - """Tests that set_epoch (if the sampler has one) is called on the DataLoader during evaluation and - prediction.""" +def test_evaluation_loop_sampler_set_epoch_called(tmpdir): + """Tests that set_epoch is called on the dataloader's sampler (if any) during training and validation.""" def _get_dataloader(): dataset = RandomDataset(32, 64) @@ -56,20 +55,60 @@ def _get_dataloader(): model = BoringModel() trainer = Trainer( - default_root_dir=tmpdir, limit_train_batches=2, limit_val_batches=2, max_epochs=2, enable_model_summary=False + default_root_dir=tmpdir, + limit_train_batches=1, + limit_val_batches=1, + max_epochs=2, + enable_model_summary=False, + enable_checkpointing=False, + logger=False, + ) + + train_dataloader = _get_dataloader() + val_dataloader = _get_dataloader() + trainer.fit(model, train_dataloaders=train_dataloader, val_dataloaders=val_dataloader) + # One for each epoch + assert train_dataloader.sampler.set_epoch.call_args_list == [call(0), call(1)] + # One for each epoch + sanity check + assert val_dataloader.sampler.set_epoch.call_args_list == [call(0), call(0), call(1)] + + val_dataloader = _get_dataloader() + trainer.validate(model, val_dataloader) + assert val_dataloader.sampler.set_epoch.call_args_list == [call(2)] + + +def test_evaluation_loop_batch_sampler_set_epoch_called(tmpdir): + """Tests that set_epoch is called on the dataloader's batch sampler (if any) during training and validation.""" + + def _get_dataloader(): + dataset = RandomDataset(32, 64) + sampler = RandomSampler(dataset) + batch_sampler = BatchSampler(sampler, 2, True) + batch_sampler.set_epoch = Mock() + return DataLoader(dataset, batch_sampler=batch_sampler) + + model = BoringModel() + trainer = Trainer( + default_root_dir=tmpdir, + limit_train_batches=1, + limit_val_batches=1, + max_epochs=2, + enable_model_summary=False, + enable_checkpointing=False, + logger=False, ) train_dataloader = _get_dataloader() val_dataloader = _get_dataloader() trainer.fit(model, train_dataloaders=train_dataloader, val_dataloaders=val_dataloader) # One for each epoch - assert train_dataloader.sampler.set_epoch.call_count == 2 + assert train_dataloader.batch_sampler.set_epoch.call_args_list == [call(0), call(1)] # One for each epoch + sanity check - assert val_dataloader.sampler.set_epoch.call_count == 3 + assert val_dataloader.batch_sampler.set_epoch.call_args_list == [call(0), call(0), call(1)] val_dataloader = _get_dataloader() trainer.validate(model, val_dataloader) - assert val_dataloader.sampler.set_epoch.call_count == 1 + assert val_dataloader.batch_sampler.set_epoch.call_args_list == [call(2)] @mock.patch( diff --git a/tests/loops/test_loop_state_dict.py b/tests/loops/test_loop_state_dict.py index 1e67fcc0ed8db..f9630095502d1 100644 --- a/tests/loops/test_loop_state_dict.py +++ b/tests/loops/test_loop_state_dict.py @@ -47,7 +47,7 @@ def test_loops_state_dict_structure(): expected = { "fit_loop": { "state_dict": {}, - "epoch_loop.state_dict": {}, + "epoch_loop.state_dict": {"_batches_that_stepped": 0}, "epoch_loop.batch_progress": { "total": {"ready": 0, "started": 0, "processed": 0, "completed": 0}, "current": {"ready": 0, "started": 0, "processed": 0, "completed": 0}, diff --git a/tests/loops/test_utilities.py b/tests/loops/test_utilities.py index c5d2e98d008b0..914c1de8e115b 100644 --- a/tests/loops/test_utilities.py +++ b/tests/loops/test_utilities.py @@ -11,10 +11,12 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from unittest.mock import Mock + import pytest import torch -from pytorch_lightning.loops.utilities import _extract_hiddens, _v1_8_output_format +from pytorch_lightning.loops.utilities import _extract_hiddens, _set_sampler_epoch, _v1_8_output_format from pytorch_lightning.utilities.exceptions import MisconfigurationException @@ -61,3 +63,23 @@ def training_epoch_end(outputs, new_format=True): ... assert _v1_8_output_format(training_epoch_end) + + +def test_set_sampler_epoch(): + # No samplers + dataloader = Mock() + dataloader.sampler = None + dataloader.batch_sampler = None + _set_sampler_epoch(dataloader, 55) + + # set_epoch not callable + dataloader = Mock() + dataloader.sampler.set_epoch = None + dataloader.batch_sampler.set_epoch = None + _set_sampler_epoch(dataloader, 55) + + # set_epoch callable + dataloader = Mock() + _set_sampler_epoch(dataloader, 55) + dataloader.sampler.set_epoch.assert_called_once_with(55) + dataloader.batch_sampler.set_epoch.assert_called_once_with(55) diff --git a/tests/models/test_restore.py b/tests/models/test_restore.py index 136e8ee516bbb..d0519de90e7a2 100644 --- a/tests/models/test_restore.py +++ b/tests/models/test_restore.py @@ -259,6 +259,7 @@ def on_train_start(self) -> None: trainer.fit(TestModel(), ckpt_path=ckpt_path) assert trainer.current_epoch == max_epochs assert trainer.global_step == max_epochs * train_batches + assert trainer.fit_loop.epoch_loop._batches_that_stepped == max_epochs * train_batches def test_fit_twice(tmpdir): diff --git a/tests/plugins/precision/test_sharded_precision.py b/tests/plugins/precision/test_sharded_precision.py index 754095912fb53..8fde1946459b2 100644 --- a/tests/plugins/precision/test_sharded_precision.py +++ b/tests/plugins/precision/test_sharded_precision.py @@ -15,8 +15,8 @@ import pytest import torch +from pytorch_lightning.overrides.fairscale import _FAIRSCALE_AVAILABLE from pytorch_lightning.plugins import ShardedNativeMixedPrecisionPlugin -from pytorch_lightning.utilities import _FAIRSCALE_AVAILABLE from tests.helpers.runif import RunIf ShardedGradScaler = None diff --git a/tests/strategies/test_ddp_fully_sharded_with_full_state_dict.py b/tests/strategies/test_ddp_fully_sharded_with_full_state_dict.py index 2912d59598220..0a26236acecdd 100644 --- a/tests/strategies/test_ddp_fully_sharded_with_full_state_dict.py +++ b/tests/strategies/test_ddp_fully_sharded_with_full_state_dict.py @@ -7,9 +7,9 @@ from pytorch_lightning import Trainer from pytorch_lightning.callbacks import ModelCheckpoint +from pytorch_lightning.overrides.fairscale import _FAIRSCALE_FULLY_SHARDED_AVAILABLE from pytorch_lightning.plugins import FullyShardedNativeMixedPrecisionPlugin from pytorch_lightning.strategies import DDPFullyShardedStrategy -from pytorch_lightning.utilities import _FAIRSCALE_FULLY_SHARDED_AVAILABLE from pytorch_lightning.utilities.exceptions import MisconfigurationException from tests.helpers.boring_model import BoringModel from tests.helpers.runif import RunIf diff --git a/tests/strategies/test_deepspeed_strategy.py b/tests/strategies/test_deepspeed_strategy.py index f3c99203d70eb..c829b203f3846 100644 --- a/tests/strategies/test_deepspeed_strategy.py +++ b/tests/strategies/test_deepspeed_strategy.py @@ -1199,3 +1199,26 @@ def training_step(self, *args, **kwargs): ckpt_path = os.path.join(trainer.checkpoint_callback.dirpath, filepath) expected = {"latest", "zero_to_fp32.py", "checkpoint"} assert expected == set(os.listdir(ckpt_path)) + + +@RunIf(min_cuda_gpus=2, deepspeed=True, standalone=True) +def test_deepspeed_configure_optimizer_device_set(tmpdir): + """Test to ensure that the LM has access to the device within the ``configure_optimizer`` function, and + estimated_stepping_batches works correctly as a result.""" + + class TestModel(BoringModel): + def configure_optimizers(self): + assert self.trainer.estimated_stepping_batches == 1 + assert self.device.type == "cuda" + raise SystemExit + + model = TestModel() + trainer = Trainer( + default_root_dir=tmpdir, + fast_dev_run=True, + accelerator="gpu", + devices=2, + strategy=DeepSpeedStrategy(), + ) + with pytest.raises(SystemExit): + trainer.fit(model) diff --git a/tests/strategies/test_sharded_strategy.py b/tests/strategies/test_sharded_strategy.py index 8a1313e5a6a45..dff7ca0a0d75d 100644 --- a/tests/strategies/test_sharded_strategy.py +++ b/tests/strategies/test_sharded_strategy.py @@ -6,9 +6,9 @@ import torch from pytorch_lightning import LightningModule, Trainer +from pytorch_lightning.overrides.fairscale import _FAIRSCALE_AVAILABLE from pytorch_lightning.strategies import DDPShardedStrategy, DDPSpawnShardedStrategy from pytorch_lightning.trainer.states import TrainerFn -from pytorch_lightning.utilities import _FAIRSCALE_AVAILABLE from tests.helpers.boring_model import BoringModel from tests.helpers.runif import RunIf diff --git a/tests/trainer/logging_/test_eval_loop_logging.py b/tests/trainer/logging_/test_eval_loop_logging.py index 4457aba18e796..d41044240fa92 100644 --- a/tests/trainer/logging_/test_eval_loop_logging.py +++ b/tests/trainer/logging_/test_eval_loop_logging.py @@ -25,11 +25,12 @@ import torch from pytorch_lightning import callbacks, Trainer +from pytorch_lightning.callbacks.progress.rich_progress import _RICH_AVAILABLE from pytorch_lightning.loggers import TensorBoardLogger from pytorch_lightning.loops.dataloader import EvaluationLoop from pytorch_lightning.trainer.states import RunningStage from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning.utilities.imports import _PYTHON_GREATER_EQUAL_3_8_0, _RICH_AVAILABLE +from pytorch_lightning.utilities.imports import _PYTHON_GREATER_EQUAL_3_8_0 from tests.helpers import BoringModel, RandomDataset from tests.helpers.runif import RunIf diff --git a/tests/trainer/test_dataloaders.py b/tests/trainer/test_dataloaders.py index 108ea323ecd89..bbd2d61f3d03d 100644 --- a/tests/trainer/test_dataloaders.py +++ b/tests/trainer/test_dataloaders.py @@ -19,7 +19,7 @@ import torch from torch.utils.data import RandomSampler from torch.utils.data.dataloader import DataLoader -from torch.utils.data.dataset import Dataset, IterableDataset, Subset +from torch.utils.data.dataset import Dataset, IterableDataset from torch.utils.data.distributed import DistributedSampler from torch.utils.data.sampler import SequentialSampler @@ -855,53 +855,6 @@ def test_dataloader_distributed_sampler_already_attached(tmpdir): assert trainer.state.finished, "DDP Training failed" -@RunIf(min_gpus=3) -def test_batch_size_smaller_than_num_gpus(tmpdir): - # we need at least 3 gpus for this test - num_gpus = 3 - batch_size = 3 - - class CurrentTestModel(BoringModel): - def __init__(self, batch_size) -> None: - super().__init__() - self.save_hyperparameters() - # batch norm doesn't work with batch size 1, we replace it - self.c_d1_bn = torch.nn.ReLU() - - def training_step(self, *args, **kwargs): - output = super().training_step(*args, **kwargs) - loss = output["loss"] - # we make sure to add some metrics to the output dict, - # this is essential for this test - output["progress_bar"] = {"train_loss": loss} - return output - - def train_dataloader(self): - dataset = RandomDataset(32, 64) - # construct a dataset with a size that is not divisible by num_gpus - # therefore the last batch will have a size < num_gpus - size = num_gpus * self.hparams.batch_size + (num_gpus - 1) - dataset = Subset(dataset, range(size)) - dataloader = DataLoader(dataset, batch_size=self.hparams.batch_size, drop_last=False) - return dataloader - - model = CurrentTestModel(batch_size=batch_size) - - trainer = Trainer( - default_root_dir=tmpdir, - max_epochs=1, - limit_train_batches=0.1, - limit_val_batches=0, - accelerator="gpu", - devices=num_gpus, - ) - - # we expect the reduction for the metrics also to happen on the last batch - # where we will get fewer metrics than gpus - trainer.fit(model) - assert trainer.state.finished, f"Training failed with {trainer.state}" - - @pytest.mark.parametrize( ["multiple_trainloader_mode", "num_training_batches"], [("min_size", 16), ("max_size_cycle", 64)], diff --git a/tests/utilities/test_imports.py b/tests/utilities/test_imports.py index aa40f71da4982..629517d9c51f9 100644 --- a/tests/utilities/test_imports.py +++ b/tests/utilities/test_imports.py @@ -13,11 +13,11 @@ # limitations under the License. import operator +from pytorch_lightning.overrides.fairscale import _FAIRSCALE_AVAILABLE +from pytorch_lightning.strategies.bagua import _BAGUA_AVAILABLE from pytorch_lightning.utilities import ( _APEX_AVAILABLE, - _BAGUA_AVAILABLE, _DEEPSPEED_AVAILABLE, - _FAIRSCALE_AVAILABLE, _HOROVOD_AVAILABLE, _module_available, _OMEGACONF_AVAILABLE,