Merge branch 'PyTorchLightning:master' into bug/12768_datamodule_hpar…

…am_update
Lightning-AI · May 5, 2022 · 0cbc78c · 0cbc78c
2 parents fd65af8 + 1a502c0
commit 0cbc78c
Show file tree

Hide file tree

Showing 303 changed files with 15,899 additions and 9,564 deletions.
diff --git a/.actions/assistant.py b/.actions/assistant.py
@@ -10,7 +10,7 @@
     "requirements.txt",
     "requirements/extra.txt",
     "requirements/loggers.txt",
-    # "requirements/test.txt",
+    "requirements/strategies.txt",
     "requirements/examples.txt",
 )
 

diff --git a/.azure-pipelines/gpu-tests.yml b/.azure-pipelines/gpu-tests.yml
@@ -19,7 +19,7 @@ pr:
 jobs:
   - job: pytest
     # how long to run the job before automatically cancelling
-    timeoutInMinutes: "45"
+    timeoutInMinutes: "55"
     # how much time to give 'run always even if cancelled tasks' before stopping them
     cancelTimeoutInMinutes: "2"
 
@@ -51,9 +51,7 @@ jobs:
       displayName: 'Image info & NVIDIA'
 
     - bash: |
-        python -c "fname = 'requirements/extra.txt' ; lines = [line for line in open(fname).readlines() if 'horovod' not in line] ; open(fname, 'w').writelines(lines)"
-        pip install fairscale>=0.4.5
-        pip install deepspeed>=0.6.0
+        python -c "fname = 'requirements/strategies.txt' ; lines = [line for line in open(fname).readlines() if 'horovod' not in line] ; open(fname, 'w').writelines(lines)"
         CUDA_VERSION_MM=$(python -c "import torch ; print(''.join(map(str, torch.version.cuda.split('.')[:2])))")
         pip install "bagua-cuda$CUDA_VERSION_MM>=0.9.0"
         pip install . --requirement requirements/devel.txt

diff --git a/.azure-pipelines/hpu-tests.yml b/.azure-pipelines/hpu-tests.yml
@@ -14,10 +14,10 @@ pr:
   - "release/*"
 
 jobs:
-  - job: hpu
+  - job: tests
 
     # how long to run the job before automatically cancelling
-    timeoutInMinutes: "5"
+    timeoutInMinutes: "10"
     # how much time to give 'run always even if cancelled tasks' before stopping them
     cancelTimeoutInMinutes: "2"
 
@@ -33,6 +33,7 @@ jobs:
       displayName: 'Instance HW info'
 
     - bash: |
+        pip install . --requirement requirements/extra.txt
         pip install . --requirement requirements/test.txt
       displayName: 'Install dependencies'
 

diff --git a/.azure-pipelines/ipu-tests.yml b/.azure-pipelines/ipu-tests.yml
@@ -16,8 +16,10 @@ variables:
   value: "poplar_sdk-ubuntu_20_04-2.3.1+793-89796d462d"
 
 jobs:
-  - job: ipu
+  - job: tests
 
+    # how long to run the job before automatically cancelling
+    timeoutInMinutes: "15"
     pool: graphcore-ipus
 
     workspace:
@@ -51,11 +53,9 @@ jobs:
 
     - bash: |
         export GIT_TERMINAL_PROMPT=1
-        python -c "fname = 'requirements/extra.txt' ; lines = [line for line in open(fname).readlines() if 'fairscale' not in line] ; open(fname, 'w').writelines(lines)"
-        python -c "fname = 'requirements/extra.txt' ; lines = [line for line in open(fname).readlines() if 'horovod' not in line] ; open(fname, 'w').writelines(lines)"
         python ./requirements/adjust-versions.py requirements/extra.txt
         python ./requirements/adjust-versions.py requirements/examples.txt
-        pip install . --requirement requirements/devel.txt
+        pip install . --requirement ./requirements/devel-base.txt
         pip list
       displayName: 'Install dependencies'
 

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
@@ -13,11 +13,13 @@
 *.yml           @borda @tchaton @carmocca
 
 # Docs
-/docs/                      @edenlightning @tchaton @borda @awaelchli
-/.github/*.md               @edenlightning @williamfalcon @borda
-/.github/ISSUE_TEMPLATE/    @edenlightning @borda @tchaton
-/docs/source/conf.py        @borda @awaelchli @carmocca
-/docs/source/index.rst      @williamfalcon
+/docs/                                  @edenlightning @tchaton @borda @awaelchli
+/.github/*.md                           @edenlightning @williamfalcon @borda
+/.github/ISSUE_TEMPLATE/                @edenlightning @borda @tchaton
+/docs/source/conf.py                    @borda @awaelchli @carmocca
+/docs/source/index.rst                  @williamfalcon
+/docs/source/levels                     @williamfalcon
+/docs/source/expertise_levels           @williamfalcon
 
 # Packages
 /pytorch_lightning/accelerators         @williamfalcon @tchaton @SeanNaren @awaelchli @justusschock @kaushikb11

diff --git a/.github/workflows/ci_dockers.yml b/.github/workflows/ci_dockers.yml
@@ -73,11 +73,15 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python_version: ["3.7", "3.9"]
-        pytorch_version: ["1.8", "1.11"]
         include:
           # the config used in '.azure-pipelines/gpu-tests.yml'
-          - {python_version: "3.9", pytorch_version: "1.10"}
+          - {python_version: "3.7", pytorch_version: "1.10", cuda_version: "11.1"}
+          - {python_version: "3.7", pytorch_version: "1.11", cuda_version: "11.3.1"}
+          # latest (used in Tutorials)
+          - {python_version: "3.8", pytorch_version: "1.8", cuda_version: "11.1"}
+          - {python_version: "3.8", pytorch_version: "1.9", cuda_version: "11.1"}
+          - {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.1"}
+          - {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.3.1"}
     steps:
       - name: Checkout
         uses: actions/checkout@v2
@@ -88,6 +92,7 @@ jobs:
           build-args: |
             PYTHON_VERSION=${{ matrix.python_version }}
             PYTORCH_VERSION=${{ matrix.pytorch_version }}
+            CUDA_VERSION=${{ matrix.cuda_version }}
           file: dockers/base-cuda/Dockerfile
           push: false
         timeout-minutes: 75

diff --git a/.github/workflows/ci_test-full.yml b/.github/workflows/ci_test-full.yml
@@ -98,10 +98,6 @@ jobs:
       shell: bash
 
     - name: Install extra dependencies
-      env:
-        HOROVOD_BUILD_ARCH_FLAGS: "-mfma"
-        HOROVOD_WITHOUT_MXNET: 1
-        HOROVOD_WITHOUT_TENSORFLOW: 1
       run: |
         # adjust versions according installed Torch version
         python ./requirements/adjust-versions.py requirements/extra.txt
@@ -119,7 +115,7 @@ jobs:
         HOROVOD_BUILT=$(python -c "import horovod.torch; horovod.torch.nccl_built(); print('SUCCESS')" || true)
         if [[ $HOROVOD_BUILT != "SUCCESS" ]]; then
           pip uninstall -y horovod
-          echo $(grep "horovod" requirements/extra.txt) > requirements/horovod.txt
+          grep "horovod" requirements/strategies.txt > requirements/horovod.txt
           pip install --no-cache-dir -r requirements/horovod.txt
         fi
         horovodrun --check-build

diff --git a/.github/workflows/code-checks.yml b/.github/workflows/code-checks.yml
@@ -13,17 +13,10 @@ concurrency:
 jobs:
   mypy:
     runs-on: ubuntu-20.04
-    #strategy:
-    #  fail-fast: false
-    #  matrix:
-    #    include:
-    #      - {python-version: "3.8", pytorch-version: "1.8"}
-    #      - {python-version: "3.9", pytorch-version: "1.10"}
     steps:
     - uses: actions/checkout@master
     - uses: actions/setup-python@v2
       with:
-        # python-version: ${{ matrix.python-version }}
         python-version: 3.9
 
     # Note: This uses an internal pip API and may not always work
@@ -37,15 +30,10 @@ jobs:
           ${{ runner.os }}-pip-
 
     - name: Install dependencies
-      env:
-        # TORCH_VERSION: ${{ matrix.pytorch-version }}
-        TORCH_VERSION: "1.10"
       run: |
-        pip install "torch==$TORCH_VERSION" --find-links https://download.pytorch.org/whl/cpu/torch_stable.html
-        # adjust versions according installed Torch version
+        pip install torch==1.11 --find-links https://download.pytorch.org/whl/cpu/torch_stable.html
         python ./requirements/adjust-versions.py requirements/extra.txt
-        python ./requirements/adjust-versions.py requirements/examples.txt
-        pip install '.[dev]' --upgrade-strategy only-if-needed --find-links https://download.pytorch.org/whl/cpu/torch_stable.html
+        pip install '.[dev]'
         pip list
 
     - name: Type check

diff --git a/.github/workflows/docs-checks.yml b/.github/workflows/docs-checks.yml
@@ -41,12 +41,9 @@ jobs:
           sudo apt-get install -y cmake pandoc
           pip --version
           pip install -q fire
-          # remove Horovod from requirements
-          python .actions/assistant.py requirements_prune_pkgs horovod
           # python -m pip install --upgrade --user pip
           pip install --requirement requirements.txt --upgrade-strategy only-if-needed --find-links https://download.pytorch.org/whl/cpu/torch_stable.html --quiet
-          pip install --requirement requirements/extra.txt
-          pip install --requirement requirements/loggers.txt
+          pip install --requirement requirements/devel-base.txt
           pip install --requirement requirements/docs.txt
           pip list
         shell: bash

diff --git a/.github/workflows/events-nightly.yml b/.github/workflows/events-nightly.yml
@@ -115,12 +115,13 @@ jobs:
       matrix:
         include:
           # the config used in '.azure-pipelines/gpu-tests.yml'
-          - {python_version: "3.7", pytorch_version: "1.8"}
-          - {python_version: "3.7", pytorch_version: "1.10"}
+          - {python_version: "3.7", pytorch_version: "1.10", cuda_version: "11.1"}
+          - {python_version: "3.7", pytorch_version: "1.11", cuda_version: "11.3.1"}
           # latest (used in Tutorials)
-          - {python_version: "3.8", pytorch_version: "1.8"}
-          - {python_version: "3.9", pytorch_version: "1.10"}
-          - {python_version: "3.9", pytorch_version: "1.11"}
+          - {python_version: "3.8", pytorch_version: "1.8", cuda_version: "11.1"}
+          - {python_version: "3.8", pytorch_version: "1.9", cuda_version: "11.1"}
+          - {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.1"}
+          - {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.3.1"}
 
     steps:
       - name: Checkout
@@ -140,6 +141,7 @@ jobs:
           build-args: |
             PYTHON_VERSION=${{ matrix.python_version }}
             PYTORCH_VERSION=${{ matrix.pytorch_version }}
+            CUDA_VERSION=${{ matrix.cuda_version }}
           file: dockers/base-cuda/Dockerfile
           push: ${{ env.PUSH_TO_HUB }}
           tags: pytorchlightning/pytorch_lightning:base-cuda-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}

diff --git a/.gitignore b/.gitignore
@@ -7,9 +7,6 @@ pip-wheel-metadata/
 lightning_logs/
 .vscode/
 
-# Test-tube
-test_tube_*/
-
 # Documentations
 docs/source/api
 docs/source/*.md

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -36,6 +36,7 @@ repos:
         args: ['--maxkb=350', '--enforce-all']
         exclude: |
             (?x)^(
+                CHANGELOG.md|
                 docs/source/_static/images/general/fast_2.gif|
                 docs/source/_static/images/mnist_imgs/pt_to_pl.jpg|
                 docs/source/_static/images/lightning_module/pt_to_pl.png|

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -9,19 +9,47 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Added
 
--
+- Added support for reloading the last checkpoint saved by passing `ckpt_path="last"` ([#12816](https://github.com/PyTorchLightning/pytorch-lightning/pull/12816))
+
+
+- Added `LightningDataModule.load_from_checkpoint` to support loading datamodules directly from checkpoint ([#12550](https://github.com/PyTorchLightning/pytorch-lightning/pull/12550))
+
+
+- Added a friendly error message when attempting to call `Trainer.save_checkpoint()` without a model attached ([#12772](https://github.com/PyTorchLightning/pytorch-lightning/pull/12772))
+
+
+- Added a friendly error message when attempting to use `DeepSpeedStrategy` on unsupported accelerators ([#12699](https://github.com/PyTorchLightning/pytorch-lightning/pull/12699))
+
+
+- Enabled `torch.inference_mode` for evaluation and prediction ([#12715](https://github.com/PyTorchLightning/pytorch-lightning/pull/12715))
+
+
+- Added support for setting `val_check_interval` to a value higher than the amount of training batches when `check_val_every_n_epoch=None` ([#11993](https://github.com/PyTorchLightning/pytorch-lightning/pull/11993))
 
 
 - Include the `pytorch_lightning` version as a header in the CLI config files ([#12532](https://github.com/PyTorchLightning/pytorch-lightning/pull/12532))
 
 
--
+- Added support for `Callback` registration through entry points ([#12739](https://github.com/PyTorchLightning/pytorch-lightning/pull/12739))
 
 
--
+- Added support for `Trainer(deterministic="warn")` to warn instead of fail when a non-deterministic operation is encountered ([#12588](https://github.com/PyTorchLightning/pytorch-lightning/pull/12588))
 
 
--
+- Added `CollaborativeStrategy` ([#12842](https://github.com/PyTorchLightning/pytorch-lightning/pull/12842))
+
+
+- Include a version suffix for new "last" checkpoints of later runs in the same directory ([#12902](https://github.com/PyTorchLightning/pytorch-lightning/pull/12902))
+
+
+- Added missing `predict_dataset` argument in `LightningDataModule.from_datasets` to create predict dataloaders ([#12942](https://github.com/PyTorchLightning/pytorch-lightning/pull/12942))
+
+
+- Added class name prefix to metrics logged by `DeviceStatsMonitor` ([#12228](https://github.com/PyTorchLightning/pytorch-lightning/pull/12228))
+
+
+- Added Native FSDP Strategy ([#12447](https://github.com/PyTorchLightning/pytorch-lightning/pull/12447))
+
 
 ### Changed
 
@@ -40,10 +68,16 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Marked `swa_lrs` argument in `StochasticWeightAveraging` callback as required ([#12556](https://github.com/PyTorchLightning/pytorch-lightning/pull/12556))
 
 
--
+- `LightningCLI`'s shorthand notation changed to use jsonargparse native feature ([#12614](https://github.com/PyTorchLightning/pytorch-lightning/pull/12614))
 
 
--
+- Changed `seed_everything_default` argument in the `LightningCLI` to type `Union[bool, int]`. If set to `True` a seed is automatically generated for the parser argument `--seed_everything`. ([#12822](https://github.com/PyTorchLightning/pytorch-lightning/pull/12822))
+
+
+- Make positional arguments required for classes passed into the `add_argparse_args` function. ([#12504](https://github.com/PyTorchLightning/pytorch-lightning/pull/12504))
+
+
+- Raise an error if there are insufficient training batches when using a float value of `limit_train_batches` ([#12885](https://github.com/PyTorchLightning/pytorch-lightning/pull/12885))
 
 
 -
@@ -57,7 +91,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Deprecated `num_processes`, `gpus`, `tpu_cores,` and `ipus` from the `Trainer` constructor in favor of using the `accelerator` and `devices` arguments ([#11040](https://github.com/PyTorchLightning/pytorch-lightning/pull/11040))
 
 
--
+- Deprecated setting `LightningCLI(seed_everything_default=None)` in favor of `False` ([#12804](https://github.com/PyTorchLightning/pytorch-lightning/issues/12804)).
 
 
 -
@@ -67,6 +101,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Removed
 
+- Removed the deprecated `TestTubeLogger` ([#12859](https://github.com/PyTorchLightning/pytorch-lightning/pull/12859))
+
+
 - Removed the deprecated `pytorch_lightning.core.memory.LayerSummary` and `pytorch_lightning.core.memory.ModelSummary` ([#12593](https://github.com/PyTorchLightning/pytorch-lightning/pull/12593))
 
 
@@ -118,16 +155,64 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Removed support for passing strategy names or strategy instances to the plugins Trainer argument ([#12700](https://github.com/PyTorchLightning/pytorch-lightning/pull/12700))
 
 
+- Removed the deprecated `val_transforms` argument from the `LightningDataModule` constructor ([#12763](https://github.com/PyTorchLightning/pytorch-lightning/pull/12763))
+
+
+- Removed the deprecated `test_transforms` argument from the `LightningDataModule` constructor ([#12773](https://github.com/PyTorchLightning/pytorch-lightning/pull/12773))
+
+
 - Removed deprecated `dataloader_idx` argument from `on_train_batch_start/end` hooks `Callback` and `LightningModule` ([#12769](https://github.com/PyTorchLightning/pytorch-lightning/pull/12769))
 
+
+- Removed deprecated `get_progress_bar_dict` property from `LightningModule` ([#12839](https://github.com/PyTorchLightning/pytorch-lightning/pull/12839))
+
 ### Fixed
 
+- Fixed an issue causing zero-division error for empty dataloaders ([#12885](https://github.com/PyTorchLightning/pytorch-lightning/pull/12885))
+
+
+-
+
+
+-
+
+
 -
 
 
 -
 
 
+## [1.6.3] - 2022-05-03
+
+### Fixed
+
+- Use only a single instance of `rich.console.Console` throughout codebase ([#12886](https://github.com/PyTorchLightning/pytorch-lightning/pull/12886))
+- Fixed an issue to ensure all the checkpoint states are saved in a common filepath with `DeepspeedStrategy` ([#12887](https://github.com/PyTorchLightning/pytorch-lightning/pull/12887))
+- Fixed `trainer.logger` deprecation message ([#12671](https://github.com/PyTorchLightning/pytorch-lightning/pull/12671))
+- Fixed an issue where sharded grad scaler is passed in when using BF16 with the `ShardedStrategy` ([#12915](https://github.com/PyTorchLightning/pytorch-lightning/pull/12915))
+- Fixed an issue wrt recursive invocation of DDP configuration in hpu parallel plugin ([#12912](https://github.com/PyTorchLightning/pytorch-lightning/pull/12912))
+- Fixed printing of ragged dictionaries in `Trainer.validate` and `Trainer.test` ([#12857](https://github.com/PyTorchLightning/pytorch-lightning/pull/12857))
+- Fixed threading support for legacy loading of checkpoints ([#12814](https://github.com/PyTorchLightning/pytorch-lightning/pull/12814))
+- Fixed pickling of `KFoldLoop` ([#12441](https://github.com/PyTorchLightning/pytorch-lightning/pull/12441))
+- Stopped `optimizer_zero_grad` from being called after IPU execution ([#12913](https://github.com/PyTorchLightning/pytorch-lightning/pull/12913))
+- Fixed `fuse_modules` to be qat-aware for `torch>=1.11` ([#12891](https://github.com/PyTorchLightning/pytorch-lightning/pull/12891))
+- Enforced eval shuffle warning only for default samplers in DataLoader ([#12653](https://github.com/PyTorchLightning/pytorch-lightning/pull/12653))
+- Enable mixed precision in `DDPFullyShardedStrategy` when `precision=16` ([#12965](https://github.com/PyTorchLightning/pytorch-lightning/pull/12965))
+- Fixed `TQDMProgressBar` reset and update to show correct time estimation ([#12889](https://github.com/PyTorchLightning/pytorch-lightning/pull/12889))
+- Fixed fit loop restart logic to enable resume using the checkpoint ([#12821](https://github.com/PyTorchLightning/pytorch-lightning/pull/12821))
+
+
+## [1.6.2] - 2022-04-27
+
+### Fixed
+
+- Fixed `ImportError` when `torch.distributed` is not available. ([#12794](https://github.com/PyTorchLightning/pytorch-lightning/pull/12794))
+- When using custom DataLoaders in LightningDataModule, multiple inheritance is resolved properly ([#12716](https://github.com/PyTorchLightning/pytorch-lightning/pull/12716))
+- Fixed encoding issues on terminals that do not support unicode characters ([#12828](https://github.com/PyTorchLightning/pytorch-lightning/pull/12828))
+- Fixed support for `ModelCheckpoint` monitors with dots ([#12783](https://github.com/PyTorchLightning/pytorch-lightning/pull/12783))
+
+
 ## [1.6.1] - 2022-04-13
 
 ### Changed