Merge branch 'PyTorchLightning:master' into bug/12768_datamodule_hpar…

…am_update
Lightning-AI · May 6, 2022 · b8c09d8 · b8c09d8
2 parents 0cbc78c + 3d74c90
commit b8c09d8
Show file tree

Hide file tree

Showing 23 changed files with 313 additions and 106 deletions.
diff --git a/.azure-pipelines/gpu-benchmark.yml b/.azure-pipelines/gpu-benchmark.yml
@@ -28,13 +28,18 @@ jobs:
     cancelTimeoutInMinutes: "2"
     pool: azure-gpus-spot
     container:
-      # TODO: Unpin sha256
-      image: "pytorchlightning/pytorch_lightning:base-cuda-py3.7-torch1.8@sha256:b75de74d4c7c820f442f246be8500c93f8b5797b84aa8531847e5fb317ed3dda"
+      image: "pytorchlightning/pytorch_lightning:base-cuda-py3.7-torch1.8"
       options: "--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all --shm-size=32g"
     workspace:
       clean: all
 
     steps:
+      - bash: |
+          # TODO: Prepare a docker image with 1.8.2 (LTS) installed and remove manual installation.
+          pip install torch==1.8.2+cu102 torchvision==0.9.2+cu102 torchtext==0.9.2 -f https://download.pytorch.org/whl/lts/1.8/torch_lts.html
+          pip list
+        displayName: 'Install PyTorch LTS'
+
       - bash: |
           python -m pytest tests/benchmarks -v --durations=0
         displayName: 'Testing: benchmarks'

diff --git a/.azure-pipelines/gpu-tests.yml b/.azure-pipelines/gpu-tests.yml
@@ -29,8 +29,7 @@ jobs:
     container:
       # base ML image: mcr.microsoft.com/azureml/openmpi3.1.2-cuda10.2-cudnn8-ubuntu18.04
       # run on torch 1.8 as it's the LTS version
-      # TODO: Unpin sha256
-      image: "pytorchlightning/pytorch_lightning:base-cuda-py3.7-torch1.8@sha256:b75de74d4c7c820f442f246be8500c93f8b5797b84aa8531847e5fb317ed3dda"
+      image: "pytorchlightning/pytorch_lightning:base-cuda-py3.7-torch1.8"
       # default shm size is 64m. Increase it to avoid:
       # 'Error while creating shared memory: unhandled system error, NCCL version 2.7.8'
       options: "--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all --shm-size=512m"
@@ -55,6 +54,8 @@ jobs:
         CUDA_VERSION_MM=$(python -c "import torch ; print(''.join(map(str, torch.version.cuda.split('.')[:2])))")
         pip install "bagua-cuda$CUDA_VERSION_MM>=0.9.0"
         pip install . --requirement requirements/devel.txt
+        # TODO: Prepare a docker image with 1.8.2 (LTS) installed and remove manual installation.
+        pip install torch==1.8.2+cu102 torchvision==0.9.2+cu102 torchtext==0.9.2 -f https://download.pytorch.org/whl/lts/1.8/torch_lts.html
         pip list
       displayName: 'Install dependencies'
 

diff --git a/.github/workflows/ci_dockers.yml b/.github/workflows/ci_dockers.yml
@@ -75,13 +75,14 @@ jobs:
       matrix:
         include:
           # the config used in '.azure-pipelines/gpu-tests.yml'
-          - {python_version: "3.7", pytorch_version: "1.10", cuda_version: "11.1"}
-          - {python_version: "3.7", pytorch_version: "1.11", cuda_version: "11.3.1"}
+          - {python_version: "3.7", pytorch_version: "1.8", cuda_version: "10.2", ubuntu_version: "18.04"}
+          - {python_version: "3.7", pytorch_version: "1.10", cuda_version: "11.1", ubuntu_version: "20.04"}
+          - {python_version: "3.7", pytorch_version: "1.11", cuda_version: "11.3.1", ubuntu_version: "20.04"}
           # latest (used in Tutorials)
-          - {python_version: "3.8", pytorch_version: "1.8", cuda_version: "11.1"}
-          - {python_version: "3.8", pytorch_version: "1.9", cuda_version: "11.1"}
-          - {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.1"}
-          - {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.3.1"}
+          - {python_version: "3.8", pytorch_version: "1.8", cuda_version: "11.1", ubuntu_version: "20.04"}
+          - {python_version: "3.8", pytorch_version: "1.9", cuda_version: "11.1", ubuntu_version: "20.04"}
+          - {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.1", ubuntu_version: "20.04"}
+          - {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.3.1", ubuntu_version: "20.04"}
     steps:
       - name: Checkout
         uses: actions/checkout@v2
@@ -93,6 +94,7 @@ jobs:
             PYTHON_VERSION=${{ matrix.python_version }}
             PYTORCH_VERSION=${{ matrix.pytorch_version }}
             CUDA_VERSION=${{ matrix.cuda_version }}
+            UBUNTU_VERSION=${{ matrix.ubuntu_version }}
           file: dockers/base-cuda/Dockerfile
           push: false
         timeout-minutes: 75

diff --git a/.github/workflows/events-nightly.yml b/.github/workflows/events-nightly.yml
@@ -115,14 +115,14 @@ jobs:
       matrix:
         include:
           # the config used in '.azure-pipelines/gpu-tests.yml'
-          - {python_version: "3.7", pytorch_version: "1.10", cuda_version: "11.1"}
-          - {python_version: "3.7", pytorch_version: "1.11", cuda_version: "11.3.1"}
+          - {python_version: "3.7", pytorch_version: "1.8", cuda_version: "10.2", ubuntu_version: "18.04"}
+          - {python_version: "3.7", pytorch_version: "1.10", cuda_version: "11.1", ubuntu_version: "20.04"}
+          - {python_version: "3.7", pytorch_version: "1.11", cuda_version: "11.3.1", ubuntu_version: "20.04"}
           # latest (used in Tutorials)
-          - {python_version: "3.8", pytorch_version: "1.8", cuda_version: "11.1"}
-          - {python_version: "3.8", pytorch_version: "1.9", cuda_version: "11.1"}
-          - {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.1"}
-          - {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.3.1"}
-
+          - {python_version: "3.8", pytorch_version: "1.8", cuda_version: "11.1", ubuntu_version: "20.04"}
+          - {python_version: "3.8", pytorch_version: "1.9", cuda_version: "11.1", ubuntu_version: "20.04"}
+          - {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.1", ubuntu_version: "20.04"}
+          - {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.3.1", ubuntu_version: "20.04"}
     steps:
       - name: Checkout
         uses: actions/checkout@v2
@@ -142,6 +142,7 @@ jobs:
             PYTHON_VERSION=${{ matrix.python_version }}
             PYTORCH_VERSION=${{ matrix.pytorch_version }}
             CUDA_VERSION=${{ matrix.cuda_version }}
+            UBUNTU_VERSION=${{ matrix.ubuntu_version }}
           file: dockers/base-cuda/Dockerfile
           push: ${{ env.PUSH_TO_HUB }}
           tags: pytorchlightning/pytorch_lightning:base-cuda-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -36,6 +36,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Added support for `Trainer(deterministic="warn")` to warn instead of fail when a non-deterministic operation is encountered ([#12588](https://github.com/PyTorchLightning/pytorch-lightning/pull/12588))
 
 
+- Added profiling to the loops' dataloader `__next__` calls ([#12124](https://github.com/PyTorchLightning/pytorch-lightning/pull/12124))
+
+
 - Added `CollaborativeStrategy` ([#12842](https://github.com/PyTorchLightning/pytorch-lightning/pull/12842))
 
 
@@ -48,9 +51,17 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Added class name prefix to metrics logged by `DeviceStatsMonitor` ([#12228](https://github.com/PyTorchLightning/pytorch-lightning/pull/12228))
 
 
+- Added profiling of `LightningDataModule` hooks ([#12971](https://github.com/PyTorchLightning/pytorch-lightning/pull/12971))
+
+
 - Added Native FSDP Strategy ([#12447](https://github.com/PyTorchLightning/pytorch-lightning/pull/12447))
 
 
+- Added breaking of lazy graph across training, validation, test and predict steps when training with habana accelerators to ensure better performance ([#12938](https://github.com/PyTorchLightning/pytorch-lightning/pull/12938))
+
+
+-
+
 ### Changed
 
 - Enable validation during overfitting ([#12527](https://github.com/PyTorchLightning/pytorch-lightning/pull/12527))
@@ -161,7 +172,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Removed the deprecated `test_transforms` argument from the `LightningDataModule` constructor ([#12773](https://github.com/PyTorchLightning/pytorch-lightning/pull/12773))
 
 
-- Removed deprecated `dataloader_idx` argument from `on_train_batch_start/end` hooks `Callback` and `LightningModule` ([#12769](https://github.com/PyTorchLightning/pytorch-lightning/pull/12769))
+- Removed deprecated `dataloader_idx` argument from `on_train_batch_start/end` hooks `Callback` and `LightningModule` ([#12769](https://github.com/PyTorchLightning/pytorch-lightning/pull/12769), [#12977](https://github.com/PyTorchLightning/pytorch-lightning/pull/12977))
 
 
 - Removed deprecated `get_progress_bar_dict` property from `LightningModule` ([#12839](https://github.com/PyTorchLightning/pytorch-lightning/pull/12839))
@@ -171,7 +182,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Fixed an issue causing zero-division error for empty dataloaders ([#12885](https://github.com/PyTorchLightning/pytorch-lightning/pull/12885))
 
 
--
+- Fixed mismatching default values for the types of some arguments in the DeepSpeed and Fully-Sharded strategies which made the CLI unable to use them ([#12989](https://github.com/PyTorchLightning/pytorch-lightning/pull/12989))
 
 
 -

diff --git a/dockers/base-cuda/Dockerfile b/dockers/base-cuda/Dockerfile
@@ -13,8 +13,10 @@
 # limitations under the License.
 
 ARG CUDA_VERSION=11.3.1
+ARG UBUNTU_VERSION=20.04
 
-FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04
+# TODO: Remove OS arg to always use ubuntu20.04 when dropping CUDA 10.2
+FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
 
 ARG PYTHON_VERSION=3.9
 ARG PYTORCH_VERSION=1.8
@@ -47,6 +49,8 @@ RUN \
         ca-certificates \
         software-properties-common \
         libopenmpi-dev \
+        openmpi-bin \
+        ssh \
     && \
 
 # Install python
@@ -110,10 +114,14 @@ ENV \
     HOROVOD_WITH_MPI=1
 
 RUN \
+    # CUDA 10.2 doesn't support ampere architecture (8.0).
+    if [[ "$CUDA_VERSION" < "11.0" ]]; then export TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST//";8.0"/}; echo $TORCH_CUDA_ARCH_LIST; fi && \
     HOROVOD_BUILD_CUDA_CC_LIST=${TORCH_CUDA_ARCH_LIST//";"/","} && \
     export HOROVOD_BUILD_CUDA_CC_LIST=${HOROVOD_BUILD_CUDA_CC_LIST//"."/""} && \
+    echo $HOROVOD_BUILD_CUDA_CC_LIST && \
     cmake --version && \
     pip install --no-cache-dir -r ./requirements/strategies.txt && \
+    horovodrun --check-build && \
     rm -rf requirements/
 
 RUN \
@@ -127,6 +135,8 @@ RUN \
     fi
 
 RUN \
+    # CUDA 10.2 doesn't support ampere architecture (8.0).
+    if [[ "$CUDA_VERSION" < "11.0" ]]; then export TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST//";8.0"/}; echo $TORCH_CUDA_ARCH_LIST; fi && \
     # install NVIDIA apex
     pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" https://github.com/NVIDIA/apex/archive/refs/heads/master.zip && \
     python -c "from apex import amp"

diff --git a/pytorch_lightning/callbacks/base.py b/pytorch_lightning/callbacks/base.py
@@ -107,23 +107,12 @@ def on_sanity_check_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningMod
         """Called when the validation sanity check ends."""
 
     def on_train_batch_start(
-        self,
-        trainer: "pl.Trainer",
-        pl_module: "pl.LightningModule",
-        batch: Any,
-        batch_idx: int,
-        unused: int = 0,
+        self, trainer: "pl.Trainer", pl_module: "pl.LightningModule", batch: Any, batch_idx: int
     ) -> None:
         """Called when the train batch begins."""
 
     def on_train_batch_end(
-        self,
-        trainer: "pl.Trainer",
-        pl_module: "pl.LightningModule",
-        outputs: STEP_OUTPUT,
-        batch: Any,
-        batch_idx: int,
-        unused: int = 0,
+        self, trainer: "pl.Trainer", pl_module: "pl.LightningModule", outputs: STEP_OUTPUT, batch: Any, batch_idx: int
     ) -> None:
         """Called when the train batch ends."""
 

diff --git a/pytorch_lightning/callbacks/device_stats_monitor.py b/pytorch_lightning/callbacks/device_stats_monitor.py
@@ -48,12 +48,7 @@ def setup(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule", stage: O
             raise MisconfigurationException("Cannot use DeviceStatsMonitor callback with Trainer that has no logger.")
 
     def on_train_batch_start(
-        self,
-        trainer: "pl.Trainer",
-        pl_module: "pl.LightningModule",
-        batch: Any,
-        batch_idx: int,
-        unused: int = 0,
+        self, trainer: "pl.Trainer", pl_module: "pl.LightningModule", batch: Any, batch_idx: int
     ) -> None:
         if not trainer.loggers:
             raise MisconfigurationException("Cannot use `DeviceStatsMonitor` callback with `Trainer(logger=False)`.")
@@ -71,13 +66,7 @@ def on_train_batch_start(
             logger.log_metrics(prefixed_device_stats, step=trainer.fit_loop.epoch_loop._batches_that_stepped)
 
     def on_train_batch_end(
-        self,
-        trainer: "pl.Trainer",
-        pl_module: "pl.LightningModule",
-        outputs: STEP_OUTPUT,
-        batch: Any,
-        batch_idx: int,
-        unused: int = 0,
+        self, trainer: "pl.Trainer", pl_module: "pl.LightningModule", outputs: STEP_OUTPUT, batch: Any, batch_idx: int
     ) -> None:
         if not trainer.loggers:
             raise MisconfigurationException("Cannot use `DeviceStatsMonitor` callback with `Trainer(logger=False)`.")

diff --git a/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py b/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py
@@ -88,6 +88,21 @@ def on_run_start(  # type: ignore[override]
         # add the previous `fetched` value to properly track `is_last_batch` with no prefetching
         data_fetcher.fetched += self.batch_progress.current.ready
 
+        stage = self.trainer.state.stage
+        assert stage is not None
+        stage = stage.dataloader_prefix
+        self._profiler_fetch_action = (
+            f"[{self.__class__.__name__}].{stage}_dataloader_idx_{kwargs.get('dataloader_idx', 0)}_next"
+        )
+        data_fetcher._start_profiler = self._on_before_fetch
+        data_fetcher._stop_profiler = self._on_after_fetch
+
+    def _on_before_fetch(self) -> None:
+        self.trainer.profiler.start(self._profiler_fetch_action)
+
+    def _on_after_fetch(self) -> None:
+        self.trainer.profiler.stop(self._profiler_fetch_action)
+
     def advance(  # type: ignore[override]
         self,
         data_fetcher: AbstractDataFetcher,

diff --git a/pytorch_lightning/loops/epoch/prediction_epoch_loop.py b/pytorch_lightning/loops/epoch/prediction_epoch_loop.py
@@ -89,7 +89,9 @@ def advance(  # type: ignore[override]
             num_dataloaders: the total number of dataloaders
             return_predictions: whether to return the obtained predictions
         """
-        batch_idx, batch = next(dataloader_iter)
+        action_name = f"[{self.__class__.__name__}].predict_dataloader_idx_{dataloader_idx}_next"
+        with self.trainer.profiler.profile(action_name):
+            batch_idx, batch = next(dataloader_iter)
         self._seen_batch_indices = self._get_batch_indices(dataloader_idx)
         # we need to truncate the list of batch indices due to prefetching in the dataloader and Lightning
         self._seen_batch_indices = self._seen_batch_indices[: (self.batch_progress.current.completed + 1)]

diff --git a/pytorch_lightning/loops/epoch/training_epoch_loop.py b/pytorch_lightning/loops/epoch/training_epoch_loop.py
@@ -154,6 +154,15 @@ def on_run_start(self, data_fetcher: AbstractDataFetcher) -> None:  # type: igno
         # add the previous `fetched` value to properly track `is_last_batch` with no prefetching
         data_fetcher.fetched += self.batch_progress.current.ready
 
+        data_fetcher._start_profiler = self._on_before_fetch
+        data_fetcher._stop_profiler = self._on_after_fetch
+
+    def _on_before_fetch(self) -> None:
+        self.trainer.profiler.start(f"[{self.__class__.__name__}].train_dataloader_next")
+
+    def _on_after_fetch(self) -> None:
+        self.trainer.profiler.stop(f"[{self.__class__.__name__}].train_dataloader_next")
+
     def advance(self, data_fetcher: AbstractDataFetcher) -> None:  # type: ignore[override]
         """Runs a single training batch.
 
@@ -186,20 +195,10 @@ def advance(self, data_fetcher: AbstractDataFetcher) -> None:  # type: ignore[ov
             # hook
             self.trainer._call_callback_hooks("on_batch_start")
 
-            # TODO: Update this in v1.7 (deprecation: #9816)
-            model_fx = self.trainer.lightning_module.on_train_batch_start
-            extra_kwargs = (
-                {"dataloader_idx": 0}
-                if callable(model_fx) and is_param_in_hook_signature(model_fx, "dataloader_idx", explicit=True)
-                else {}
-            )
-
             # hook
-            self.trainer._call_callback_hooks("on_train_batch_start", batch, batch_idx, **extra_kwargs)
-            response = self.trainer._call_lightning_module_hook(
-                "on_train_batch_start", batch, batch_idx, **extra_kwargs
-            )
-            self.trainer._call_strategy_hook("on_train_batch_start", batch, batch_idx, **extra_kwargs)
+            self.trainer._call_callback_hooks("on_train_batch_start", batch, batch_idx)
+            response = self.trainer._call_lightning_module_hook("on_train_batch_start", batch, batch_idx)
+            self.trainer._call_strategy_hook("on_train_batch_start", batch, batch_idx)
             if response == -1:
                 self.batch_progress.increment_processed()
                 raise StopIteration
@@ -223,17 +222,8 @@ def advance(self, data_fetcher: AbstractDataFetcher) -> None:  # type: ignore[ov
             num_optimizers=len(self.trainer.optimizers),
         )
 
-        # TODO: Update this in v1.7 (deprecation: #9816)
-        model_fx = self.trainer.lightning_module.on_train_batch_end
-        extra_kwargs = (
-            {"dataloader_idx": 0}
-            if callable(model_fx) and is_param_in_hook_signature(model_fx, "dataloader_idx", explicit=True)
-            else {}
-        )
-        self.trainer._call_callback_hooks("on_train_batch_end", batch_end_outputs, batch, batch_idx, **extra_kwargs)
-        self.trainer._call_lightning_module_hook(
-            "on_train_batch_end", batch_end_outputs, batch, batch_idx, **extra_kwargs
-        )
+        self.trainer._call_callback_hooks("on_train_batch_end", batch_end_outputs, batch, batch_idx)
+        self.trainer._call_lightning_module_hook("on_train_batch_end", batch_end_outputs, batch, batch_idx)
         self.trainer._call_callback_hooks("on_batch_end")
         self.trainer._logger_connector.on_batch_end()
 

diff --git a/pytorch_lightning/strategies/deepspeed.py b/pytorch_lightning/strategies/deepspeed.py
@@ -102,8 +102,8 @@ def __init__(
         offload_params_device: str = "cpu",
         nvme_path: str = "/local_nvme",
         params_buffer_count: int = 5,
-        params_buffer_size: int = 1e8,
-        max_in_cpu: int = 1e9,
+        params_buffer_size: int = 100_000_000,
+        max_in_cpu: int = 1_000_000_000,
         offload_optimizer_device: str = "cpu",
         optimizer_buffer_count: int = 4,
         block_size: int = 1048576,
@@ -112,13 +112,13 @@ def __init__(
         overlap_events: bool = True,
         thread_count: int = 1,
         pin_memory: bool = False,
-        sub_group_size: int = 1e12,
+        sub_group_size: int = 1_000_000_000_000,
         contiguous_gradients: bool = True,
         overlap_comm: bool = True,
         allgather_partitions: bool = True,
         reduce_scatter: bool = True,
-        allgather_bucket_size: int = 2e8,
-        reduce_bucket_size: int = 2e8,
+        allgather_bucket_size: int = 200_000_000,
+        reduce_bucket_size: int = 200_000_000,
         zero_allow_untested_optimizer: bool = True,
         logging_batch_size_per_gpu: Union[str, int] = "auto",
         config: Optional[Union[Path, str, dict]] = None,

diff --git a/pytorch_lightning/strategies/fully_sharded.py b/pytorch_lightning/strategies/fully_sharded.py
@@ -50,7 +50,7 @@ def __init__(
         fp32_reduce_scatter: Optional[bool] = None,
         compute_dtype: Optional[torch.dtype] = None,
         bucket_cap_mb: int = 25,
-        min_num_params: int = 1e8,
+        min_num_params: int = 100_000_000,
         state_dict_to_cpu: bool = True,
         parallel_devices: Optional[List[torch.device]] = None,
         cluster_environment: Optional[ClusterEnvironment] = None,