Skip to content

Commit

Permalink
Merge branch 'PyTorchLightning:master' into bug/12768_datamodule_hpar…
Browse files Browse the repository at this point in the history
…am_update
  • Loading branch information
tanmoyio authored May 6, 2022
2 parents 0cbc78c + 3d74c90 commit b8c09d8
Show file tree
Hide file tree
Showing 23 changed files with 313 additions and 106 deletions.
9 changes: 7 additions & 2 deletions .azure-pipelines/gpu-benchmark.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,13 +28,18 @@ jobs:
cancelTimeoutInMinutes: "2"
pool: azure-gpus-spot
container:
# TODO: Unpin sha256
image: "pytorchlightning/pytorch_lightning:base-cuda-py3.7-torch1.8@sha256:b75de74d4c7c820f442f246be8500c93f8b5797b84aa8531847e5fb317ed3dda"
image: "pytorchlightning/pytorch_lightning:base-cuda-py3.7-torch1.8"
options: "--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all --shm-size=32g"
workspace:
clean: all

steps:
- bash: |
# TODO: Prepare a docker image with 1.8.2 (LTS) installed and remove manual installation.
pip install torch==1.8.2+cu102 torchvision==0.9.2+cu102 torchtext==0.9.2 -f https://download.pytorch.org/whl/lts/1.8/torch_lts.html
pip list
displayName: 'Install PyTorch LTS'
- bash: |
python -m pytest tests/benchmarks -v --durations=0
displayName: 'Testing: benchmarks'
Expand Down
5 changes: 3 additions & 2 deletions .azure-pipelines/gpu-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,7 @@ jobs:
container:
# base ML image: mcr.microsoft.com/azureml/openmpi3.1.2-cuda10.2-cudnn8-ubuntu18.04
# run on torch 1.8 as it's the LTS version
# TODO: Unpin sha256
image: "pytorchlightning/pytorch_lightning:base-cuda-py3.7-torch1.8@sha256:b75de74d4c7c820f442f246be8500c93f8b5797b84aa8531847e5fb317ed3dda"
image: "pytorchlightning/pytorch_lightning:base-cuda-py3.7-torch1.8"
# default shm size is 64m. Increase it to avoid:
# 'Error while creating shared memory: unhandled system error, NCCL version 2.7.8'
options: "--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all --shm-size=512m"
Expand All @@ -55,6 +54,8 @@ jobs:
CUDA_VERSION_MM=$(python -c "import torch ; print(''.join(map(str, torch.version.cuda.split('.')[:2])))")
pip install "bagua-cuda$CUDA_VERSION_MM>=0.9.0"
pip install . --requirement requirements/devel.txt
# TODO: Prepare a docker image with 1.8.2 (LTS) installed and remove manual installation.
pip install torch==1.8.2+cu102 torchvision==0.9.2+cu102 torchtext==0.9.2 -f https://download.pytorch.org/whl/lts/1.8/torch_lts.html
pip list
displayName: 'Install dependencies'
Expand Down
14 changes: 8 additions & 6 deletions .github/workflows/ci_dockers.yml
Original file line number Diff line number Diff line change
Expand Up @@ -75,13 +75,14 @@ jobs:
matrix:
include:
# the config used in '.azure-pipelines/gpu-tests.yml'
- {python_version: "3.7", pytorch_version: "1.10", cuda_version: "11.1"}
- {python_version: "3.7", pytorch_version: "1.11", cuda_version: "11.3.1"}
- {python_version: "3.7", pytorch_version: "1.8", cuda_version: "10.2", ubuntu_version: "18.04"}
- {python_version: "3.7", pytorch_version: "1.10", cuda_version: "11.1", ubuntu_version: "20.04"}
- {python_version: "3.7", pytorch_version: "1.11", cuda_version: "11.3.1", ubuntu_version: "20.04"}
# latest (used in Tutorials)
- {python_version: "3.8", pytorch_version: "1.8", cuda_version: "11.1"}
- {python_version: "3.8", pytorch_version: "1.9", cuda_version: "11.1"}
- {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.1"}
- {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.3.1"}
- {python_version: "3.8", pytorch_version: "1.8", cuda_version: "11.1", ubuntu_version: "20.04"}
- {python_version: "3.8", pytorch_version: "1.9", cuda_version: "11.1", ubuntu_version: "20.04"}
- {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.1", ubuntu_version: "20.04"}
- {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.3.1", ubuntu_version: "20.04"}
steps:
- name: Checkout
uses: actions/checkout@v2
Expand All @@ -93,6 +94,7 @@ jobs:
PYTHON_VERSION=${{ matrix.python_version }}
PYTORCH_VERSION=${{ matrix.pytorch_version }}
CUDA_VERSION=${{ matrix.cuda_version }}
UBUNTU_VERSION=${{ matrix.ubuntu_version }}
file: dockers/base-cuda/Dockerfile
push: false
timeout-minutes: 75
Expand Down
15 changes: 8 additions & 7 deletions .github/workflows/events-nightly.yml
Original file line number Diff line number Diff line change
Expand Up @@ -115,14 +115,14 @@ jobs:
matrix:
include:
# the config used in '.azure-pipelines/gpu-tests.yml'
- {python_version: "3.7", pytorch_version: "1.10", cuda_version: "11.1"}
- {python_version: "3.7", pytorch_version: "1.11", cuda_version: "11.3.1"}
- {python_version: "3.7", pytorch_version: "1.8", cuda_version: "10.2", ubuntu_version: "18.04"}
- {python_version: "3.7", pytorch_version: "1.10", cuda_version: "11.1", ubuntu_version: "20.04"}
- {python_version: "3.7", pytorch_version: "1.11", cuda_version: "11.3.1", ubuntu_version: "20.04"}
# latest (used in Tutorials)
- {python_version: "3.8", pytorch_version: "1.8", cuda_version: "11.1"}
- {python_version: "3.8", pytorch_version: "1.9", cuda_version: "11.1"}
- {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.1"}
- {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.3.1"}

- {python_version: "3.8", pytorch_version: "1.8", cuda_version: "11.1", ubuntu_version: "20.04"}
- {python_version: "3.8", pytorch_version: "1.9", cuda_version: "11.1", ubuntu_version: "20.04"}
- {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.1", ubuntu_version: "20.04"}
- {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.3.1", ubuntu_version: "20.04"}
steps:
- name: Checkout
uses: actions/checkout@v2
Expand All @@ -142,6 +142,7 @@ jobs:
PYTHON_VERSION=${{ matrix.python_version }}
PYTORCH_VERSION=${{ matrix.pytorch_version }}
CUDA_VERSION=${{ matrix.cuda_version }}
UBUNTU_VERSION=${{ matrix.ubuntu_version }}
file: dockers/base-cuda/Dockerfile
push: ${{ env.PUSH_TO_HUB }}
tags: pytorchlightning/pytorch_lightning:base-cuda-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}
Expand Down
15 changes: 13 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
- Added support for `Trainer(deterministic="warn")` to warn instead of fail when a non-deterministic operation is encountered ([#12588](https://github.com/PyTorchLightning/pytorch-lightning/pull/12588))


- Added profiling to the loops' dataloader `__next__` calls ([#12124](https://github.com/PyTorchLightning/pytorch-lightning/pull/12124))


- Added `CollaborativeStrategy` ([#12842](https://github.com/PyTorchLightning/pytorch-lightning/pull/12842))


Expand All @@ -48,9 +51,17 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
- Added class name prefix to metrics logged by `DeviceStatsMonitor` ([#12228](https://github.com/PyTorchLightning/pytorch-lightning/pull/12228))


- Added profiling of `LightningDataModule` hooks ([#12971](https://github.com/PyTorchLightning/pytorch-lightning/pull/12971))


- Added Native FSDP Strategy ([#12447](https://github.com/PyTorchLightning/pytorch-lightning/pull/12447))


- Added breaking of lazy graph across training, validation, test and predict steps when training with habana accelerators to ensure better performance ([#12938](https://github.com/PyTorchLightning/pytorch-lightning/pull/12938))


-

### Changed

- Enable validation during overfitting ([#12527](https://github.com/PyTorchLightning/pytorch-lightning/pull/12527))
Expand Down Expand Up @@ -161,7 +172,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
- Removed the deprecated `test_transforms` argument from the `LightningDataModule` constructor ([#12773](https://github.com/PyTorchLightning/pytorch-lightning/pull/12773))


- Removed deprecated `dataloader_idx` argument from `on_train_batch_start/end` hooks `Callback` and `LightningModule` ([#12769](https://github.com/PyTorchLightning/pytorch-lightning/pull/12769))
- Removed deprecated `dataloader_idx` argument from `on_train_batch_start/end` hooks `Callback` and `LightningModule` ([#12769](https://github.com/PyTorchLightning/pytorch-lightning/pull/12769), [#12977](https://github.com/PyTorchLightning/pytorch-lightning/pull/12977))


- Removed deprecated `get_progress_bar_dict` property from `LightningModule` ([#12839](https://github.com/PyTorchLightning/pytorch-lightning/pull/12839))
Expand All @@ -171,7 +182,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
- Fixed an issue causing zero-division error for empty dataloaders ([#12885](https://github.com/PyTorchLightning/pytorch-lightning/pull/12885))


-
- Fixed mismatching default values for the types of some arguments in the DeepSpeed and Fully-Sharded strategies which made the CLI unable to use them ([#12989](https://github.com/PyTorchLightning/pytorch-lightning/pull/12989))


-
Expand Down
12 changes: 11 additions & 1 deletion dockers/base-cuda/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,10 @@
# limitations under the License.

ARG CUDA_VERSION=11.3.1
ARG UBUNTU_VERSION=20.04

FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04
# TODO: Remove OS arg to always use ubuntu20.04 when dropping CUDA 10.2
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}

ARG PYTHON_VERSION=3.9
ARG PYTORCH_VERSION=1.8
Expand Down Expand Up @@ -47,6 +49,8 @@ RUN \
ca-certificates \
software-properties-common \
libopenmpi-dev \
openmpi-bin \
ssh \
&& \

# Install python
Expand Down Expand Up @@ -110,10 +114,14 @@ ENV \
HOROVOD_WITH_MPI=1

RUN \
# CUDA 10.2 doesn't support ampere architecture (8.0).
if [[ "$CUDA_VERSION" < "11.0" ]]; then export TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST//";8.0"/}; echo $TORCH_CUDA_ARCH_LIST; fi && \
HOROVOD_BUILD_CUDA_CC_LIST=${TORCH_CUDA_ARCH_LIST//";"/","} && \
export HOROVOD_BUILD_CUDA_CC_LIST=${HOROVOD_BUILD_CUDA_CC_LIST//"."/""} && \
echo $HOROVOD_BUILD_CUDA_CC_LIST && \
cmake --version && \
pip install --no-cache-dir -r ./requirements/strategies.txt && \
horovodrun --check-build && \
rm -rf requirements/

RUN \
Expand All @@ -127,6 +135,8 @@ RUN \
fi

RUN \
# CUDA 10.2 doesn't support ampere architecture (8.0).
if [[ "$CUDA_VERSION" < "11.0" ]]; then export TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST//";8.0"/}; echo $TORCH_CUDA_ARCH_LIST; fi && \
# install NVIDIA apex
pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" https://github.com/NVIDIA/apex/archive/refs/heads/master.zip && \
python -c "from apex import amp"
Expand Down
15 changes: 2 additions & 13 deletions pytorch_lightning/callbacks/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,23 +107,12 @@ def on_sanity_check_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningMod
"""Called when the validation sanity check ends."""

def on_train_batch_start(
self,
trainer: "pl.Trainer",
pl_module: "pl.LightningModule",
batch: Any,
batch_idx: int,
unused: int = 0,
self, trainer: "pl.Trainer", pl_module: "pl.LightningModule", batch: Any, batch_idx: int
) -> None:
"""Called when the train batch begins."""

def on_train_batch_end(
self,
trainer: "pl.Trainer",
pl_module: "pl.LightningModule",
outputs: STEP_OUTPUT,
batch: Any,
batch_idx: int,
unused: int = 0,
self, trainer: "pl.Trainer", pl_module: "pl.LightningModule", outputs: STEP_OUTPUT, batch: Any, batch_idx: int
) -> None:
"""Called when the train batch ends."""

Expand Down
15 changes: 2 additions & 13 deletions pytorch_lightning/callbacks/device_stats_monitor.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,12 +48,7 @@ def setup(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule", stage: O
raise MisconfigurationException("Cannot use DeviceStatsMonitor callback with Trainer that has no logger.")

def on_train_batch_start(
self,
trainer: "pl.Trainer",
pl_module: "pl.LightningModule",
batch: Any,
batch_idx: int,
unused: int = 0,
self, trainer: "pl.Trainer", pl_module: "pl.LightningModule", batch: Any, batch_idx: int
) -> None:
if not trainer.loggers:
raise MisconfigurationException("Cannot use `DeviceStatsMonitor` callback with `Trainer(logger=False)`.")
Expand All @@ -71,13 +66,7 @@ def on_train_batch_start(
logger.log_metrics(prefixed_device_stats, step=trainer.fit_loop.epoch_loop._batches_that_stepped)

def on_train_batch_end(
self,
trainer: "pl.Trainer",
pl_module: "pl.LightningModule",
outputs: STEP_OUTPUT,
batch: Any,
batch_idx: int,
unused: int = 0,
self, trainer: "pl.Trainer", pl_module: "pl.LightningModule", outputs: STEP_OUTPUT, batch: Any, batch_idx: int
) -> None:
if not trainer.loggers:
raise MisconfigurationException("Cannot use `DeviceStatsMonitor` callback with `Trainer(logger=False)`.")
Expand Down
15 changes: 15 additions & 0 deletions pytorch_lightning/loops/epoch/evaluation_epoch_loop.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,21 @@ def on_run_start( # type: ignore[override]
# add the previous `fetched` value to properly track `is_last_batch` with no prefetching
data_fetcher.fetched += self.batch_progress.current.ready

stage = self.trainer.state.stage
assert stage is not None
stage = stage.dataloader_prefix
self._profiler_fetch_action = (
f"[{self.__class__.__name__}].{stage}_dataloader_idx_{kwargs.get('dataloader_idx', 0)}_next"
)
data_fetcher._start_profiler = self._on_before_fetch
data_fetcher._stop_profiler = self._on_after_fetch

def _on_before_fetch(self) -> None:
self.trainer.profiler.start(self._profiler_fetch_action)

def _on_after_fetch(self) -> None:
self.trainer.profiler.stop(self._profiler_fetch_action)

def advance( # type: ignore[override]
self,
data_fetcher: AbstractDataFetcher,
Expand Down
4 changes: 3 additions & 1 deletion pytorch_lightning/loops/epoch/prediction_epoch_loop.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,9 @@ def advance( # type: ignore[override]
num_dataloaders: the total number of dataloaders
return_predictions: whether to return the obtained predictions
"""
batch_idx, batch = next(dataloader_iter)
action_name = f"[{self.__class__.__name__}].predict_dataloader_idx_{dataloader_idx}_next"
with self.trainer.profiler.profile(action_name):
batch_idx, batch = next(dataloader_iter)
self._seen_batch_indices = self._get_batch_indices(dataloader_idx)
# we need to truncate the list of batch indices due to prefetching in the dataloader and Lightning
self._seen_batch_indices = self._seen_batch_indices[: (self.batch_progress.current.completed + 1)]
Expand Down
38 changes: 14 additions & 24 deletions pytorch_lightning/loops/epoch/training_epoch_loop.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,15 @@ def on_run_start(self, data_fetcher: AbstractDataFetcher) -> None: # type: igno
# add the previous `fetched` value to properly track `is_last_batch` with no prefetching
data_fetcher.fetched += self.batch_progress.current.ready

data_fetcher._start_profiler = self._on_before_fetch
data_fetcher._stop_profiler = self._on_after_fetch

def _on_before_fetch(self) -> None:
self.trainer.profiler.start(f"[{self.__class__.__name__}].train_dataloader_next")

def _on_after_fetch(self) -> None:
self.trainer.profiler.stop(f"[{self.__class__.__name__}].train_dataloader_next")

def advance(self, data_fetcher: AbstractDataFetcher) -> None: # type: ignore[override]
"""Runs a single training batch.
Expand Down Expand Up @@ -186,20 +195,10 @@ def advance(self, data_fetcher: AbstractDataFetcher) -> None: # type: ignore[ov
# hook
self.trainer._call_callback_hooks("on_batch_start")

# TODO: Update this in v1.7 (deprecation: #9816)
model_fx = self.trainer.lightning_module.on_train_batch_start
extra_kwargs = (
{"dataloader_idx": 0}
if callable(model_fx) and is_param_in_hook_signature(model_fx, "dataloader_idx", explicit=True)
else {}
)

# hook
self.trainer._call_callback_hooks("on_train_batch_start", batch, batch_idx, **extra_kwargs)
response = self.trainer._call_lightning_module_hook(
"on_train_batch_start", batch, batch_idx, **extra_kwargs
)
self.trainer._call_strategy_hook("on_train_batch_start", batch, batch_idx, **extra_kwargs)
self.trainer._call_callback_hooks("on_train_batch_start", batch, batch_idx)
response = self.trainer._call_lightning_module_hook("on_train_batch_start", batch, batch_idx)
self.trainer._call_strategy_hook("on_train_batch_start", batch, batch_idx)
if response == -1:
self.batch_progress.increment_processed()
raise StopIteration
Expand All @@ -223,17 +222,8 @@ def advance(self, data_fetcher: AbstractDataFetcher) -> None: # type: ignore[ov
num_optimizers=len(self.trainer.optimizers),
)

# TODO: Update this in v1.7 (deprecation: #9816)
model_fx = self.trainer.lightning_module.on_train_batch_end
extra_kwargs = (
{"dataloader_idx": 0}
if callable(model_fx) and is_param_in_hook_signature(model_fx, "dataloader_idx", explicit=True)
else {}
)
self.trainer._call_callback_hooks("on_train_batch_end", batch_end_outputs, batch, batch_idx, **extra_kwargs)
self.trainer._call_lightning_module_hook(
"on_train_batch_end", batch_end_outputs, batch, batch_idx, **extra_kwargs
)
self.trainer._call_callback_hooks("on_train_batch_end", batch_end_outputs, batch, batch_idx)
self.trainer._call_lightning_module_hook("on_train_batch_end", batch_end_outputs, batch, batch_idx)
self.trainer._call_callback_hooks("on_batch_end")
self.trainer._logger_connector.on_batch_end()

Expand Down
10 changes: 5 additions & 5 deletions pytorch_lightning/strategies/deepspeed.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,8 +102,8 @@ def __init__(
offload_params_device: str = "cpu",
nvme_path: str = "/local_nvme",
params_buffer_count: int = 5,
params_buffer_size: int = 1e8,
max_in_cpu: int = 1e9,
params_buffer_size: int = 100_000_000,
max_in_cpu: int = 1_000_000_000,
offload_optimizer_device: str = "cpu",
optimizer_buffer_count: int = 4,
block_size: int = 1048576,
Expand All @@ -112,13 +112,13 @@ def __init__(
overlap_events: bool = True,
thread_count: int = 1,
pin_memory: bool = False,
sub_group_size: int = 1e12,
sub_group_size: int = 1_000_000_000_000,
contiguous_gradients: bool = True,
overlap_comm: bool = True,
allgather_partitions: bool = True,
reduce_scatter: bool = True,
allgather_bucket_size: int = 2e8,
reduce_bucket_size: int = 2e8,
allgather_bucket_size: int = 200_000_000,
reduce_bucket_size: int = 200_000_000,
zero_allow_untested_optimizer: bool = True,
logging_batch_size_per_gpu: Union[str, int] = "auto",
config: Optional[Union[Path, str, dict]] = None,
Expand Down
2 changes: 1 addition & 1 deletion pytorch_lightning/strategies/fully_sharded.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ def __init__(
fp32_reduce_scatter: Optional[bool] = None,
compute_dtype: Optional[torch.dtype] = None,
bucket_cap_mb: int = 25,
min_num_params: int = 1e8,
min_num_params: int = 100_000_000,
state_dict_to_cpu: bool = True,
parallel_devices: Optional[List[torch.device]] = None,
cluster_environment: Optional[ClusterEnvironment] = None,
Expand Down
Loading

0 comments on commit b8c09d8

Please sign in to comment.