From e320a5d581a350010287c5ed5e9ef01d147b06e1 Mon Sep 17 00:00:00 2001 From: Kaushik B <kaushikbokka@gmail.com> Date: Thu, 24 Mar 2022 16:28:38 +0530 Subject: [PATCH 01/14] Update Plugins doc --- docs/source/extensions/plugins.rst | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/docs/source/extensions/plugins.rst b/docs/source/extensions/plugins.rst index 252fb47570fc4..d81ed52fae504 100644 --- a/docs/source/extensions/plugins.rst +++ b/docs/source/extensions/plugins.rst @@ -74,6 +74,20 @@ Precision Plugins IPUPrecisionPlugin +CheckpointIO Plugins +-------------------- + +.. currentmodule:: pytorch_lightning.plugins.io + +.. autosummary:: + :nosignatures: + :template: classtemplate.rst + + CheckpointIO + TorchCheckpointIO + XLACheckpointIO + + Cluster Environments -------------------- From 95e1ffc218e574bb4f800416c27420d0a7019af9 Mon Sep 17 00:00:00 2001 From: Kaushik B <kaushikbokka@gmail.com> Date: Thu, 24 Mar 2022 16:35:23 +0530 Subject: [PATCH 02/14] Update Plugins doc --- docs/source/extensions/plugins.rst | 23 ++++++----------------- 1 file changed, 6 insertions(+), 17 deletions(-) diff --git a/docs/source/extensions/plugins.rst b/docs/source/extensions/plugins.rst index d81ed52fae504..51d65931804ee 100644 --- a/docs/source/extensions/plugins.rst +++ b/docs/source/extensions/plugins.rst @@ -6,8 +6,8 @@ Plugins .. include:: ../links.rst -Plugins allow custom integrations to the internals of the Trainer such as a custom precision or -distributed implementation. +Plugins allow custom integrations to the internals of the Trainer such as a custom precision, checkpointing or +cluster environment implementation. Under the hood, the Lightning Trainer is using plugins in the training routine, added automatically depending on the provided Trainer arguments. For example: @@ -27,22 +27,11 @@ We expose Accelerators and Plugins mainly for expert users that want to extend L `PyTorch <https://pytorch.org/docs/stable/distributed.html#backends>`_ itself) - Clusters (e.g. customized access to the cluster's environment interface) -There are two types of Plugins in Lightning with different responsibilities: +There are three types of Plugins in Lightning with different responsibilities: -Strategy --------- - -- Launching and teardown of training processes (if applicable) -- Setup communication between processes (NCCL, GLOO, MPI, ...) -- Provide a unified communication interface for reduction, broadcast, etc. -- Provide access to the wrapped LightningModule - - -Furthermore, for multi-node training Lightning provides cluster environment plugins that allow the advanced user -to configure Lightning to integrate with a :ref:`custom-cluster`. - - -.. image:: ../_static/images/accelerator/overview.svg +- Precision Plugins +- CheckpointIO Plugins +- Cluster Environments (e.g. customized access to the cluster's environment interface) The full list of built-in plugins is listed below. From 83c2deffa419f82e535bb0065f137edf898252eb Mon Sep 17 00:00:00 2001 From: Kaushik B <kaushikbokka@gmail.com> Date: Thu, 24 Mar 2022 17:36:03 +0530 Subject: [PATCH 03/14] Update CheckpointIO --- docs/source/extensions/plugins.rst | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/docs/source/extensions/plugins.rst b/docs/source/extensions/plugins.rst index 51d65931804ee..2cfbefbf4a57e 100644 --- a/docs/source/extensions/plugins.rst +++ b/docs/source/extensions/plugins.rst @@ -29,6 +29,17 @@ We expose Accelerators and Plugins mainly for expert users that want to extend L There are three types of Plugins in Lightning with different responsibilities: +Precision Plugins +----------------- + +We expose precision plugins for the users + + +.. code-block:: python + + # precision: FP16Plugin + trainer = Trainer(precision=16) + - Precision Plugins - CheckpointIO Plugins - Cluster Environments (e.g. customized access to the cluster's environment interface) @@ -66,6 +77,11 @@ Precision Plugins CheckpointIO Plugins -------------------- +As part of our commitment to extensibility, we have abstracted Lightning's checkpointing logic into the :class:`~pytorch_lightning.plugins.io.CheckpointIO` plugin. +With this, users have the ability to customize the checkpointing logic to match the needs of their infrastructure. + +Below is a list of built-in plugins for checkpointing. + .. currentmodule:: pytorch_lightning.plugins.io .. autosummary:: @@ -76,6 +92,7 @@ CheckpointIO Plugins TorchCheckpointIO XLACheckpointIO +You could learn more about custom checkpointing with Lightning :ref:`here <../common/checkpointing:Customize Checkpointing>`. Cluster Environments -------------------- From de8ed1537bdd58c589e10a33937166aa4dc32f31 Mon Sep 17 00:00:00 2001 From: Kaushik B <kaushikbokka@gmail.com> Date: Tue, 29 Mar 2022 10:55:13 +0530 Subject: [PATCH 04/14] Update plugins doc --- docs/source/common/checkpointing.rst | 2 +- docs/source/extensions/plugins.rst | 41 +++++++--------------------- 2 files changed, 11 insertions(+), 32 deletions(-) diff --git a/docs/source/common/checkpointing.rst b/docs/source/common/checkpointing.rst index 73496052793b6..f6d0aa329363e 100644 --- a/docs/source/common/checkpointing.rst +++ b/docs/source/common/checkpointing.rst @@ -392,7 +392,7 @@ Custom Checkpoint IO Plugin .. note:: - Some ``TrainingTypePlugins`` like ``DeepSpeedStrategy`` do not support custom ``CheckpointIO`` as checkpointing logic is not modifiable. + Some strategies like :class:`~pytorch_lightning.strategies.deepspeed.DeepSpeedStrategy` do not support custom :class:`~pytorch_lightning.plugins.io.checkpoint_plugin.CheckpointIO` as checkpointing logic is not modifiable. ----------- diff --git a/docs/source/extensions/plugins.rst b/docs/source/extensions/plugins.rst index 2cfbefbf4a57e..ee214eed177ce 100644 --- a/docs/source/extensions/plugins.rst +++ b/docs/source/extensions/plugins.rst @@ -10,50 +10,27 @@ Plugins allow custom integrations to the internals of the Trainer such as a cust cluster environment implementation. Under the hood, the Lightning Trainer is using plugins in the training routine, added automatically -depending on the provided Trainer arguments. For example: +depending on the provided Trainer arguments. -.. code-block:: python - - # accelerator: GPUAccelerator - # training strategy: DDPStrategy - # precision: NativeMixedPrecisionPlugin - trainer = Trainer(accelerator="gpu", devices=4, precision=16) - - -We expose Accelerators and Plugins mainly for expert users that want to extend Lightning for: +There are three types of Plugins in Lightning with different responsibilities: -- New hardware (like TPU plugin) -- Distributed backends (e.g. a backend not yet supported by - `PyTorch <https://pytorch.org/docs/stable/distributed.html#backends>`_ itself) -- Clusters (e.g. customized access to the cluster's environment interface) +- Precision Plugins +- CheckpointIO Plugins +- Cluster Environments (e.g. customized access to the cluster's environment interface) -There are three types of Plugins in Lightning with different responsibilities: Precision Plugins ----------------- -We expose precision plugins for the users - +We provide precision plugins for the users so that they can benefit from numerical representations with lower precision than +32-bit floating-point or higher precision, such as 64-bit floating-point. .. code-block:: python # precision: FP16Plugin trainer = Trainer(precision=16) -- Precision Plugins -- CheckpointIO Plugins -- Cluster Environments (e.g. customized access to the cluster's environment interface) - - -The full list of built-in plugins is listed below. - - -.. warning:: The Plugin API is in beta and subject to change. - For help setting up custom plugins/accelerators, please reach out to us at **support@pytorchlightning.ai** - - -Precision Plugins ------------------ +The full list of built-in precision plugins is listed below. .. currentmodule:: pytorch_lightning.plugins.precision @@ -97,6 +74,8 @@ You could learn more about custom checkpointing with Lightning :ref:`here <../co Cluster Environments -------------------- +Clusters (e.g. customized access to the cluster's environment interface) + .. currentmodule:: pytorch_lightning.plugins.environments .. autosummary:: From 426cb06370d091530e8d387b77e53ef00150630d Mon Sep 17 00:00:00 2001 From: Kaushik B <kaushikbokka@gmail.com> Date: Tue, 29 Mar 2022 11:13:32 +0530 Subject: [PATCH 05/14] Update plugins doc --- docs/source/extensions/plugins.rst | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/docs/source/extensions/plugins.rst b/docs/source/extensions/plugins.rst index ee214eed177ce..3dfe6628b6152 100644 --- a/docs/source/extensions/plugins.rst +++ b/docs/source/extensions/plugins.rst @@ -16,7 +16,7 @@ There are three types of Plugins in Lightning with different responsibilities: - Precision Plugins - CheckpointIO Plugins -- Cluster Environments (e.g. customized access to the cluster's environment interface) +- Cluster Environments Precision Plugins @@ -27,7 +27,7 @@ We provide precision plugins for the users so that they can benefit from numeric .. code-block:: python - # precision: FP16Plugin + # Training with 16-bit precision trainer = Trainer(precision=16) The full list of built-in precision plugins is listed below. @@ -50,6 +50,7 @@ The full list of built-in precision plugins is listed below. FullyShardedNativeMixedPrecisionPlugin IPUPrecisionPlugin +More information regarding precision with Lightning can be found :doc:`here <../advanced/precision>` CheckpointIO Plugins -------------------- @@ -74,7 +75,7 @@ You could learn more about custom checkpointing with Lightning :ref:`here <../co Cluster Environments -------------------- -Clusters (e.g. customized access to the cluster's environment interface) +Users can define the interface of their own cluster environment based on the requirements of their infrastructure. .. currentmodule:: pytorch_lightning.plugins.environments From 3249d1d7fc743e707d3eaa1faa60458a06d089db Mon Sep 17 00:00:00 2001 From: Kaushik B <kaushikbokka@gmail.com> Date: Thu, 24 Mar 2022 16:28:38 +0530 Subject: [PATCH 06/14] Update Plugins doc --- docs/source/extensions/plugins.rst | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/docs/source/extensions/plugins.rst b/docs/source/extensions/plugins.rst index 3bfa7ad24b29c..dc9261b863b6b 100644 --- a/docs/source/extensions/plugins.rst +++ b/docs/source/extensions/plugins.rst @@ -75,6 +75,20 @@ Precision Plugins TPUPrecisionPlugin +CheckpointIO Plugins +-------------------- + +.. currentmodule:: pytorch_lightning.plugins.io + +.. autosummary:: + :nosignatures: + :template: classtemplate.rst + + CheckpointIO + TorchCheckpointIO + XLACheckpointIO + + Cluster Environments -------------------- From dd575daf8c8f79cf5438c9d75bfc9f298acfca14 Mon Sep 17 00:00:00 2001 From: Kaushik B <kaushikbokka@gmail.com> Date: Thu, 24 Mar 2022 16:35:23 +0530 Subject: [PATCH 07/14] Update Plugins doc --- docs/source/extensions/plugins.rst | 23 ++++++----------------- 1 file changed, 6 insertions(+), 17 deletions(-) diff --git a/docs/source/extensions/plugins.rst b/docs/source/extensions/plugins.rst index dc9261b863b6b..b03b7e51bbc89 100644 --- a/docs/source/extensions/plugins.rst +++ b/docs/source/extensions/plugins.rst @@ -6,8 +6,8 @@ Plugins .. include:: ../links.rst -Plugins allow custom integrations to the internals of the Trainer such as a custom precision or -distributed implementation. +Plugins allow custom integrations to the internals of the Trainer such as a custom precision, checkpointing or +cluster environment implementation. Under the hood, the Lightning Trainer is using plugins in the training routine, added automatically depending on the provided Trainer arguments. For example: @@ -27,22 +27,11 @@ We expose Accelerators and Plugins mainly for expert users that want to extend L `PyTorch <https://pytorch.org/docs/stable/distributed.html#backends>`_ itself) - Clusters (e.g. customized access to the cluster's environment interface) -There are two types of Plugins in Lightning with different responsibilities: +There are three types of Plugins in Lightning with different responsibilities: -Strategy --------- - -- Launching and teardown of training processes (if applicable) -- Setup communication between processes (NCCL, GLOO, MPI, ...) -- Provide a unified communication interface for reduction, broadcast, etc. -- Provide access to the wrapped LightningModule - - -Furthermore, for multi-node training Lightning provides cluster environment plugins that allow the advanced user -to configure Lightning to integrate with a :ref:`custom-cluster`. - - -.. image:: ../_static/images/accelerator/overview.svg +- Precision Plugins +- CheckpointIO Plugins +- Cluster Environments (e.g. customized access to the cluster's environment interface) The full list of built-in plugins is listed below. From e0d2595133e879632c41affbfcf12a7fca7d40ed Mon Sep 17 00:00:00 2001 From: Kaushik B <kaushikbokka@gmail.com> Date: Thu, 24 Mar 2022 17:36:03 +0530 Subject: [PATCH 08/14] Update CheckpointIO --- docs/source/extensions/plugins.rst | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/docs/source/extensions/plugins.rst b/docs/source/extensions/plugins.rst index b03b7e51bbc89..a71e747659cb9 100644 --- a/docs/source/extensions/plugins.rst +++ b/docs/source/extensions/plugins.rst @@ -29,6 +29,17 @@ We expose Accelerators and Plugins mainly for expert users that want to extend L There are three types of Plugins in Lightning with different responsibilities: +Precision Plugins +----------------- + +We expose precision plugins for the users + + +.. code-block:: python + + # precision: FP16Plugin + trainer = Trainer(precision=16) + - Precision Plugins - CheckpointIO Plugins - Cluster Environments (e.g. customized access to the cluster's environment interface) @@ -67,6 +78,11 @@ Precision Plugins CheckpointIO Plugins -------------------- +As part of our commitment to extensibility, we have abstracted Lightning's checkpointing logic into the :class:`~pytorch_lightning.plugins.io.CheckpointIO` plugin. +With this, users have the ability to customize the checkpointing logic to match the needs of their infrastructure. + +Below is a list of built-in plugins for checkpointing. + .. currentmodule:: pytorch_lightning.plugins.io .. autosummary:: @@ -77,6 +93,7 @@ CheckpointIO Plugins TorchCheckpointIO XLACheckpointIO +You could learn more about custom checkpointing with Lightning :ref:`here <../common/checkpointing:Customize Checkpointing>`. Cluster Environments -------------------- From 622b3126964186783302f96f1888778712b0cca5 Mon Sep 17 00:00:00 2001 From: Kaushik B <kaushikbokka@gmail.com> Date: Tue, 29 Mar 2022 10:55:13 +0530 Subject: [PATCH 09/14] Update plugins doc --- docs/source/common/checkpointing.rst | 2 +- docs/source/extensions/plugins.rst | 41 +++++++--------------------- 2 files changed, 11 insertions(+), 32 deletions(-) diff --git a/docs/source/common/checkpointing.rst b/docs/source/common/checkpointing.rst index 2371964d1f278..b08a77040df53 100644 --- a/docs/source/common/checkpointing.rst +++ b/docs/source/common/checkpointing.rst @@ -392,7 +392,7 @@ Custom Checkpoint IO Plugin .. note:: - Some ``TrainingTypePlugins`` like ``DeepSpeedStrategy`` do not support custom ``CheckpointIO`` as checkpointing logic is not modifiable. + Some strategies like :class:`~pytorch_lightning.strategies.deepspeed.DeepSpeedStrategy` do not support custom :class:`~pytorch_lightning.plugins.io.checkpoint_plugin.CheckpointIO` as checkpointing logic is not modifiable. ----------- diff --git a/docs/source/extensions/plugins.rst b/docs/source/extensions/plugins.rst index a71e747659cb9..c41c540d6719f 100644 --- a/docs/source/extensions/plugins.rst +++ b/docs/source/extensions/plugins.rst @@ -10,50 +10,27 @@ Plugins allow custom integrations to the internals of the Trainer such as a cust cluster environment implementation. Under the hood, the Lightning Trainer is using plugins in the training routine, added automatically -depending on the provided Trainer arguments. For example: +depending on the provided Trainer arguments. -.. code-block:: python - - # accelerator: GPUAccelerator - # training strategy: DDPStrategy - # precision: NativeMixedPrecisionPlugin - trainer = Trainer(accelerator="gpu", devices=4, precision=16) - - -We expose Accelerators and Plugins mainly for expert users that want to extend Lightning for: +There are three types of Plugins in Lightning with different responsibilities: -- New hardware (like TPU plugin) -- Distributed backends (e.g. a backend not yet supported by - `PyTorch <https://pytorch.org/docs/stable/distributed.html#backends>`_ itself) -- Clusters (e.g. customized access to the cluster's environment interface) +- Precision Plugins +- CheckpointIO Plugins +- Cluster Environments (e.g. customized access to the cluster's environment interface) -There are three types of Plugins in Lightning with different responsibilities: Precision Plugins ----------------- -We expose precision plugins for the users - +We provide precision plugins for the users so that they can benefit from numerical representations with lower precision than +32-bit floating-point or higher precision, such as 64-bit floating-point. .. code-block:: python # precision: FP16Plugin trainer = Trainer(precision=16) -- Precision Plugins -- CheckpointIO Plugins -- Cluster Environments (e.g. customized access to the cluster's environment interface) - - -The full list of built-in plugins is listed below. - - -.. warning:: The Plugin API is in beta and subject to change. - For help setting up custom plugins/accelerators, please reach out to us at **support@pytorchlightning.ai** - - -Precision Plugins ------------------ +The full list of built-in precision plugins is listed below. .. currentmodule:: pytorch_lightning.plugins.precision @@ -98,6 +75,8 @@ You could learn more about custom checkpointing with Lightning :ref:`here <../co Cluster Environments -------------------- +Clusters (e.g. customized access to the cluster's environment interface) + .. currentmodule:: pytorch_lightning.plugins.environments .. autosummary:: From 4aa05f35e5d4853438a3e3711bd9dd2feec6a834 Mon Sep 17 00:00:00 2001 From: Kaushik B <kaushikbokka@gmail.com> Date: Tue, 29 Mar 2022 11:13:32 +0530 Subject: [PATCH 10/14] Update plugins doc --- docs/source/extensions/plugins.rst | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/docs/source/extensions/plugins.rst b/docs/source/extensions/plugins.rst index c41c540d6719f..40ea79e2c6c0e 100644 --- a/docs/source/extensions/plugins.rst +++ b/docs/source/extensions/plugins.rst @@ -16,7 +16,7 @@ There are three types of Plugins in Lightning with different responsibilities: - Precision Plugins - CheckpointIO Plugins -- Cluster Environments (e.g. customized access to the cluster's environment interface) +- Cluster Environments Precision Plugins @@ -27,7 +27,7 @@ We provide precision plugins for the users so that they can benefit from numeric .. code-block:: python - # precision: FP16Plugin + # Training with 16-bit precision trainer = Trainer(precision=16) The full list of built-in precision plugins is listed below. @@ -51,6 +51,7 @@ The full list of built-in precision plugins is listed below. TPUBf16PrecisionPlugin TPUPrecisionPlugin +More information regarding precision with Lightning can be found :doc:`here <../advanced/precision>` CheckpointIO Plugins -------------------- @@ -75,7 +76,7 @@ You could learn more about custom checkpointing with Lightning :ref:`here <../co Cluster Environments -------------------- -Clusters (e.g. customized access to the cluster's environment interface) +Users can define the interface of their own cluster environment based on the requirements of their infrastructure. .. currentmodule:: pytorch_lightning.plugins.environments From 6ce3a1e47511c32b708ae87c3a861b9984cf13d2 Mon Sep 17 00:00:00 2001 From: rohitgr7 <rohitgr1998@gmail.com> Date: Tue, 29 Mar 2022 13:49:28 +0530 Subject: [PATCH 11/14] add tag --- docs/source/advanced/model_parallel.rst | 10 ++-------- docs/source/common/checkpointing.rst | 1 + docs/source/common/lightning_module.rst | 2 +- docs/source/extensions/plugins.rst | 2 +- 4 files changed, 5 insertions(+), 10 deletions(-) diff --git a/docs/source/advanced/model_parallel.rst b/docs/source/advanced/model_parallel.rst index 18c83bde743c8..5cf7556be1efd 100644 --- a/docs/source/advanced/model_parallel.rst +++ b/docs/source/advanced/model_parallel.rst @@ -296,7 +296,6 @@ Below we show an example of running `ZeRO-Offload <https://www.deepspeed.ai/tuto .. code-block:: python from pytorch_lightning import Trainer - from pytorch_lightning.strategies import DeepSpeedStrategy model = MyModel() trainer = Trainer(accelerator="gpu", devices=4, strategy="deepspeed_stage_2_offload", precision=16) @@ -341,7 +340,6 @@ For even more speed benefit, DeepSpeed offers an optimized CPU version of ADAM c import pytorch_lightning from pytorch_lightning import Trainer - from pytorch_lightning.strategies import DeepSpeedStrategy from deepspeed.ops.adam import DeepSpeedCPUAdam @@ -385,7 +383,6 @@ Also please have a look at our :ref:`deepspeed-zero-stage-3-tips` which contains .. code-block:: python from pytorch_lightning import Trainer - from pytorch_lightning.strategies import DeepSpeedStrategy from deepspeed.ops.adam import FusedAdam @@ -409,7 +406,6 @@ You can also use the Lightning Trainer to run predict or evaluate with DeepSpeed .. code-block:: python from pytorch_lightning import Trainer - from pytorch_lightning.strategies import DeepSpeedStrategy class MyModel(pl.LightningModule): @@ -435,7 +431,6 @@ This reduces the time taken to initialize very large models, as well as ensure w import torch.nn as nn from pytorch_lightning import Trainer - from pytorch_lightning.strategies import DeepSpeedStrategy from deepspeed.ops.adam import FusedAdam @@ -549,7 +544,6 @@ This saves memory when training larger models, however requires using a checkpoi .. code-block:: python from pytorch_lightning import Trainer - from pytorch_lightning.strategies import DeepSpeedStrategy import deepspeed @@ -686,7 +680,7 @@ In some cases you may want to define your own DeepSpeed Config, to access all pa } model = MyModel() - trainer = Trainer(accelerator="gpu", devices=4, strategy=DeepSpeedStrategy(deepspeed_config), precision=16) + trainer = Trainer(accelerator="gpu", devices=4, strategy=DeepSpeedStrategy(config=deepspeed_config), precision=16) trainer.fit(model) @@ -699,7 +693,7 @@ We support taking the config as a json formatted file: model = MyModel() trainer = Trainer( - accelerator="gpu", devices=4, strategy=DeepSpeedStrategy("/path/to/deepspeed_config.json"), precision=16 + accelerator="gpu", devices=4, strategy=DeepSpeedStrategy(config="/path/to/deepspeed_config.json"), precision=16 ) trainer.fit(model) diff --git a/docs/source/common/checkpointing.rst b/docs/source/common/checkpointing.rst index b08a77040df53..31824e828cc7d 100644 --- a/docs/source/common/checkpointing.rst +++ b/docs/source/common/checkpointing.rst @@ -315,6 +315,7 @@ and the Lightning Team will be happy to integrate/help integrate it. ----------- +.. _customize_checkpointing: *********************** Customize Checkpointing diff --git a/docs/source/common/lightning_module.rst b/docs/source/common/lightning_module.rst index 935e788310d7c..fd9de11f601d8 100644 --- a/docs/source/common/lightning_module.rst +++ b/docs/source/common/lightning_module.rst @@ -1056,7 +1056,7 @@ automatic_optimization When set to ``False``, Lightning does not automate the optimization process. This means you are responsible for handling your optimizers. However, we do take care of precision and any accelerators used. -See :ref:`manual optimization<common/optimization:Manual optimization>` for details. +See :ref:`manual optimization <common/optimization:Manual optimization>` for details. .. code-block:: python diff --git a/docs/source/extensions/plugins.rst b/docs/source/extensions/plugins.rst index 40ea79e2c6c0e..11468e86e9d2e 100644 --- a/docs/source/extensions/plugins.rst +++ b/docs/source/extensions/plugins.rst @@ -71,7 +71,7 @@ Below is a list of built-in plugins for checkpointing. TorchCheckpointIO XLACheckpointIO -You could learn more about custom checkpointing with Lightning :ref:`here <../common/checkpointing:Customize Checkpointing>`. +You could learn more about custom checkpointing with Lightning :ref:`here <customize_checkpointing>`. Cluster Environments -------------------- From 9f63704e7f619284b035d2e8524904d3773f13af Mon Sep 17 00:00:00 2001 From: rohitgr7 <rohitgr1998@gmail.com> Date: Tue, 29 Mar 2022 13:57:44 +0530 Subject: [PATCH 12/14] add missing plugins and fix structure --- docs/source/extensions/plugins.rst | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/docs/source/extensions/plugins.rst b/docs/source/extensions/plugins.rst index 11468e86e9d2e..4537582c645e5 100644 --- a/docs/source/extensions/plugins.rst +++ b/docs/source/extensions/plugins.rst @@ -19,8 +19,9 @@ There are three types of Plugins in Lightning with different responsibilities: - Cluster Environments +***************** Precision Plugins ------------------ +***************** We provide precision plugins for the users so that they can benefit from numerical representations with lower precision than 32-bit floating-point or higher precision, such as 64-bit floating-point. @@ -53,8 +54,13 @@ The full list of built-in precision plugins is listed below. More information regarding precision with Lightning can be found :doc:`here <../advanced/precision>` + +----------- + + +******************** CheckpointIO Plugins --------------------- +******************** As part of our commitment to extensibility, we have abstracted Lightning's checkpointing logic into the :class:`~pytorch_lightning.plugins.io.CheckpointIO` plugin. With this, users have the ability to customize the checkpointing logic to match the needs of their infrastructure. @@ -68,13 +74,19 @@ Below is a list of built-in plugins for checkpointing. :template: classtemplate.rst CheckpointIO + HPUCheckpointIO TorchCheckpointIO XLACheckpointIO You could learn more about custom checkpointing with Lightning :ref:`here <customize_checkpointing>`. + +----------- + + +******************** Cluster Environments --------------------- +******************** Users can define the interface of their own cluster environment based on the requirements of their infrastructure. @@ -85,8 +97,8 @@ Users can define the interface of their own cluster environment based on the req :template: classtemplate.rst ClusterEnvironment + KubeflowEnvironment LightningEnvironment LSFEnvironment - TorchElasticEnvironment - KubeflowEnvironment SLURMEnvironment + TorchElasticEnvironment From 16da186dad57ec89e411d920a065463d7958647d Mon Sep 17 00:00:00 2001 From: Rohit Gupta <rohitgr1998@gmail.com> Date: Tue, 29 Mar 2022 14:05:25 +0530 Subject: [PATCH 13/14] Apply suggestions from code review --- docs/source/extensions/plugins.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/source/extensions/plugins.rst b/docs/source/extensions/plugins.rst index 4537582c645e5..227cc24254b89 100644 --- a/docs/source/extensions/plugins.rst +++ b/docs/source/extensions/plugins.rst @@ -6,7 +6,7 @@ Plugins .. include:: ../links.rst -Plugins allow custom integrations to the internals of the Trainer such as a custom precision, checkpointing or +Plugins allow custom integrations to the internals of the Trainer such as custom precision, checkpointing or cluster environment implementation. Under the hood, the Lightning Trainer is using plugins in the training routine, added automatically @@ -23,7 +23,7 @@ There are three types of Plugins in Lightning with different responsibilities: Precision Plugins ***************** -We provide precision plugins for the users so that they can benefit from numerical representations with lower precision than +We provide precision plugins for you so that they can benefit from numerical representations with lower precision than 32-bit floating-point or higher precision, such as 64-bit floating-point. .. code-block:: python @@ -63,7 +63,7 @@ CheckpointIO Plugins ******************** As part of our commitment to extensibility, we have abstracted Lightning's checkpointing logic into the :class:`~pytorch_lightning.plugins.io.CheckpointIO` plugin. -With this, users have the ability to customize the checkpointing logic to match the needs of their infrastructure. +With this, you have the ability to customize the checkpointing logic to match the needs of their infrastructure. Below is a list of built-in plugins for checkpointing. @@ -88,7 +88,7 @@ You could learn more about custom checkpointing with Lightning :ref:`here <custo Cluster Environments ******************** -Users can define the interface of their own cluster environment based on the requirements of their infrastructure. +You can define the interface of their own cluster environment based on the requirements of their infrastructure. .. currentmodule:: pytorch_lightning.plugins.environments From f27f13087f5e29761d59a9e95b11a3ac4897ff03 Mon Sep 17 00:00:00 2001 From: Kaushik B <kaushikbokka@gmail.com> Date: Tue, 29 Mar 2022 15:15:08 +0530 Subject: [PATCH 14/14] Replace training type instances with strategy --- docs/source/advanced/training_tricks.rst | 3 +-- docs/source/common/trainer.rst | 4 ++-- docs/source/starter/lightning_lite.rst | 2 +- pytorch_lightning/loops/optimization/optimizer_loop.py | 2 +- pytorch_lightning/strategies/strategy.py | 3 +-- pytorch_lightning/trainer/trainer.py | 10 +++++----- tests/models/test_amp.py | 4 ++-- tests/strategies/test_ddp_spawn_strategy.py | 2 +- tests/strategies/test_ddp_strategy.py | 2 +- tests/trainer/test_trainer.py | 2 +- 10 files changed, 16 insertions(+), 18 deletions(-) diff --git a/docs/source/advanced/training_tricks.rst b/docs/source/advanced/training_tricks.rst index ddfc72e24ef75..103eb34c3fe8e 100644 --- a/docs/source/advanced/training_tricks.rst +++ b/docs/source/advanced/training_tricks.rst @@ -331,8 +331,7 @@ However, for in-memory datasets, that means that each process will hold a (redun For example, when training Graph Neural Networks, a common strategy is to load the entire graph into CPU memory for fast access to the entire graph structure and its features, and to then perform neighbor sampling to obtain mini-batches that fit onto the GPU. A simple way to prevent redundant dataset replicas is to rely on :obj:`torch.multiprocessing` to share the `data automatically between spawned processes via shared memory <https://pytorch.org/docs/stable/notes/multiprocessing.html>`_. -For this, all data pre-loading should be done on the main process inside :meth:`DataModule.__init__`. As a result, all tensor-data will get automatically shared when using the :class:`~pytorch_lightning.plugins.strategies.ddp_spawn.DDPSpawnStrategy` -training type strategy: +For this, all data pre-loading should be done on the main process inside :meth:`DataModule.__init__`. As a result, all tensor-data will get automatically shared when using the :class:`~pytorch_lightning.plugins.strategies.ddp_spawn.DDPSpawnStrategy` strategy. .. warning:: diff --git a/docs/source/common/trainer.rst b/docs/source/common/trainer.rst index 7b00832846d4b..5f1bd9dc2fa93 100644 --- a/docs/source/common/trainer.rst +++ b/docs/source/common/trainer.rst @@ -1448,7 +1448,7 @@ checkpoint, training will start from the beginning of the next epoch. strategy ^^^^^^^^ -Supports passing different training strategies with aliases (ddp, ddp_spawn, etc) as well as custom training type plugins. +Supports passing different training strategies with aliases (ddp, ddp_spawn, etc) as well as custom strategies. .. code-block:: python @@ -1458,7 +1458,7 @@ Supports passing different training strategies with aliases (ddp, ddp_spawn, etc # Training with the DDP Spawn strategy using 4 cpu processes trainer = Trainer(strategy="ddp_spawn", accelerator="cpu", devices=4) -.. note:: Additionally, you can pass your custom training type plugins to the ``strategy`` argument. +.. note:: Additionally, you can pass your custom strategy to the ``strategy`` argument. .. code-block:: python diff --git a/docs/source/starter/lightning_lite.rst b/docs/source/starter/lightning_lite.rst index 2a838d75a4fa4..860bd60511efd 100644 --- a/docs/source/starter/lightning_lite.rst +++ b/docs/source/starter/lightning_lite.rst @@ -387,7 +387,7 @@ Choose a training strategy: ``"dp"``, ``"ddp"``, ``"ddp_spawn"``, ``"tpu_spawn"` lite = Lite(strategy="ddp_spawn", accelerator="cpu", devices=4) -Additionally, you can pass in your custom training type strategy by configuring additional parameters. +Additionally, you can pass in your custom strategy by configuring additional parameters. .. code-block:: python diff --git a/pytorch_lightning/loops/optimization/optimizer_loop.py b/pytorch_lightning/loops/optimization/optimizer_loop.py index bab025466789a..f9068b87b653d 100644 --- a/pytorch_lightning/loops/optimization/optimizer_loop.py +++ b/pytorch_lightning/loops/optimization/optimizer_loop.py @@ -235,7 +235,7 @@ def _run_optimization( closure = self._make_closure(split_batch, batch_idx, opt_idx, optimizer) if ( - # when the training type plugin handles accumulation, we want to always call the optimizer step + # when the strategy handles accumulation, we want to always call the optimizer step not self.trainer.strategy.handles_gradient_accumulation and self.trainer.fit_loop._should_accumulate() ): diff --git a/pytorch_lightning/strategies/strategy.py b/pytorch_lightning/strategies/strategy.py index db33c4ec72d72..87c5c171d0ece 100644 --- a/pytorch_lightning/strategies/strategy.py +++ b/pytorch_lightning/strategies/strategy.py @@ -40,8 +40,7 @@ class Strategy(ABC): - """Base class for all training type plugins that change the behaviour of the training, validation and test- - loop.""" + """Base class for all strategies that change the behaviour of the training, validation and test- loop.""" def __init__( self, diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 53b16af117e34..c0ea6f6f38dbd 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -401,7 +401,7 @@ def __init__( Please pass the path to ``Trainer.fit(..., ckpt_path=...)`` instead. strategy: Supports different training strategies with aliases - as well custom training type plugins. + as well custom strategies. Default: ``None``. sync_batchnorm: Synchronize batch norm layers between process groups/whole world. @@ -1152,7 +1152,7 @@ def _run( if hasattr(model, "hparams"): parsing.clean_namespace(model.hparams) - # attach model to the training type plugin + # attach model to the strategy self.strategy.connect(model) self._callback_connector._attach_model_callbacks() @@ -2035,17 +2035,17 @@ def global_rank(self) -> int: @property def local_rank(self) -> int: - # some training types define a local rank + # some strategies define a local rank return getattr(self.strategy, "local_rank", 0) @property def node_rank(self) -> int: - # some training types define a node rank + # some strategies define a node rank return getattr(self.strategy, "node_rank", 0) @property def world_size(self) -> int: - # some training types define a world size + # some strategies define a world size return getattr(self.strategy, "world_size", 1) @property diff --git a/tests/models/test_amp.py b/tests/models/test_amp.py index 3fb42fb0ce29e..0130270a5ac78 100644 --- a/tests/models/test_amp.py +++ b/tests/models/test_amp.py @@ -79,7 +79,7 @@ def _assert_autocast_enabled(self): @pytest.mark.parametrize("precision", [16, "bf16"]) @pytest.mark.parametrize("devices", [1, 2]) def test_amp_cpus(tmpdir, strategy, precision, devices): - """Make sure combinations of AMP and training types work if supported.""" + """Make sure combinations of AMP and strategies work if supported.""" tutils.reset_seed() trainer = Trainer( @@ -104,7 +104,7 @@ def test_amp_cpus(tmpdir, strategy, precision, devices): @pytest.mark.parametrize("precision", [16, "bf16"]) @pytest.mark.parametrize("devices", [1, 2]) def test_amp_gpus(tmpdir, strategy, precision, devices): - """Make sure combinations of AMP and training types work if supported.""" + """Make sure combinations of AMP and strategies work if supported.""" tutils.reset_seed() trainer = Trainer( diff --git a/tests/strategies/test_ddp_spawn_strategy.py b/tests/strategies/test_ddp_spawn_strategy.py index 74ceb08058eb4..c7ce848376e0d 100644 --- a/tests/strategies/test_ddp_spawn_strategy.py +++ b/tests/strategies/test_ddp_spawn_strategy.py @@ -55,7 +55,7 @@ def get_from_queue(self, queue) -> None: def test_ddp_cpu(): """Tests if device is set correctly when training for DDPSpawnStrategy.""" trainer = Trainer(devices=2, accelerator="cpu", fast_dev_run=True) - # assert training type plugin attributes for device setting + # assert strategy attributes for device setting assert isinstance(trainer.strategy, DDPSpawnStrategy) assert trainer.strategy.root_device == torch.device("cpu") diff --git a/tests/strategies/test_ddp_strategy.py b/tests/strategies/test_ddp_strategy.py index d34617b4b2664..3e62c17bc4ecd 100644 --- a/tests/strategies/test_ddp_strategy.py +++ b/tests/strategies/test_ddp_strategy.py @@ -37,7 +37,7 @@ def on_train_start(self) -> None: def test_ddp_with_2_gpus(): """Tests if device is set correctly when training and after teardown for DDPStrategy.""" trainer = Trainer(accelerator="gpu", devices=2, strategy="ddp", fast_dev_run=True) - # assert training type plugin attributes for device setting + # assert strategy attributes for device setting assert isinstance(trainer.strategy, DDPStrategy) local_rank = trainer.strategy.local_rank assert trainer.strategy.root_device == torch.device(f"cuda:{local_rank}") diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py index 6f76e79bd284c..18083b2868889 100644 --- a/tests/trainer/test_trainer.py +++ b/tests/trainer/test_trainer.py @@ -1496,7 +1496,7 @@ def write_on_batch_end(self, trainer, pl_module, prediction, batch_indices, *arg def test_spawn_predict_return_predictions(tmpdir): - """Test that `return_predictions=True` raise a MisconfigurationException with spawn training type plugins.""" + """Test that `return_predictions=True` raise a MisconfigurationException with spawn strategies.""" model = BoringModel() trainer = Trainer(default_root_dir=tmpdir, accelerator="cpu", strategy="ddp_spawn", devices=2, fast_dev_run=True) assert isinstance(trainer.strategy, DDPSpawnStrategy)