Skip to content

Commit

Permalink
Fix ddp_spawn -> ddp fallback logic when on LSF cluster (#15657)
Browse files Browse the repository at this point in the history
Co-authored-by: awaelchli <[email protected]>
  • Loading branch information
Atharva-Phatak and awaelchli authored Nov 12, 2022
1 parent 61ee3fa commit cdb7006
Show file tree
Hide file tree
Showing 6 changed files with 81 additions and 35 deletions.
4 changes: 3 additions & 1 deletion src/lightning_lite/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).

### Fixed

-

- Fixed the automatic fallback from `LightningLite(strategy="ddp_spawn", ...)` to `LightningLite(strategy="ddp", ...)` when on an LSF cluster ([#15103](https://github.com/PyTorchLightning/pytorch-lightning/issues/15103))



## [1.8.1] - 2022-11-10
Expand Down
5 changes: 4 additions & 1 deletion src/lightning_lite/connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -403,7 +403,10 @@ def _check_strategy_and_fallback(self) -> None:
strategy_flag = "" if isinstance(self._strategy_flag, Strategy) else self._strategy_flag

if strategy_flag in ("ddp_spawn", "ddp_spawn_find_unused_parameters_false") and (
TorchElasticEnvironment.detect() or KubeflowEnvironment.detect() or SLURMEnvironment.detect()
TorchElasticEnvironment.detect()
or KubeflowEnvironment.detect()
or SLURMEnvironment.detect()
or LSFEnvironment.detect()
):
strategy_flag = "ddp"
if strategy_flag == "dp" and self._accelerator_flag == "cpu":
Expand Down
2 changes: 2 additions & 0 deletions src/pytorch_lightning/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
- Fixed manual optimization raising `AttributeError` with Bagua Strategy ([#12534](https://github.com/PyTorchLightning/pytorch-lightning/issues/12534))
- Fixed the import of `pytorch_lightning` causing a warning 'Redirects are currently not supported in Windows or MacOs' ([#15610](https://github.com/PyTorchLightning/pytorch-lightning/issues/15610))

- Fixed the automatic fallback from `Trainer(strategy="ddp_spawn", ...)` to `Trainer(strategy="ddp", ...)` when on an LSF cluster ([#15103](https://github.com/PyTorchLightning/pytorch-lightning/issues/15103))


## [1.8.0] - 2022-11-01

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -614,7 +614,10 @@ def _check_strategy_and_fallback(self) -> None:
strategy_flag = "" if isinstance(self._strategy_flag, Strategy) else self._strategy_flag

if strategy_flag in ("ddp_spawn", "ddp_spawn_find_unused_parameters_false") and (
TorchElasticEnvironment.detect() or KubeflowEnvironment.detect() or SLURMEnvironment.detect()
TorchElasticEnvironment.detect()
or KubeflowEnvironment.detect()
or SLURMEnvironment.detect()
or LSFEnvironment.detect()
):
strategy_flag = "ddp"
if strategy_flag == "dp" and self._accelerator_flag == "cpu":
Expand Down
52 changes: 35 additions & 17 deletions tests/tests_lite/test_connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
from lightning_lite.plugins.environments import (
KubeflowEnvironment,
LightningEnvironment,
LSFEnvironment,
SLURMEnvironment,
TorchElasticEnvironment,
)
Expand Down Expand Up @@ -201,24 +202,41 @@ class Strat(DDPStrategy):
assert connector.strategy is strategy


@mock.patch.dict(
os.environ,
{
"SLURM_NTASKS": "2",
"SLURM_NTASKS_PER_NODE": "1",
"SLURM_JOB_NAME": "SOME_NAME",
"SLURM_NODEID": "0",
"LOCAL_RANK": "0",
"SLURM_PROCID": "0",
"SLURM_LOCALID": "0",
},
@pytest.mark.parametrize(
"env_vars,expected_environment",
[
(
{
"SLURM_NTASKS": "2",
"SLURM_NTASKS_PER_NODE": "1",
"SLURM_JOB_NAME": "SOME_NAME",
"SLURM_NODEID": "0",
"LOCAL_RANK": "0",
"SLURM_PROCID": "0",
"SLURM_LOCALID": "0",
},
SLURMEnvironment,
),
(
{
"LSB_JOBID": "1",
"LSB_DJOB_RANKFILE": "SOME_RANK_FILE",
"JSM_NAMESPACE_LOCAL_RANK": "1",
"JSM_NAMESPACE_SIZE": "20",
"JSM_NAMESPACE_RANK": "1",
},
LSFEnvironment,
),
],
)
@mock.patch("lightning_lite.accelerators.cuda.num_cuda_devices", return_value=0)
def test_dist_backend_accelerator_mapping(*_):
connector = _Connector(strategy="ddp_spawn", accelerator="cpu", devices=2)
assert isinstance(connector.accelerator, CPUAccelerator)
assert isinstance(connector.strategy, DDPStrategy)
assert connector.strategy.local_rank == 0
@mock.patch("lightning_lite.plugins.environments.lsf.LSFEnvironment._read_hosts", return_value=["node0", "node1"])
@mock.patch("lightning_lite.plugins.environments.lsf.LSFEnvironment._get_node_rank", return_value=0)
def test_fallback_from_ddp_spawn_to_ddp_on_cluster(_, __, env_vars, expected_environment):
with mock.patch.dict(os.environ, env_vars, clear=True):
trainer = _Connector(strategy="ddp_spawn", accelerator="cpu", devices=2)
assert isinstance(trainer.accelerator, CPUAccelerator)
assert isinstance(trainer.strategy, DDPStrategy)
assert isinstance(trainer.strategy.cluster_environment, expected_environment)


@RunIf(mps=False)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
from lightning_lite.plugins.environments import (
KubeflowEnvironment,
LightningEnvironment,
LSFEnvironment,
SLURMEnvironment,
TorchElasticEnvironment,
)
Expand Down Expand Up @@ -193,24 +194,41 @@ class Strat(DDPStrategy):
assert trainer._accelerator_connector.strategy is strategy


@mock.patch.dict(
os.environ,
{
"SLURM_NTASKS": "2",
"SLURM_NTASKS_PER_NODE": "1",
"SLURM_JOB_NAME": "SOME_NAME",
"SLURM_NODEID": "0",
"LOCAL_RANK": "0",
"SLURM_PROCID": "0",
"SLURM_LOCALID": "0",
},
@pytest.mark.parametrize(
"env_vars,expected_environment",
[
(
{
"SLURM_NTASKS": "2",
"SLURM_NTASKS_PER_NODE": "1",
"SLURM_JOB_NAME": "SOME_NAME",
"SLURM_NODEID": "0",
"LOCAL_RANK": "0",
"SLURM_PROCID": "0",
"SLURM_LOCALID": "0",
},
SLURMEnvironment,
),
(
{
"LSB_JOBID": "1",
"LSB_DJOB_RANKFILE": "SOME_RANK_FILE",
"JSM_NAMESPACE_LOCAL_RANK": "1",
"JSM_NAMESPACE_SIZE": "20",
"JSM_NAMESPACE_RANK": "1",
},
LSFEnvironment,
),
],
)
@mock.patch("pytorch_lightning.strategies.DDPStrategy.setup_distributed", autospec=True)
def test_dist_backend_accelerator_mapping(cuda_count_0):
trainer = Trainer(fast_dev_run=True, strategy="ddp_spawn", accelerator="cpu", devices=2)
@mock.patch("lightning_lite.plugins.environments.lsf.LSFEnvironment._read_hosts", return_value=["node0", "node1"])
@mock.patch("lightning_lite.plugins.environments.lsf.LSFEnvironment._get_node_rank", return_value=0)
def test_fallback_from_ddp_spawn_to_ddp_on_cluster(_, __, env_vars, expected_environment):
with mock.patch.dict(os.environ, env_vars, clear=True):
trainer = Trainer(strategy="ddp_spawn", accelerator="cpu", devices=2)
assert isinstance(trainer.accelerator, CPUAccelerator)
assert isinstance(trainer.strategy, DDPStrategy)
assert trainer.strategy.local_rank == 0
assert isinstance(trainer.strategy.cluster_environment, expected_environment)


def test_interactive_incompatible_backend_error(mps_count_2, cuda_count_2, monkeypatch):
Expand Down

0 comments on commit cdb7006

Please sign in to comment.