diff --git a/src/lightning_fabric/CHANGELOG.md b/src/lightning_fabric/CHANGELOG.md index 8c9b3084e2151..4c5c406db5747 100644 --- a/src/lightning_fabric/CHANGELOG.md +++ b/src/lightning_fabric/CHANGELOG.md @@ -35,7 +35,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Fixed -- +- Fixed error handling for `accelerator="mps"` and `ddp` strategy pairing ([#16455](https://github.com/Lightning-AI/lightning/pull/16455)) + ## [1.9.0] - 2023-01-17 diff --git a/src/lightning_fabric/connector.py b/src/lightning_fabric/connector.py index 622df289db6aa..cc14d9d0162cd 100644 --- a/src/lightning_fabric/connector.py +++ b/src/lightning_fabric/connector.py @@ -44,6 +44,7 @@ from lightning_fabric.plugins.precision.precision import _PRECISION_INPUT, _PRECISION_INPUT_INT, _PRECISION_INPUT_STR from lightning_fabric.strategies import ( DeepSpeedStrategy, + ParallelStrategy, SingleDeviceStrategy, SingleTPUStrategy, Strategy, @@ -201,6 +202,20 @@ def _check_config_and_set_final_flags( f" Available names are: {', '.join(self._registered_accelerators)}." ) + # MPS accelerator is incompatible with DDP family of strategies. It supports single-device operation only. + is_ddp_str = isinstance(strategy, str) and "ddp" in strategy + is_dp_str = isinstance(strategy, str) and "dp" in strategy + is_deepspeed_str = isinstance(strategy, str) and "deepspeed" in strategy + is_parallel_strategy = isinstance(strategy, ParallelStrategy) or is_ddp_str or is_dp_str or is_deepspeed_str + is_mps_accelerator = MPSAccelerator.is_available() and ( + accelerator in ("mps", "auto", "gpu", None) or isinstance(accelerator, MPSAccelerator) + ) + if is_mps_accelerator and is_parallel_strategy: + raise ValueError( + f"You set `strategy={strategy}` but strategies from the DDP family are not supported on the" + f" MPS accelerator. Either explicitly set `accelerator='cpu'` or change the strategy." + ) + self._accelerator_flag = accelerator supported_precision = get_args(_PRECISION_INPUT_STR) + get_args(_PRECISION_INPUT_INT) diff --git a/tests/tests_fabric/plugins/precision/test_deepspeed_integration.py b/tests/tests_fabric/plugins/precision/test_deepspeed_integration.py index 75c04fe2e9d6f..822cf0be0b7f4 100644 --- a/tests/tests_fabric/plugins/precision/test_deepspeed_integration.py +++ b/tests/tests_fabric/plugins/precision/test_deepspeed_integration.py @@ -11,6 +11,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from unittest import mock + import pytest from tests_fabric.helpers.runif import RunIf @@ -21,7 +23,8 @@ @RunIf(deepspeed=True) @pytest.mark.parametrize("precision", ["bf16", 16, 32]) -def test_deepspeed_precision_choice(precision): +@mock.patch("lightning_fabric.accelerators.mps.MPSAccelerator.is_available", return_value=False) +def test_deepspeed_precision_choice(_, precision): """Test to ensure precision plugin is correctly chosen. DeepSpeed handles precision via custom DeepSpeedPrecision. diff --git a/tests/tests_fabric/strategies/test_deepspeed_integration.py b/tests/tests_fabric/strategies/test_deepspeed_integration.py index 0330c6a6f7b5e..2c5b3b09cd19a 100644 --- a/tests/tests_fabric/strategies/test_deepspeed_integration.py +++ b/tests/tests_fabric/strategies/test_deepspeed_integration.py @@ -254,8 +254,9 @@ def test_deepspeed_multigpu_stage_3(tmpdir): @RunIf(deepspeed=True) @mock.patch("deepspeed.init_distributed", autospec=True) +@mock.patch("lightning_fabric.accelerators.mps.MPSAccelerator.is_available", return_value=False) @pytest.mark.parametrize("platform", ["Linux", "Windows"]) -def test_deepspeed_env_variables_on_platforms(deepspeed_dist_mock, tmpdir, platform): +def test_deepspeed_env_variables_on_platforms(_, deepspeed_dist_mock, platform): """Test to ensure that we set up distributed communication correctly. When using Windows, ranks environment variables should not be set, and DeepSpeed should handle this. diff --git a/tests/tests_fabric/test_connector.py b/tests/tests_fabric/test_connector.py index 29cfcc91a9de2..983003bef522f 100644 --- a/tests/tests_fabric/test_connector.py +++ b/tests/tests_fabric/test_connector.py @@ -125,6 +125,7 @@ def creates_processes_externally(self) -> bool: assert isinstance(connector.strategy.cluster_environment, CustomCluster) +@RunIf(mps=False) @mock.patch.dict( os.environ, { @@ -246,7 +247,8 @@ def test_interactive_incompatible_backend_error(_, monkeypatch): @mock.patch("lightning_fabric.accelerators.cuda.num_cuda_devices", return_value=2) -def test_interactive_compatible_dp_strategy_gpu(_, monkeypatch): +@mock.patch("lightning_fabric.accelerators.mps.MPSAccelerator.is_available", return_value=False) +def test_interactive_compatible_dp_strategy_gpu(_, __, monkeypatch): monkeypatch.setattr(lightning_fabric.utilities.imports, "_IS_INTERACTIVE", True) connector = _Connector(strategy="dp", accelerator="gpu") assert connector.strategy.launcher is None @@ -266,6 +268,24 @@ def test_interactive_compatible_strategy_ddp_fork(monkeypatch): assert connector.strategy.launcher.is_interactive_compatible +@RunIf(mps=True) +@pytest.mark.parametrize( + ["strategy", "strategy_class"], + ( + ("ddp", DDPStrategy), + ("dp", DataParallelStrategy), + pytest.param("deepspeed", DeepSpeedStrategy, marks=RunIf(deepspeed=True)), + ), +) +@pytest.mark.parametrize("accelerator", ["mps", "auto", "gpu", None, MPSAccelerator()]) +def test_invalid_ddp_strategy_with_mps(accelerator, strategy, strategy_class): + with pytest.raises(ValueError, match="strategies from the DDP family are not supported"): + _Connector(accelerator=accelerator, strategy=strategy) + + with pytest.raises(ValueError, match="strategies from the DDP family are not supported"): + _Connector(accelerator="mps", strategy=strategy_class()) + + @RunIf(mps=False) @pytest.mark.parametrize( ["strategy", "strategy_class"], @@ -353,6 +373,7 @@ def test_set_devices_if_none_cpu(): assert connector._parallel_devices == [torch.device("cpu")] * 3 +@RunIf(mps=False) def test_unsupported_strategy_types_on_cpu_and_fallback(): with pytest.warns(UserWarning, match="is not supported on CPUs, hence setting `strategy='ddp"): connector = _Connector(strategy="dp", devices=2) @@ -607,7 +628,8 @@ def test_strategy_choice_ddp_cpu_slurm(strategy): @mock.patch.dict(os.environ, {}, clear=True) -def test_unsupported_tpu_choice(tpu_available): +@mock.patch("lightning_fabric.accelerators.mps.MPSAccelerator.is_available", return_value=False) +def test_unsupported_tpu_choice(_, tpu_available): with pytest.raises(NotImplementedError, match=r"accelerator='tpu', precision=64\)` is not implemented"): _Connector(accelerator="tpu", precision=64) @@ -729,7 +751,8 @@ def test_gpu_accelerator_no_gpu_backend_found_error(*_): "lightning_fabric.connector.torch.multiprocessing.get_all_start_methods", return_value=[], ) -def test_ddp_fork_on_unsupported_platform(_, strategy): +@mock.patch("lightning_fabric.accelerators.mps.MPSAccelerator.is_available", return_value=False) +def test_ddp_fork_on_unsupported_platform(_, __, strategy): with pytest.raises(ValueError, match="process forking is not supported on this platform"): _Connector(strategy=strategy) @@ -765,7 +788,8 @@ def test_precision_selection_amp_ddp(strategy, devices, is_custom_plugin, plugin @pytest.mark.parametrize(["strategy", "strategy_cls"], [("DDP", DDPStrategy), ("Ddp", DDPStrategy)]) -def test_strategy_str_passed_being_case_insensitive(strategy, strategy_cls): +@mock.patch("lightning_fabric.accelerators.mps.MPSAccelerator.is_available", return_value=False) +def test_strategy_str_passed_being_case_insensitive(_, strategy, strategy_cls): connector = _Connector(strategy=strategy) assert isinstance(connector.strategy, strategy_cls) @@ -838,7 +862,8 @@ def test_arguments_from_environment_collision(): @RunIf(min_torch="1.12") -def test_fsdp_unsupported_on_cpu(): +@mock.patch("lightning_fabric.accelerators.mps.MPSAccelerator.is_available", return_value=False) +def test_fsdp_unsupported_on_cpu(_): """Test that we raise an error if attempting to run FSDP without GPU.""" with pytest.raises(ValueError, match="You selected the FSDP strategy but FSDP is only available on GPU"): _Connector(strategy="fsdp") diff --git a/tests/tests_fabric/test_fabric.py b/tests/tests_fabric/test_fabric.py index 356e2133087ca..8570db2bc500c 100644 --- a/tests/tests_fabric/test_fabric.py +++ b/tests/tests_fabric/test_fabric.py @@ -556,7 +556,7 @@ def test_backward(): fabric._precision.backward.assert_called_with(loss, None, "arg", keyword="kwarg") -@RunIf(deepspeed=True) +@RunIf(deepspeed=True, mps=False) def test_backward_model_input_required(): """Test that when using deepspeed and multiple models, backward() requires the model as input.""" fabric = EmptyFabric(strategy="deepspeed")