From 1a660e3cb1f0c3a502a91eca3cda0e78e03247b5 Mon Sep 17 00:00:00 2001 From: "Kang, Harim" Date: Wed, 31 Jul 2024 15:24:56 +0900 Subject: [PATCH 1/5] Add num_devices for multi-gpus --- src/otx/engine/engine.py | 15 +++++++++++++++ tests/unit/engine/test_engine.py | 13 +++++++++++++ 2 files changed, 28 insertions(+) diff --git a/src/otx/engine/engine.py b/src/otx/engine/engine.py index f9e233359ed..c1d4ea15b80 100644 --- a/src/otx/engine/engine.py +++ b/src/otx/engine/engine.py @@ -119,6 +119,7 @@ def __init__( model: OTXModel | str | None = None, checkpoint: PathLike | None = None, device: DeviceType = DeviceType.auto, + num_devices: int = 1, **kwargs, ): """Initializes the OTX Engine. @@ -131,12 +132,14 @@ def __init__( model (OTXModel | str | None, optional): The model for the engine. Defaults to None. checkpoint (PathLike | None, optional): Path to the checkpoint file. Defaults to None. device (DeviceType, optional): The device type to use. Defaults to DeviceType.auto. + num_devices (int, optional): The number of devices to use. If it is 2 or more, it will behave as multi-gpu. **kwargs: Additional keyword arguments for pl.Trainer. """ self._cache = TrainerArgumentsCache(**kwargs) self.checkpoint = checkpoint self.work_dir = work_dir self.device = device # type: ignore[assignment] + self.num_devices = num_devices self._auto_configurator = AutoConfigurator( data_root=data_root, task=datamodule.task if datamodule is not None else task, @@ -946,6 +949,18 @@ def device(self, device: DeviceType) -> None: self._cache.update(accelerator=self._device.accelerator, devices=self._device.devices) self._cache.is_trainer_args_identical = False + @property + def num_devices(self) -> int: + """Device engine uses.""" + return self._device.devices + + @num_devices.setter + def num_devices(self, num_devices: int) -> None: + """Number of GPUs for multi-gpu.""" + self._device.devices = num_devices + self._cache.update(devices=self._device.devices) + self._cache.is_trainer_args_identical = False + @property def trainer(self) -> Trainer: """Returns the trainer object associated with the engine. diff --git a/tests/unit/engine/test_engine.py b/tests/unit/engine/test_engine.py index 1f1c6275988..b3f72255aba 100644 --- a/tests/unit/engine/test_engine.py +++ b/tests/unit/engine/test_engine.py @@ -367,3 +367,16 @@ def test_from_config(self, tmp_path) -> None: assert engine is not None assert engine.datamodule.train_subset.batch_size == 3 assert engine.datamodule.test_subset.subset_name == "TESTING" + + def test_num_devices(self, fxt_engine, tmp_path) -> None: + assert fxt_engine.num_devices == 1 + assert fxt_engine._cache.args.get("devices") == 1 + + fxt_engine.num_devices = 2 + assert fxt_engine.num_devices == 2 + assert fxt_engine._cache.args.get("devices") == 2 + + data_root = "tests/assets/classification_dataset" + engine = Engine(work_dir=tmp_path, data_root=data_root, num_devices=3) + assert engine.num_devices == 3 + assert engine._cache.args.get("devices") == 3 From d4e51ed246d0fba2b4f83662fb5d8338f813f82d Mon Sep 17 00:00:00 2001 From: "Kang, Harim" Date: Wed, 31 Jul 2024 15:46:01 +0900 Subject: [PATCH 2/5] Add docs --- CHANGELOG.md | 2 + .../guide/tutorials/advanced/multi_gpu.rst | 51 +++++++++++++++++++ 2 files changed, 53 insertions(+) create mode 100644 docs/source/guide/tutorials/advanced/multi_gpu.rst diff --git a/CHANGELOG.md b/CHANGELOG.md index 0a815b70f5c..d182f815157 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -27,6 +27,8 @@ All notable changes to this project will be documented in this file. () - Enable to use polygon and bitmap mask as prompt inputs for zero-shot learning () +- Add num_devices in Engine for multi-gpu training + () ### Bug fixes diff --git a/docs/source/guide/tutorials/advanced/multi_gpu.rst b/docs/source/guide/tutorials/advanced/multi_gpu.rst new file mode 100644 index 00000000000..98d3fb3dde4 --- /dev/null +++ b/docs/source/guide/tutorials/advanced/multi_gpu.rst @@ -0,0 +1,51 @@ +Multi-GPU Support +================= + +Overview +-------- + +OpenVINO™ Training Extensions now supports operations in a multi-GPU environment, offering faster computation speeds and enhanced performance. With this new feature, users can efficiently process large datasets and complex models, significantly reducing the time required for machine learning and deep learning tasks. + +Benefits of Multi-GPU Support +----------------------------- + +- **Speed Improvement**: Training times can be greatly reduced by utilizing multiple GPUs in parallel. +- **Large Dataset Handling**: Load larger datasets into memory and work with larger batch sizes. +- **Efficient Resource Utilization**: Maximize the computational efficiency by fully utilizing the GPU resources of the system. + +How to Set Up Multi-GPU +----------------------- + +Setting up multi-GPU in OpenVINO™ Training Extensions is straightforward. Follow these steps to complete the setup: + +1. **Environment Check**: Ensure that multiple GPUs are installed in your system and that all GPUs are compatible with OpenVINO™ Training Extensions. +2. **Driver Installation**: Install the latest GPU drivers to ensure all GPUs are properly recognized and available for use. +3. **Configuration**: Activate the multi-GPU option in the OpenVINO™ Training Extensions configuration file or through the user interface. + +Using Multi-GPU +--------------- + +Once the multi-GPU feature is enabled, you can use multi-GPU for model training as follows: + +.. tab-set:: + + .. tab-item:: CLI + + .. code-block:: shell + + (otx) ...$ otx train \ + ... \ + --engine.num_devices 2 + + .. tab-item:: API + + .. code-block:: python + + from otx.engine import Engine + + engine = Engine.from_config( + ... + num_devices=2, + ) + + engine.train(...) From d189e33e079cffeed5458b351c761d2f8ff0deac Mon Sep 17 00:00:00 2001 From: "Kang, Harim" Date: Wed, 31 Jul 2024 15:47:42 +0900 Subject: [PATCH 3/5] Add index --- docs/source/guide/tutorials/advanced/index.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/guide/tutorials/advanced/index.rst b/docs/source/guide/tutorials/advanced/index.rst index 9fcf48635d8..8524b3a8200 100644 --- a/docs/source/guide/tutorials/advanced/index.rst +++ b/docs/source/guide/tutorials/advanced/index.rst @@ -7,5 +7,6 @@ Advanced Tutorials configuration semi_supervised_learning huggingface_model + multi_gpu .. Once we have enough material, we might need to categorize these into `data`, `model learning` sections. \ No newline at end of file From 12baa4b546c91eb64ddea16a037084f08ca87250 Mon Sep 17 00:00:00 2001 From: "Kang, Harim" Date: Wed, 31 Jul 2024 15:52:05 +0900 Subject: [PATCH 4/5] Update docstrings --- src/otx/engine/engine.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/otx/engine/engine.py b/src/otx/engine/engine.py index c1d4ea15b80..a730d77f572 100644 --- a/src/otx/engine/engine.py +++ b/src/otx/engine/engine.py @@ -951,12 +951,12 @@ def device(self, device: DeviceType) -> None: @property def num_devices(self) -> int: - """Device engine uses.""" + """Number of devices for Engine use.""" return self._device.devices @num_devices.setter def num_devices(self, num_devices: int) -> None: - """Number of GPUs for multi-gpu.""" + """Setter function for multi-gpu.""" self._device.devices = num_devices self._cache.update(devices=self._device.devices) self._cache.is_trainer_args_identical = False From b6ad495c275325a80aebc80278fd518103e30518 Mon Sep 17 00:00:00 2001 From: "Kang, Harim" Date: Wed, 31 Jul 2024 16:25:48 +0900 Subject: [PATCH 5/5] Update CHANGELOG --- CHANGELOG.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d182f815157..48aea3c4e23 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -27,8 +27,6 @@ All notable changes to this project will be documented in this file. () - Enable to use polygon and bitmap mask as prompt inputs for zero-shot learning () -- Add num_devices in Engine for multi-gpu training - () ### Bug fixes @@ -36,6 +34,8 @@ All notable changes to this project will be documented in this file. (https://github.com/openvinotoolkit/training_extensions/pull/3723) - Revert #3579 to fix issues with replacing coco_instance with a different format in some dataset (https://github.com/openvinotoolkit/training_extensions/pull/3753) +- Add num_devices in Engine for multi-gpu training + (https://github.com/openvinotoolkit/training_extensions/pull/3778) ## \[v2.1.0\]