diff --git a/docs/source/guide/get_started/quick_start_guide/cli_commands.rst b/docs/source/guide/get_started/quick_start_guide/cli_commands.rst index e6ed0619c3d..155faf5ec6e 100644 --- a/docs/source/guide/get_started/quick_start_guide/cli_commands.rst +++ b/docs/source/guide/get_started/quick_start_guide/cli_commands.rst @@ -248,7 +248,7 @@ As can be seen from the parameters list, the model can be trained using multiple .. note:: - Multi-GPU training is currently supported for all tasks except for action tasks and semi/self-supervised learning methods. We'll add support for them in the near future. + Multi-GPU training is currently supported for all tasks except for action tasks. We'll add support for them in the near future. ********** Exporting diff --git a/otx/algorithms/classification/adapters/mmcls/models/classifiers/byol.py b/otx/algorithms/classification/adapters/mmcls/models/classifiers/byol.py index a8ad3642b9a..f88cb1dd248 100644 --- a/otx/algorithms/classification/adapters/mmcls/models/classifiers/byol.py +++ b/otx/algorithms/classification/adapters/mmcls/models/classifiers/byol.py @@ -4,6 +4,8 @@ - 'Bootstrap Your Own Latent: A New Approach to Self-Supervised Learning', https://arxiv.org/abs/2006.07733 """ +import copy + # Copyright (C) 2022 Intel Corporation # SPDX-License-Identifier: Apache-2.0 # @@ -50,7 +52,11 @@ def __init__( # build backbone self.online_backbone = build_backbone(backbone) - self.target_backbone = build_backbone(backbone) + + target_backbone_cfg = copy.deepcopy(backbone) + target_backbone_cfg["pretrained"] = None + + self.target_backbone = build_backbone(target_backbone_cfg) # build projector self.online_projector = build_neck(neck) diff --git a/otx/mpa/modules/models/detectors/unbiased_teacher.py b/otx/mpa/modules/models/detectors/unbiased_teacher.py index c72c2d4a296..a926d612d25 100644 --- a/otx/mpa/modules/models/detectors/unbiased_teacher.py +++ b/otx/mpa/modules/models/detectors/unbiased_teacher.py @@ -93,12 +93,13 @@ def forward_train(self, img, img_metas, img0, gt_bboxes, gt_labels, gt_bboxes_ig [ul_img_metas], rescale=False, # easy augmentation ) + current_device = ul_img0[0].device pseudo_bboxes, pseudo_labels, pseudo_ratio = self.generate_pseudo_labels( - teacher_outputs, device=ul_img0.device, **kwargs + teacher_outputs, device=current_device, **kwargs ) ps_recall = self.eval_pseudo_label_recall(pseudo_bboxes, ul_args.get("gt_bboxes", [])) - losses.update(ps_recall=ps_recall) - losses.update(ps_ratio=torch.Tensor([pseudo_ratio])) + losses.update(ps_recall=torch.tensor(ps_recall, device=current_device)) + losses.update(ps_ratio=torch.tensor([pseudo_ratio], device=current_device)) if not self.unlabeled_loss_enabled or self.unlabeled_loss_weight <= 0.001: # TODO: move back return losses @@ -147,7 +148,7 @@ def eval_pseudo_label_recall(self, all_pseudo_bboxes, all_gt_bboxes): img_num = len(all_gt_bboxes) if img_num == 0: - return torch.Tensor([0.0]) + return [0.0] all_ious = np.ndarray((img_num,), dtype=object) for i in range(img_num): ps_bboxes = all_pseudo_bboxes[i] @@ -162,7 +163,7 @@ def eval_pseudo_label_recall(self, all_pseudo_bboxes, all_gt_bboxes): ious = bbox_overlaps(gt_bboxes.detach().cpu().numpy(), ps_bboxes.detach().cpu().numpy()[:prop_num, :4]) all_ious[i] = ious recall = _recalls(all_ious, np.array([100]), np.array([0.5])) - return torch.Tensor(recall) + return recall @staticmethod def state_dict_hook(module, state_dict, prefix, *args, **kwargs): diff --git a/otx/recipes/stages/detection/semisl.py b/otx/recipes/stages/detection/semisl.py index 737beb90da6..3a0e6097e20 100644 --- a/otx/recipes/stages/detection/semisl.py +++ b/otx/recipes/stages/detection/semisl.py @@ -29,3 +29,5 @@ priority=75, ), ] + +find_unused_parameters = True diff --git a/otx/recipes/stages/segmentation/semisl.py b/otx/recipes/stages/segmentation/semisl.py index 3dc376531dd..d7763650f5c 100644 --- a/otx/recipes/stages/segmentation/semisl.py +++ b/otx/recipes/stages/segmentation/semisl.py @@ -42,5 +42,5 @@ ignore = True -find_unused_parameters = False +find_unused_parameters = True seed = 42 diff --git a/tests/e2e/cli/classification/test_classification.py b/tests/e2e/cli/classification/test_classification.py index 42d2efe4406..403c23024bf 100644 --- a/tests/e2e/cli/classification/test_classification.py +++ b/tests/e2e/cli/classification/test_classification.py @@ -305,6 +305,19 @@ def test_otx_eval(self, template, tmp_dir_path): tmp_dir_path = tmp_dir_path / "multi_class_cls/test_semisl" otx_eval_testing(template, tmp_dir_path, otx_dir, args0) + @e2e_pytest_component + @pytest.mark.skip(reason="CVS-101246 Multi-GPU tests are stuck while CI is running") + @pytest.mark.skipif(TT_STABILITY_TESTS, reason="This is TT_STABILITY_TESTS") + @pytest.mark.skipif(MULTI_GPU_UNAVAILABLE, reason="The number of gpu is insufficient") + @pytest.mark.parametrize("template", templates, ids=templates_ids) + def test_otx_multi_gpu_train_semisl(self, template, tmp_dir_path): + tmp_dir_path = tmp_dir_path / "multi_class_cls/test_multi_gpu_semisl" + args_semisl_multigpu = copy.deepcopy(args0) + args_semisl_multigpu["--unlabeled-data-roots"] = args["--train-data-roots"] + args_semisl_multigpu["train_params"].extend(["--algo_backend.train_type", "SEMISUPERVISED"]) + args_semisl_multigpu["--gpus"] = "0,1" + otx_train_testing(template, tmp_dir_path, otx_dir, args_semisl_multigpu) + # Pre-train w/ 'car', 'tree' classes args0_m = { @@ -744,3 +757,14 @@ def test_otx_selfsl_train(self, template, tmp_dir_path): def test_otx_selfsl_eval(self, template, tmp_dir_path): tmp_dir_path = tmp_dir_path / "multi_class_cls/test_selfsl_sl" otx_eval_testing(template, tmp_dir_path, otx_dir, args) + + @e2e_pytest_component + @pytest.mark.skip(reason="CVS-101246 Multi-GPU tests are stuck while CI is running") + @pytest.mark.skipif(TT_STABILITY_TESTS, reason="This is TT_STABILITY_TESTS") + @pytest.mark.skipif(MULTI_GPU_UNAVAILABLE, reason="The number of gpu is insufficient") + @pytest.mark.parametrize("template", templates, ids=templates_ids) + def test_otx_multi_gpu_train_selfsl(self, template, tmp_dir_path): + tmp_dir_path = tmp_dir_path / "multi_class_cls/test_multi_gpu_selfsl" + args_selfsl_multigpu = copy.deepcopy(args_selfsl) + args_selfsl_multigpu["--gpus"] = "0,1" + otx_train_testing(template, tmp_dir_path, otx_dir, args_selfsl_multigpu) diff --git a/tests/e2e/cli/detection/test_detection.py b/tests/e2e/cli/detection/test_detection.py index 938a77da612..09020631eb3 100644 --- a/tests/e2e/cli/detection/test_detection.py +++ b/tests/e2e/cli/detection/test_detection.py @@ -291,3 +291,14 @@ def test_otx_train(self, template, tmp_dir_path): def test_otx_eval(self, template, tmp_dir_path): tmp_dir_path = tmp_dir_path / "detection/test_semisl" otx_eval_testing(template, tmp_dir_path, otx_dir, args) + + @e2e_pytest_component + @pytest.mark.skip(reason="CVS-101246 Multi-GPU tests are stuck while CI is running") + @pytest.mark.skipif(TT_STABILITY_TESTS, reason="This is TT_STABILITY_TESTS") + @pytest.mark.skipif(MULTI_GPU_UNAVAILABLE, reason="The number of gpu is insufficient") + @pytest.mark.parametrize("template", templates, ids=templates_ids) + def test_otx_multi_gpu_train_semisl(self, template, tmp_dir_path): + tmp_dir_path = tmp_dir_path / "detection/test_multi_gpu_semisl" + args_semisl_multigpu = copy.deepcopy(args_semisl) + args_semisl_multigpu["--gpus"] = "0,1" + otx_train_testing(template, tmp_dir_path, otx_dir, args_semisl_multigpu) diff --git a/tests/e2e/cli/segmentation/test_segmentation.py b/tests/e2e/cli/segmentation/test_segmentation.py index 1f259a4cafe..d69af0724c9 100644 --- a/tests/e2e/cli/segmentation/test_segmentation.py +++ b/tests/e2e/cli/segmentation/test_segmentation.py @@ -287,6 +287,17 @@ def test_otx_eval(self, template, tmp_dir_path): tmp_dir_path = tmp_dir_path / "segmentation/test_semisl" otx_eval_testing(template, tmp_dir_path, otx_dir, args_semisl) + @e2e_pytest_component + @pytest.mark.skip(reason="CVS-101246 Multi-GPU tests are stuck while CI is running") + @pytest.mark.skipif(TT_STABILITY_TESTS, reason="This is TT_STABILITY_TESTS") + @pytest.mark.skipif(MULTI_GPU_UNAVAILABLE, reason="The number of gpu is insufficient") + @pytest.mark.parametrize("template", templates, ids=templates_ids) + def test_otx_multi_gpu_train_semisl(self, template, tmp_dir_path): + tmp_dir_path = tmp_dir_path / "segmentation/test_multi_gpu_semisl" + args_semisl_multigpu = copy.deepcopy(args_semisl) + args_semisl_multigpu["--gpus"] = "0,1" + otx_train_testing(template, tmp_dir_path, otx_dir, args_semisl_multigpu) + args_selfsl = { "--train-data-roots": "tests/assets/common_semantic_segmentation_dataset/train", @@ -321,3 +332,14 @@ def test_otx_train(self, template, tmp_dir_path): def test_otx_eval(self, template, tmp_dir_path): tmp_dir_path = tmp_dir_path / "segmentation/test_selfsl_sl" otx_eval_testing(template, tmp_dir_path, otx_dir, args) + + @e2e_pytest_component + @pytest.mark.skip(reason="CVS-101246 Multi-GPU tests are stuck while CI is running") + @pytest.mark.skipif(TT_STABILITY_TESTS, reason="This is TT_STABILITY_TESTS") + @pytest.mark.skipif(MULTI_GPU_UNAVAILABLE, reason="The number of gpu is insufficient") + @pytest.mark.parametrize("template", templates, ids=templates_ids) + def test_otx_multi_gpu_train_selfsl(self, template, tmp_dir_path): + tmp_dir_path = tmp_dir_path / "segmentation/test_multi_gpu_selfsl" + args_selfsl_multigpu = copy.deepcopy(args_selfsl) + args_selfsl_multigpu["--gpus"] = "0,1" + otx_train_testing(template, tmp_dir_path, otx_dir, args_selfsl_multigpu) diff --git a/tests/integration/cli/classification/test_classification.py b/tests/integration/cli/classification/test_classification.py index 5b1af23bd4a..19c8212df8b 100644 --- a/tests/integration/cli/classification/test_classification.py +++ b/tests/integration/cli/classification/test_classification.py @@ -186,12 +186,34 @@ def test_otx_train_semisl(self, template, tmp_dir_path): args_semisl["train_params"].extend(["--algo_backend.train_type", "SEMISUPERVISED"]) otx_train_testing(template, tmp_dir_path, otx_dir, args_semisl) + @e2e_pytest_component + @pytest.mark.skip(reason="CVS-101246 Multi-GPU tests are stuck while CI is running") + @pytest.mark.skipif(MULTI_GPU_UNAVAILABLE, reason="The number of gpu is insufficient") + @pytest.mark.parametrize("template", default_templates, ids=default_templates_ids) + def test_otx_multi_gpu_train_semisl(self, template, tmp_dir_path): + tmp_dir_path = tmp_dir_path / "multi_class_cls/test_multi_gpu_semisl" + args_semisl_multigpu = copy.deepcopy(args) + args_semisl_multigpu["--unlabeled-data-roots"] = args["--train-data-roots"] + args_semisl_multigpu["train_params"].extend(["--algo_backend.train_type", "SEMISUPERVISED"]) + args_semisl_multigpu["--gpus"] = "0,1" + otx_train_testing(template, tmp_dir_path, otx_dir, args_semisl_multigpu) + @e2e_pytest_component @pytest.mark.parametrize("template", default_templates, ids=default_templates_ids) def test_otx_train_selfsl(self, template, tmp_dir_path): tmp_dir_path = tmp_dir_path / "multi_class_cls/test_selfsl" otx_train_testing(template, tmp_dir_path, otx_dir, args_selfsl) + @e2e_pytest_component + @pytest.mark.skip(reason="CVS-101246 Multi-GPU tests are stuck while CI is running") + @pytest.mark.skipif(MULTI_GPU_UNAVAILABLE, reason="The number of gpu is insufficient") + @pytest.mark.parametrize("template", default_templates, ids=default_templates_ids) + def test_otx_multi_gpu_train_selfsl(self, template, tmp_dir_path): + tmp_dir_path = tmp_dir_path / "multi_class_cls/test_multi_gpu_selfsl" + args_selfsl_multigpu = copy.deepcopy(args_selfsl) + args_selfsl_multigpu["--gpus"] = "0,1" + otx_train_testing(template, tmp_dir_path, otx_dir, args_selfsl_multigpu) + # Multi-label training w/ 'car', 'tree', 'bug' classes args_m = { diff --git a/tests/integration/cli/detection/test_detection.py b/tests/integration/cli/detection/test_detection.py index ac3ad13fd0d..77e6c130a9c 100644 --- a/tests/integration/cli/detection/test_detection.py +++ b/tests/integration/cli/detection/test_detection.py @@ -163,3 +163,13 @@ def test_otx_multi_gpu_train(self, template, tmp_dir_path): def test_otx_train_semisl(self, template, tmp_dir_path): tmp_dir_path = tmp_dir_path / "detection/test_semisl" otx_train_testing(template, tmp_dir_path, otx_dir, args_semisl) + + @e2e_pytest_component + @pytest.mark.skip(reason="CVS-101246 Multi-GPU tests are stuck while CI is running") + @pytest.mark.skipif(MULTI_GPU_UNAVAILABLE, reason="The number of gpu is insufficient") + @pytest.mark.parametrize("template", default_templates, ids=default_templates_ids) + def test_otx_multi_gpu_train_semisl(self, template, tmp_dir_path): + tmp_dir_path = tmp_dir_path / "detection/test_multi_gpu_semisl" + args_semisl_multigpu = copy.deepcopy(args_semisl) + args_semisl_multigpu["--gpus"] = "0,1" + otx_train_testing(template, tmp_dir_path, otx_dir, args_semisl_multigpu) diff --git a/tests/integration/cli/segmentation/test_segmentation.py b/tests/integration/cli/segmentation/test_segmentation.py index d77c37e36d0..f18b0cb394a 100644 --- a/tests/integration/cli/segmentation/test_segmentation.py +++ b/tests/integration/cli/segmentation/test_segmentation.py @@ -176,8 +176,28 @@ def test_otx_train_semisl(self, template, tmp_dir_path): tmp_dir_path = tmp_dir_path / "segmentation/test_semisl" otx_train_testing(template, tmp_dir_path, otx_dir, args_semisl) + @e2e_pytest_component + @pytest.mark.skip(reason="CVS-101246 Multi-GPU tests are stuck while CI is running") + @pytest.mark.skipif(MULTI_GPU_UNAVAILABLE, reason="The number of gpu is insufficient") + @pytest.mark.parametrize("template", templates, ids=templates_ids) + def test_otx_multi_gpu_train_semisl(self, template, tmp_dir_path): + tmp_dir_path = tmp_dir_path / "segmentation/test_multi_gpu_semisl" + args_semisl_multigpu = copy.deepcopy(args_semisl) + args_semisl_multigpu["--gpus"] = "0,1" + otx_train_testing(template, tmp_dir_path, otx_dir, args_semisl_multigpu) + @e2e_pytest_component @pytest.mark.parametrize("template", templates, ids=templates_ids) def test_otx_train_selfsl(self, template, tmp_dir_path): tmp_dir_path = tmp_dir_path / "segmentation/test_selfsl" otx_train_testing(template, tmp_dir_path, otx_dir, args_selfsl) + + @e2e_pytest_component + @pytest.mark.skip(reason="CVS-101246 Multi-GPU tests are stuck while CI is running") + @pytest.mark.skipif(MULTI_GPU_UNAVAILABLE, reason="The number of gpu is insufficient") + @pytest.mark.parametrize("template", templates, ids=templates_ids) + def test_otx_multi_gpu_train_selfsl(self, template, tmp_dir_path): + tmp_dir_path = tmp_dir_path / "segmentation/test_multi_gpu_selfsl" + args_selfsl_multigpu = copy.deepcopy(args_selfsl) + args_selfsl_multigpu["--gpus"] = "0,1" + otx_train_testing(template, tmp_dir_path, otx_dir, args_selfsl_multigpu)