diff --git a/docs/source/guide/get_started/quick_start_guide/cli_commands.rst b/docs/source/guide/get_started/quick_start_guide/cli_commands.rst
index e6ed0619c3d..155faf5ec6e 100644
--- a/docs/source/guide/get_started/quick_start_guide/cli_commands.rst
+++ b/docs/source/guide/get_started/quick_start_guide/cli_commands.rst
@@ -248,7 +248,7 @@ As can be seen from the parameters list, the model can be trained using multiple
 
 .. note::
 
-    Multi-GPU training is currently supported for all tasks except for action tasks and semi/self-supervised learning methods. We'll add support for them in the near future.
+    Multi-GPU training is currently supported for all tasks except for action tasks. We'll add support for them in the near future.
 
 **********
 Exporting
diff --git a/otx/algorithms/classification/adapters/mmcls/models/classifiers/byol.py b/otx/algorithms/classification/adapters/mmcls/models/classifiers/byol.py
index a8ad3642b9a..f88cb1dd248 100644
--- a/otx/algorithms/classification/adapters/mmcls/models/classifiers/byol.py
+++ b/otx/algorithms/classification/adapters/mmcls/models/classifiers/byol.py
@@ -4,6 +4,8 @@
 - 'Bootstrap Your Own Latent: A New Approach to Self-Supervised Learning', https://arxiv.org/abs/2006.07733
 """
 
+import copy
+
 # Copyright (C) 2022 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 #
@@ -50,7 +52,11 @@ def __init__(
 
         # build backbone
         self.online_backbone = build_backbone(backbone)
-        self.target_backbone = build_backbone(backbone)
+
+        target_backbone_cfg = copy.deepcopy(backbone)
+        target_backbone_cfg["pretrained"] = None
+
+        self.target_backbone = build_backbone(target_backbone_cfg)
 
         # build projector
         self.online_projector = build_neck(neck)
diff --git a/otx/mpa/modules/models/detectors/unbiased_teacher.py b/otx/mpa/modules/models/detectors/unbiased_teacher.py
index c72c2d4a296..a926d612d25 100644
--- a/otx/mpa/modules/models/detectors/unbiased_teacher.py
+++ b/otx/mpa/modules/models/detectors/unbiased_teacher.py
@@ -93,12 +93,13 @@ def forward_train(self, img, img_metas, img0, gt_bboxes, gt_labels, gt_bboxes_ig
                 [ul_img_metas],
                 rescale=False,  # easy augmentation
             )
+        current_device = ul_img0[0].device
         pseudo_bboxes, pseudo_labels, pseudo_ratio = self.generate_pseudo_labels(
-            teacher_outputs, device=ul_img0.device, **kwargs
+            teacher_outputs, device=current_device, **kwargs
         )
         ps_recall = self.eval_pseudo_label_recall(pseudo_bboxes, ul_args.get("gt_bboxes", []))
-        losses.update(ps_recall=ps_recall)
-        losses.update(ps_ratio=torch.Tensor([pseudo_ratio]))
+        losses.update(ps_recall=torch.tensor(ps_recall, device=current_device))
+        losses.update(ps_ratio=torch.tensor([pseudo_ratio], device=current_device))
 
         if not self.unlabeled_loss_enabled or self.unlabeled_loss_weight <= 0.001:  # TODO: move back
             return losses
@@ -147,7 +148,7 @@ def eval_pseudo_label_recall(self, all_pseudo_bboxes, all_gt_bboxes):
 
         img_num = len(all_gt_bboxes)
         if img_num == 0:
-            return torch.Tensor([0.0])
+            return [0.0]
         all_ious = np.ndarray((img_num,), dtype=object)
         for i in range(img_num):
             ps_bboxes = all_pseudo_bboxes[i]
@@ -162,7 +163,7 @@ def eval_pseudo_label_recall(self, all_pseudo_bboxes, all_gt_bboxes):
                 ious = bbox_overlaps(gt_bboxes.detach().cpu().numpy(), ps_bboxes.detach().cpu().numpy()[:prop_num, :4])
             all_ious[i] = ious
         recall = _recalls(all_ious, np.array([100]), np.array([0.5]))
-        return torch.Tensor(recall)
+        return recall
 
     @staticmethod
     def state_dict_hook(module, state_dict, prefix, *args, **kwargs):
diff --git a/otx/recipes/stages/detection/semisl.py b/otx/recipes/stages/detection/semisl.py
index 737beb90da6..3a0e6097e20 100644
--- a/otx/recipes/stages/detection/semisl.py
+++ b/otx/recipes/stages/detection/semisl.py
@@ -29,3 +29,5 @@
         priority=75,
     ),
 ]
+
+find_unused_parameters = True
diff --git a/otx/recipes/stages/segmentation/semisl.py b/otx/recipes/stages/segmentation/semisl.py
index 3dc376531dd..d7763650f5c 100644
--- a/otx/recipes/stages/segmentation/semisl.py
+++ b/otx/recipes/stages/segmentation/semisl.py
@@ -42,5 +42,5 @@
 ignore = True
 
 
-find_unused_parameters = False
+find_unused_parameters = True
 seed = 42
diff --git a/tests/e2e/cli/classification/test_classification.py b/tests/e2e/cli/classification/test_classification.py
index 42d2efe4406..403c23024bf 100644
--- a/tests/e2e/cli/classification/test_classification.py
+++ b/tests/e2e/cli/classification/test_classification.py
@@ -305,6 +305,19 @@ def test_otx_eval(self, template, tmp_dir_path):
         tmp_dir_path = tmp_dir_path / "multi_class_cls/test_semisl"
         otx_eval_testing(template, tmp_dir_path, otx_dir, args0)
 
+    @e2e_pytest_component
+    @pytest.mark.skip(reason="CVS-101246 Multi-GPU tests are stuck while CI is running")
+    @pytest.mark.skipif(TT_STABILITY_TESTS, reason="This is TT_STABILITY_TESTS")
+    @pytest.mark.skipif(MULTI_GPU_UNAVAILABLE, reason="The number of gpu is insufficient")
+    @pytest.mark.parametrize("template", templates, ids=templates_ids)
+    def test_otx_multi_gpu_train_semisl(self, template, tmp_dir_path):
+        tmp_dir_path = tmp_dir_path / "multi_class_cls/test_multi_gpu_semisl"
+        args_semisl_multigpu = copy.deepcopy(args0)
+        args_semisl_multigpu["--unlabeled-data-roots"] = args["--train-data-roots"]
+        args_semisl_multigpu["train_params"].extend(["--algo_backend.train_type", "SEMISUPERVISED"])
+        args_semisl_multigpu["--gpus"] = "0,1"
+        otx_train_testing(template, tmp_dir_path, otx_dir, args_semisl_multigpu)
+
 
 # Pre-train w/ 'car', 'tree' classes
 args0_m = {
@@ -744,3 +757,14 @@ def test_otx_selfsl_train(self, template, tmp_dir_path):
     def test_otx_selfsl_eval(self, template, tmp_dir_path):
         tmp_dir_path = tmp_dir_path / "multi_class_cls/test_selfsl_sl"
         otx_eval_testing(template, tmp_dir_path, otx_dir, args)
+
+    @e2e_pytest_component
+    @pytest.mark.skip(reason="CVS-101246 Multi-GPU tests are stuck while CI is running")
+    @pytest.mark.skipif(TT_STABILITY_TESTS, reason="This is TT_STABILITY_TESTS")
+    @pytest.mark.skipif(MULTI_GPU_UNAVAILABLE, reason="The number of gpu is insufficient")
+    @pytest.mark.parametrize("template", templates, ids=templates_ids)
+    def test_otx_multi_gpu_train_selfsl(self, template, tmp_dir_path):
+        tmp_dir_path = tmp_dir_path / "multi_class_cls/test_multi_gpu_selfsl"
+        args_selfsl_multigpu = copy.deepcopy(args_selfsl)
+        args_selfsl_multigpu["--gpus"] = "0,1"
+        otx_train_testing(template, tmp_dir_path, otx_dir, args_selfsl_multigpu)
diff --git a/tests/e2e/cli/detection/test_detection.py b/tests/e2e/cli/detection/test_detection.py
index 938a77da612..09020631eb3 100644
--- a/tests/e2e/cli/detection/test_detection.py
+++ b/tests/e2e/cli/detection/test_detection.py
@@ -291,3 +291,14 @@ def test_otx_train(self, template, tmp_dir_path):
     def test_otx_eval(self, template, tmp_dir_path):
         tmp_dir_path = tmp_dir_path / "detection/test_semisl"
         otx_eval_testing(template, tmp_dir_path, otx_dir, args)
+
+    @e2e_pytest_component
+    @pytest.mark.skip(reason="CVS-101246 Multi-GPU tests are stuck while CI is running")
+    @pytest.mark.skipif(TT_STABILITY_TESTS, reason="This is TT_STABILITY_TESTS")
+    @pytest.mark.skipif(MULTI_GPU_UNAVAILABLE, reason="The number of gpu is insufficient")
+    @pytest.mark.parametrize("template", templates, ids=templates_ids)
+    def test_otx_multi_gpu_train_semisl(self, template, tmp_dir_path):
+        tmp_dir_path = tmp_dir_path / "detection/test_multi_gpu_semisl"
+        args_semisl_multigpu = copy.deepcopy(args_semisl)
+        args_semisl_multigpu["--gpus"] = "0,1"
+        otx_train_testing(template, tmp_dir_path, otx_dir, args_semisl_multigpu)
diff --git a/tests/e2e/cli/segmentation/test_segmentation.py b/tests/e2e/cli/segmentation/test_segmentation.py
index 1f259a4cafe..d69af0724c9 100644
--- a/tests/e2e/cli/segmentation/test_segmentation.py
+++ b/tests/e2e/cli/segmentation/test_segmentation.py
@@ -287,6 +287,17 @@ def test_otx_eval(self, template, tmp_dir_path):
         tmp_dir_path = tmp_dir_path / "segmentation/test_semisl"
         otx_eval_testing(template, tmp_dir_path, otx_dir, args_semisl)
 
+    @e2e_pytest_component
+    @pytest.mark.skip(reason="CVS-101246 Multi-GPU tests are stuck while CI is running")
+    @pytest.mark.skipif(TT_STABILITY_TESTS, reason="This is TT_STABILITY_TESTS")
+    @pytest.mark.skipif(MULTI_GPU_UNAVAILABLE, reason="The number of gpu is insufficient")
+    @pytest.mark.parametrize("template", templates, ids=templates_ids)
+    def test_otx_multi_gpu_train_semisl(self, template, tmp_dir_path):
+        tmp_dir_path = tmp_dir_path / "segmentation/test_multi_gpu_semisl"
+        args_semisl_multigpu = copy.deepcopy(args_semisl)
+        args_semisl_multigpu["--gpus"] = "0,1"
+        otx_train_testing(template, tmp_dir_path, otx_dir, args_semisl_multigpu)
+
 
 args_selfsl = {
     "--train-data-roots": "tests/assets/common_semantic_segmentation_dataset/train",
@@ -321,3 +332,14 @@ def test_otx_train(self, template, tmp_dir_path):
     def test_otx_eval(self, template, tmp_dir_path):
         tmp_dir_path = tmp_dir_path / "segmentation/test_selfsl_sl"
         otx_eval_testing(template, tmp_dir_path, otx_dir, args)
+
+    @e2e_pytest_component
+    @pytest.mark.skip(reason="CVS-101246 Multi-GPU tests are stuck while CI is running")
+    @pytest.mark.skipif(TT_STABILITY_TESTS, reason="This is TT_STABILITY_TESTS")
+    @pytest.mark.skipif(MULTI_GPU_UNAVAILABLE, reason="The number of gpu is insufficient")
+    @pytest.mark.parametrize("template", templates, ids=templates_ids)
+    def test_otx_multi_gpu_train_selfsl(self, template, tmp_dir_path):
+        tmp_dir_path = tmp_dir_path / "segmentation/test_multi_gpu_selfsl"
+        args_selfsl_multigpu = copy.deepcopy(args_selfsl)
+        args_selfsl_multigpu["--gpus"] = "0,1"
+        otx_train_testing(template, tmp_dir_path, otx_dir, args_selfsl_multigpu)
diff --git a/tests/integration/cli/classification/test_classification.py b/tests/integration/cli/classification/test_classification.py
index 5b1af23bd4a..19c8212df8b 100644
--- a/tests/integration/cli/classification/test_classification.py
+++ b/tests/integration/cli/classification/test_classification.py
@@ -186,12 +186,34 @@ def test_otx_train_semisl(self, template, tmp_dir_path):
         args_semisl["train_params"].extend(["--algo_backend.train_type", "SEMISUPERVISED"])
         otx_train_testing(template, tmp_dir_path, otx_dir, args_semisl)
 
+    @e2e_pytest_component
+    @pytest.mark.skip(reason="CVS-101246 Multi-GPU tests are stuck while CI is running")
+    @pytest.mark.skipif(MULTI_GPU_UNAVAILABLE, reason="The number of gpu is insufficient")
+    @pytest.mark.parametrize("template", default_templates, ids=default_templates_ids)
+    def test_otx_multi_gpu_train_semisl(self, template, tmp_dir_path):
+        tmp_dir_path = tmp_dir_path / "multi_class_cls/test_multi_gpu_semisl"
+        args_semisl_multigpu = copy.deepcopy(args)
+        args_semisl_multigpu["--unlabeled-data-roots"] = args["--train-data-roots"]
+        args_semisl_multigpu["train_params"].extend(["--algo_backend.train_type", "SEMISUPERVISED"])
+        args_semisl_multigpu["--gpus"] = "0,1"
+        otx_train_testing(template, tmp_dir_path, otx_dir, args_semisl_multigpu)
+
     @e2e_pytest_component
     @pytest.mark.parametrize("template", default_templates, ids=default_templates_ids)
     def test_otx_train_selfsl(self, template, tmp_dir_path):
         tmp_dir_path = tmp_dir_path / "multi_class_cls/test_selfsl"
         otx_train_testing(template, tmp_dir_path, otx_dir, args_selfsl)
 
+    @e2e_pytest_component
+    @pytest.mark.skip(reason="CVS-101246 Multi-GPU tests are stuck while CI is running")
+    @pytest.mark.skipif(MULTI_GPU_UNAVAILABLE, reason="The number of gpu is insufficient")
+    @pytest.mark.parametrize("template", default_templates, ids=default_templates_ids)
+    def test_otx_multi_gpu_train_selfsl(self, template, tmp_dir_path):
+        tmp_dir_path = tmp_dir_path / "multi_class_cls/test_multi_gpu_selfsl"
+        args_selfsl_multigpu = copy.deepcopy(args_selfsl)
+        args_selfsl_multigpu["--gpus"] = "0,1"
+        otx_train_testing(template, tmp_dir_path, otx_dir, args_selfsl_multigpu)
+
 
 # Multi-label training w/ 'car', 'tree', 'bug' classes
 args_m = {
diff --git a/tests/integration/cli/detection/test_detection.py b/tests/integration/cli/detection/test_detection.py
index ac3ad13fd0d..77e6c130a9c 100644
--- a/tests/integration/cli/detection/test_detection.py
+++ b/tests/integration/cli/detection/test_detection.py
@@ -163,3 +163,13 @@ def test_otx_multi_gpu_train(self, template, tmp_dir_path):
     def test_otx_train_semisl(self, template, tmp_dir_path):
         tmp_dir_path = tmp_dir_path / "detection/test_semisl"
         otx_train_testing(template, tmp_dir_path, otx_dir, args_semisl)
+
+    @e2e_pytest_component
+    @pytest.mark.skip(reason="CVS-101246 Multi-GPU tests are stuck while CI is running")
+    @pytest.mark.skipif(MULTI_GPU_UNAVAILABLE, reason="The number of gpu is insufficient")
+    @pytest.mark.parametrize("template", default_templates, ids=default_templates_ids)
+    def test_otx_multi_gpu_train_semisl(self, template, tmp_dir_path):
+        tmp_dir_path = tmp_dir_path / "detection/test_multi_gpu_semisl"
+        args_semisl_multigpu = copy.deepcopy(args_semisl)
+        args_semisl_multigpu["--gpus"] = "0,1"
+        otx_train_testing(template, tmp_dir_path, otx_dir, args_semisl_multigpu)
diff --git a/tests/integration/cli/segmentation/test_segmentation.py b/tests/integration/cli/segmentation/test_segmentation.py
index d77c37e36d0..f18b0cb394a 100644
--- a/tests/integration/cli/segmentation/test_segmentation.py
+++ b/tests/integration/cli/segmentation/test_segmentation.py
@@ -176,8 +176,28 @@ def test_otx_train_semisl(self, template, tmp_dir_path):
         tmp_dir_path = tmp_dir_path / "segmentation/test_semisl"
         otx_train_testing(template, tmp_dir_path, otx_dir, args_semisl)
 
+    @e2e_pytest_component
+    @pytest.mark.skip(reason="CVS-101246 Multi-GPU tests are stuck while CI is running")
+    @pytest.mark.skipif(MULTI_GPU_UNAVAILABLE, reason="The number of gpu is insufficient")
+    @pytest.mark.parametrize("template", templates, ids=templates_ids)
+    def test_otx_multi_gpu_train_semisl(self, template, tmp_dir_path):
+        tmp_dir_path = tmp_dir_path / "segmentation/test_multi_gpu_semisl"
+        args_semisl_multigpu = copy.deepcopy(args_semisl)
+        args_semisl_multigpu["--gpus"] = "0,1"
+        otx_train_testing(template, tmp_dir_path, otx_dir, args_semisl_multigpu)
+
     @e2e_pytest_component
     @pytest.mark.parametrize("template", templates, ids=templates_ids)
     def test_otx_train_selfsl(self, template, tmp_dir_path):
         tmp_dir_path = tmp_dir_path / "segmentation/test_selfsl"
         otx_train_testing(template, tmp_dir_path, otx_dir, args_selfsl)
+
+    @e2e_pytest_component
+    @pytest.mark.skip(reason="CVS-101246 Multi-GPU tests are stuck while CI is running")
+    @pytest.mark.skipif(MULTI_GPU_UNAVAILABLE, reason="The number of gpu is insufficient")
+    @pytest.mark.parametrize("template", templates, ids=templates_ids)
+    def test_otx_multi_gpu_train_selfsl(self, template, tmp_dir_path):
+        tmp_dir_path = tmp_dir_path / "segmentation/test_multi_gpu_selfsl"
+        args_selfsl_multigpu = copy.deepcopy(args_selfsl)
+        args_selfsl_multigpu["--gpus"] = "0,1"
+        otx_train_testing(template, tmp_dir_path, otx_dir, args_selfsl_multigpu)