From d7739dbe46caafb10d2519bd9c0f6a544bfb5d07 Mon Sep 17 00:00:00 2001
From: jaegukhyun <jaeguk.hyun@intel.com>
Date: Thu, 22 Jun 2023 17:52:47 +0900
Subject: [PATCH 01/11] Add DINO

---
 .../adapters/mmdet/models/__init__.py         |   4 +-
 .../mmdet/models/detectors/__init__.py        |   2 +
 .../models/detectors/custom_dino_detector.py  |  94 +++
 .../adapters/mmdet/models/heads/__init__.py   |   4 +
 .../mmdet/models/heads/custom_dino_head.py    | 629 ++++++++++++++++++
 .../adapters/mmdet/models/heads/detr_head.py  | 263 ++++++++
 .../adapters/mmdet/models/layers/__init__.py  |   9 +
 .../adapters/mmdet/models/layers/dino.py      | 169 +++++
 .../mmdet/models/layers/dino_layers.py        | 609 +++++++++++++++++
 .../detection/resnet50_dino/data_pipeline.py  | 115 ++++
 .../detection/resnet50_dino/deployment.py     |  12 +
 .../configs/detection/resnet50_dino/model.py  | 117 ++++
 .../resnet50_dino/template_experimental.yaml  |  64 ++
 13 files changed, 2089 insertions(+), 2 deletions(-)
 create mode 100644 otx/algorithms/detection/adapters/mmdet/models/detectors/custom_dino_detector.py
 create mode 100644 otx/algorithms/detection/adapters/mmdet/models/heads/custom_dino_head.py
 create mode 100644 otx/algorithms/detection/adapters/mmdet/models/heads/detr_head.py
 create mode 100644 otx/algorithms/detection/adapters/mmdet/models/layers/__init__.py
 create mode 100644 otx/algorithms/detection/adapters/mmdet/models/layers/dino.py
 create mode 100644 otx/algorithms/detection/adapters/mmdet/models/layers/dino_layers.py
 create mode 100644 otx/algorithms/detection/configs/detection/resnet50_dino/data_pipeline.py
 create mode 100644 otx/algorithms/detection/configs/detection/resnet50_dino/deployment.py
 create mode 100644 otx/algorithms/detection/configs/detection/resnet50_dino/model.py
 create mode 100644 otx/algorithms/detection/configs/detection/resnet50_dino/template_experimental.yaml

diff --git a/otx/algorithms/detection/adapters/mmdet/models/__init__.py b/otx/algorithms/detection/adapters/mmdet/models/__init__.py
index c73e3d4247e..c59b3e97b84 100644
--- a/otx/algorithms/detection/adapters/mmdet/models/__init__.py
+++ b/otx/algorithms/detection/adapters/mmdet/models/__init__.py
@@ -3,6 +3,6 @@
 # SPDX-License-Identifier: Apache-2.0
 #
 
-from . import assigners, backbones, dense_heads, detectors, heads, losses, necks, roi_heads
+from . import assigners, backbones, dense_heads, detectors, heads, layers, losses, necks, roi_heads
 
-__all__ = ["assigners", "backbones", "dense_heads", "detectors", "heads", "losses", "necks", "roi_heads"]
+__all__ = ["assigners", "backbones", "dense_heads", "detectors", "heads", "layers", "losses", "necks", "roi_heads"]
diff --git a/otx/algorithms/detection/adapters/mmdet/models/detectors/__init__.py b/otx/algorithms/detection/adapters/mmdet/models/detectors/__init__.py
index 962407bb091..0dc0e8e4079 100644
--- a/otx/algorithms/detection/adapters/mmdet/models/detectors/__init__.py
+++ b/otx/algorithms/detection/adapters/mmdet/models/detectors/__init__.py
@@ -5,6 +5,7 @@
 
 from .custom_atss_detector import CustomATSS
 from .custom_deformable_detr_detector import CustomDeformableDETR
+from .custom_dino_detector import CustomDINO
 from .custom_maskrcnn_detector import CustomMaskRCNN
 from .custom_maskrcnn_tile_optimized import CustomMaskRCNNTileOptimized
 from .custom_single_stage_detector import CustomSingleStageDetector
@@ -18,6 +19,7 @@
 __all__ = [
     "CustomATSS",
     "CustomDeformableDETR",
+    "CustomDINO",
     "CustomMaskRCNN",
     "CustomSingleStageDetector",
     "CustomTwoStageDetector",
diff --git a/otx/algorithms/detection/adapters/mmdet/models/detectors/custom_dino_detector.py b/otx/algorithms/detection/adapters/mmdet/models/detectors/custom_dino_detector.py
new file mode 100644
index 00000000000..3bfd97bfa05
--- /dev/null
+++ b/otx/algorithms/detection/adapters/mmdet/models/detectors/custom_dino_detector.py
@@ -0,0 +1,94 @@
+"""OTX DINO Class for mmdetection detectors."""
+
+# Copyright (C) 2023 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+
+import functools
+
+from mmdet.models.builder import DETECTORS
+
+from otx.algorithms.common.adapters.mmcv.hooks.recording_forward_hook import (
+    ActivationMapHook,
+    FeatureVectorHook,
+)
+from otx.algorithms.common.adapters.mmdeploy.utils import is_mmdeploy_enabled
+from otx.algorithms.common.utils.logger import get_logger
+from otx.algorithms.detection.adapters.mmdet.models.detectors import CustomDeformableDETR
+
+logger = get_logger()
+
+
+@DETECTORS.register_module()
+class CustomDINO(CustomDeformableDETR):
+    """Custom DINO detector."""
+
+    def __init__(self, *args, task_adapt=None, **kwargs):
+        super().__init__(*args, task_adapt=task_adapt, **kwargs)
+        self._register_load_state_dict_pre_hook(
+            functools.partial(
+                self.load_state_dict_pre_hook,
+                self,
+            )
+        )
+
+    @staticmethod
+    def load_state_dict_pre_hook(model, ckpt_dict, *args, **kwargs):
+        """Modify mmdet3.x version's weights before weight loading."""
+
+        if list(ckpt_dict.keys())[0] == "level_embed":
+            logger.info("----------------- CustomDINO.load_state_dict_pre_hook() called")
+            # This ckpt_dict is come from mmdet3.x
+            ckpt_dict["bbox_head.transformer.level_embeds"] = ckpt_dict.pop("level_embed")
+            replaced_params = {}
+            for param in ckpt_dict:
+                new_param = None
+                if "encoder" in param or "decoder" in param:
+                    new_param = "bbox_head.transformer." + param
+                    new_param = new_param.replace("self_attn", "attentions.0")
+                    new_param = new_param.replace("cross_attn", "attentions.1")
+                    new_param = new_param.replace("ffn", "ffns.0")
+                elif param == "query_embedding.weight":
+                    new_param = "bbox_head." + param
+                elif param == "dn_query_generator.label_embedding.weight":
+                    new_param = "bbox_head.transformer." + param
+                elif "memory_trans" in param:
+                    new_param = "bbox_head.transformer." + param
+                    new_param = new_param.replace("memory_trans_fc", "enc_output")
+                    new_param = new_param.replace("memory_trans_norm", "enc_output_norm")
+                if new_param is not None:
+                    replaced_params[param] = new_param
+
+            for origin, new in replaced_params.items():
+                ckpt_dict[new] = ckpt_dict.pop(origin)
+
+
+if is_mmdeploy_enabled():
+    from mmdeploy.core import FUNCTION_REWRITER
+
+    @FUNCTION_REWRITER.register_rewriter(
+        "otx.algorithms.detection.adapters.mmdet.models.detectors.custom_dino_detector.CustomDINO.simple_test"
+    )
+    def custom_dino__simple_test(ctx, self, img, img_metas, **kwargs):
+        """Function for custom_mask_rcnn__simple_test."""
+        height = int(img_metas[0]["img_shape"][0])
+        width = int(img_metas[0]["img_shape"][1])
+        img_metas[0]["batch_input_shape"] = (height, width)
+        img_metas[0]["img_shape"] = (height, width, 3)
+        feats = self.extract_feat(img)
+        gt_bboxes = [None] * len(feats)
+        gt_labels = [None] * len(feats)
+        hidden_states, references, enc_output_class, enc_output_coord, _ = self.bbox_head.forward_transformer(
+            feats, gt_bboxes, gt_labels, img_metas
+        )
+        cls_scores, bbox_preds = self.bbox_head(hidden_states, references)
+        bbox_results = self.bbox_head.get_bboxes(
+            cls_scores, bbox_preds, enc_output_class, enc_output_coord, img_metas=img_metas, **kwargs
+        )
+
+        if ctx.cfg["dump_features"]:
+            feature_vector = FeatureVectorHook.func(feats)
+            saliency_map = ActivationMapHook.func(cls_scores)
+            return (*bbox_results, feature_vector, saliency_map)
+
+        return bbox_results
diff --git a/otx/algorithms/detection/adapters/mmdet/models/heads/__init__.py b/otx/algorithms/detection/adapters/mmdet/models/heads/__init__.py
index 521dd14e83e..28da39d0a1b 100644
--- a/otx/algorithms/detection/adapters/mmdet/models/heads/__init__.py
+++ b/otx/algorithms/detection/adapters/mmdet/models/heads/__init__.py
@@ -6,23 +6,27 @@
 from .cross_dataset_detector_head import CrossDatasetDetectorHead
 from .custom_anchor_generator import SSDAnchorGeneratorClustered
 from .custom_atss_head import CustomATSSHead, CustomATSSHeadTrackingLossDynamics
+from .custom_dino_head import CustomDINOHead
 from .custom_fcn_mask_head import CustomFCNMaskHead
 from .custom_retina_head import CustomRetinaHead
 from .custom_roi_head import CustomRoIHead
 from .custom_ssd_head import CustomSSDHead
 from .custom_vfnet_head import CustomVFNetHead
 from .custom_yolox_head import CustomYOLOXHead
+from .detr_head import DETRHeadExtension
 
 __all__ = [
     "CrossDatasetDetectorHead",
     "SSDAnchorGeneratorClustered",
     "CustomATSSHead",
+    "CustomDINOHead",
     "CustomFCNMaskHead",
     "CustomRetinaHead",
     "CustomSSDHead",
     "CustomRoIHead",
     "CustomVFNetHead",
     "CustomYOLOXHead",
+    "DETRHeadExtension",
     # Loss dynamics tracking
     "CustomATSSHeadTrackingLossDynamics",
 ]
diff --git a/otx/algorithms/detection/adapters/mmdet/models/heads/custom_dino_head.py b/otx/algorithms/detection/adapters/mmdet/models/heads/custom_dino_head.py
new file mode 100644
index 00000000000..e17ec30bf55
--- /dev/null
+++ b/otx/algorithms/detection/adapters/mmdet/models/heads/custom_dino_head.py
@@ -0,0 +1,629 @@
+"""Custom DINO head for OTX template."""
+# Copyright (C) 2023 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+
+from typing import Dict, List, Optional, Tuple
+
+import torch
+import torch.nn.functional as F
+from mmcv.utils import Config
+from mmdet.core import bbox_cxcywh_to_xyxy, bbox_xyxy_to_cxcywh, multi_apply, reduce_mean
+from mmdet.models.builder import HEADS
+from mmdet.models.dense_heads import DeformableDETRHead
+from mmdet.models.utils.transformer import inverse_sigmoid
+from torch import Tensor
+
+from otx.algorithms.detection.adapters.mmdet.models.heads.detr_head import DETRHeadExtension
+from otx.algorithms.detection.adapters.mmdet.models.layers import CdnQueryGenerator
+
+
+@HEADS.register_module()
+class CustomDINOHead(DeformableDETRHead, DETRHeadExtension):
+    """Head of DINO."""
+
+    def __init__(self, *args, dn_cfg: Optional[Config] = None, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        if dn_cfg is not None:
+            assert "num_classes" not in dn_cfg and "num_queries" not in dn_cfg and "hidden_dim" not in dn_cfg, (
+                "The three keyword args `num_classes`, `embed_dims`, and "
+                "`num_matching_queries` are set in `detector.__init__()`, "
+                "users should not set them in `dn_cfg` config."
+            )
+            dn_cfg["num_classes"] = self.num_classes
+            dn_cfg["embed_dims"] = self.embed_dims
+            dn_cfg["num_matching_queries"] = self.num_query
+        self.transformer.dn_query_generator = CdnQueryGenerator(**dn_cfg)
+        self.transformer.two_stage_num_proposals = self.num_query
+
+    def _init_layers(self):
+        """Initialize classification branch and regression branch of head."""
+        super()._init_layers()
+        self.query_embedding = torch.nn.Embedding(self.num_query, self.embed_dims)
+
+    def forward_train(self, x, img_metas, gt_bboxes, gt_labels=None, gt_bboxes_ignore=None, proposal_cfg=None):
+        """Forward function for training mode.
+
+        Args:
+            x (list[Tensor]): Features from backbone.
+            img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            gt_bboxes (Tensor): Ground truth bboxes of the image,
+                shape (num_gts, 4).
+            gt_labels (Tensor): Ground truth labels of each box,
+                shape (num_gts,).
+            gt_bboxes_ignore (Tensor): Ground truth bboxes to be
+                ignored, shape (num_ignored_gts, 4).
+            proposal_cfg (mmcv.Config): Test / postprocessing configuration,
+                if None, test_cfg would be used.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        assert proposal_cfg is None, '"proposal_cfg" must be None'
+        outs = self.forward_transformer(x, gt_bboxes, gt_labels, img_metas)
+        batch_data_samples = []
+        for img_meta, gt_bbox, gt_label in zip(img_metas, gt_bboxes, gt_labels):
+            info = Config({"metainfo": img_meta, "gt_instances": {"bboxes": gt_bbox, "labels": gt_label}})
+            batch_data_samples.append(info)
+        loss_inputs = outs + (batch_data_samples,)
+        losses = self.loss(*loss_inputs)
+        return losses
+
+    def forward_transformer(self, mlvl_feats, gt_bboxes, gt_labels, img_metas):
+        """Forward function.
+
+        Args:
+            mlvl_feats (tuple[Tensor]): Features from the upstream
+                network, each is a 4D-tensor with shape
+                (N, C, H, W).
+            gt_bboxes (List[Tensor | None]): List of ground truth bboxes.
+                When model is evaluated, it will be list of None.
+            gt_labels (List[Tensor | None]): List of ground truth labels.
+                When model is evaluated, it will be list of None.
+            img_metas (list[dict]): List of image information.
+
+        Returns:
+            all_cls_scores (Tensor): Outputs from the classification head, \
+                shape [nb_dec, bs, num_query, cls_out_channels]. Note \
+                cls_out_channels should includes background.
+            all_bbox_preds (Tensor): Sigmoid outputs from the regression \
+                head with normalized coordinate format (cx, cy, w, h). \
+                Shape [nb_dec, bs, num_query, 4].
+            enc_outputs_class (Tensor): The score of each point on encode \
+                feature map, has shape (N, h*w, num_class). Only when \
+                as_two_stage is True it would be returned, otherwise \
+                `None` would be returned.
+            enc_outputs_coord (Tensor): The proposal generate from the \
+                encode feature map, has shape (N, h*w, 4). Only when \
+                as_two_stage is True it would be returned, otherwise \
+                `None` would be returned.
+        """
+
+        batch_size = mlvl_feats[0].size(0)
+        input_img_h, input_img_w = img_metas[0]["batch_input_shape"]
+        img_masks = mlvl_feats[0].new_ones((batch_size, input_img_h, input_img_w))
+        for img_id in range(batch_size):
+            img_h, img_w, _ = img_metas[img_id]["img_shape"]
+            img_masks[img_id, :img_h, :img_w] = 0
+
+        mlvl_masks = []
+        mlvl_positional_encodings = []
+        for feat in mlvl_feats:
+            mlvl_masks.append(F.interpolate(img_masks[None], size=feat.shape[-2:]).to(torch.bool).squeeze(0))
+            mlvl_positional_encodings.append(self.positional_encoding(mlvl_masks[-1]))
+
+        query_embeds = self.query_embedding.weight
+        batch_info = []
+        for img_meta, gt_bbox, gt_label in zip(img_metas, gt_bboxes, gt_labels):
+            info = {
+                "img_shape": img_meta["img_shape"][:2],
+                "bboxes": gt_bbox,
+                "labels": gt_label,
+            }
+            batch_info.append(info)
+        return self.transformer(
+            batch_info,
+            mlvl_feats,
+            mlvl_masks,
+            query_embeds,
+            mlvl_positional_encodings,
+            reg_branches=self.reg_branches,
+            cls_branches=self.cls_branches,
+        )
+
+    def loss(
+        self,
+        hidden_states: Tensor,
+        references: List[Tensor],
+        enc_outputs_class: Tensor,
+        enc_outputs_coord: Tensor,
+        dn_meta: Dict[str, int],
+        batch_data_samples,
+    ) -> dict:
+        """Perform forward propagation and loss calculation.
+
+        Args:
+            hidden_states (Tensor): Hidden states output from each decoder
+                layer, has shape (num_decoder_layers, bs, num_queries_total,
+                dim), where `num_queries_total` is the sum of
+                `num_denoising_queries` and `num_matching_queries` when
+                `self.training` is `True`, else `num_matching_queries`.
+            references (list[Tensor]): List of the reference from the decoder.
+                The first reference is the `init_reference` (initial) and the
+                other num_decoder_layers(6) references are `inter_references`
+                (intermediate). The `init_reference` has shape (bs,
+                num_queries_total, 4) and each `inter_reference` has shape
+                (bs, num_queries, 4) with the last dimension arranged as
+                (cx, cy, w, h).
+            enc_outputs_class (Tensor): The score of each point on encode
+                feature map, has shape (bs, num_feat_points, cls_out_channels).
+            enc_outputs_coord (Tensor): The proposal generate from the
+                encode feature map, has shape (bs, num_feat_points, 4) with the
+                last dimension arranged as (cx, cy, w, h).
+            batch_data_samples (list[:obj:`DetDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+            dn_meta (Dict[str, int]): The dictionary saves information about
+              group collation, including 'num_denoising_queries' and
+              'num_denoising_groups'. It will be used for split outputs of
+              denoising and matching parts and loss calculation.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        batch_gt_instances = []
+        batch_img_metas = []
+        for data_sample in batch_data_samples:
+            batch_img_metas.append(data_sample.metainfo)
+            batch_gt_instances.append(data_sample.gt_instances)
+
+        outs = self(hidden_states, references)
+        loss_inputs = outs + (enc_outputs_class, enc_outputs_coord, batch_gt_instances, batch_img_metas, dn_meta)
+        losses = self.loss_by_feat_two_stage(*loss_inputs)
+        return losses
+
+    def forward(self, hidden_states, references):
+        """Forward function.
+
+        T.B.D.
+        """
+        all_layers_outputs_classes = []
+        all_layers_outputs_coords = []
+
+        for layer_id in range(hidden_states.shape[0]):
+            reference = inverse_sigmoid(references[layer_id])
+            # NOTE The last reference will not be used.
+            hidden_state = hidden_states[layer_id]
+            outputs_class = self.cls_branches[layer_id](hidden_state)
+            tmp_reg_preds = self.reg_branches[layer_id](hidden_state)
+            if reference.shape[-1] == 4:
+                # When `layer` is 0 and `as_two_stage` of the detector
+                # is `True`, or when `layer` is greater than 0 and
+                # `with_box_refine` of the detector is `True`.
+                tmp_reg_preds += reference
+            else:
+                # When `layer` is 0 and `as_two_stage` of the detector
+                # is `False`, or when `layer` is greater than 0 and
+                # `with_box_refine` of the detector is `False`.
+                assert reference.shape[-1] == 2
+                tmp_reg_preds[..., :2] += reference
+            outputs_coord = tmp_reg_preds.sigmoid()
+            all_layers_outputs_classes.append(outputs_class)
+            all_layers_outputs_coords.append(outputs_coord)
+
+        all_layers_outputs_classes = torch.stack(all_layers_outputs_classes)
+        all_layers_outputs_coords = torch.stack(all_layers_outputs_coords)
+
+        return all_layers_outputs_classes, all_layers_outputs_coords
+
+    def loss_by_feat_two_stage(
+        self,
+        all_layers_cls_scores: Tensor,
+        all_layers_bbox_preds: Tensor,
+        enc_cls_scores: Tensor,
+        enc_bbox_preds: Tensor,
+        batch_gt_instances,
+        batch_img_metas: List[dict],
+        dn_meta: Dict[str, int],
+        batch_gt_instances_ignore=None,
+    ) -> Dict[str, Tensor]:
+        """Loss function.
+
+        Args:
+            all_layers_cls_scores (Tensor): Classification scores of all
+                decoder layers, has shape (num_decoder_layers, bs,
+                num_queries_total, cls_out_channels), where
+                `num_queries_total` is the sum of `num_denoising_queries`
+                and `num_matching_queries`.
+            all_layers_bbox_preds (Tensor): Regression outputs of all decoder
+                layers. Each is a 4D-tensor with normalized coordinate format
+                (cx, cy, w, h) and has shape (num_decoder_layers, bs,
+                num_queries_total, 4).
+            enc_cls_scores (Tensor): The score of each point on encode
+                feature map, has shape (bs, num_feat_points, cls_out_channels).
+            enc_bbox_preds (Tensor): The proposal generate from the encode
+                feature map, has shape (bs, num_feat_points, 4) with the last
+                dimension arranged as (cx, cy, w, h).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            dn_meta (Dict[str, int]): The dictionary saves information about
+                group collation, including 'num_denoising_queries' and
+                'num_denoising_groups'. It will be used for split outputs of
+                denoising and matching parts and loss calculation.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        # extract denoising and matching part of outputs
+        (
+            all_layers_matching_cls_scores,
+            all_layers_matching_bbox_preds,
+            all_layers_denoising_cls_scores,
+            all_layers_denoising_bbox_preds,
+        ) = self.split_outputs(all_layers_cls_scores, all_layers_bbox_preds, dn_meta)
+
+        loss_dict = super(DeformableDETRHead, self).loss_by_feat(
+            all_layers_matching_cls_scores,
+            all_layers_matching_bbox_preds,
+            batch_gt_instances,
+            batch_img_metas,
+            batch_gt_instances_ignore,
+        )
+
+        # loss of proposal generated from encode feature map.
+        if enc_cls_scores is not None:
+            # NOTE The enc_loss calculation of the DINO is
+            # different from that of Deformable DETR.
+            enc_loss_cls, enc_losses_bbox, enc_losses_iou = self.loss_by_feat_single(
+                enc_cls_scores, enc_bbox_preds, batch_gt_instances=batch_gt_instances, batch_img_metas=batch_img_metas
+            )
+            loss_dict["enc_loss_cls"] = enc_loss_cls
+            loss_dict["enc_loss_bbox"] = enc_losses_bbox
+            loss_dict["enc_loss_iou"] = enc_losses_iou
+
+        if all_layers_denoising_cls_scores is not None:
+            # calculate denoising loss from all decoder layers
+            dn_losses_cls, dn_losses_bbox, dn_losses_iou = self.loss_dn(
+                all_layers_denoising_cls_scores,
+                all_layers_denoising_bbox_preds,
+                batch_gt_instances=batch_gt_instances,
+                batch_img_metas=batch_img_metas,
+                dn_meta=dn_meta,
+            )
+            # collate denoising loss
+            loss_dict["dn_loss_cls"] = dn_losses_cls[-1]
+            loss_dict["dn_loss_bbox"] = dn_losses_bbox[-1]
+            loss_dict["dn_loss_iou"] = dn_losses_iou[-1]
+            for num_dec_layer, (loss_cls_i, loss_bbox_i, loss_iou_i) in enumerate(
+                zip(dn_losses_cls[:-1], dn_losses_bbox[:-1], dn_losses_iou[:-1])
+            ):
+                loss_dict[f"d{num_dec_layer}.dn_loss_cls"] = loss_cls_i
+                loss_dict[f"d{num_dec_layer}.dn_loss_bbox"] = loss_bbox_i
+                loss_dict[f"d{num_dec_layer}.dn_loss_iou"] = loss_iou_i
+        return loss_dict
+
+    def loss_dn(
+        self,
+        all_layers_denoising_cls_scores: Tensor,
+        all_layers_denoising_bbox_preds: Tensor,
+        batch_gt_instances,
+        batch_img_metas: List[dict],
+        dn_meta: Dict[str, int],
+    ) -> Tuple[List[Tensor], ...]:
+        """Calculate denoising loss.
+
+        Args:
+            all_layers_denoising_cls_scores (Tensor): Classification scores of
+                all decoder layers in denoising part, has shape (
+                num_decoder_layers, bs, num_denoising_queries,
+                cls_out_channels).
+            all_layers_denoising_bbox_preds (Tensor): Regression outputs of all
+                decoder layers in denoising part. Each is a 4D-tensor with
+                normalized coordinate format (cx, cy, w, h) and has shape
+                (num_decoder_layers, bs, num_denoising_queries, 4).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            dn_meta (Dict[str, int]): The dictionary saves information about
+              group collation, including 'num_denoising_queries' and
+              'num_denoising_groups'. It will be used for split outputs of
+              denoising and matching parts and loss calculation.
+
+        Returns:
+            Tuple[List[Tensor]]: The loss_dn_cls, loss_dn_bbox, and loss_dn_iou
+            of each decoder layers.
+        """
+        return multi_apply(
+            self._loss_dn_single,
+            all_layers_denoising_cls_scores,
+            all_layers_denoising_bbox_preds,
+            batch_gt_instances=batch_gt_instances,
+            batch_img_metas=batch_img_metas,
+            dn_meta=dn_meta,
+        )
+
+    def _loss_dn_single(
+        self,
+        dn_cls_scores: Tensor,
+        dn_bbox_preds: Tensor,
+        batch_gt_instances,
+        batch_img_metas: List[dict],
+        dn_meta: Dict[str, int],
+    ) -> Tuple[Tensor, ...]:
+        """Denoising loss for outputs from a single decoder layer.
+
+        Args:
+            dn_cls_scores (Tensor): Classification scores of a single decoder
+                layer in denoising part, has shape (bs, num_denoising_queries,
+                cls_out_channels).
+            dn_bbox_preds (Tensor): Regression outputs of a single decoder
+                layer in denoising part. Each is a 4D-tensor with normalized
+                coordinate format (cx, cy, w, h) and has shape
+                (bs, num_denoising_queries, 4).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            dn_meta (Dict[str, int]): The dictionary saves information about
+              group collation, including 'num_denoising_queries' and
+              'num_denoising_groups'. It will be used for split outputs of
+              denoising and matching parts and loss calculation.
+
+        Returns:
+            Tuple[Tensor]: A tuple including `loss_cls`, `loss_box` and
+            `loss_iou`.
+        """
+        cls_reg_targets = self.get_dn_targets(batch_gt_instances, batch_img_metas, dn_meta)
+        (
+            labels_list,
+            label_weights_list,
+            bbox_targets_list,
+            bbox_weights_list,
+            num_total_pos,
+            num_total_neg,
+        ) = cls_reg_targets
+        labels = torch.cat(labels_list, 0)
+        label_weights = torch.cat(label_weights_list, 0)
+        bbox_targets = torch.cat(bbox_targets_list, 0)
+        bbox_weights = torch.cat(bbox_weights_list, 0)
+
+        # classification loss
+        cls_scores = dn_cls_scores.reshape(-1, self.cls_out_channels)
+        # construct weighted avg_factor to match with the official DETR repo
+        cls_avg_factor = num_total_pos * 1.0 + num_total_neg * self.bg_cls_weight
+        if self.sync_cls_avg_factor:
+            cls_avg_factor = reduce_mean(cls_scores.new_tensor([cls_avg_factor]))
+        cls_avg_factor = max(cls_avg_factor, 1)
+
+        if len(cls_scores) > 0:
+            loss_cls = self.loss_cls(cls_scores, labels, label_weights, avg_factor=cls_avg_factor)
+        else:
+            loss_cls = torch.zeros(1, dtype=cls_scores.dtype, device=cls_scores.device)
+
+        # Compute the average number of gt boxes across all gpus, for
+        # normalization purposes
+        num_total_pos = loss_cls.new_tensor([num_total_pos])
+        num_total_pos = torch.clamp(reduce_mean(num_total_pos), min=1).item()
+
+        # construct factors used for rescale bboxes
+        factors = []
+        for img_meta, bbox_pred in zip(batch_img_metas, dn_bbox_preds):
+            img_h, img_w = img_meta["img_shape"][:2]
+            factor = bbox_pred.new_tensor([img_w, img_h, img_w, img_h]).unsqueeze(0).repeat(bbox_pred.size(0), 1)
+            factors.append(factor)
+        factors = torch.cat(factors)
+
+        # DETR regress the relative position of boxes (cxcywh) in the image,
+        # thus the learning target is normalized by the image size. So here
+        # we need to re-scale them for calculating IoU loss
+        bbox_preds = dn_bbox_preds.reshape(-1, 4)
+        bboxes = bbox_cxcywh_to_xyxy(bbox_preds) * factors
+        bboxes_gt = bbox_cxcywh_to_xyxy(bbox_targets) * factors
+
+        # regression IoU loss, defaultly GIoU loss
+        loss_iou = self.loss_iou(bboxes, bboxes_gt, bbox_weights, avg_factor=num_total_pos)
+
+        # regression L1 loss
+        loss_bbox = self.loss_bbox(bbox_preds, bbox_targets, bbox_weights, avg_factor=num_total_pos)
+        return loss_cls, loss_bbox, loss_iou
+
+    def get_dn_targets(self, batch_gt_instances, batch_img_metas: List[Dict], dn_meta: Dict[str, int]) -> tuple:
+        """Get targets in denoising part for a batch of images.
+
+        Args:
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            dn_meta (Dict[str, int]): The dictionary saves information about
+              group collation, including 'num_denoising_queries' and
+              'num_denoising_groups'. It will be used for split outputs of
+              denoising and matching parts and loss calculation.
+
+        Returns:
+            tuple: a tuple containing the following targets.
+
+            - labels_list (list[Tensor]): Labels for all images.
+            - label_weights_list (list[Tensor]): Label weights for all images.
+            - bbox_targets_list (list[Tensor]): BBox targets for all images.
+            - bbox_weights_list (list[Tensor]): BBox weights for all images.
+            - num_total_pos (int): Number of positive samples in all images.
+            - num_total_neg (int): Number of negative samples in all images.
+        """
+        (
+            labels_list,
+            label_weights_list,
+            bbox_targets_list,
+            bbox_weights_list,
+            pos_inds_list,
+            neg_inds_list,
+        ) = multi_apply(self._get_dn_targets_single, batch_gt_instances, batch_img_metas, dn_meta=dn_meta)
+        num_total_pos = sum((inds.numel() for inds in pos_inds_list))
+        num_total_neg = sum((inds.numel() for inds in neg_inds_list))
+        return (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list, num_total_pos, num_total_neg)
+
+    def _get_dn_targets_single(self, gt_instances, img_meta: dict, dn_meta: Dict[str, int]) -> tuple:
+        """Get targets in denoising part for one image.
+
+        Args:
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It should includes ``bboxes`` and ``labels``
+                attributes.
+            img_meta (dict): Meta information for one image.
+            dn_meta (Dict[str, int]): The dictionary saves information about
+              group collation, including 'num_denoising_queries' and
+              'num_denoising_groups'. It will be used for split outputs of
+              denoising and matching parts and loss calculation.
+
+        Returns:
+            tuple[Tensor]: a tuple containing the following for one image.
+
+            - labels (Tensor): Labels of each image.
+            - label_weights (Tensor]): Label weights of each image.
+            - bbox_targets (Tensor): BBox targets of each image.
+            - bbox_weights (Tensor): BBox weights of each image.
+            - pos_inds (Tensor): Sampled positive indices for each image.
+            - neg_inds (Tensor): Sampled negative indices for each image.
+        """
+        gt_bboxes = gt_instances.bboxes
+        gt_labels = gt_instances.labels
+        num_groups = dn_meta["num_denoising_groups"]
+        num_denoising_queries = dn_meta["num_denoising_queries"]
+        num_queries_each_group = int(num_denoising_queries / num_groups)
+        device = gt_bboxes.device
+
+        if len(gt_labels) > 0:
+            t = torch.arange(len(gt_labels), dtype=torch.long, device=device)
+            t = t.unsqueeze(0).repeat(num_groups, 1)
+            pos_assigned_gt_inds = t.flatten()
+            pos_inds = torch.arange(num_groups, dtype=torch.long, device=device)
+            pos_inds = pos_inds.unsqueeze(1) * num_queries_each_group + t
+            pos_inds = pos_inds.flatten()
+        else:
+            pos_inds = pos_assigned_gt_inds = gt_bboxes.new_tensor([], dtype=torch.long)
+
+        neg_inds = pos_inds + num_queries_each_group // 2
+
+        # label targets
+        labels = gt_bboxes.new_full((num_denoising_queries,), self.num_classes, dtype=torch.long)
+        labels[pos_inds] = gt_labels[pos_assigned_gt_inds]
+        label_weights = gt_bboxes.new_ones(num_denoising_queries)
+
+        # bbox targets
+        bbox_targets = torch.zeros(num_denoising_queries, 4, device=device)
+        bbox_weights = torch.zeros(num_denoising_queries, 4, device=device)
+        bbox_weights[pos_inds] = 1.0
+        img_h, img_w = img_meta["img_shape"][:2]
+
+        # DETR regress the relative position of boxes (cxcywh) in the image.
+        # Thus the learning target should be normalized by the image size, also
+        # the box format should be converted from defaultly x1y1x2y2 to cxcywh.
+        factor = gt_bboxes.new_tensor([img_w, img_h, img_w, img_h]).unsqueeze(0)
+        gt_bboxes_normalized = gt_bboxes / factor
+        gt_bboxes_targets = bbox_xyxy_to_cxcywh(gt_bboxes_normalized)
+        bbox_targets[pos_inds] = gt_bboxes_targets.repeat([num_groups, 1])
+
+        return (labels, label_weights, bbox_targets, bbox_weights, pos_inds, neg_inds)
+
+    @staticmethod
+    def split_outputs(
+        all_layers_cls_scores: Tensor, all_layers_bbox_preds: Tensor, dn_meta: Dict[str, int]
+    ) -> Tuple[Tensor, ...]:
+        """Split outputs of the denoising part and the matching part.
+
+        For the total outputs of `num_queries_total` length, the former
+        `num_denoising_queries` outputs are from denoising queries, and
+        the rest `num_matching_queries` ones are from matching queries,
+        where `num_queries_total` is the sum of `num_denoising_queries` and
+        `num_matching_queries`.
+
+        Args:
+            all_layers_cls_scores (Tensor): Classification scores of all
+                decoder layers, has shape (num_decoder_layers, bs,
+                num_queries_total, cls_out_channels).
+            all_layers_bbox_preds (Tensor): Regression outputs of all decoder
+                layers. Each is a 4D-tensor with normalized coordinate format
+                (cx, cy, w, h) and has shape (num_decoder_layers, bs,
+                num_queries_total, 4).
+            dn_meta (Dict[str, int]): The dictionary saves information about
+              group collation, including 'num_denoising_queries' and
+              'num_denoising_groups'.
+
+        Returns:
+            Tuple[Tensor]: a tuple containing the following outputs.
+
+            - all_layers_matching_cls_scores (Tensor): Classification scores
+              of all decoder layers in matching part, has shape
+              (num_decoder_layers, bs, num_matching_queries, cls_out_channels).
+            - all_layers_matching_bbox_preds (Tensor): Regression outputs of
+              all decoder layers in matching part. Each is a 4D-tensor with
+              normalized coordinate format (cx, cy, w, h) and has shape
+              (num_decoder_layers, bs, num_matching_queries, 4).
+            - all_layers_denoising_cls_scores (Tensor): Classification scores
+              of all decoder layers in denoising part, has shape
+              (num_decoder_layers, bs, num_denoising_queries,
+              cls_out_channels).
+            - all_layers_denoising_bbox_preds (Tensor): Regression outputs of
+              all decoder layers in denoising part. Each is a 4D-tensor with
+              normalized coordinate format (cx, cy, w, h) and has shape
+              (num_decoder_layers, bs, num_denoising_queries, 4).
+        """
+        num_denoising_queries = dn_meta["num_denoising_queries"]
+        if dn_meta is not None:
+            all_layers_denoising_cls_scores = all_layers_cls_scores[:, :, :num_denoising_queries, :]
+            all_layers_denoising_bbox_preds = all_layers_bbox_preds[:, :, :num_denoising_queries, :]
+            all_layers_matching_cls_scores = all_layers_cls_scores[:, :, num_denoising_queries:, :]
+            all_layers_matching_bbox_preds = all_layers_bbox_preds[:, :, num_denoising_queries:, :]
+        else:
+            all_layers_denoising_cls_scores = None
+            all_layers_denoising_bbox_preds = None
+            all_layers_matching_cls_scores = all_layers_cls_scores
+            all_layers_matching_bbox_preds = all_layers_bbox_preds
+        return (
+            all_layers_matching_cls_scores,
+            all_layers_matching_bbox_preds,
+            all_layers_denoising_cls_scores,
+            all_layers_denoising_bbox_preds,
+        )
+
+    def simple_test_bboxes(self, feats, img_metas, rescale=False):
+        """Test det bboxes without test-time augmentation.
+
+        Args:
+            feats (tuple[torch.Tensor]): Multi-level features from the
+                upstream network, each is a 4D-tensor.
+            img_metas (list[dict]): List of image information.
+            rescale (bool, optional): Whether to rescale the results.
+                Defaults to False.
+
+        Returns:
+            list[tuple[Tensor, Tensor]]: Each item in result_list is 2-tuple.
+                The first item is ``bboxes`` with shape (n, 5),
+                where 5 represent (tl_x, tl_y, br_x, br_y, score).
+                The shape of the second tensor in the tuple is ``labels``
+                with shape (n,)
+        """
+        # forward of this head requires img_metas
+        gt_bboxes = [None] * len(feats)
+        gt_labels = [None] * len(feats)
+        hidden_states, references, enc_output_class, enc_output_coord, _ = self.forward_transformer(
+            feats, gt_bboxes, gt_labels, img_metas
+        )
+        cls_scores, bbox_preds = self(hidden_states, references)
+        results_list = self.get_bboxes(
+            cls_scores, bbox_preds, enc_output_class, enc_output_coord, img_metas, rescale=rescale
+        )
+        return results_list
diff --git a/otx/algorithms/detection/adapters/mmdet/models/heads/detr_head.py b/otx/algorithms/detection/adapters/mmdet/models/heads/detr_head.py
new file mode 100644
index 00000000000..86841e12022
--- /dev/null
+++ b/otx/algorithms/detection/adapters/mmdet/models/heads/detr_head.py
@@ -0,0 +1,263 @@
+"""DETR Head extension for OTX DINO."""
+# Copyright (C) 2023 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+
+from typing import Dict, List, Tuple
+
+import torch
+from mmcv.runner import BaseModule
+from mmdet.core import bbox_cxcywh_to_xyxy, bbox_xyxy_to_cxcywh, multi_apply, reduce_mean
+from torch import Tensor
+
+
+class DETRHeadExtension(BaseModule):
+    """Head of DETR. DETR:End-to-End Object Detection with Transformers."""
+
+    def loss_by_feat(
+        self,
+        all_layers_cls_scores: Tensor,
+        all_layers_bbox_preds: Tensor,
+        batch_gt_instances,
+        batch_img_metas: List[dict],
+        batch_gt_instances_ignore=None,
+    ) -> Dict[str, Tensor]:
+        """Loss function.
+
+        Only outputs from the last feature level are used for computing
+        losses by default.
+
+        Args:
+            all_layers_cls_scores (Tensor): Classification outputs
+                of each decoder layers. Each is a 4D-tensor, has shape
+                (num_decoder_layers, bs, num_queries, cls_out_channels).
+            all_layers_bbox_preds (Tensor): Sigmoid regression
+                outputs of each decoder layers. Each is a 4D-tensor with
+                normalized coordinate format (cx, cy, w, h) and shape
+                (num_decoder_layers, bs, num_queries, 4).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        assert batch_gt_instances_ignore is None, (
+            f"{self.__class__.__name__} only supports " "for batch_gt_instances_ignore setting to None."
+        )
+
+        losses_cls, losses_bbox, losses_iou = multi_apply(
+            self.loss_by_feat_single,
+            all_layers_cls_scores,
+            all_layers_bbox_preds,
+            batch_gt_instances=batch_gt_instances,
+            batch_img_metas=batch_img_metas,
+        )
+
+        loss_dict = dict()
+        # loss from the last decoder layer
+        loss_dict["loss_cls"] = losses_cls[-1]
+        loss_dict["loss_bbox"] = losses_bbox[-1]
+        loss_dict["loss_iou"] = losses_iou[-1]
+        # loss from other decoder layers
+        num_dec_layer = 0
+        for loss_cls_i, loss_bbox_i, loss_iou_i in zip(losses_cls[:-1], losses_bbox[:-1], losses_iou[:-1]):
+            loss_dict[f"d{num_dec_layer}.loss_cls"] = loss_cls_i
+            loss_dict[f"d{num_dec_layer}.loss_bbox"] = loss_bbox_i
+            loss_dict[f"d{num_dec_layer}.loss_iou"] = loss_iou_i
+            num_dec_layer += 1
+        return loss_dict
+
+    def loss_by_feat_single(
+        self, cls_scores: Tensor, bbox_preds: Tensor, batch_gt_instances, batch_img_metas: List[dict]
+    ) -> Tuple[Tensor, Tensor, Tensor]:
+        """Loss function for outputs from a single decoder layer of a single feature level.
+
+        Args:
+            cls_scores (Tensor): Box score logits from a single decoder layer
+                for all images, has shape (bs, num_queries, cls_out_channels).
+            bbox_preds (Tensor): Sigmoid outputs from a single decoder layer
+                for all images, with normalized coordinate (cx, cy, w, h) and
+                shape (bs, num_queries, 4).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+
+        Returns:
+            Tuple[Tensor]: A tuple including `loss_cls`, `loss_box` and
+            `loss_iou`.
+        """
+        num_imgs = cls_scores.size(0)
+        cls_scores_list = [cls_scores[i] for i in range(num_imgs)]
+        bbox_preds_list = [bbox_preds[i] for i in range(num_imgs)]
+        cls_reg_targets = self._get_targets(cls_scores_list, bbox_preds_list, batch_gt_instances, batch_img_metas)
+        (
+            labels_list,
+            label_weights_list,
+            bbox_targets_list,
+            bbox_weights_list,
+            num_total_pos,
+            num_total_neg,
+        ) = cls_reg_targets
+        labels = torch.cat(labels_list, 0)
+        label_weights = torch.cat(label_weights_list, 0)
+        bbox_targets = torch.cat(bbox_targets_list, 0)
+        bbox_weights = torch.cat(bbox_weights_list, 0)
+
+        # classification loss
+        cls_scores = cls_scores.reshape(-1, self.cls_out_channels)
+        # construct weighted avg_factor to match with the official DETR repo
+        cls_avg_factor = num_total_pos * 1.0 + num_total_neg * self.bg_cls_weight
+        if self.sync_cls_avg_factor:
+            cls_avg_factor = reduce_mean(cls_scores.new_tensor([cls_avg_factor]))
+        cls_avg_factor = max(cls_avg_factor, 1)
+
+        loss_cls = self.loss_cls(cls_scores, labels, label_weights, avg_factor=cls_avg_factor)
+
+        # Compute the average number of gt boxes across all gpus, for
+        # normalization purposes
+        num_total_pos = loss_cls.new_tensor([num_total_pos])
+        num_total_pos = torch.clamp(reduce_mean(num_total_pos), min=1).item()
+
+        # construct factors used for rescale bboxes
+        factors = []
+        for img_meta, bbox_pred in zip(batch_img_metas, bbox_preds):
+            (img_h, img_w,) = img_meta[
+                "img_shape"
+            ][:2]
+            factor = bbox_pred.new_tensor([img_w, img_h, img_w, img_h]).unsqueeze(0).repeat(bbox_pred.size(0), 1)
+            factors.append(factor)
+        factors = torch.cat(factors, 0)
+
+        # DETR regress the relative position of boxes (cxcywh) in the image,
+        # thus the learning target is normalized by the image size. So here
+        # we need to re-scale them for calculating IoU loss
+        bbox_preds = bbox_preds.reshape(-1, 4)
+        bboxes = bbox_cxcywh_to_xyxy(bbox_preds) * factors
+        bboxes_gt = bbox_cxcywh_to_xyxy(bbox_targets) * factors
+
+        # regression IoU loss, defaultly GIoU loss
+        loss_iou = self.loss_iou(bboxes, bboxes_gt, bbox_weights, avg_factor=num_total_pos)
+
+        # regression L1 loss
+        loss_bbox = self.loss_bbox(bbox_preds, bbox_targets, bbox_weights, avg_factor=num_total_pos)
+        return loss_cls, loss_bbox, loss_iou
+
+    def _get_targets(
+        self,
+        cls_scores_list: List[Tensor],
+        bbox_preds_list: List[Tensor],
+        batch_gt_instances,
+        batch_img_metas: List[dict],
+    ) -> tuple:
+        """Compute regression and classification targets for a batch image.
+
+        Outputs from a single decoder layer of a single feature level are used.
+
+        Args:
+            cls_scores_list (list[Tensor]): Box score logits from a single
+                decoder layer for each image, has shape [num_queries,
+                cls_out_channels].
+            bbox_preds_list (list[Tensor]): Sigmoid outputs from a single
+                decoder layer for each image, with normalized coordinate
+                (cx, cy, w, h) and shape [num_queries, 4].
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+
+        Returns:
+            tuple: a tuple containing the following targets.
+
+            - labels_list (list[Tensor]): Labels for all images.
+            - label_weights_list (list[Tensor]): Label weights for all images.
+            - bbox_targets_list (list[Tensor]): BBox targets for all images.
+            - bbox_weights_list (list[Tensor]): BBox weights for all images.
+            - num_total_pos (int): Number of positive samples in all images.
+            - num_total_neg (int): Number of negative samples in all images.
+        """
+        (
+            labels_list,
+            label_weights_list,
+            bbox_targets_list,
+            bbox_weights_list,
+            pos_inds_list,
+            neg_inds_list,
+        ) = multi_apply(
+            self.__get_targets_single, cls_scores_list, bbox_preds_list, batch_gt_instances, batch_img_metas
+        )
+        num_total_pos = sum((inds.numel() for inds in pos_inds_list))
+        num_total_neg = sum((inds.numel() for inds in neg_inds_list))
+        return (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list, num_total_pos, num_total_neg)
+
+    def __get_targets_single(self, cls_score: Tensor, bbox_pred: Tensor, gt_instances, img_meta: dict) -> tuple:
+        """Compute regression and classification targets for one image.
+
+        Outputs from a single decoder layer of a single feature level are used.
+
+        Args:
+            cls_score (Tensor): Box score logits from a single decoder layer
+                for one image. Shape [num_queries, cls_out_channels].
+            bbox_pred (Tensor): Sigmoid outputs from a single decoder layer
+                for one image, with normalized coordinate (cx, cy, w, h) and
+                shape [num_queries, 4].
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It should includes ``bboxes`` and ``labels``
+                attributes.
+            img_meta (dict): Meta information for one image.
+
+        Returns:
+            tuple[Tensor]: a tuple containing the following for one image.
+
+            - labels (Tensor): Labels of each image.
+            - label_weights (Tensor]): Label weights of each image.
+            - bbox_targets (Tensor): BBox targets of each image.
+            - bbox_weights (Tensor): BBox weights of each image.
+            - pos_inds (Tensor): Sampled positive indices for each image.
+            - neg_inds (Tensor): Sampled negative indices for each image.
+        """
+        img_h, img_w = img_meta["img_shape"][:2]
+        factor = bbox_pred.new_tensor([img_w, img_h, img_w, img_h]).unsqueeze(0)
+        num_bboxes = bbox_pred.size(0)
+        # # convert bbox_pred from xywh, normalized to xyxy, unnormalized
+        # bbox_pred = bbox_cxcywh_to_xyxy(bbox_pred)
+        # bbox_pred = bbox_pred * factor
+
+        # assigner and sampler
+        assign_result = self.assigner.assign(
+            bbox_pred, cls_score, gt_instances.bboxes, gt_instances.labels, img_meta=img_meta
+        )
+
+        gt_bboxes = gt_instances.bboxes
+        gt_labels = gt_instances.labels
+        pos_inds = torch.nonzero(assign_result.gt_inds > 0, as_tuple=False).squeeze(-1).unique()
+        neg_inds = torch.nonzero(assign_result.gt_inds == 0, as_tuple=False).squeeze(-1).unique()
+        pos_assigned_gt_inds = assign_result.gt_inds[pos_inds] - 1
+        pos_gt_bboxes = gt_bboxes[pos_assigned_gt_inds.long(), :]
+
+        # label targets
+        labels = gt_bboxes.new_full((num_bboxes,), self.num_classes, dtype=torch.long)
+        labels[pos_inds] = gt_labels[pos_assigned_gt_inds]
+        label_weights = gt_bboxes.new_ones(num_bboxes)
+
+        # bbox targets
+        bbox_targets = torch.zeros_like(bbox_pred)
+        bbox_weights = torch.zeros_like(bbox_pred)
+        bbox_weights[pos_inds] = 1.0
+
+        # DETR regress the relative position of boxes (cxcywh) in the image.
+        # Thus the learning target should be normalized by the image size, also
+        # the box format should be converted from defaultly x1y1x2y2 to cxcywh.
+        pos_gt_bboxes_normalized = pos_gt_bboxes / factor
+        pos_gt_bboxes_targets = bbox_xyxy_to_cxcywh(pos_gt_bboxes_normalized)
+        bbox_targets[pos_inds] = pos_gt_bboxes_targets
+        return (labels, label_weights, bbox_targets, bbox_weights, pos_inds, neg_inds)
diff --git a/otx/algorithms/detection/adapters/mmdet/models/layers/__init__.py b/otx/algorithms/detection/adapters/mmdet/models/layers/__init__.py
new file mode 100644
index 00000000000..4ded67b4b79
--- /dev/null
+++ b/otx/algorithms/detection/adapters/mmdet/models/layers/__init__.py
@@ -0,0 +1,9 @@
+"""Initial file for mmdetection layers for models."""
+# Copyright (C) 2023 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+
+from .dino import CustomDINOTransformer
+from .dino_layers import CdnQueryGenerator, DINOTransformerDecoder
+
+__all__ = ["CustomDINOTransformer", "DINOTransformerDecoder", "CdnQueryGenerator"]
diff --git a/otx/algorithms/detection/adapters/mmdet/models/layers/dino.py b/otx/algorithms/detection/adapters/mmdet/models/layers/dino.py
new file mode 100644
index 00000000000..f942b5b1717
--- /dev/null
+++ b/otx/algorithms/detection/adapters/mmdet/models/layers/dino.py
@@ -0,0 +1,169 @@
+"""Custom DINO transformer for OTX template."""
+# Copyright (C) 2023 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+
+import torch
+from mmdet.models.utils.builder import TRANSFORMER
+from mmdet.models.utils.transformer import DeformableDetrTransformer
+
+
+@TRANSFORMER.register_module()
+class CustomDINOTransformer(DeformableDetrTransformer):
+    """Custom DINO transformer."""
+
+    def init_layers(self):
+        """Initialize layers of the DINO."""
+        self.level_embeds = torch.nn.Parameter(torch.Tensor(self.num_feature_levels, self.embed_dims))
+
+        self.enc_output = torch.nn.Linear(self.embed_dims, self.embed_dims)
+        self.enc_output_norm = torch.nn.LayerNorm(self.embed_dims)
+
+    def forward(
+        self,
+        batch_info,
+        mlvl_feats,
+        mlvl_masks,
+        query_embed,
+        mlvl_pos_embeds,
+        reg_branches=None,
+        cls_branches=None,
+        **kwargs
+    ):
+        """Forward function for `Transformer`.
+
+        Args:
+            batch_info(list(dict(str, union(tuple, tensor)))):
+                Information about batch such as image shaep,
+                gt information.
+            mlvl_feats (list(Tensor)): Input queries from
+                different level. Each element has shape
+                [bs, embed_dims, h, w].
+            mlvl_masks (list(Tensor)): The key_padding_mask from
+                different level used for encoder and decoder,
+                each element has shape  [bs, h, w].
+            query_embed (Tensor): The query embedding for decoder,
+                with shape [num_query, c].
+            mlvl_pos_embeds (list(Tensor)): The positional encoding
+                of feats from different level, has the shape
+                 [bs, embed_dims, h, w].
+            reg_branches (obj:`nn.ModuleList`): Regression heads for
+                feature maps from each decoder layer. Only would
+                be passed when
+                `with_box_refine` is True. Default to None.
+            cls_branches (obj:`nn.ModuleList`): Classification heads
+                for feature maps from each decoder layer. Only would
+                 be passed when `as_two_stage`
+                 is True. Default to None.
+            kwargs: Additional argument for forward_transformer function.
+
+
+        Returns:
+            tuple[Tensor]: results of decoder containing the following tensor.
+
+                - inter_states: Outputs from decoder. If
+                    return_intermediate_dec is True output has shape \
+                      (num_dec_layers, bs, num_query, embed_dims), else has \
+                      shape (1, bs, num_query, embed_dims).
+                - init_reference_out: The initial value of reference \
+                    points, has shape (bs, num_queries, 4).
+                - inter_references_out: The internal value of reference \
+                    points in decoder, has shape \
+                    (num_dec_layers, bs,num_query, embed_dims)
+                - enc_outputs_class: The classification score of \
+                    proposals generated from \
+                    encoder's feature maps, has shape \
+                    (batch, h*w, num_classes). \
+                    Only would be returned when `as_two_stage` is True, \
+                    otherwise None.
+                - enc_outputs_coord_unact: The regression results \
+                    generated from encoder's feature maps., has shape \
+                    (batch, h*w, 4). Only would \
+                    be returned when `as_two_stage` is True, \
+                    otherwise None.
+        """
+        feat_flatten = []
+        mask_flatten = []
+        lvl_pos_embed_flatten = []
+        spatial_shapes = []
+        for lvl, (feat, mask, pos_embed) in enumerate(zip(mlvl_feats, mlvl_masks, mlvl_pos_embeds)):
+            bs, c, h, w = feat.shape
+            spatial_shape = (h, w)
+            spatial_shapes.append(spatial_shape)
+            feat = feat.flatten(2).transpose(1, 2)
+            mask = mask.flatten(1)
+            pos_embed = pos_embed.flatten(2).transpose(1, 2)
+            lvl_pos_embed = pos_embed + self.level_embeds[lvl].view(1, 1, -1)
+            lvl_pos_embed_flatten.append(lvl_pos_embed)
+            feat_flatten.append(feat)
+            mask_flatten.append(mask)
+        feat_flatten = torch.cat(feat_flatten, 1)
+        mask_flatten = torch.cat(mask_flatten, 1)
+        lvl_pos_embed_flatten = torch.cat(lvl_pos_embed_flatten, 1)
+        spatial_shapes = torch.as_tensor(spatial_shapes, dtype=torch.long, device=feat_flatten.device)
+        level_start_index = torch.cat((spatial_shapes.new_zeros((1,)), spatial_shapes.prod(1).cumsum(0)[:-1]))
+        valid_ratios = torch.stack([self.get_valid_ratio(m) for m in mlvl_masks], 1)
+
+        reference_points = self.get_reference_points(spatial_shapes, valid_ratios, device=feat.device)
+
+        feat_flatten = feat_flatten.permute(1, 0, 2)  # (H*W, bs, embed_dims)
+        lvl_pos_embed_flatten = lvl_pos_embed_flatten.permute(1, 0, 2)  # (H*W, bs, embed_dims)
+        memory = self.encoder(
+            query=feat_flatten,
+            key=None,
+            value=None,
+            query_pos=lvl_pos_embed_flatten,
+            query_key_padding_mask=mask_flatten,
+            spatial_shapes=spatial_shapes,
+            reference_points=reference_points,
+            level_start_index=level_start_index,
+            valid_ratios=valid_ratios,
+            **kwargs
+        )
+
+        # pre_decoder part at mmdet 3.x version
+        memory = memory.permute(1, 0, 2)
+        bs, _, c = memory.shape
+        cls_out_features = cls_branches[self.decoder.num_layers].out_features
+        output_memory, output_proposals = self.gen_encoder_output_proposals(memory, mask_flatten, spatial_shapes)
+        enc_outputs_class = cls_branches[self.decoder.num_layers](output_memory)
+        enc_outputs_coord_unact = reg_branches[self.decoder.num_layers](output_memory) + output_proposals
+
+        topk_indices = torch.topk(enc_outputs_class.max(-1)[0], k=self.two_stage_num_proposals, dim=1)[1]
+        topk_scores = torch.gather(enc_outputs_class, 1, topk_indices.unsqueeze(-1).repeat(1, 1, cls_out_features))
+        topk_coords_unact = torch.gather(enc_outputs_coord_unact, 1, topk_indices.unsqueeze(-1).repeat(1, 1, 4))
+        topk_coords = topk_coords_unact.sigmoid()
+        topk_coords_unact = topk_coords_unact.detach()
+
+        query = query_embed[:, None, :]
+        query = query.repeat(1, bs, 1).transpose(0, 1)
+        if self.training:
+            dn_label_query, dn_bbox_query, dn_mask, dn_meta = self.dn_query_generator(batch_info)
+            query = torch.cat([dn_label_query, query], dim=1)
+            reference_points = torch.cat([dn_bbox_query, topk_coords_unact], dim=1)
+        else:
+            reference_points = topk_coords_unact
+            dn_mask, dn_meta = None, None
+        reference_points = reference_points.sigmoid()
+
+        # forward_decoder part in mmdet 3.x
+        inter_states, references = self.decoder(
+            query=query,
+            value=memory,
+            key_padding_mask=mask_flatten,
+            self_attn_mask=dn_mask,
+            reference_points=reference_points,
+            spatial_shapes=spatial_shapes,
+            level_start_index=level_start_index,
+            valid_ratios=valid_ratios,
+            reg_branches=reg_branches,
+        )
+
+        if len(query) == self.two_stage_num_proposals:
+            # NOTE: This is to make sure label_embeding can be involved to
+            # produce loss even if there is no denoising query (no ground truth
+            # target in this GPU), otherwise, this will raise runtime error in
+            # distributed training.
+            inter_states[0] += self.dn_query_generator.label_embedding.weight[0, 0] * 0.0
+
+        return inter_states, list(references), topk_scores, topk_coords, dn_meta
diff --git a/otx/algorithms/detection/adapters/mmdet/models/layers/dino_layers.py b/otx/algorithms/detection/adapters/mmdet/models/layers/dino_layers.py
new file mode 100644
index 00000000000..fb82a1febb1
--- /dev/null
+++ b/otx/algorithms/detection/adapters/mmdet/models/layers/dino_layers.py
@@ -0,0 +1,609 @@
+"""Initial file for mmdetection layers for models."""
+# Copyright (C) 2023 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+
+import math
+import warnings
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+from mmcv.cnn.bricks.registry import TRANSFORMER_LAYER_SEQUENCE
+from mmcv.runner import BaseModule
+from mmcv.utils import Config
+from mmdet.core import bbox_xyxy_to_cxcywh
+from mmdet.models.utils.transformer import DeformableDetrTransformerDecoder, inverse_sigmoid
+from torch import Tensor, nn
+
+
+@TRANSFORMER_LAYER_SEQUENCE.register_module()
+class DINOTransformerDecoder(DeformableDetrTransformerDecoder):
+    """Transformer encoder of DINO."""
+
+    def __init__(self, *args, return_intermediate=False, **kwargs):
+        super().__init__(*args, return_intermediate=return_intermediate, **kwargs)
+        self.ref_point_head = MLP(self.embed_dims * 2, self.embed_dims, self.embed_dims, 2)
+        self.norm = nn.LayerNorm(self.embed_dims)
+
+    def forward(
+        self,
+        query: Tensor,
+        value: Tensor,
+        key_padding_mask: Tensor,
+        self_attn_mask: Tensor,
+        reference_points: Tensor,
+        spatial_shapes: Tensor,
+        level_start_index: Tensor,
+        valid_ratios: Tensor,
+        reg_branches: nn.ModuleList,
+        **kwargs,
+    ) -> Tensor:
+        """Forward function of Transformer encoder.
+
+        Args:
+            query (Tensor): The input query, has shape (num_queries, bs, dim).
+            value (Tensor): The input values, has shape (num_value, bs, dim).
+            key_padding_mask (Tensor): The `key_padding_mask` of `self_attn`
+                input. ByteTensor, has shape (num_queries, bs).
+            self_attn_mask (Tensor): The attention mask to prevent information
+                leakage from different denoising groups and matching parts, has
+                shape (num_queries_total, num_queries_total). It is `None` when
+                `self.training` is `False`.
+            reference_points (Tensor): The initial reference, has shape
+                (bs, num_queries, 4) with the last dimension arranged as
+                (cx, cy, w, h).
+            spatial_shapes (Tensor): Spatial shapes of features in all levels,
+                has shape (num_levels, 2), last dimension represents (h, w).
+            level_start_index (Tensor): The start index of each level.
+                A tensor has shape (num_levels, ) and can be represented
+                as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...].
+            valid_ratios (Tensor): The ratios of the valid width and the valid
+                height relative to the width and the height of features in all
+                levels, has shape (bs, num_levels, 2).
+            reg_branches: (obj:`nn.ModuleList`): Used for refining the
+                regression results.
+            kwargs: Additional argument for attention layers.
+
+        Returns:
+            Tensor: Output queries of Transformer encoder, which is also
+            called 'encoder output embeddings' or 'memory', has shape
+            (num_queries, bs, dim)
+        """
+        intermediate = []
+        intermediate_reference_points = [reference_points]
+        for lid, layer in enumerate(self.layers):
+            if reference_points.shape[-1] == 4:
+                reference_points_input = (
+                    reference_points[:, :, None] * torch.cat([valid_ratios, valid_ratios], -1)[:, None]
+                )
+            else:
+                assert reference_points.shape[-1] == 2
+                reference_points_input = reference_points[:, :, None] * valid_ratios[:, None]
+
+            query_sine_embed = coordinate_to_encoding(reference_points_input[:, :, 0, :])
+            query_pos = self.ref_point_head(query_sine_embed)
+
+            query = layer(
+                query.permute(1, 0, 2),
+                query_pos=query_pos.permute(1, 0, 2),
+                value=value.permute(1, 0, 2),
+                key_padding_mask=key_padding_mask,
+                attn_masks=[self_attn_mask, None],
+                spatial_shapes=spatial_shapes,
+                level_start_index=level_start_index,
+                valid_ratios=valid_ratios,
+                reference_points=reference_points_input,
+                **kwargs,
+            )
+
+            query = query.permute(1, 0, 2)
+            if reg_branches is not None:
+                tmp = reg_branches[lid](query)
+                assert reference_points.shape[-1] == 4
+                new_reference_points = tmp + inverse_sigmoid(reference_points, eps=1e-3)
+                new_reference_points = new_reference_points.sigmoid()
+                reference_points = new_reference_points.detach()
+
+            if self.return_intermediate:
+                intermediate.append(self.norm(query))
+                intermediate_reference_points.append(new_reference_points)
+                # NOTE this is for the "Look Forward Twice" module,
+                # in the DeformDETR, reference_points was appended.
+
+        if self.return_intermediate:
+            return torch.stack(intermediate), torch.stack(intermediate_reference_points)
+
+        return query, reference_points
+
+
+class CdnQueryGenerator(BaseModule):
+    """Implement query generator of the Contrastive denoising (CDN).
+
+    Proposed in`DINO: DETR with Improved DeNoising Anchor Boxes for End-to-End Object
+    Detection <https://arxiv.org/abs/2203.03605>`_.
+
+    Code is modified from the `official github repo
+    <https://github.com/IDEA-Research/DINO>`_.
+
+    Args:
+        num_classes (int): Number of object classes.
+        embed_dims (int): The embedding dimensions of the generated queries.
+        num_matching_queries (int): The queries number of the matching part.
+            Used for generating dn_mask.
+        label_noise_scale (float): The scale of label noise, defaults to 0.5.
+        box_noise_scale (float): The scale of box noise, defaults to 1.0.
+        group_cfg (:obj:`ConfigDict` or dict, optional): The config of the
+            denoising queries grouping, includes `dynamic`, `num_dn_queries`,
+            and `num_groups`. Two grouping strategies, 'static dn groups' and
+            'dynamic dn groups', are supported. When `dynamic` is `False`,
+            the `num_groups` should be set, and the number of denoising query
+            groups will always be `num_groups`. When `dynamic` is `True`, the
+            `num_dn_queries` should be set, and the group number will be
+            dynamic to ensure that the denoising queries number will not exceed
+            `num_dn_queries` to prevent large fluctuations of memory. Defaults
+            to `None`.
+    """
+
+    def __init__(
+        self,
+        num_classes: int,
+        embed_dims: int,
+        num_matching_queries: int,
+        label_noise_scale: float = 0.5,
+        box_noise_scale: float = 1.0,
+        group_cfg: Optional[Config] = None,
+    ) -> None:
+        super().__init__()
+        self.num_classes = num_classes
+        self.embed_dims = embed_dims
+        self.num_matching_queries = num_matching_queries
+        self.label_noise_scale = label_noise_scale
+        self.box_noise_scale = box_noise_scale
+
+        # prepare grouping strategy
+        group_cfg = {} if group_cfg is None else group_cfg
+        self.dynamic_dn_groups = group_cfg.get("dynamic", True)
+        if self.dynamic_dn_groups:
+            if "num_dn_queries" not in group_cfg:
+                warnings.warn("'num_dn_queries' should be set when using " "dynamic dn groups, use 100 as default.")
+            self.num_dn_queries = group_cfg.get("num_dn_queries", 100)
+            assert isinstance(self.num_dn_queries, int), (
+                f"Expected the num_dn_queries to have type int, but got "
+                f"{self.num_dn_queries}({type(self.num_dn_queries)}). "
+            )
+        else:
+            assert "num_groups" in group_cfg, "num_groups should be set when using static dn groups"
+            self.num_groups = group_cfg["num_groups"]
+            assert isinstance(self.num_groups, int), (
+                f"Expected the num_groups to have type int, but got " f"{self.num_groups}({type(self.num_groups)}). "
+            )
+
+        # NOTE The original repo of DINO set the num_embeddings 92 for coco,
+        # 91 (0~90) of which represents target classes and the 92 (91)
+        # indicates `Unknown` class. However, the embedding of `unknown` class
+        # is not used in the original DINO.
+        # TODO: num_classes + 1 or num_classes ?
+        self.label_embedding = nn.Embedding(self.num_classes, self.embed_dims)
+
+    def __call__(self, batch_info: List[Dict[str, Any]]) -> tuple:
+        """Generate contrastive denoising (cdn) queries with ground truth.
+
+        Descriptions of the Number Values in code and comments:
+            - num_target_total: the total target number of the input batch
+              samples.
+            - max_num_target: the max target number of the input batch samples.
+            - num_noisy_targets: the total targets number after adding noise,
+              i.e., num_target_total * num_groups * 2.
+            - num_denoising_queries: the length of the output batched queries,
+              i.e., max_num_target * num_groups * 2.
+
+        NOTE The format of input bboxes in batch_info is unnormalized
+        (x, y, x, y), and the output bbox queries are embedded by normalized
+        (cx, cy, w, h) format bboxes going through inverse_sigmoid.
+
+        Args:
+            batch_info (list[dict[str, union[tuple, tensor]]]): List of the batch
+                information such as image size, and gt information.
+
+        Returns:
+            tuple: The outputs of the dn query generator.
+
+            - dn_label_query (Tensor): The output content queries for denoising
+              part, has shape (bs, num_denoising_queries, dim), where
+              `num_denoising_queries = max_num_target * num_groups * 2`.
+            - dn_bbox_query (Tensor): The output reference bboxes as positions
+              of queries for denoising part, which are embedded by normalized
+              (cx, cy, w, h) format bboxes going through inverse_sigmoid, has
+              shape (bs, num_denoising_queries, 4) with the last dimension
+              arranged as (cx, cy, w, h).
+            - attn_mask (Tensor): The attention mask to prevent information
+              leakage from different denoising groups and matching parts,
+              will be used as `self_attn_mask` of the `decoder`, has shape
+              (num_queries_total, num_queries_total), where `num_queries_total`
+              is the sum of `num_denoising_queries` and `num_matching_queries`.
+            - dn_meta (Dict[str, int]): The dictionary saves information about
+              group collation, including 'num_denoising_queries' and
+              'num_denoising_groups'. It will be used for split outputs of
+              denoising and matching parts and loss calculation.
+        """
+        # normalize bbox and collate ground truth (gt)
+        gt_labels_list = []
+        gt_bboxes_list = []
+        for sample in batch_info:
+            img_h, img_w = sample["img_shape"]
+            bboxes = sample["bboxes"]
+            factor = bboxes.new_tensor([img_w, img_h, img_w, img_h]).unsqueeze(0)
+            bboxes_normalized = bboxes / factor
+            gt_bboxes_list.append(bboxes_normalized)
+            gt_labels_list.append(sample["labels"])
+        gt_labels = torch.cat(gt_labels_list)  # (num_target_total, 4)
+        gt_bboxes = torch.cat(gt_bboxes_list)
+
+        num_target_list = [len(bboxes) for bboxes in gt_bboxes_list]
+        max_num_target = max(num_target_list)
+        num_groups = self.get_num_groups(max_num_target)
+
+        dn_label_query = self.generate_dn_label_query(gt_labels, num_groups)
+        dn_bbox_query = self.generate_dn_bbox_query(gt_bboxes, num_groups)
+
+        # The `batch_idx` saves the batch index of the corresponding sample
+        # for each target, has shape (num_target_total).
+        batch_idx = torch.cat([torch.full_like(t.long(), i) for i, t in enumerate(gt_labels_list)])
+        dn_label_query, dn_bbox_query = self.collate_dn_queries(
+            dn_label_query, dn_bbox_query, batch_idx, len(batch_info), num_groups
+        )
+
+        attn_mask = self.generate_dn_mask(max_num_target, num_groups, device=dn_label_query.device)
+
+        dn_meta = dict(num_denoising_queries=int(max_num_target * 2 * num_groups), num_denoising_groups=num_groups)
+
+        return dn_label_query, dn_bbox_query, attn_mask, dn_meta
+
+    def get_num_groups(self, max_num_target: int = None) -> int:
+        """Calculate denoising query groups number.
+
+        Two grouping strategies, 'static dn groups' and 'dynamic dn groups',
+        are supported. When `self.dynamic_dn_groups` is `False`, the number
+        of denoising query groups will always be `self.num_groups`. When
+        `self.dynamic_dn_groups` is `True`, the group number will be dynamic,
+        ensuring the denoising queries number will not exceed
+        `self.num_dn_queries` to prevent large fluctuations of memory.
+
+        NOTE The `num_group` is shared for different samples in a batch. When
+        the target numbers in the samples varies, the denoising queries of the
+        samples containing fewer targets are padded to the max length.
+
+        Args:
+            max_num_target (int, optional): The max target number of the batch
+                samples. It will only be used when `self.dynamic_dn_groups` is
+                `True`. Defaults to `None`.
+
+        Returns:
+            int: The denoising group number of the current batch.
+        """
+        if self.dynamic_dn_groups:
+            assert max_num_target is not None, "group_queries should be provided when using " "dynamic dn groups"
+            if max_num_target == 0:
+                num_groups = 1
+            else:
+                num_groups = self.num_dn_queries // max_num_target
+        else:
+            num_groups = self.num_groups
+        if num_groups < 1:
+            num_groups = 1
+        return int(num_groups)
+
+    def generate_dn_label_query(self, gt_labels: Tensor, num_groups: int) -> Tensor:
+        """Generate noisy labels and their query embeddings.
+
+        The strategy for generating noisy labels is: Randomly choose labels of
+        `self.label_noise_scale * 0.5` proportion and override each of them
+        with a random object category label.
+
+        NOTE Not add noise to all labels. Besides, the `self.label_noise_scale
+        * 0.5` arg is the ratio of the chosen positions, which is higher than
+        the actual proportion of noisy labels, because the labels to override
+        may be correct. And the gap becomes larger as the number of target
+        categories decreases. The users should notice this and modify the scale
+        arg or the corresponding logic according to specific dataset.
+
+        Args:
+            gt_labels (Tensor): The concatenated gt labels of all samples
+                in the batch, has shape (num_target_total, ) where
+                `num_target_total = sum(num_target_list)`.
+            num_groups (int): The number of denoising query groups.
+
+        Returns:
+            Tensor: The query embeddings of noisy labels, has shape
+            (num_noisy_targets, embed_dims), where `num_noisy_targets =
+            num_target_total * num_groups * 2`.
+        """
+        assert self.label_noise_scale > 0
+        gt_labels_expand = gt_labels.repeat(2 * num_groups, 1).view(-1)  # Note `* 2`  # noqa
+        p = torch.rand_like(gt_labels_expand.float())
+        chosen_indice = torch.nonzero(p < (self.label_noise_scale * 0.5)).view(-1)  # Note `* 0.5`
+        new_labels = torch.randint_like(chosen_indice, 0, self.num_classes)
+        noisy_labels_expand = gt_labels_expand.scatter(0, chosen_indice, new_labels)
+        dn_label_query = self.label_embedding(noisy_labels_expand)
+        return dn_label_query
+
+    def generate_dn_bbox_query(self, gt_bboxes: Tensor, num_groups: int) -> Tensor:
+        """Generate noisy bboxes and their query embeddings.
+
+        The strategy for generating noisy bboxes is as follow:
+
+        .. code:: text
+
+            +--------------------+
+            |      negative      |
+            |    +----------+    |
+            |    | positive |    |
+            |    |    +-----|----+------------+
+            |    |    |     |    |            |
+            |    +----+-----+    |            |
+            |         |          |            |
+            +---------+----------+            |
+                      |                       |
+                      |        gt bbox        |
+                      |                       |
+                      |             +---------+----------+
+                      |             |         |          |
+                      |             |    +----+-----+    |
+                      |             |    |    |     |    |
+                      +-------------|--- +----+     |    |
+                                    |    | positive |    |
+                                    |    +----------+    |
+                                    |      negative      |
+                                    +--------------------+
+
+         The random noise is added to the top-left and down-right point
+         positions, hence, normalized (x, y, x, y) format of bboxes are
+         required. The noisy bboxes of positive queries have the points
+         both within the inner square, while those of negative queries
+         have the points both between the inner and outer squares.
+
+        Besides, the length of outer square is twice as long as that of
+        the inner square, i.e., self.box_noise_scale * w_or_h / 2.
+        NOTE The noise is added to all the bboxes. Moreover, there is still
+        unconsidered case when one point is within the positive square and
+        the others is between the inner and outer squares.
+
+        Args:
+            gt_bboxes (Tensor): The concatenated gt bboxes of all samples
+                in the batch, has shape (num_target_total, 4) with the last
+                dimension arranged as (cx, cy, w, h) where
+                `num_target_total = sum(num_target_list)`.
+            num_groups (int): The number of denoising query groups.
+
+        Returns:
+            Tensor: The output noisy bboxes, which are embedded by normalized
+            (cx, cy, w, h) format bboxes going through inverse_sigmoid, has
+            shape (num_noisy_targets, 4) with the last dimension arranged as
+            (cx, cy, w, h), where
+            `num_noisy_targets = num_target_total * num_groups * 2`.
+        """
+        assert self.box_noise_scale > 0
+        device = gt_bboxes.device
+
+        # expand gt_bboxes as groups
+        gt_bboxes_expand = gt_bboxes.repeat(2 * num_groups, 1)  # xyxy
+
+        # obtain index of negative queries in gt_bboxes_expand
+        positive_idx = torch.arange(len(gt_bboxes), dtype=torch.long, device=device)
+        positive_idx = positive_idx.unsqueeze(0).repeat(num_groups, 1)
+        positive_idx += 2 * len(gt_bboxes) * torch.arange(num_groups, dtype=torch.long, device=device)[:, None]
+        positive_idx = positive_idx.flatten()
+        negative_idx = positive_idx + len(gt_bboxes)
+
+        # determine the sign of each element in the random part of the added
+        # noise to be positive or negative randomly.
+        rand_sign = (
+            torch.randint_like(gt_bboxes_expand, low=0, high=2, dtype=torch.float32) * 2.0 - 1.0
+        )  # [low, high), 1 or -1, randomly
+
+        # calculate the random part of the added noise
+        rand_part = torch.rand_like(gt_bboxes_expand)  # [0, 1)
+        rand_part[negative_idx] += 1.0  # pos: [0, 1); neg: [1, 2)
+        rand_part *= rand_sign  # pos: (-1, 1); neg: (-2, -1] U [1, 2)
+
+        # add noise to the bboxes
+        bboxes_whwh = bbox_xyxy_to_cxcywh(gt_bboxes_expand)[:, 2:].repeat(1, 2)
+        noisy_bboxes_expand = gt_bboxes_expand + torch.mul(rand_part, bboxes_whwh) * self.box_noise_scale / 2  # xyxy
+        noisy_bboxes_expand = noisy_bboxes_expand.clamp(min=0.0, max=1.0)
+        noisy_bboxes_expand = bbox_xyxy_to_cxcywh(noisy_bboxes_expand)
+
+        dn_bbox_query = inverse_sigmoid(noisy_bboxes_expand, eps=1e-3)
+        return dn_bbox_query
+
+    def collate_dn_queries(
+        self, input_label_query: Tensor, input_bbox_query: Tensor, batch_idx: Tensor, batch_size: int, num_groups: int
+    ) -> Tuple[Tensor, Tensor]:
+        """Collate generated queries to obtain batched dn queries.
+
+        The strategy for query collation is as follow:
+
+        .. code:: text
+
+                    input_queries (num_target_total, query_dim)
+            P_A1 P_B1 P_B2 N_A1 N_B1 N_B2 P'A1 P'B1 P'B2 N'A1 N'B1 N'B2
+              |________ group1 ________|    |________ group2 ________|
+                                         |
+                                         V
+                      P_A1 Pad0 N_A1 Pad0 P'A1 Pad0 N'A1 Pad0
+                      P_B1 P_B2 N_B1 N_B2 P'B1 P'B2 N'B1 N'B2
+                       |____ group1 ____| |____ group2 ____|
+             batched_queries (batch_size, max_num_target, query_dim)
+
+            where query_dim is 4 for bbox and self.embed_dims for label.
+            Notation: _-group 1; '-group 2;
+                      A-Sample1(has 1 target); B-sample2(has 2 targets)
+
+        Args:
+            input_label_query (Tensor): The generated label queries of all
+                targets, has shape (num_target_total, embed_dims) where
+                `num_target_total = sum(num_target_list)`.
+            input_bbox_query (Tensor): The generated bbox queries of all
+                targets, has shape (num_target_total, 4) with the last
+                dimension arranged as (cx, cy, w, h).
+            batch_idx (Tensor): The batch index of the corresponding sample
+                for each target, has shape (num_target_total).
+            batch_size (int): The size of the input batch.
+            num_groups (int): The number of denoising query groups.
+
+        Returns:
+            tuple[Tensor]: Output batched label and bbox queries.
+            - batched_label_query (Tensor): The output batched label queries,
+              has shape (batch_size, max_num_target, embed_dims).
+            - batched_bbox_query (Tensor): The output batched bbox queries,
+              has shape (batch_size, max_num_target, 4) with the last dimension
+              arranged as (cx, cy, w, h).
+        """
+        device = input_label_query.device
+        num_target_list = [torch.sum(batch_idx == idx) for idx in range(batch_size)]
+        max_num_target = max(num_target_list)
+        num_denoising_queries = int(max_num_target * 2 * num_groups)
+
+        map_query_index = torch.cat([torch.arange(num_target, device=device) for num_target in num_target_list])
+        map_query_index = torch.cat([map_query_index + max_num_target * i for i in range(2 * num_groups)]).long()
+        batch_idx_expand = batch_idx.repeat(2 * num_groups, 1).view(-1)
+        mapper = (batch_idx_expand, map_query_index)
+
+        batched_label_query = torch.zeros(batch_size, num_denoising_queries, self.embed_dims, device=device)
+        batched_bbox_query = torch.zeros(batch_size, num_denoising_queries, 4, device=device)
+
+        batched_label_query[mapper] = input_label_query
+        batched_bbox_query[mapper] = input_bbox_query
+        return batched_label_query, batched_bbox_query
+
+    def generate_dn_mask(self, max_num_target: int, num_groups: int, device: Union[torch.device, str]) -> Tensor:
+        """Generate attention mask to prevent information leakage from different denoising groups and matching parts.
+
+        .. code:: text
+
+                        0 0 0 0 1 1 1 1 0 0 0 0 0
+                        0 0 0 0 1 1 1 1 0 0 0 0 0
+                        0 0 0 0 1 1 1 1 0 0 0 0 0
+                        0 0 0 0 1 1 1 1 0 0 0 0 0
+                        1 1 1 1 0 0 0 0 0 0 0 0 0
+                        1 1 1 1 0 0 0 0 0 0 0 0 0
+                        1 1 1 1 0 0 0 0 0 0 0 0 0
+                        1 1 1 1 0 0 0 0 0 0 0 0 0
+                        1 1 1 1 1 1 1 1 0 0 0 0 0
+                        1 1 1 1 1 1 1 1 0 0 0 0 0
+                        1 1 1 1 1 1 1 1 0 0 0 0 0
+                        1 1 1 1 1 1 1 1 0 0 0 0 0
+                        1 1 1 1 1 1 1 1 0 0 0 0 0
+         max_num_target |_|           |_________| num_matching_queries
+                        |_____________| num_denoising_queries
+
+               1 -> True  (Masked), means 'can not see'.
+               0 -> False (UnMasked), means 'can see'.
+
+        Args:
+            max_num_target (int): The max target number of the input batch
+                samples.
+            num_groups (int): The number of denoising query groups.
+            device (obj:`device` or str): The device of generated mask.
+
+        Returns:
+            Tensor: The attention mask to prevent information leakage from
+            different denoising groups and matching parts, will be used as
+            `self_attn_mask` of the `decoder`, has shape (num_queries_total,
+            num_queries_total), where `num_queries_total` is the sum of
+            `num_denoising_queries` and `num_matching_queries`.
+        """
+        num_denoising_queries = int(max_num_target * 2 * num_groups)
+        num_queries_total = num_denoising_queries + self.num_matching_queries
+        attn_mask = torch.zeros(num_queries_total, num_queries_total, device=device, dtype=torch.bool)
+        # Make the matching part cannot see the denoising groups
+        attn_mask[num_denoising_queries:, :num_denoising_queries] = True
+        # Make the denoising groups cannot see each other
+        for i in range(num_groups):
+            # Mask rows of one group per step.
+            row_scope = slice(max_num_target * 2 * i, max_num_target * 2 * (i + 1))
+            left_scope = slice(max_num_target * 2 * i)
+            right_scope = slice(max_num_target * 2 * (i + 1), num_denoising_queries)
+            attn_mask[row_scope, right_scope] = True
+            attn_mask[row_scope, left_scope] = True
+        return attn_mask
+
+
+class MLP(BaseModule):
+    """Very simple multi-layer perceptron (also called FFN) with relu. Mostly used in DETR series detectors.
+
+    Args:
+        input_dim (int): Feature dim of the input tensor.
+        hidden_dim (int): Feature dim of the hidden layer.
+        output_dim (int): Feature dim of the output tensor.
+        num_layers (int): Number of FFN layers. As the last
+            layer of MLP only contains FFN (Linear).
+    """
+
+    def __init__(self, input_dim: int, hidden_dim: int, output_dim: int, num_layers: int) -> None:
+        super().__init__()
+        self.num_layers = num_layers
+        h = [hidden_dim] * (num_layers - 1)
+        self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
+
+    def forward(self, x: Tensor) -> Tensor:
+        """Forward function of MLP.
+
+        Args:
+            x (Tensor): The input feature, has shape
+                (num_queries, bs, input_dim).
+
+
+        Returns:
+            Tensor: The output feature, has shape
+                (num_queries, bs, output_dim).
+        """
+        for i, layer in enumerate(self.layers):
+            x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
+        return x
+
+
+def coordinate_to_encoding(
+    coord_tensor: Tensor, num_feats: int = 128, temperature: int = 10000, scale: float = 2 * math.pi
+):
+    """Convert coordinate tensor to positional encoding.
+
+    Args:
+        coord_tensor (Tensor): Coordinate tensor to be converted to
+            positional encoding. With the last dimension as 2 or 4.
+        num_feats (int, optional): The feature dimension for each position
+            along x-axis or y-axis. Note the final returned dimension
+            for each position is 2 times of this value. Defaults to 128.
+        temperature (int, optional): The temperature used for scaling
+            the position embedding. Defaults to 10000.
+        scale (float, optional): A scale factor that scales the position
+            embedding. The scale will be used only when `normalize` is True.
+            Defaults to 2*pi.
+
+
+    Returns:
+        Tensor: Returned encoded positional tensor.
+    """
+    dim_t = torch.arange(num_feats, dtype=torch.float32, device=coord_tensor.device)
+    dim_t = temperature ** (2 * (dim_t // 2) / num_feats)
+    x_embed = coord_tensor[..., 0] * scale
+    y_embed = coord_tensor[..., 1] * scale
+    pos_x = x_embed[..., None] / dim_t
+    pos_y = y_embed[..., None] / dim_t
+    pos_x = torch.stack((pos_x[..., 0::2].sin(), pos_x[..., 1::2].cos()), dim=-1).flatten(2)
+    pos_y = torch.stack((pos_y[..., 0::2].sin(), pos_y[..., 1::2].cos()), dim=-1).flatten(2)
+    if coord_tensor.size(-1) == 2:
+        pos = torch.cat((pos_y, pos_x), dim=-1)
+    elif coord_tensor.size(-1) == 4:
+        w_embed = coord_tensor[..., 2] * scale
+        pos_w = w_embed[..., None] / dim_t
+        pos_w = torch.stack((pos_w[..., 0::2].sin(), pos_w[..., 1::2].cos()), dim=-1).flatten(2)
+
+        h_embed = coord_tensor[..., 3] * scale
+        pos_h = h_embed[..., None] / dim_t
+        pos_h = torch.stack((pos_h[..., 0::2].sin(), pos_h[..., 1::2].cos()), dim=-1).flatten(2)
+
+        pos = torch.cat((pos_y, pos_x, pos_w, pos_h), dim=-1)
+    else:
+        raise ValueError("Unknown pos_tensor shape(-1):{}".format(coord_tensor.size(-1)))
+    return pos
diff --git a/otx/algorithms/detection/configs/detection/resnet50_dino/data_pipeline.py b/otx/algorithms/detection/configs/detection/resnet50_dino/data_pipeline.py
new file mode 100644
index 00000000000..4b577e21eb8
--- /dev/null
+++ b/otx/algorithms/detection/configs/detection/resnet50_dino/data_pipeline.py
@@ -0,0 +1,115 @@
+"""Data pipeline for Deformable DETR."""
+# dataset settings
+dataset_type = "CocoDataset"
+data_root = "data/coco/"
+img_norm_cfg = dict(mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+
+# train_pipeline, NOTE the img_scale and the Pad's size_divisor is different
+# from the default setting in mmdet.
+train_pipeline = [
+    dict(type="LoadImageFromFile"),
+    dict(type="LoadAnnotations", with_bbox=True),
+    dict(type="RandomFlip", flip_ratio=0.5),
+    dict(
+        type="AutoAugment",
+        policies=[
+            [
+                dict(
+                    type="Resize",
+                    img_scale=[
+                        (480, 1333),
+                        (512, 1333),
+                        (544, 1333),
+                        (576, 1333),
+                        (608, 1333),
+                        (640, 1333),
+                        (672, 1333),
+                        (704, 1333),
+                        (736, 1333),
+                        (768, 1333),
+                        (800, 1333),
+                    ],
+                    multiscale_mode="value",
+                    keep_ratio=True,
+                )
+            ],
+            [
+                dict(
+                    type="Resize",
+                    # The radio of all image in train dataset < 7
+                    # follow the original impl
+                    img_scale=[(400, 4200), (500, 4200), (600, 4200)],
+                    multiscale_mode="value",
+                    keep_ratio=True,
+                ),
+                dict(type="RandomCrop", crop_type="absolute_range", crop_size=(384, 600), allow_negative_crop=True),
+                dict(
+                    type="Resize",
+                    img_scale=[
+                        (480, 1333),
+                        (512, 1333),
+                        (544, 1333),
+                        (576, 1333),
+                        (608, 1333),
+                        (640, 1333),
+                        (672, 1333),
+                        (704, 1333),
+                        (736, 1333),
+                        (768, 1333),
+                        (800, 1333),
+                    ],
+                    multiscale_mode="value",
+                    override=True,
+                    keep_ratio=True,
+                ),
+            ],
+        ],
+    ),
+    dict(type="Normalize", **img_norm_cfg),
+    dict(type="Pad", size_divisor=1),
+    dict(type="DefaultFormatBundle"),
+    dict(type="Collect", keys=["img", "gt_bboxes", "gt_labels"]),
+]
+# test_pipeline, NOTE the Pad's size_divisor is different from the default
+# setting (size_divisor=32). While there is little effect on the performance
+# whether we use the default setting or use size_divisor=1.
+test_pipeline = [
+    dict(type="LoadImageFromFile"),
+    dict(
+        type="MultiScaleFlipAug",
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type="Resize", keep_ratio=True),
+            dict(type="RandomFlip"),
+            dict(type="Normalize", **img_norm_cfg),
+            dict(type="Pad", size_divisor=1),
+            dict(type="ImageToTensor", keys=["img"]),
+            dict(type="Collect", keys=["img"]),
+        ],
+    ),
+]
+data = dict(
+    samples_per_gpu=2,
+    workers_per_gpu=2,
+    train=dict(
+        type=dataset_type,
+        filter_empty_gt=False,
+        ann_file=data_root + "annotations/instances_train2017.json",
+        img_prefix=data_root + "train2017/",
+        pipeline=train_pipeline,
+    ),
+    val=dict(
+        type=dataset_type,
+        ann_file=data_root + "annotations/instances_val2017.json",
+        img_prefix=data_root + "val2017/",
+        pipeline=test_pipeline,
+    ),
+    test=dict(
+        type=dataset_type,
+        ann_file=data_root + "annotations/instances_val2017.json",
+        img_prefix=data_root + "val2017/",
+        pipeline=test_pipeline,
+    ),
+)
+evaluation = dict(interval=1, metric="bbox")
diff --git a/otx/algorithms/detection/configs/detection/resnet50_dino/deployment.py b/otx/algorithms/detection/configs/detection/resnet50_dino/deployment.py
new file mode 100644
index 00000000000..76b4a6544f5
--- /dev/null
+++ b/otx/algorithms/detection/configs/detection/resnet50_dino/deployment.py
@@ -0,0 +1,12 @@
+"""MMDeploy config of Deformable DETR model for Detection Task."""
+
+_base_ = ["../../base/deployments/base_detection_dynamic.py"]
+
+ir_config = dict(
+    output_names=["boxes", "labels"],
+    opset_version=16,
+)
+
+backend_config = dict(
+    model_inputs=[dict(opt_shapes=dict(input=[-1, 3, 800, 1333]))],
+)
diff --git a/otx/algorithms/detection/configs/detection/resnet50_dino/model.py b/otx/algorithms/detection/configs/detection/resnet50_dino/model.py
new file mode 100644
index 00000000000..a9cdf215901
--- /dev/null
+++ b/otx/algorithms/detection/configs/detection/resnet50_dino/model.py
@@ -0,0 +1,117 @@
+"""Model config for Deformable DETR."""
+model = dict(
+    type="CustomDINO",
+    backbone=dict(
+        type="ResNet",
+        depth=50,
+        num_stages=4,
+        out_indices=(1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type="BN", requires_grad=False),
+        norm_eval=True,
+        style="pytorch",
+        init_cfg=dict(type="Pretrained", checkpoint="torchvision://resnet50"),
+    ),
+    neck=dict(
+        type="ChannelMapper",
+        in_channels=[512, 1024, 2048],
+        kernel_size=1,
+        out_channels=256,
+        act_cfg=None,
+        norm_cfg=dict(type="GN", num_groups=32),
+        num_outs=4,
+    ),
+    bbox_head=dict(
+        type="CustomDINOHead",
+        num_query=900,
+        num_classes=80,
+        in_channels=2048,
+        sync_cls_avg_factor=True,
+        with_box_refine=True,
+        as_two_stage=True,
+        transformer=dict(
+            type="CustomDINOTransformer",
+            encoder=dict(
+                type="DetrTransformerEncoder",
+                num_layers=6,
+                transformerlayers=dict(
+                    type="BaseTransformerLayer",
+                    attn_cfgs=dict(type="MultiScaleDeformableAttention", embed_dims=256, dropout=0.0),
+                    feedforward_channels=2048,
+                    ffn_dropout=0.0,
+                    operation_order=("self_attn", "norm", "ffn", "norm"),
+                ),
+            ),
+            decoder=dict(
+                type="DINOTransformerDecoder",
+                num_layers=6,
+                return_intermediate=True,
+                transformerlayers=dict(
+                    type="DetrTransformerDecoderLayer",
+                    attn_cfgs=[
+                        dict(type="MultiheadAttention", embed_dims=256, num_heads=8, dropout=0.0),
+                        dict(type="MultiScaleDeformableAttention", embed_dims=256, dropout=0.0),
+                    ],
+                    feedforward_channels=2048,
+                    ffn_dropout=0.0,
+                    operation_order=("self_attn", "norm", "cross_attn", "norm", "ffn", "norm"),
+                ),
+            ),
+        ),
+        positional_encoding=dict(
+            type="SinePositionalEncoding", num_feats=128, normalize=True, offset=0.0, temperature=20
+        ),
+        loss_cls=dict(type="FocalLoss", use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=1.0),
+        loss_bbox=dict(type="L1Loss", loss_weight=5.0),
+        loss_iou=dict(type="GIoULoss", loss_weight=2.0),
+        dn_cfg=dict(
+            label_noise_scale=0.5,
+            box_noise_scale=1.0,  # 0.4 for DN-DETR
+            group_cfg=dict(dynamic=True, num_groups=None, num_dn_queries=100),
+        ),
+    ),
+    # training and testing settings
+    train_cfg=dict(
+        assigner=dict(
+            type="HungarianAssigner",
+            cls_cost=dict(type="FocalLossCost", weight=1.0),
+            reg_cost=dict(type="BBoxL1Cost", weight=5.0, box_format="xywh"),
+            iou_cost=dict(type="IoUCost", iou_mode="giou", weight=2.0),
+        )
+    ),
+    test_cfg=dict(max_per_img=300),
+)
+# optimizer
+optimizer = dict(
+    type="AdamW",
+    lr=1e-4,
+    weight_decay=0.0001,
+    paramwise_cfg=dict(
+        custom_keys={
+            "backbone": dict(lr_mult=0.1),
+            "sampling_offsets": dict(lr_mult=0.1),
+            "reference_points": dict(lr_mult=0.1),
+        }
+    ),
+)
+optimizer_config = dict(grad_clip=dict(max_norm=0.1, norm_type=2))
+# learning policy
+lr_config = dict(policy="step", step=[10])
+runner = dict(type="EpochRunnerWithCancel", max_epochs=12)
+load_from = (
+    "https://download.openmmlab.com/mmdetection/v3.0/dino/"
+    "dino-4scale_r50_8xb2-12e_coco/dino-4scale_r50_8xb2-12e_coco_20221202_182705-55b2bba2.pth"
+)
+resume_from = None
+
+checkpoint_config = dict(interval=1)
+# yapf:disable
+log_config = dict(
+    interval=100,
+    hooks=[
+        dict(type="TextLoggerHook"),
+    ],
+)
+log_level = "INFO"
+workflow = [("train", 1)]
+task_adapt = dict(op="REPLACE", type="temp", efficient_mode=False, use_mpa_anchor=False)
diff --git a/otx/algorithms/detection/configs/detection/resnet50_dino/template_experimental.yaml b/otx/algorithms/detection/configs/detection/resnet50_dino/template_experimental.yaml
new file mode 100644
index 00000000000..cddcef9542f
--- /dev/null
+++ b/otx/algorithms/detection/configs/detection/resnet50_dino/template_experimental.yaml
@@ -0,0 +1,64 @@
+# Description.
+model_template_id: Custom_Object_Detection_Gen3_DINO
+name: DINO
+task_type: DETECTION
+task_family: VISION
+instantiation: "CLASS"
+summary: Class-Incremental Object Detection for DINO
+application: ~
+
+# Algo backend.
+framework: OTXDetection v2.9.1
+
+# Task implementations.
+entrypoints:
+  base: otx.algorithms.detection.adapters.mmdet.task.MMDetectionTask
+  openvino: otx.algorithms.detection.adapters.openvino.task.OpenVINODetectionTask
+  nncf: otx.algorithms.detection.adapters.mmdet.nncf.task.DetectionNNCFTask
+
+# Capabilities.
+capabilities:
+  - compute_representations
+
+# Hyperparameters.
+hyper_parameters:
+  base_path: ../configuration.yaml
+  parameter_overrides:
+    learning_parameters:
+      batch_size:
+        default_value: 2
+        auto_hpo_state: POSSIBLE
+      learning_rate:
+        default_value: 0.0001
+        auto_hpo_state: POSSIBLE
+      learning_rate_warmup_iters:
+        default_value: 3
+      num_iters:
+        default_value: 12
+    nncf_optimization:
+      enable_quantization:
+        default_value: true
+      enable_pruning:
+        default_value: false
+      pruning_supported:
+        default_value: true
+      maximal_accuracy_degradation:
+        default_value: 1.0
+    algo_backend:
+      train_type:
+        default_value: Incremental
+
+# Training resources.
+max_nodes: 1
+training_targets:
+  - GPU
+  - CPU
+
+# Stats.
+gigaflops: ???
+size: ???
+# # Inference options. Defined by OpenVINO capabilities, not Algo Backend or Platform.
+# inference_targets:
+#   - CPU
+#   - GPU
+#   - VPU

From a00c25b5b3ace8ebbe3f0713c202e46473c7d9ad Mon Sep 17 00:00:00 2001
From: jaegukhyun <jaeguk.hyun@intel.com>
Date: Fri, 23 Jun 2023 13:47:24 +0900
Subject: [PATCH 02/11] Modify docstrings

---
 .../models/detectors/custom_dino_detector.py  |   2 +-
 .../mmdet/models/heads/custom_dino_head.py    | 120 ++++++++++++++----
 .../adapters/mmdet/models/heads/detr_head.py  |  29 +++--
 .../adapters/mmdet/models/layers/dino.py      |  52 +++++---
 .../mmdet/models/layers/dino_layers.py        |   9 +-
 5 files changed, 155 insertions(+), 57 deletions(-)

diff --git a/otx/algorithms/detection/adapters/mmdet/models/detectors/custom_dino_detector.py b/otx/algorithms/detection/adapters/mmdet/models/detectors/custom_dino_detector.py
index 3bfd97bfa05..e84bb25eeec 100644
--- a/otx/algorithms/detection/adapters/mmdet/models/detectors/custom_dino_detector.py
+++ b/otx/algorithms/detection/adapters/mmdet/models/detectors/custom_dino_detector.py
@@ -70,7 +70,7 @@ def load_state_dict_pre_hook(model, ckpt_dict, *args, **kwargs):
         "otx.algorithms.detection.adapters.mmdet.models.detectors.custom_dino_detector.CustomDINO.simple_test"
     )
     def custom_dino__simple_test(ctx, self, img, img_metas, **kwargs):
-        """Function for custom_mask_rcnn__simple_test."""
+        """Function for custom_dino__simple_test."""
         height = int(img_metas[0]["img_shape"][0])
         width = int(img_metas[0]["img_shape"][1])
         img_metas[0]["batch_input_shape"] = (height, width)
diff --git a/otx/algorithms/detection/adapters/mmdet/models/heads/custom_dino_head.py b/otx/algorithms/detection/adapters/mmdet/models/heads/custom_dino_head.py
index e17ec30bf55..eae551f7ebd 100644
--- a/otx/algorithms/detection/adapters/mmdet/models/heads/custom_dino_head.py
+++ b/otx/algorithms/detection/adapters/mmdet/models/heads/custom_dino_head.py
@@ -20,7 +20,13 @@
 
 @HEADS.register_module()
 class CustomDINOHead(DeformableDETRHead, DETRHeadExtension):
-    """Head of DINO."""
+    """Head of DINO.
+
+    Based on detr_head.py and deformable_detr.py in mmdet2.x, some functions from dino_head.py in mmdet3.x are added.
+    Forward structure:
+        - Training: self.forward_train -> self.forward_transformer -> self.forward -> self.loss
+        - Inference: self.simple_test_bboxes -> self.forward_transformer -> self.forward -> self.get_bboxes
+    """
 
     def __init__(self, *args, dn_cfg: Optional[Config] = None, **kwargs):
         super().__init__(*args, **kwargs)
@@ -45,6 +51,10 @@ def _init_layers(self):
     def forward_train(self, x, img_metas, gt_bboxes, gt_labels=None, gt_bboxes_ignore=None, proposal_cfg=None):
         """Forward function for training mode.
 
+        Origin impelmentation: forward_train function of detr_head.py in mmdet2.x
+        What's changed: Divided self.forward into self.forward_transformer + self.forward.
+        This kind of structure is from mmdet3.x.
+
         Args:
             x (list[Tensor]): Features from backbone.
             img_metas (list[dict]): Meta information of each image, e.g.,
@@ -72,7 +82,11 @@ def forward_train(self, x, img_metas, gt_bboxes, gt_labels=None, gt_bboxes_ignor
         return losses
 
     def forward_transformer(self, mlvl_feats, gt_bboxes, gt_labels, img_metas):
-        """Forward function.
+        """Transformers's forward function.
+
+        Origin implementation: forward function of deformable_detr_head.py in mmdet2.x
+        What's changed: Original implementation has post-processing process after getting outputs from
+            self.transformer. However, this function directly return outputs from self.transformer
 
         Args:
             mlvl_feats (tuple[Tensor]): Features from the upstream
@@ -99,6 +113,10 @@ def forward_transformer(self, mlvl_feats, gt_bboxes, gt_labels, img_metas):
                 encode feature map, has shape (N, h*w, 4). Only when \
                 as_two_stage is True it would be returned, otherwise \
                 `None` would be returned.
+            dn_meta (Dict[str, int]): The dictionary saves information about
+              group collation, including 'num_denoising_queries' and
+              'num_denoising_groups'. It will be used for split outputs of
+              denoising and matching parts and loss calculation.
         """
 
         batch_size = mlvl_feats[0].size(0)
@@ -140,10 +158,14 @@ def loss(
         enc_outputs_class: Tensor,
         enc_outputs_coord: Tensor,
         dn_meta: Dict[str, int],
-        batch_data_samples,
+        batch_data_samples: List[Config],
     ) -> dict:
         """Perform forward propagation and loss calculation.
 
+        Original implementation: loss function of dino_head.py in mmdet3.x
+        What's changed: Change the name of function of loss_by_feat to loss_by_feat_two_stage since
+            there are changes in function input from parent's implementation.
+
         Args:
             hidden_states (Tensor): Hidden states output from each decoder
                 layer, has shape (num_decoder_layers, bs, num_queries_total,
@@ -162,13 +184,12 @@ def loss(
             enc_outputs_coord (Tensor): The proposal generate from the
                 encode feature map, has shape (bs, num_feat_points, 4) with the
                 last dimension arranged as (cx, cy, w, h).
-            batch_data_samples (list[:obj:`DetDataSample`]): The Data
-                Samples. It usually includes information such as
-                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
             dn_meta (Dict[str, int]): The dictionary saves information about
               group collation, including 'num_denoising_queries' and
               'num_denoising_groups'. It will be used for split outputs of
               denoising and matching parts and loss calculation.
+            batch_data_samples (List[Config]): This is same with batch_data_samples in mmdet3.x
+              It contains meta_info(==img_metas) and gt_instances(==(gt_bboxes, gt_labels))
 
         Returns:
             dict: A dictionary of loss components.
@@ -187,8 +208,35 @@ def loss(
     def forward(self, hidden_states, references):
         """Forward function.
 
-        T.B.D.
+        Original implementation: forward function of deformable_detr_head.py in mmdet3.x
+        What's changed: None
+
+        Args:
+            hidden_states (Tensor): Hidden states output from each decoder
+                layer, has shape (num_decoder_layers, bs, num_queries, dim).
+            references (list[Tensor]): List of the reference from the decoder.
+                The first reference is the `init_reference` (initial) and the
+                other num_decoder_layers(6) references are `inter_references`
+                (intermediate). The `init_reference` has shape (bs,
+                num_queries, 4) when `as_two_stage` of the detector is `True`,
+                otherwise (bs, num_queries, 2). Each `inter_reference` has
+                shape (bs, num_queries, 4) when `with_box_refine` of the
+                detector is `True`, otherwise (bs, num_queries, 2). The
+                coordinates are arranged as (cx, cy) when the last dimension is
+                2, and (cx, cy, w, h) when it is 4.
+
+        Returns:
+            tuple[Tensor]: results of head containing the following tensor.
+
+            - all_layers_outputs_classes (Tensor): Outputs from the
+              classification head, has shape (num_decoder_layers, bs,
+              num_queries, cls_out_channels).
+            - all_layers_outputs_coords (Tensor): Sigmoid outputs from the
+              regression head with normalized coordinate format (cx, cy, w,
+              h), has shape (num_decoder_layers, bs, num_queries, 4) with the
+              last dimension arranged as (cx, cy, w, h).
         """
+
         all_layers_outputs_classes = []
         all_layers_outputs_coords = []
 
@@ -224,13 +272,16 @@ def loss_by_feat_two_stage(
         all_layers_bbox_preds: Tensor,
         enc_cls_scores: Tensor,
         enc_bbox_preds: Tensor,
-        batch_gt_instances,
+        batch_gt_instances: List[Config],
         batch_img_metas: List[dict],
         dn_meta: Dict[str, int],
         batch_gt_instances_ignore=None,
     ) -> Dict[str, Tensor]:
         """Loss function.
 
+        Original implementation: loss_by_feat function of dino_head.py in mmdet3.x
+        What's changed: Name of function is changed. Parent's loss_by_feat function has different inputs.
+
         Args:
             all_layers_cls_scores (Tensor): Classification scores of all
                 decoder layers, has shape (num_decoder_layers, bs,
@@ -246,9 +297,8 @@ def loss_by_feat_two_stage(
             enc_bbox_preds (Tensor): The proposal generate from the encode
                 feature map, has shape (bs, num_feat_points, 4) with the last
                 dimension arranged as (cx, cy, w, h).
-            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
-                gt_instance. It usually includes ``bboxes`` and ``labels``
-                attributes.
+            batch_gt_instances (List[Config]): Batch of gt_instance.
+                It usually includes ``bboxes`` and ``labels`` attributes.
             batch_img_metas (list[dict]): Meta information of each image, e.g.,
                 image size, scaling factor, etc.
             dn_meta (Dict[str, int]): The dictionary saves information about
@@ -315,12 +365,15 @@ def loss_dn(
         self,
         all_layers_denoising_cls_scores: Tensor,
         all_layers_denoising_bbox_preds: Tensor,
-        batch_gt_instances,
+        batch_gt_instances: List[Config],
         batch_img_metas: List[dict],
         dn_meta: Dict[str, int],
     ) -> Tuple[List[Tensor], ...]:
         """Calculate denoising loss.
 
+        Original implementation: loss_dn function of dino_head.py in mmdet3.x
+        What's changed: None
+
         Args:
             all_layers_denoising_cls_scores (Tensor): Classification scores of
                 all decoder layers in denoising part, has shape (
@@ -330,9 +383,8 @@ def loss_dn(
                 decoder layers in denoising part. Each is a 4D-tensor with
                 normalized coordinate format (cx, cy, w, h) and has shape
                 (num_decoder_layers, bs, num_denoising_queries, 4).
-            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
-                gt_instance. It usually includes ``bboxes`` and ``labels``
-                attributes.
+            batch_gt_instances (List[Config]): Batch of gt_instance.
+                It usually includes ``bboxes`` and ``labels`` attributes.
             batch_img_metas (list[dict]): Meta information of each image, e.g.,
                 image size, scaling factor, etc.
             dn_meta (Dict[str, int]): The dictionary saves information about
@@ -357,12 +409,15 @@ def _loss_dn_single(
         self,
         dn_cls_scores: Tensor,
         dn_bbox_preds: Tensor,
-        batch_gt_instances,
+        batch_gt_instances: List[Config],
         batch_img_metas: List[dict],
         dn_meta: Dict[str, int],
     ) -> Tuple[Tensor, ...]:
         """Denoising loss for outputs from a single decoder layer.
 
+        Original implementation: _loss_dn_single function of dino_head.py in mmdet3.x
+        What's changed: None
+
         Args:
             dn_cls_scores (Tensor): Classification scores of a single decoder
                 layer in denoising part, has shape (bs, num_denoising_queries,
@@ -371,9 +426,8 @@ def _loss_dn_single(
                 layer in denoising part. Each is a 4D-tensor with normalized
                 coordinate format (cx, cy, w, h) and has shape
                 (bs, num_denoising_queries, 4).
-            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
-                gt_instance. It usually includes ``bboxes`` and ``labels``
-                attributes.
+            batch_gt_instances (List[Config]): Batch of gt_instance.
+                It usually includes ``bboxes`` and ``labels`` attributes.
             batch_img_metas (list[dict]): Meta information of each image, e.g.,
                 image size, scaling factor, etc.
             dn_meta (Dict[str, int]): The dictionary saves information about
@@ -439,13 +493,17 @@ def _loss_dn_single(
         loss_bbox = self.loss_bbox(bbox_preds, bbox_targets, bbox_weights, avg_factor=num_total_pos)
         return loss_cls, loss_bbox, loss_iou
 
-    def get_dn_targets(self, batch_gt_instances, batch_img_metas: List[Dict], dn_meta: Dict[str, int]) -> tuple:
+    def get_dn_targets(
+        self, batch_gt_instances: List[Config], batch_img_metas: List[Dict], dn_meta: Dict[str, int]
+    ) -> tuple:
         """Get targets in denoising part for a batch of images.
 
+        Original implementation: get_dn_targets function of dino_head.py in mmdet3.x
+        What's changed: None
+
         Args:
-            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
-                gt_instance. It usually includes ``bboxes`` and ``labels``
-                attributes.
+            batch_gt_instances (List[Config]): Batch of gt_instance.
+                It usually includes ``bboxes`` and ``labels`` attributes.
             batch_img_metas (list[dict]): Meta information of each image, e.g.,
                 image size, scaling factor, etc.
             dn_meta (Dict[str, int]): The dictionary saves information about
@@ -475,13 +533,14 @@ def get_dn_targets(self, batch_gt_instances, batch_img_metas: List[Dict], dn_met
         num_total_neg = sum((inds.numel() for inds in neg_inds_list))
         return (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list, num_total_pos, num_total_neg)
 
-    def _get_dn_targets_single(self, gt_instances, img_meta: dict, dn_meta: Dict[str, int]) -> tuple:
+    def _get_dn_targets_single(self, gt_instances: Config, img_meta: dict, dn_meta: Dict[str, int]) -> tuple:
         """Get targets in denoising part for one image.
 
+        Original implementation: _get_dn_targets_single function of dino_head.py in mmdet3.x
+        What's changed: None
+
         Args:
-            gt_instances (:obj:`InstanceData`): Ground truth of instance
-                annotations. It should includes ``bboxes`` and ``labels``
-                attributes.
+            gt_instances (Config): A gt_instance which usually includes ``bboxes`` and ``labels`` attributes.
             img_meta (dict): Meta information for one image.
             dn_meta (Dict[str, int]): The dictionary saves information about
               group collation, including 'num_denoising_queries' and
@@ -544,6 +603,9 @@ def split_outputs(
     ) -> Tuple[Tensor, ...]:
         """Split outputs of the denoising part and the matching part.
 
+        Original implementation: split_outputs function of dino_head.py in mmdet3.x
+        What's changed: None
+
         For the total outputs of `num_queries_total` length, the former
         `num_denoising_queries` outputs are from denoising queries, and
         the rest `num_matching_queries` ones are from matching queries,
@@ -602,6 +664,10 @@ def split_outputs(
     def simple_test_bboxes(self, feats, img_metas, rescale=False):
         """Test det bboxes without test-time augmentation.
 
+        Original implementation: simple_test_bboxes funciton of detr_head.py in mmdet2.x
+        What's changed: self.forward function is divided into self.forward_transformer and self.forward function.
+            This changes is from mmdet3.x
+
         Args:
             feats (tuple[torch.Tensor]): Multi-level features from the
                 upstream network, each is a 4D-tensor.
diff --git a/otx/algorithms/detection/adapters/mmdet/models/heads/detr_head.py b/otx/algorithms/detection/adapters/mmdet/models/heads/detr_head.py
index 86841e12022..29ab16e22e2 100644
--- a/otx/algorithms/detection/adapters/mmdet/models/heads/detr_head.py
+++ b/otx/algorithms/detection/adapters/mmdet/models/heads/detr_head.py
@@ -7,18 +7,24 @@
 
 import torch
 from mmcv.runner import BaseModule
+from mmcv.utils import Config
 from mmdet.core import bbox_cxcywh_to_xyxy, bbox_xyxy_to_cxcywh, multi_apply, reduce_mean
 from torch import Tensor
 
 
 class DETRHeadExtension(BaseModule):
-    """Head of DETR. DETR:End-to-End Object Detection with Transformers."""
+    """Head of DETR. DETR:End-to-End Object Detection with Transformers.
+
+    Origin implementation: DETRHead of detr_head.py in mmdet3.x
+    What's changed: Change data type of batch_gt_instances from InstanceList to List[Config].
+        Since InstanceList is a new data type from mmdet3.x, List[Config] will replace it.
+    """
 
     def loss_by_feat(
         self,
         all_layers_cls_scores: Tensor,
         all_layers_bbox_preds: Tensor,
-        batch_gt_instances,
+        batch_gt_instances: List[Config],
         batch_img_metas: List[dict],
         batch_gt_instances_ignore=None,
     ) -> Dict[str, Tensor]:
@@ -35,9 +41,8 @@ def loss_by_feat(
                 outputs of each decoder layers. Each is a 4D-tensor with
                 normalized coordinate format (cx, cy, w, h) and shape
                 (num_decoder_layers, bs, num_queries, 4).
-            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
-                gt_instance. It usually includes ``bboxes`` and ``labels``
-                attributes.
+            batch_gt_instances (List[Config]): Batch of gt_instance.
+                It usually includes ``bboxes`` and ``labels`` attributes.
             batch_img_metas (list[dict]): Meta information of each image, e.g.,
                 image size, scaling factor, etc.
             batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
@@ -75,7 +80,7 @@ def loss_by_feat(
         return loss_dict
 
     def loss_by_feat_single(
-        self, cls_scores: Tensor, bbox_preds: Tensor, batch_gt_instances, batch_img_metas: List[dict]
+        self, cls_scores: Tensor, bbox_preds: Tensor, batch_gt_instances: List[Config], batch_img_metas: List[dict]
     ) -> Tuple[Tensor, Tensor, Tensor]:
         """Loss function for outputs from a single decoder layer of a single feature level.
 
@@ -85,9 +90,8 @@ def loss_by_feat_single(
             bbox_preds (Tensor): Sigmoid outputs from a single decoder layer
                 for all images, with normalized coordinate (cx, cy, w, h) and
                 shape (bs, num_queries, 4).
-            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
-                gt_instance. It usually includes ``bboxes`` and ``labels``
-                attributes.
+            batch_gt_instances (List[Config]): Batch of gt_instance.
+                It usually includes ``bboxes`` and ``labels`` attributes.
             batch_img_metas (list[dict]): Meta information of each image, e.g.,
                 image size, scaling factor, etc.
 
@@ -155,7 +159,7 @@ def _get_targets(
         self,
         cls_scores_list: List[Tensor],
         bbox_preds_list: List[Tensor],
-        batch_gt_instances,
+        batch_gt_instances: List[Config],
         batch_img_metas: List[dict],
     ) -> tuple:
         """Compute regression and classification targets for a batch image.
@@ -169,9 +173,8 @@ def _get_targets(
             bbox_preds_list (list[Tensor]): Sigmoid outputs from a single
                 decoder layer for each image, with normalized coordinate
                 (cx, cy, w, h) and shape [num_queries, 4].
-            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
-                gt_instance. It usually includes ``bboxes`` and ``labels``
-                attributes.
+            batch_gt_instances (List[Config]): Batch of gt_instance.
+                It usually includes ``bboxes`` and ``labels`` attributes.
             batch_img_metas (list[dict]): Meta information of each image, e.g.,
                 image size, scaling factor, etc.
 
diff --git a/otx/algorithms/detection/adapters/mmdet/models/layers/dino.py b/otx/algorithms/detection/adapters/mmdet/models/layers/dino.py
index f942b5b1717..cab3f23183d 100644
--- a/otx/algorithms/detection/adapters/mmdet/models/layers/dino.py
+++ b/otx/algorithms/detection/adapters/mmdet/models/layers/dino.py
@@ -3,17 +3,28 @@
 # SPDX-License-Identifier: Apache-2.0
 #
 
+from typing import Dict, List, Optional, Tuple, Union
+
 import torch
 from mmdet.models.utils.builder import TRANSFORMER
 from mmdet.models.utils.transformer import DeformableDetrTransformer
+from torch import Tensor, nn
 
 
 @TRANSFORMER.register_module()
 class CustomDINOTransformer(DeformableDetrTransformer):
-    """Custom DINO transformer."""
+    """Custom DINO transformer.
+
+    Original implementation: mmdet.models.utils.transformer.DeformableDETR in mmdet2.x
+    What's changed: The forward function is modified.
+        Modified implementations are come from mmdet.models.detectors.dino.DINO in mmdet3.x
+    """
 
     def init_layers(self):
-        """Initialize layers of the DINO."""
+        """Initialize layers of the DINO.
+
+        Unlike Deformable DETR, DINO does not need pos_trans, pos_trans_norm.
+        """
         self.level_embeds = torch.nn.Parameter(torch.Tensor(self.num_feature_levels, self.embed_dims))
 
         self.enc_output = torch.nn.Linear(self.embed_dims, self.embed_dims)
@@ -21,17 +32,26 @@ def init_layers(self):
 
     def forward(
         self,
-        batch_info,
-        mlvl_feats,
-        mlvl_masks,
-        query_embed,
-        mlvl_pos_embeds,
-        reg_branches=None,
-        cls_branches=None,
+        batch_info: List[Dict[str, Union[Tuple, Tensor]]],
+        mlvl_feats: List[Tensor],
+        mlvl_masks: List[Tensor],
+        query_embed: Tensor,
+        mlvl_pos_embeds: List[Tensor],
+        reg_branches: Optional[nn.ModuleList] = None,
+        cls_branches: Optional[nn.ModuleList] = None,
         **kwargs
     ):
         """Forward function for `Transformer`.
 
+        What's changed:
+            In mmdet3.x forward of transformer is divided into
+            pre_transformer() -> forward_encoder() -> pre_decoder() -> forward_decoder().
+            In comparison, mmdet2.x forward function takes charge of all functions above.
+            The differences in Deformable DETR and DINO are occured in pre_decoder(), forward_decoder().
+            Therefore this function modified those parts. Modified implementations are come from
+            pre_decoder(), and forward_decoder() of mmdet.models.detectors.dino.DINO in mmdet3.x.
+
+
         Args:
             batch_info(list(dict(str, union(tuple, tensor)))):
                 Information about batch such as image shaep,
@@ -65,8 +85,6 @@ def forward(
                     return_intermediate_dec is True output has shape \
                       (num_dec_layers, bs, num_query, embed_dims), else has \
                       shape (1, bs, num_query, embed_dims).
-                - init_reference_out: The initial value of reference \
-                    points, has shape (bs, num_queries, 4).
                 - inter_references_out: The internal value of reference \
                     points in decoder, has shape \
                     (num_dec_layers, bs,num_query, embed_dims)
@@ -81,11 +99,15 @@ def forward(
                     (batch, h*w, 4). Only would \
                     be returned when `as_two_stage` is True, \
                     otherwise None.
+                - dn_meta (Dict[str, int]): The dictionary saves information about
+                    group collation, including 'num_denoising_queries' and
+                    'num_denoising_groups'. It will be used for split outputs of
+                    denoising and matching parts and loss calculation.
         """
-        feat_flatten = []
-        mask_flatten = []
-        lvl_pos_embed_flatten = []
-        spatial_shapes = []
+        feat_flatten: Union[Tensor, List[Tensor]] = []
+        mask_flatten: Union[Tensor, List[Tensor]] = []
+        lvl_pos_embed_flatten: Union[Tensor, List[Tensor]] = []
+        spatial_shapes: Union[Tensor, List[Tensor]] = []
         for lvl, (feat, mask, pos_embed) in enumerate(zip(mlvl_feats, mlvl_masks, mlvl_pos_embeds)):
             bs, c, h, w = feat.shape
             spatial_shape = (h, w)
diff --git a/otx/algorithms/detection/adapters/mmdet/models/layers/dino_layers.py b/otx/algorithms/detection/adapters/mmdet/models/layers/dino_layers.py
index fb82a1febb1..4dda964e3d8 100644
--- a/otx/algorithms/detection/adapters/mmdet/models/layers/dino_layers.py
+++ b/otx/algorithms/detection/adapters/mmdet/models/layers/dino_layers.py
@@ -39,7 +39,11 @@ def forward(
         reg_branches: nn.ModuleList,
         **kwargs,
     ) -> Tensor:
-        """Forward function of Transformer encoder.
+        """Forward function of Transformer decoder.
+
+        Original implementation: forward function of DinoTransformerDecoder in mmdet3.x.
+        What's change: Since implementation of base transformer layer is different between mmdet2.x and mmdet3.x,
+        input shape of layer and some input parameters of layer is modified.
 
         Args:
             query (Tensor): The input query, has shape (num_queries, bs, dim).
@@ -126,6 +130,9 @@ class CdnQueryGenerator(BaseModule):
     Code is modified from the `official github repo
     <https://github.com/IDEA-Research/DINO>`_.
 
+    Original implementation: mmdet.models.layers.transformer.dino_layers.CdnQueryGenerator
+    What's changed: None
+
     Args:
         num_classes (int): Number of object classes.
         embed_dims (int): The embedding dimensions of the generated queries.

From fc4bc6ac443e66488b9c94940dbca1dea92e5681 Mon Sep 17 00:00:00 2001
From: jaegukhyun <jaeguk.hyun@intel.com>
Date: Fri, 23 Jun 2023 14:06:04 +0900
Subject: [PATCH 03/11] Add mmengine to detection requirements

---
 requirements/detection.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements/detection.txt b/requirements/detection.txt
index 4c3802395d4..a0e94ea71bc 100644
--- a/requirements/detection.txt
+++ b/requirements/detection.txt
@@ -6,4 +6,5 @@ pytorchcv
 mmcls==0.25.0
 timm==0.6.12
 mmdeploy==0.14.0
+mmengine==0.7.4
 scikit-image

From a6712be97fac6a89eef7ca3eaf8a6671a2a429ad Mon Sep 17 00:00:00 2001
From: jaegukhyun <jaeguk.hyun@intel.com>
Date: Fri, 23 Jun 2023 16:00:59 +0900
Subject: [PATCH 04/11] Add unit tests

---
 .../mmdet/models/heads/custom_dino_head.py    |  30 ++-
 .../mmdet/models/detectors/conftest.py        |  92 +++++++-
 .../detectors/test_custom_dino_detector.py    |  52 +++++
 .../adapters/mmdet/models/heads/__init__.py   |   3 +
 .../models/heads/test_custom_dino_head.py     | 213 ++++++++++++++++++
 5 files changed, 381 insertions(+), 9 deletions(-)
 create mode 100644 tests/unit/algorithms/detection/adapters/mmdet/models/detectors/test_custom_dino_detector.py
 create mode 100644 tests/unit/algorithms/detection/adapters/mmdet/models/heads/__init__.py
 create mode 100644 tests/unit/algorithms/detection/adapters/mmdet/models/heads/test_custom_dino_head.py

diff --git a/otx/algorithms/detection/adapters/mmdet/models/heads/custom_dino_head.py b/otx/algorithms/detection/adapters/mmdet/models/heads/custom_dino_head.py
index eae551f7ebd..7da7d4fa8a8 100644
--- a/otx/algorithms/detection/adapters/mmdet/models/heads/custom_dino_head.py
+++ b/otx/algorithms/detection/adapters/mmdet/models/heads/custom_dino_head.py
@@ -3,7 +3,7 @@
 # SPDX-License-Identifier: Apache-2.0
 #
 
-from typing import Dict, List, Optional, Tuple
+from typing import Any, Dict, List, Optional, Tuple
 
 import torch
 import torch.nn.functional as F
@@ -48,7 +48,15 @@ def _init_layers(self):
         super()._init_layers()
         self.query_embedding = torch.nn.Embedding(self.num_query, self.embed_dims)
 
-    def forward_train(self, x, img_metas, gt_bboxes, gt_labels=None, gt_bboxes_ignore=None, proposal_cfg=None):
+    def forward_train(
+        self,
+        x: Tuple[Tensor],
+        img_metas: List[Dict[str, Any]],
+        gt_bboxes: List[Tensor],
+        gt_labels: Optional[List[Tensor]] = None,
+        gt_bboxes_ignore: Optional[List[Tensor]] = None,
+        proposal_cfg: Optional[Config] = None,
+    ):
         """Forward function for training mode.
 
         Origin impelmentation: forward_train function of detr_head.py in mmdet2.x
@@ -59,11 +67,11 @@ def forward_train(self, x, img_metas, gt_bboxes, gt_labels=None, gt_bboxes_ignor
             x (list[Tensor]): Features from backbone.
             img_metas (list[dict]): Meta information of each image, e.g.,
                 image size, scaling factor, etc.
-            gt_bboxes (Tensor): Ground truth bboxes of the image,
+            gt_bboxes (List[Tensor]): Ground truth bboxes of the image,
                 shape (num_gts, 4).
-            gt_labels (Tensor): Ground truth labels of each box,
+            gt_labels (List[Tensor]): Ground truth labels of each box,
                 shape (num_gts,).
-            gt_bboxes_ignore (Tensor): Ground truth bboxes to be
+            gt_bboxes_ignore (List[Tensor]): Ground truth bboxes to be
                 ignored, shape (num_ignored_gts, 4).
             proposal_cfg (mmcv.Config): Test / postprocessing configuration,
                 if None, test_cfg would be used.
@@ -81,7 +89,13 @@ def forward_train(self, x, img_metas, gt_bboxes, gt_labels=None, gt_bboxes_ignor
         losses = self.loss(*loss_inputs)
         return losses
 
-    def forward_transformer(self, mlvl_feats, gt_bboxes, gt_labels, img_metas):
+    def forward_transformer(
+        self,
+        mlvl_feats: Tuple[Tensor],
+        gt_bboxes: Optional[List[Tensor]],
+        gt_labels: Optional[List[Tensor]],
+        img_metas: List[Dict[str, Any]],
+    ):
         """Transformers's forward function.
 
         Origin implementation: forward function of deformable_detr_head.py in mmdet2.x
@@ -205,7 +219,7 @@ def loss(
         losses = self.loss_by_feat_two_stage(*loss_inputs)
         return losses
 
-    def forward(self, hidden_states, references):
+    def forward(self, hidden_states: Tensor, references: List[Tensor]):
         """Forward function.
 
         Original implementation: forward function of deformable_detr_head.py in mmdet3.x
@@ -661,7 +675,7 @@ def split_outputs(
             all_layers_denoising_bbox_preds,
         )
 
-    def simple_test_bboxes(self, feats, img_metas, rescale=False):
+    def simple_test_bboxes(self, feats: Tuple[Tensor], img_metas: List[Dict[str, Any]], rescale=False):
         """Test det bboxes without test-time augmentation.
 
         Original implementation: simple_test_bboxes funciton of detr_head.py in mmdet2.x
diff --git a/tests/unit/algorithms/detection/adapters/mmdet/models/detectors/conftest.py b/tests/unit/algorithms/detection/adapters/mmdet/models/detectors/conftest.py
index 52b50f2722d..4ac44156cba 100644
--- a/tests/unit/algorithms/detection/adapters/mmdet/models/detectors/conftest.py
+++ b/tests/unit/algorithms/detection/adapters/mmdet/models/detectors/conftest.py
@@ -341,7 +341,7 @@ def fxt_cfg_custom_deformable_detr(num_classes: int = 3):
         bbox_head=dict(
             type="DeformableDETRHead",
             num_query=300,
-            num_classes=80,
+            num_classes=num_classes,
             in_channels=2048,
             sync_cls_avg_factor=True,
             with_box_refine=True,
@@ -395,3 +395,93 @@ def fxt_cfg_custom_deformable_detr(num_classes: int = 3):
             dst_classes=["tree", "car", "person"],
         ),
     )
+
+
+@pytest.fixture
+def fxt_cfg_custom_dino(num_classes: int = 3):
+    return ConfigDict(
+        type="CustomDINO",
+        backbone=dict(
+            type="ResNet",
+            depth=50,
+            num_stages=4,
+            out_indices=(1, 2, 3),
+            frozen_stages=1,
+            norm_cfg=dict(type="BN", requires_grad=False),
+            norm_eval=True,
+            style="pytorch",
+            init_cfg=dict(type="Pretrained", checkpoint="torchvision://resnet50"),
+        ),
+        neck=dict(
+            type="ChannelMapper",
+            in_channels=[512, 1024, 2048],
+            kernel_size=1,
+            out_channels=256,
+            act_cfg=None,
+            norm_cfg=dict(type="GN", num_groups=32),
+            num_outs=4,
+        ),
+        bbox_head=dict(
+            type="CustomDINOHead",
+            num_query=900,
+            num_classes=num_classes,
+            in_channels=2048,
+            sync_cls_avg_factor=True,
+            with_box_refine=True,
+            as_two_stage=True,
+            transformer=dict(
+                type="CustomDINOTransformer",
+                encoder=dict(
+                    type="DetrTransformerEncoder",
+                    num_layers=6,
+                    transformerlayers=dict(
+                        type="BaseTransformerLayer",
+                        attn_cfgs=dict(type="MultiScaleDeformableAttention", embed_dims=256, dropout=0.0),
+                        feedforward_channels=2048,
+                        ffn_dropout=0.0,
+                        operation_order=("self_attn", "norm", "ffn", "norm"),
+                    ),
+                ),
+                decoder=dict(
+                    type="DINOTransformerDecoder",
+                    num_layers=6,
+                    return_intermediate=True,
+                    transformerlayers=dict(
+                        type="DetrTransformerDecoderLayer",
+                        attn_cfgs=[
+                            dict(type="MultiheadAttention", embed_dims=256, num_heads=8, dropout=0.0),
+                            dict(type="MultiScaleDeformableAttention", embed_dims=256, dropout=0.0),
+                        ],
+                        feedforward_channels=2048,
+                        ffn_dropout=0.0,
+                        operation_order=("self_attn", "norm", "cross_attn", "norm", "ffn", "norm"),
+                    ),
+                ),
+            ),
+            positional_encoding=dict(
+                type="SinePositionalEncoding", num_feats=128, normalize=True, offset=0.0, temperature=20
+            ),
+            loss_cls=dict(type="FocalLoss", use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=1.0),
+            loss_bbox=dict(type="L1Loss", loss_weight=5.0),
+            loss_iou=dict(type="GIoULoss", loss_weight=2.0),
+            dn_cfg=dict(
+                label_noise_scale=0.5,
+                box_noise_scale=1.0,  # 0.4 for DN-DETR
+                group_cfg=dict(dynamic=True, num_groups=None, num_dn_queries=100),
+            ),
+        ),
+        # training and testing settings
+        train_cfg=dict(
+            assigner=dict(
+                type="HungarianAssigner",
+                cls_cost=dict(type="FocalLossCost", weight=1.0),
+                reg_cost=dict(type="BBoxL1Cost", weight=5.0, box_format="xywh"),
+                iou_cost=dict(type="IoUCost", iou_mode="giou", weight=2.0),
+            )
+        ),
+        test_cfg=dict(max_per_img=300),
+        task_adapt=dict(
+            src_classes=["person", "car"],
+            dst_classes=["tree", "car", "person"],
+        ),
+    )
diff --git a/tests/unit/algorithms/detection/adapters/mmdet/models/detectors/test_custom_dino_detector.py b/tests/unit/algorithms/detection/adapters/mmdet/models/detectors/test_custom_dino_detector.py
new file mode 100644
index 00000000000..22b89435f09
--- /dev/null
+++ b/tests/unit/algorithms/detection/adapters/mmdet/models/detectors/test_custom_dino_detector.py
@@ -0,0 +1,52 @@
+# Copyright (C) 2023 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+
+from typing import Dict
+import torch
+from mmdet.models.builder import build_detector
+
+from otx.algorithms.detection.adapters.mmdet.models.detectors.custom_dino_detector import (
+    CustomDINO,
+)
+from tests.test_suite.e2e_test_system import e2e_pytest_unit
+
+
+class TestCustomDINO:
+    @e2e_pytest_unit
+    def test_custom_dino_build(self, fxt_cfg_custom_dino: Dict):
+        model = build_detector(fxt_cfg_custom_dino)
+        assert isinstance(model, CustomDINO)
+
+    def test_custom_dino_load_state_pre_hook(self, fxt_cfg_custom_dino: Dict):
+        model = build_detector(fxt_cfg_custom_dino)
+        ckpt_dict = {
+            "level_embed": "level_embed",
+            "encoder.self_attn": "encoder.self_attn",
+            "encoder.cross_attn": "encoder.cross_attn",
+            "encoder.ffn": "encoder.ffn",
+            "level_embed": "level_embed",
+            "decoder.self_attn": "decoder.self_attn",
+            "decoder.cross_attn": "decoder.cross_attn",
+            "decoder.ffn": "decoder.ffn",
+            "query_embedding.weight": "query_embedding.weight",
+            "dn_query_generator.label_embedding.weight": "dn_query_generator.label_embedding.weight",
+            "memory_trans_fc": "memory_trans_fc",
+            "memory_trans_norm": "memory_trans_norm",
+        }
+        model.load_state_dict_pre_hook(model, ckpt_dict)
+
+        assert ckpt_dict["bbox_head.transformer.level_embeds"] == "level_embed"
+        assert ckpt_dict["bbox_head.transformer.encoder.attentions.0"] == "encoder.self_attn"
+        assert ckpt_dict["bbox_head.transformer.encoder.attentions.1"] == "encoder.cross_attn"
+        assert ckpt_dict["bbox_head.transformer.encoder.ffns.0"] == "encoder.ffn"
+        assert ckpt_dict["bbox_head.transformer.decoder.attentions.0"] == "decoder.self_attn"
+        assert ckpt_dict["bbox_head.transformer.decoder.attentions.1"] == "decoder.cross_attn"
+        assert ckpt_dict["bbox_head.transformer.decoder.ffns.0"] == "decoder.ffn"
+        assert ckpt_dict["bbox_head.query_embedding.weight"] == "query_embedding.weight"
+        assert (
+            ckpt_dict["bbox_head.transformer.dn_query_generator.label_embedding.weight"]
+            == "dn_query_generator.label_embedding.weight"
+        )
+        assert ckpt_dict["bbox_head.transformer.enc_output"] == "memory_trans_fc"
+        assert ckpt_dict["bbox_head.transformer.enc_output_norm"] == "memory_trans_norm"
diff --git a/tests/unit/algorithms/detection/adapters/mmdet/models/heads/__init__.py b/tests/unit/algorithms/detection/adapters/mmdet/models/heads/__init__.py
new file mode 100644
index 00000000000..9c68be83ef0
--- /dev/null
+++ b/tests/unit/algorithms/detection/adapters/mmdet/models/heads/__init__.py
@@ -0,0 +1,3 @@
+# Copyright (C) 2023 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
diff --git a/tests/unit/algorithms/detection/adapters/mmdet/models/heads/test_custom_dino_head.py b/tests/unit/algorithms/detection/adapters/mmdet/models/heads/test_custom_dino_head.py
new file mode 100644
index 00000000000..fec8abac448
--- /dev/null
+++ b/tests/unit/algorithms/detection/adapters/mmdet/models/heads/test_custom_dino_head.py
@@ -0,0 +1,213 @@
+# Copyright (C) 2023 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+
+import numpy as np
+import pytest
+import torch
+from mmcv.utils import ConfigDict
+from mmdet.core import build_assigner
+from mmdet.models.builder import build_detector
+
+from otx.algorithms.detection.adapters.mmdet.models.heads.custom_dino_head import (
+    CustomDINOHead,
+)
+from tests.test_suite.e2e_test_system import e2e_pytest_unit
+
+
+class TestCustomDINOHead:
+    @pytest.fixture(autouse=True)
+    def setup(self):
+        cfg = ConfigDict(
+            dict(
+                type="CustomDINOHead",
+                num_query=900,
+                num_classes=80,
+                in_channels=2048,
+                sync_cls_avg_factor=True,
+                with_box_refine=True,
+                as_two_stage=True,
+                transformer=dict(
+                    type="CustomDINOTransformer",
+                    encoder=dict(
+                        type="DetrTransformerEncoder",
+                        num_layers=6,
+                        transformerlayers=dict(
+                            type="BaseTransformerLayer",
+                            attn_cfgs=dict(type="MultiScaleDeformableAttention", embed_dims=256, dropout=0.0),
+                            feedforward_channels=2048,
+                            ffn_dropout=0.0,
+                            operation_order=("self_attn", "norm", "ffn", "norm"),
+                        ),
+                    ),
+                    decoder=dict(
+                        type="DINOTransformerDecoder",
+                        num_layers=6,
+                        return_intermediate=True,
+                        transformerlayers=dict(
+                            type="DetrTransformerDecoderLayer",
+                            attn_cfgs=[
+                                dict(type="MultiheadAttention", embed_dims=256, num_heads=8, dropout=0.0),
+                                dict(type="MultiScaleDeformableAttention", embed_dims=256, dropout=0.0),
+                            ],
+                            feedforward_channels=2048,
+                            ffn_dropout=0.0,
+                            operation_order=("self_attn", "norm", "cross_attn", "norm", "ffn", "norm"),
+                        ),
+                    ),
+                ),
+                positional_encoding=dict(
+                    type="SinePositionalEncoding", num_feats=128, normalize=True, offset=0.0, temperature=20
+                ),
+                loss_cls=dict(type="FocalLoss", use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=1.0),
+                loss_bbox=dict(type="L1Loss", loss_weight=5.0),
+                loss_iou=dict(type="GIoULoss", loss_weight=2.0),
+                dn_cfg=dict(
+                    label_noise_scale=0.5,
+                    box_noise_scale=1.0,  # 0.4 for DN-DETR
+                    group_cfg=dict(dynamic=True, num_groups=None, num_dn_queries=100),
+                ),
+            ),
+        )
+        self.bbox_head = build_detector(cfg)
+
+        assigner_cfg = ConfigDict(
+            type="HungarianAssigner",
+            cls_cost=dict(type="FocalLossCost", weight=1.0),
+            reg_cost=dict(type="BBoxL1Cost", weight=5.0, box_format="xywh"),
+            iou_cost=dict(type="IoUCost", iou_mode="giou", weight=2.0),
+        )
+        self.bbox_head.assigner = build_assigner(assigner_cfg)
+
+        test_cfg = dict(max_per_img=300)
+        self.bbox_head.test_cfg = test_cfg
+
+    @e2e_pytest_unit
+    def test_forward_train(self):
+        inputs = [
+            torch.randn([2, 256, 92, 95]),
+            torch.randn([2, 256, 46, 48]),
+            torch.randn([2, 256, 23, 24]),
+            torch.randn([2, 256, 12, 12]),
+        ]
+        gt_bboxes = [
+            torch.Tensor(
+                [
+                    [432.2500, 514.2661, 632.6323, 638.8889],
+                    [361.2484, 294.9931, 558.4751, 466.9410],
+                    [616.8542, 201.9204, 752.5462, 328.1207],
+                    [591.6091, 386.4883, 733.6124, 571.0562],
+                    [728.8790, 255.5556, 760.0000, 408.5734],
+                    [713.1008, 397.5309, 760.0000, 541.0837],
+                    [246.0680, 354.9383, 427.5165, 498.4911],
+                    [113.5316, 361.2483, 309.1805, 517.4211],
+                    [457.4950, 654.6639, 646.8326, 736.0000],
+                    [132.4654, 631.0014, 187.6889, 684.6365],
+                    [217.6673, 694.1015, 298.1358, 736.0000],
+                    [0.0000, 583.6763, 56.7303, 672.0164],
+                    [86.7088, 675.1714, 168.7551, 736.0000],
+                    [173.4885, 93.0727, 253.9570, 151.4403],
+                    [738.3458, 119.8903, 760.0000, 164.0603],
+                    [683.1224, 522.1536, 760.0000, 736.0000],
+                ]
+            ),
+            torch.Tensor(
+                [
+                    [442.0, 279.0, 544.0, 377.0],
+                    [386.0, 1.0, 497.0, 108.0],
+                    [288.0, 1.0, 399.0, 84.0],
+                    [154.0, 1.0, 268.0, 77.0],
+                    [530.0, 163.0, 625.0, 248.0],
+                    [179.0, 298.0, 278.0, 398.0],
+                    [275.0, 320.0, 374.0, 420.0],
+                    [525.0, 394.0, 613.0, 480.0],
+                    [332.0, 160.0, 463.0, 286.0],
+                    [210.0, 395.0, 308.0, 480.0],
+                    [141.0, 395.0, 239.0, 480.0],
+                    [106.0, 225.0, 204.0, 310.0],
+                    [12.0, 1.0, 148.0, 70.0],
+                    [165.0, 79.0, 396.0, 247.0],
+                    [483.0, 13.0, 518.0, 52.0],
+                ],
+            ),
+        ]
+        gt_labels = [
+            torch.Tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 2]).long(),
+            torch.Tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 0]).long(),
+        ]
+        img_metas = [
+            {
+                "flip_direction": "horizontal",
+                "img_shape": (736, 760, 3),
+                "ori_shape": (480, 640, 3),
+                "img_norm_cfg": {
+                    "mean": np.array([123.675, 116.28, 103.53], dtype=np.float32),
+                    "std": np.array([58.395, 57.12, 57.375], dtype=np.float32),
+                    "to_rgb": False,
+                },
+                "scale_factor": np.array([1.5139443, 1.5144033, 1.5139443, 1.5144033], dtype=np.float32),
+                "flip": True,
+                "pad_shape": (736, 760, 3),
+                "batch_input_shape": (736, 760),
+            },
+            {
+                "flip_direction": "horizontal",
+                "img_shape": (480, 640, 3),
+                "ori_shape": (480, 640, 3),
+                "img_norm_cfg": {
+                    "mean": np.array([123.675, 116.28, 103.53], dtype=np.float32),
+                    "std": np.array([58.395, 57.12, 57.375], dtype=np.float32),
+                    "to_rgb": False,
+                },
+                "scale_factor": np.array([1.0, 1.0, 1.0, 1.0], dtype=np.float32),
+                "flip": True,
+                "pad_shape": (480, 640, 3),
+                "batch_input_shape": (736, 760),
+            },
+        ]
+        losses = self.bbox_head.forward_train(inputs, img_metas, gt_bboxes, gt_labels)
+        assert len(losses) == 39
+
+    @e2e_pytest_unit
+    def test_simple_test_bboxes(self):
+        feats = [
+            torch.randn([2, 256, 100, 134]),
+            torch.randn([2, 256, 50, 67]),
+            torch.randn([2, 256, 25, 34]),
+            torch.randn([2, 256, 13, 17]),
+        ]
+        img_metas = [
+            {
+                "ori_shape": (480, 640, 3),
+                "img_shape": (800, 1067, 3),
+                "pad_shape": (800, 1067, 3),
+                "scale_factor": np.array([1.6671875, 1.6666666, 1.6671875, 1.6666666], dtype=np.float32),
+                "flip": False,
+                "flip_direction": None,
+                "img_norm_cfg": {
+                    "mean": np.array([123.675, 116.28, 103.53], dtype=np.float32),
+                    "std": np.array([58.395, 57.12, 57.375], dtype=np.float32),
+                    "to_rgb": False,
+                },
+                "batch_input_shape": (800, 1067),
+            },
+            {
+                "ori_shape": (480, 640, 3),
+                "img_shape": (800, 1067, 3),
+                "pad_shape": (800, 1067, 3),
+                "scale_factor": np.array([1.6671875, 1.6666666, 1.6671875, 1.6666666], dtype=np.float32),
+                "flip": False,
+                "flip_direction": None,
+                "img_norm_cfg": {
+                    "mean": np.array([123.675, 116.28, 103.53], dtype=np.float32),
+                    "std": np.array([58.395, 57.12, 57.375], dtype=np.float32),
+                    "to_rgb": False,
+                },
+                "batch_input_shape": (800, 1067),
+            },
+        ]
+        self.bbox_head.eval()
+        results = self.bbox_head.simple_test_bboxes(feats, img_metas)
+        assert len(results) == 2
+        assert results[0][0].shape == torch.Size([300, 5])
+        assert results[0][1].shape == torch.Size([300])

From e04378314259561878b4f51af7253ff4829ee944 Mon Sep 17 00:00:00 2001
From: jaegukhyun <jaeguk.hyun@intel.com>
Date: Fri, 23 Jun 2023 16:30:49 +0900
Subject: [PATCH 05/11] Add intg test

---
 tests/integration/cli/detection/test_detection.py | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/tests/integration/cli/detection/test_detection.py b/tests/integration/cli/detection/test_detection.py
index 18a2d65f230..d36948ba13a 100644
--- a/tests/integration/cli/detection/test_detection.py
+++ b/tests/integration/cli/detection/test_detection.py
@@ -68,13 +68,16 @@
 templates = Registry("otx/algorithms/detection").filter(task_type="DETECTION").templates
 templates_ids = [template.model_template_id for template in templates]
 
-experimental_template = parse_model_template(
-    "otx/algorithms/detection/configs/detection/resnet50_deformable-detr/template_experimental.yaml"
-)
-experimental_template_id = experimental_template.model_template_id
+experimental_templates = [
+    parse_model_template(
+        "otx/algorithms/detection/configs/detection/resnet50_deformable-detr/template_experimental.yaml"
+    ),
+    parse_model_template("otx/algorithms/detection/configs/detection/resnet50_dino/template_experimental.yaml"),
+]
+experimental_template_ids = [template.model_template_id for template in experimental_templates]
 
-templates_w_experimental = templates + [experimental_template]
-templates_ids_w_experimental = templates_ids + [experimental_template_id]
+templates_w_experimental = templates + experimental_templates
+templates_ids_w_experimental = templates_ids + experimental_template_ids
 
 
 class TestDetectionCLI:

From 406a485ba2951c41fd749e08cdfb4f1e4f725df9 Mon Sep 17 00:00:00 2001
From: jaegukhyun <jaeguk.hyun@intel.com>
Date: Fri, 23 Jun 2023 16:37:48 +0900
Subject: [PATCH 06/11] Update CHANGELOG.md

---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index d730cb9c7c7..0d71e175803 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -10,6 +10,7 @@ All notable changes to this project will be documented in this file.
 - Add custom max iou assigner to prevent CPU OOM when large annotations are used (<https://github.com/openvinotoolkit/training_extensions/pull/2228>)
 - Auto train type detection for Semi-SL, Self-SL and Incremental: "--train-type" now is optional (https://github.com/openvinotoolkit/training_extensions/pull/2195)
 - Add new object detector Deformable DETR (<https://github.com/openvinotoolkit/training_extensions/pull/2249>)
+- Add new object detecotr DINO(<https://github.com/openvinotoolkit/training_extensions/pull/2266>)
 
 ### Enhancements
 

From 36ccfb1ee9d4b9d016ca81421293186391074538 Mon Sep 17 00:00:00 2001
From: jaegukhyun <jaeguk.hyun@intel.com>
Date: Fri, 23 Jun 2023 19:12:46 +0900
Subject: [PATCH 07/11] Change description of config files for DINO

---
 .../detection/configs/detection/resnet50_dino/data_pipeline.py  | 2 +-
 .../detection/configs/detection/resnet50_dino/deployment.py     | 2 +-
 .../detection/configs/detection/resnet50_dino/model.py          | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/otx/algorithms/detection/configs/detection/resnet50_dino/data_pipeline.py b/otx/algorithms/detection/configs/detection/resnet50_dino/data_pipeline.py
index 4b577e21eb8..9610b3fd514 100644
--- a/otx/algorithms/detection/configs/detection/resnet50_dino/data_pipeline.py
+++ b/otx/algorithms/detection/configs/detection/resnet50_dino/data_pipeline.py
@@ -1,4 +1,4 @@
-"""Data pipeline for Deformable DETR."""
+"""Data pipeline for DINO."""
 # dataset settings
 dataset_type = "CocoDataset"
 data_root = "data/coco/"
diff --git a/otx/algorithms/detection/configs/detection/resnet50_dino/deployment.py b/otx/algorithms/detection/configs/detection/resnet50_dino/deployment.py
index 76b4a6544f5..6e7d1fba3ed 100644
--- a/otx/algorithms/detection/configs/detection/resnet50_dino/deployment.py
+++ b/otx/algorithms/detection/configs/detection/resnet50_dino/deployment.py
@@ -1,4 +1,4 @@
-"""MMDeploy config of Deformable DETR model for Detection Task."""
+"""MMDeploy config of DINO model for Detection Task."""
 
 _base_ = ["../../base/deployments/base_detection_dynamic.py"]
 
diff --git a/otx/algorithms/detection/configs/detection/resnet50_dino/model.py b/otx/algorithms/detection/configs/detection/resnet50_dino/model.py
index a9cdf215901..a929fb5dc3d 100644
--- a/otx/algorithms/detection/configs/detection/resnet50_dino/model.py
+++ b/otx/algorithms/detection/configs/detection/resnet50_dino/model.py
@@ -1,4 +1,4 @@
-"""Model config for Deformable DETR."""
+"""Model config for DINO."""
 model = dict(
     type="CustomDINO",
     backbone=dict(

From 909727ef51759eb1b3b5c608ea853a033aedccf0 Mon Sep 17 00:00:00 2001
From: jaegukhyun <jaeguk.hyun@intel.com>
Date: Mon, 26 Jun 2023 12:34:20 +0900
Subject: [PATCH 08/11] Modify unit tests

---
 .../mmdet/models/heads/test_custom_dino_head.py  | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/tests/unit/algorithms/detection/adapters/mmdet/models/heads/test_custom_dino_head.py b/tests/unit/algorithms/detection/adapters/mmdet/models/heads/test_custom_dino_head.py
index fec8abac448..4b54c6ea05d 100644
--- a/tests/unit/algorithms/detection/adapters/mmdet/models/heads/test_custom_dino_head.py
+++ b/tests/unit/algorithms/detection/adapters/mmdet/models/heads/test_custom_dino_head.py
@@ -85,10 +85,10 @@ def setup(self):
     @e2e_pytest_unit
     def test_forward_train(self):
         inputs = [
-            torch.randn([2, 256, 92, 95]),
-            torch.randn([2, 256, 46, 48]),
-            torch.randn([2, 256, 23, 24]),
-            torch.randn([2, 256, 12, 12]),
+            torch.zeros([2, 256, 92, 95]),
+            torch.zeros([2, 256, 46, 48]),
+            torch.zeros([2, 256, 23, 24]),
+            torch.zeros([2, 256, 12, 12]),
         ]
         gt_bboxes = [
             torch.Tensor(
@@ -171,10 +171,10 @@ def test_forward_train(self):
     @e2e_pytest_unit
     def test_simple_test_bboxes(self):
         feats = [
-            torch.randn([2, 256, 100, 134]),
-            torch.randn([2, 256, 50, 67]),
-            torch.randn([2, 256, 25, 34]),
-            torch.randn([2, 256, 13, 17]),
+            torch.zeros([2, 256, 100, 134]),
+            torch.zeros([2, 256, 50, 67]),
+            torch.zeros([2, 256, 25, 34]),
+            torch.zeros([2, 256, 13, 17]),
         ]
         img_metas = [
             {

From 42d7e15f0afbd107247261ccf8615dbc3e119c6f Mon Sep 17 00:00:00 2001
From: jaegukhyun <jaeguk.hyun@intel.com>
Date: Tue, 27 Jun 2023 14:43:36 +0900
Subject: [PATCH 09/11] Reflect reviews

---
 .../mmdet/models/detectors/custom_dino_detector.py    | 11 +++--------
 .../detection/adapters/mmdet/models/layers/dino.py    |  6 +++---
 .../configs/detection/resnet50_dino/data_pipeline.py  |  1 -
 3 files changed, 6 insertions(+), 12 deletions(-)

diff --git a/otx/algorithms/detection/adapters/mmdet/models/detectors/custom_dino_detector.py b/otx/algorithms/detection/adapters/mmdet/models/detectors/custom_dino_detector.py
index e84bb25eeec..eeed11d99d1 100644
--- a/otx/algorithms/detection/adapters/mmdet/models/detectors/custom_dino_detector.py
+++ b/otx/algorithms/detection/adapters/mmdet/models/detectors/custom_dino_detector.py
@@ -4,8 +4,6 @@
 # SPDX-License-Identifier: Apache-2.0
 #
 
-import functools
-
 from mmdet.models.builder import DETECTORS
 
 from otx.algorithms.common.adapters.mmcv.hooks.recording_forward_hook import (
@@ -26,19 +24,16 @@ class CustomDINO(CustomDeformableDETR):
     def __init__(self, *args, task_adapt=None, **kwargs):
         super().__init__(*args, task_adapt=task_adapt, **kwargs)
         self._register_load_state_dict_pre_hook(
-            functools.partial(
-                self.load_state_dict_pre_hook,
-                self,
-            )
+            self.load_state_dict_pre_hook,
         )
 
     @staticmethod
-    def load_state_dict_pre_hook(model, ckpt_dict, *args, **kwargs):
+    def load_state_dict_pre_hook(ckpt_dict, *args, **kwargs):
         """Modify mmdet3.x version's weights before weight loading."""
 
         if list(ckpt_dict.keys())[0] == "level_embed":
             logger.info("----------------- CustomDINO.load_state_dict_pre_hook() called")
-            # This ckpt_dict is come from mmdet3.x
+            # This ckpt_dict comes from mmdet3.x
             ckpt_dict["bbox_head.transformer.level_embeds"] = ckpt_dict.pop("level_embed")
             replaced_params = {}
             for param in ckpt_dict:
diff --git a/otx/algorithms/detection/adapters/mmdet/models/layers/dino.py b/otx/algorithms/detection/adapters/mmdet/models/layers/dino.py
index cab3f23183d..573417cfabf 100644
--- a/otx/algorithms/detection/adapters/mmdet/models/layers/dino.py
+++ b/otx/algorithms/detection/adapters/mmdet/models/layers/dino.py
@@ -17,7 +17,7 @@ class CustomDINOTransformer(DeformableDetrTransformer):
 
     Original implementation: mmdet.models.utils.transformer.DeformableDETR in mmdet2.x
     What's changed: The forward function is modified.
-        Modified implementations are come from mmdet.models.detectors.dino.DINO in mmdet3.x
+        Modified implementations come from mmdet.models.detectors.dino.DINO in mmdet3.x
     """
 
     def init_layers(self):
@@ -48,13 +48,13 @@ def forward(
             pre_transformer() -> forward_encoder() -> pre_decoder() -> forward_decoder().
             In comparison, mmdet2.x forward function takes charge of all functions above.
             The differences in Deformable DETR and DINO are occured in pre_decoder(), forward_decoder().
-            Therefore this function modified those parts. Modified implementations are come from
+            Therefore this function modified those parts. Modified implementations come from
             pre_decoder(), and forward_decoder() of mmdet.models.detectors.dino.DINO in mmdet3.x.
 
 
         Args:
             batch_info(list(dict(str, union(tuple, tensor)))):
-                Information about batch such as image shaep,
+                Information about batch such as image shape,
                 gt information.
             mlvl_feats (list(Tensor)): Input queries from
                 different level. Each element has shape
diff --git a/otx/algorithms/detection/configs/detection/resnet50_dino/data_pipeline.py b/otx/algorithms/detection/configs/detection/resnet50_dino/data_pipeline.py
index 9610b3fd514..9f7b3f1d404 100644
--- a/otx/algorithms/detection/configs/detection/resnet50_dino/data_pipeline.py
+++ b/otx/algorithms/detection/configs/detection/resnet50_dino/data_pipeline.py
@@ -112,4 +112,3 @@
         pipeline=test_pipeline,
     ),
 )
-evaluation = dict(interval=1, metric="bbox")

From 53359e939e48e3d4eac248cbefcd51abf2b5ef37 Mon Sep 17 00:00:00 2001
From: jaegukhyun <jaeguk.hyun@intel.com>
Date: Tue, 27 Jun 2023 14:48:39 +0900
Subject: [PATCH 10/11] Reflect Reviews

---
 .../detection/resnet50_dino/data_pipeline.py  | 41 +++++++------------
 1 file changed, 15 insertions(+), 26 deletions(-)

diff --git a/otx/algorithms/detection/configs/detection/resnet50_dino/data_pipeline.py b/otx/algorithms/detection/configs/detection/resnet50_dino/data_pipeline.py
index 9f7b3f1d404..19365de72a7 100644
--- a/otx/algorithms/detection/configs/detection/resnet50_dino/data_pipeline.py
+++ b/otx/algorithms/detection/configs/detection/resnet50_dino/data_pipeline.py
@@ -3,6 +3,19 @@
 dataset_type = "CocoDataset"
 data_root = "data/coco/"
 img_norm_cfg = dict(mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+__img_scale = [
+    (480, 1333),
+    (512, 1333),
+    (544, 1333),
+    (576, 1333),
+    (608, 1333),
+    (640, 1333),
+    (672, 1333),
+    (704, 1333),
+    (736, 1333),
+    (768, 1333),
+    (800, 1333),
+]
 
 # train_pipeline, NOTE the img_scale and the Pad's size_divisor is different
 # from the default setting in mmdet.
@@ -16,19 +29,7 @@
             [
                 dict(
                     type="Resize",
-                    img_scale=[
-                        (480, 1333),
-                        (512, 1333),
-                        (544, 1333),
-                        (576, 1333),
-                        (608, 1333),
-                        (640, 1333),
-                        (672, 1333),
-                        (704, 1333),
-                        (736, 1333),
-                        (768, 1333),
-                        (800, 1333),
-                    ],
+                    img_scale=__img_scale,
                     multiscale_mode="value",
                     keep_ratio=True,
                 )
@@ -45,19 +46,7 @@
                 dict(type="RandomCrop", crop_type="absolute_range", crop_size=(384, 600), allow_negative_crop=True),
                 dict(
                     type="Resize",
-                    img_scale=[
-                        (480, 1333),
-                        (512, 1333),
-                        (544, 1333),
-                        (576, 1333),
-                        (608, 1333),
-                        (640, 1333),
-                        (672, 1333),
-                        (704, 1333),
-                        (736, 1333),
-                        (768, 1333),
-                        (800, 1333),
-                    ],
+                    img_scale=__img_scale,
                     multiscale_mode="value",
                     override=True,
                     keep_ratio=True,

From d893e1a25a825815b2683a6a4cb89e0fc8da88db Mon Sep 17 00:00:00 2001
From: jaegukhyun <jaeguk.hyun@intel.com>
Date: Tue, 27 Jun 2023 15:34:52 +0900
Subject: [PATCH 11/11] Update unit tests

---
 .../mmdet/models/detectors/test_custom_dino_detector.py         | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/unit/algorithms/detection/adapters/mmdet/models/detectors/test_custom_dino_detector.py b/tests/unit/algorithms/detection/adapters/mmdet/models/detectors/test_custom_dino_detector.py
index 22b89435f09..2bd02b25505 100644
--- a/tests/unit/algorithms/detection/adapters/mmdet/models/detectors/test_custom_dino_detector.py
+++ b/tests/unit/algorithms/detection/adapters/mmdet/models/detectors/test_custom_dino_detector.py
@@ -34,7 +34,7 @@ def test_custom_dino_load_state_pre_hook(self, fxt_cfg_custom_dino: Dict):
             "memory_trans_fc": "memory_trans_fc",
             "memory_trans_norm": "memory_trans_norm",
         }
-        model.load_state_dict_pre_hook(model, ckpt_dict)
+        model.load_state_dict_pre_hook(ckpt_dict)
 
         assert ckpt_dict["bbox_head.transformer.level_embeds"] == "level_embed"
         assert ckpt_dict["bbox_head.transformer.encoder.attentions.0"] == "encoder.self_attn"