From d7739dbe46caafb10d2519bd9c0f6a544bfb5d07 Mon Sep 17 00:00:00 2001 From: jaegukhyun Date: Thu, 22 Jun 2023 17:52:47 +0900 Subject: [PATCH 01/11] Add DINO --- .../adapters/mmdet/models/__init__.py | 4 +- .../mmdet/models/detectors/__init__.py | 2 + .../models/detectors/custom_dino_detector.py | 94 +++ .../adapters/mmdet/models/heads/__init__.py | 4 + .../mmdet/models/heads/custom_dino_head.py | 629 ++++++++++++++++++ .../adapters/mmdet/models/heads/detr_head.py | 263 ++++++++ .../adapters/mmdet/models/layers/__init__.py | 9 + .../adapters/mmdet/models/layers/dino.py | 169 +++++ .../mmdet/models/layers/dino_layers.py | 609 +++++++++++++++++ .../detection/resnet50_dino/data_pipeline.py | 115 ++++ .../detection/resnet50_dino/deployment.py | 12 + .../configs/detection/resnet50_dino/model.py | 117 ++++ .../resnet50_dino/template_experimental.yaml | 64 ++ 13 files changed, 2089 insertions(+), 2 deletions(-) create mode 100644 otx/algorithms/detection/adapters/mmdet/models/detectors/custom_dino_detector.py create mode 100644 otx/algorithms/detection/adapters/mmdet/models/heads/custom_dino_head.py create mode 100644 otx/algorithms/detection/adapters/mmdet/models/heads/detr_head.py create mode 100644 otx/algorithms/detection/adapters/mmdet/models/layers/__init__.py create mode 100644 otx/algorithms/detection/adapters/mmdet/models/layers/dino.py create mode 100644 otx/algorithms/detection/adapters/mmdet/models/layers/dino_layers.py create mode 100644 otx/algorithms/detection/configs/detection/resnet50_dino/data_pipeline.py create mode 100644 otx/algorithms/detection/configs/detection/resnet50_dino/deployment.py create mode 100644 otx/algorithms/detection/configs/detection/resnet50_dino/model.py create mode 100644 otx/algorithms/detection/configs/detection/resnet50_dino/template_experimental.yaml diff --git a/otx/algorithms/detection/adapters/mmdet/models/__init__.py b/otx/algorithms/detection/adapters/mmdet/models/__init__.py index c73e3d4247e..c59b3e97b84 100644 --- a/otx/algorithms/detection/adapters/mmdet/models/__init__.py +++ b/otx/algorithms/detection/adapters/mmdet/models/__init__.py @@ -3,6 +3,6 @@ # SPDX-License-Identifier: Apache-2.0 # -from . import assigners, backbones, dense_heads, detectors, heads, losses, necks, roi_heads +from . import assigners, backbones, dense_heads, detectors, heads, layers, losses, necks, roi_heads -__all__ = ["assigners", "backbones", "dense_heads", "detectors", "heads", "losses", "necks", "roi_heads"] +__all__ = ["assigners", "backbones", "dense_heads", "detectors", "heads", "layers", "losses", "necks", "roi_heads"] diff --git a/otx/algorithms/detection/adapters/mmdet/models/detectors/__init__.py b/otx/algorithms/detection/adapters/mmdet/models/detectors/__init__.py index 962407bb091..0dc0e8e4079 100644 --- a/otx/algorithms/detection/adapters/mmdet/models/detectors/__init__.py +++ b/otx/algorithms/detection/adapters/mmdet/models/detectors/__init__.py @@ -5,6 +5,7 @@ from .custom_atss_detector import CustomATSS from .custom_deformable_detr_detector import CustomDeformableDETR +from .custom_dino_detector import CustomDINO from .custom_maskrcnn_detector import CustomMaskRCNN from .custom_maskrcnn_tile_optimized import CustomMaskRCNNTileOptimized from .custom_single_stage_detector import CustomSingleStageDetector @@ -18,6 +19,7 @@ __all__ = [ "CustomATSS", "CustomDeformableDETR", + "CustomDINO", "CustomMaskRCNN", "CustomSingleStageDetector", "CustomTwoStageDetector", diff --git a/otx/algorithms/detection/adapters/mmdet/models/detectors/custom_dino_detector.py b/otx/algorithms/detection/adapters/mmdet/models/detectors/custom_dino_detector.py new file mode 100644 index 00000000000..3bfd97bfa05 --- /dev/null +++ b/otx/algorithms/detection/adapters/mmdet/models/detectors/custom_dino_detector.py @@ -0,0 +1,94 @@ +"""OTX DINO Class for mmdetection detectors.""" + +# Copyright (C) 2023 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# + +import functools + +from mmdet.models.builder import DETECTORS + +from otx.algorithms.common.adapters.mmcv.hooks.recording_forward_hook import ( + ActivationMapHook, + FeatureVectorHook, +) +from otx.algorithms.common.adapters.mmdeploy.utils import is_mmdeploy_enabled +from otx.algorithms.common.utils.logger import get_logger +from otx.algorithms.detection.adapters.mmdet.models.detectors import CustomDeformableDETR + +logger = get_logger() + + +@DETECTORS.register_module() +class CustomDINO(CustomDeformableDETR): + """Custom DINO detector.""" + + def __init__(self, *args, task_adapt=None, **kwargs): + super().__init__(*args, task_adapt=task_adapt, **kwargs) + self._register_load_state_dict_pre_hook( + functools.partial( + self.load_state_dict_pre_hook, + self, + ) + ) + + @staticmethod + def load_state_dict_pre_hook(model, ckpt_dict, *args, **kwargs): + """Modify mmdet3.x version's weights before weight loading.""" + + if list(ckpt_dict.keys())[0] == "level_embed": + logger.info("----------------- CustomDINO.load_state_dict_pre_hook() called") + # This ckpt_dict is come from mmdet3.x + ckpt_dict["bbox_head.transformer.level_embeds"] = ckpt_dict.pop("level_embed") + replaced_params = {} + for param in ckpt_dict: + new_param = None + if "encoder" in param or "decoder" in param: + new_param = "bbox_head.transformer." + param + new_param = new_param.replace("self_attn", "attentions.0") + new_param = new_param.replace("cross_attn", "attentions.1") + new_param = new_param.replace("ffn", "ffns.0") + elif param == "query_embedding.weight": + new_param = "bbox_head." + param + elif param == "dn_query_generator.label_embedding.weight": + new_param = "bbox_head.transformer." + param + elif "memory_trans" in param: + new_param = "bbox_head.transformer." + param + new_param = new_param.replace("memory_trans_fc", "enc_output") + new_param = new_param.replace("memory_trans_norm", "enc_output_norm") + if new_param is not None: + replaced_params[param] = new_param + + for origin, new in replaced_params.items(): + ckpt_dict[new] = ckpt_dict.pop(origin) + + +if is_mmdeploy_enabled(): + from mmdeploy.core import FUNCTION_REWRITER + + @FUNCTION_REWRITER.register_rewriter( + "otx.algorithms.detection.adapters.mmdet.models.detectors.custom_dino_detector.CustomDINO.simple_test" + ) + def custom_dino__simple_test(ctx, self, img, img_metas, **kwargs): + """Function for custom_mask_rcnn__simple_test.""" + height = int(img_metas[0]["img_shape"][0]) + width = int(img_metas[0]["img_shape"][1]) + img_metas[0]["batch_input_shape"] = (height, width) + img_metas[0]["img_shape"] = (height, width, 3) + feats = self.extract_feat(img) + gt_bboxes = [None] * len(feats) + gt_labels = [None] * len(feats) + hidden_states, references, enc_output_class, enc_output_coord, _ = self.bbox_head.forward_transformer( + feats, gt_bboxes, gt_labels, img_metas + ) + cls_scores, bbox_preds = self.bbox_head(hidden_states, references) + bbox_results = self.bbox_head.get_bboxes( + cls_scores, bbox_preds, enc_output_class, enc_output_coord, img_metas=img_metas, **kwargs + ) + + if ctx.cfg["dump_features"]: + feature_vector = FeatureVectorHook.func(feats) + saliency_map = ActivationMapHook.func(cls_scores) + return (*bbox_results, feature_vector, saliency_map) + + return bbox_results diff --git a/otx/algorithms/detection/adapters/mmdet/models/heads/__init__.py b/otx/algorithms/detection/adapters/mmdet/models/heads/__init__.py index 521dd14e83e..28da39d0a1b 100644 --- a/otx/algorithms/detection/adapters/mmdet/models/heads/__init__.py +++ b/otx/algorithms/detection/adapters/mmdet/models/heads/__init__.py @@ -6,23 +6,27 @@ from .cross_dataset_detector_head import CrossDatasetDetectorHead from .custom_anchor_generator import SSDAnchorGeneratorClustered from .custom_atss_head import CustomATSSHead, CustomATSSHeadTrackingLossDynamics +from .custom_dino_head import CustomDINOHead from .custom_fcn_mask_head import CustomFCNMaskHead from .custom_retina_head import CustomRetinaHead from .custom_roi_head import CustomRoIHead from .custom_ssd_head import CustomSSDHead from .custom_vfnet_head import CustomVFNetHead from .custom_yolox_head import CustomYOLOXHead +from .detr_head import DETRHeadExtension __all__ = [ "CrossDatasetDetectorHead", "SSDAnchorGeneratorClustered", "CustomATSSHead", + "CustomDINOHead", "CustomFCNMaskHead", "CustomRetinaHead", "CustomSSDHead", "CustomRoIHead", "CustomVFNetHead", "CustomYOLOXHead", + "DETRHeadExtension", # Loss dynamics tracking "CustomATSSHeadTrackingLossDynamics", ] diff --git a/otx/algorithms/detection/adapters/mmdet/models/heads/custom_dino_head.py b/otx/algorithms/detection/adapters/mmdet/models/heads/custom_dino_head.py new file mode 100644 index 00000000000..e17ec30bf55 --- /dev/null +++ b/otx/algorithms/detection/adapters/mmdet/models/heads/custom_dino_head.py @@ -0,0 +1,629 @@ +"""Custom DINO head for OTX template.""" +# Copyright (C) 2023 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# + +from typing import Dict, List, Optional, Tuple + +import torch +import torch.nn.functional as F +from mmcv.utils import Config +from mmdet.core import bbox_cxcywh_to_xyxy, bbox_xyxy_to_cxcywh, multi_apply, reduce_mean +from mmdet.models.builder import HEADS +from mmdet.models.dense_heads import DeformableDETRHead +from mmdet.models.utils.transformer import inverse_sigmoid +from torch import Tensor + +from otx.algorithms.detection.adapters.mmdet.models.heads.detr_head import DETRHeadExtension +from otx.algorithms.detection.adapters.mmdet.models.layers import CdnQueryGenerator + + +@HEADS.register_module() +class CustomDINOHead(DeformableDETRHead, DETRHeadExtension): + """Head of DINO.""" + + def __init__(self, *args, dn_cfg: Optional[Config] = None, **kwargs): + super().__init__(*args, **kwargs) + + if dn_cfg is not None: + assert "num_classes" not in dn_cfg and "num_queries" not in dn_cfg and "hidden_dim" not in dn_cfg, ( + "The three keyword args `num_classes`, `embed_dims`, and " + "`num_matching_queries` are set in `detector.__init__()`, " + "users should not set them in `dn_cfg` config." + ) + dn_cfg["num_classes"] = self.num_classes + dn_cfg["embed_dims"] = self.embed_dims + dn_cfg["num_matching_queries"] = self.num_query + self.transformer.dn_query_generator = CdnQueryGenerator(**dn_cfg) + self.transformer.two_stage_num_proposals = self.num_query + + def _init_layers(self): + """Initialize classification branch and regression branch of head.""" + super()._init_layers() + self.query_embedding = torch.nn.Embedding(self.num_query, self.embed_dims) + + def forward_train(self, x, img_metas, gt_bboxes, gt_labels=None, gt_bboxes_ignore=None, proposal_cfg=None): + """Forward function for training mode. + + Args: + x (list[Tensor]): Features from backbone. + img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + gt_bboxes (Tensor): Ground truth bboxes of the image, + shape (num_gts, 4). + gt_labels (Tensor): Ground truth labels of each box, + shape (num_gts,). + gt_bboxes_ignore (Tensor): Ground truth bboxes to be + ignored, shape (num_ignored_gts, 4). + proposal_cfg (mmcv.Config): Test / postprocessing configuration, + if None, test_cfg would be used. + + Returns: + dict[str, Tensor]: A dictionary of loss components. + """ + assert proposal_cfg is None, '"proposal_cfg" must be None' + outs = self.forward_transformer(x, gt_bboxes, gt_labels, img_metas) + batch_data_samples = [] + for img_meta, gt_bbox, gt_label in zip(img_metas, gt_bboxes, gt_labels): + info = Config({"metainfo": img_meta, "gt_instances": {"bboxes": gt_bbox, "labels": gt_label}}) + batch_data_samples.append(info) + loss_inputs = outs + (batch_data_samples,) + losses = self.loss(*loss_inputs) + return losses + + def forward_transformer(self, mlvl_feats, gt_bboxes, gt_labels, img_metas): + """Forward function. + + Args: + mlvl_feats (tuple[Tensor]): Features from the upstream + network, each is a 4D-tensor with shape + (N, C, H, W). + gt_bboxes (List[Tensor | None]): List of ground truth bboxes. + When model is evaluated, it will be list of None. + gt_labels (List[Tensor | None]): List of ground truth labels. + When model is evaluated, it will be list of None. + img_metas (list[dict]): List of image information. + + Returns: + all_cls_scores (Tensor): Outputs from the classification head, \ + shape [nb_dec, bs, num_query, cls_out_channels]. Note \ + cls_out_channels should includes background. + all_bbox_preds (Tensor): Sigmoid outputs from the regression \ + head with normalized coordinate format (cx, cy, w, h). \ + Shape [nb_dec, bs, num_query, 4]. + enc_outputs_class (Tensor): The score of each point on encode \ + feature map, has shape (N, h*w, num_class). Only when \ + as_two_stage is True it would be returned, otherwise \ + `None` would be returned. + enc_outputs_coord (Tensor): The proposal generate from the \ + encode feature map, has shape (N, h*w, 4). Only when \ + as_two_stage is True it would be returned, otherwise \ + `None` would be returned. + """ + + batch_size = mlvl_feats[0].size(0) + input_img_h, input_img_w = img_metas[0]["batch_input_shape"] + img_masks = mlvl_feats[0].new_ones((batch_size, input_img_h, input_img_w)) + for img_id in range(batch_size): + img_h, img_w, _ = img_metas[img_id]["img_shape"] + img_masks[img_id, :img_h, :img_w] = 0 + + mlvl_masks = [] + mlvl_positional_encodings = [] + for feat in mlvl_feats: + mlvl_masks.append(F.interpolate(img_masks[None], size=feat.shape[-2:]).to(torch.bool).squeeze(0)) + mlvl_positional_encodings.append(self.positional_encoding(mlvl_masks[-1])) + + query_embeds = self.query_embedding.weight + batch_info = [] + for img_meta, gt_bbox, gt_label in zip(img_metas, gt_bboxes, gt_labels): + info = { + "img_shape": img_meta["img_shape"][:2], + "bboxes": gt_bbox, + "labels": gt_label, + } + batch_info.append(info) + return self.transformer( + batch_info, + mlvl_feats, + mlvl_masks, + query_embeds, + mlvl_positional_encodings, + reg_branches=self.reg_branches, + cls_branches=self.cls_branches, + ) + + def loss( + self, + hidden_states: Tensor, + references: List[Tensor], + enc_outputs_class: Tensor, + enc_outputs_coord: Tensor, + dn_meta: Dict[str, int], + batch_data_samples, + ) -> dict: + """Perform forward propagation and loss calculation. + + Args: + hidden_states (Tensor): Hidden states output from each decoder + layer, has shape (num_decoder_layers, bs, num_queries_total, + dim), where `num_queries_total` is the sum of + `num_denoising_queries` and `num_matching_queries` when + `self.training` is `True`, else `num_matching_queries`. + references (list[Tensor]): List of the reference from the decoder. + The first reference is the `init_reference` (initial) and the + other num_decoder_layers(6) references are `inter_references` + (intermediate). The `init_reference` has shape (bs, + num_queries_total, 4) and each `inter_reference` has shape + (bs, num_queries, 4) with the last dimension arranged as + (cx, cy, w, h). + enc_outputs_class (Tensor): The score of each point on encode + feature map, has shape (bs, num_feat_points, cls_out_channels). + enc_outputs_coord (Tensor): The proposal generate from the + encode feature map, has shape (bs, num_feat_points, 4) with the + last dimension arranged as (cx, cy, w, h). + batch_data_samples (list[:obj:`DetDataSample`]): The Data + Samples. It usually includes information such as + `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`. + dn_meta (Dict[str, int]): The dictionary saves information about + group collation, including 'num_denoising_queries' and + 'num_denoising_groups'. It will be used for split outputs of + denoising and matching parts and loss calculation. + + Returns: + dict: A dictionary of loss components. + """ + batch_gt_instances = [] + batch_img_metas = [] + for data_sample in batch_data_samples: + batch_img_metas.append(data_sample.metainfo) + batch_gt_instances.append(data_sample.gt_instances) + + outs = self(hidden_states, references) + loss_inputs = outs + (enc_outputs_class, enc_outputs_coord, batch_gt_instances, batch_img_metas, dn_meta) + losses = self.loss_by_feat_two_stage(*loss_inputs) + return losses + + def forward(self, hidden_states, references): + """Forward function. + + T.B.D. + """ + all_layers_outputs_classes = [] + all_layers_outputs_coords = [] + + for layer_id in range(hidden_states.shape[0]): + reference = inverse_sigmoid(references[layer_id]) + # NOTE The last reference will not be used. + hidden_state = hidden_states[layer_id] + outputs_class = self.cls_branches[layer_id](hidden_state) + tmp_reg_preds = self.reg_branches[layer_id](hidden_state) + if reference.shape[-1] == 4: + # When `layer` is 0 and `as_two_stage` of the detector + # is `True`, or when `layer` is greater than 0 and + # `with_box_refine` of the detector is `True`. + tmp_reg_preds += reference + else: + # When `layer` is 0 and `as_two_stage` of the detector + # is `False`, or when `layer` is greater than 0 and + # `with_box_refine` of the detector is `False`. + assert reference.shape[-1] == 2 + tmp_reg_preds[..., :2] += reference + outputs_coord = tmp_reg_preds.sigmoid() + all_layers_outputs_classes.append(outputs_class) + all_layers_outputs_coords.append(outputs_coord) + + all_layers_outputs_classes = torch.stack(all_layers_outputs_classes) + all_layers_outputs_coords = torch.stack(all_layers_outputs_coords) + + return all_layers_outputs_classes, all_layers_outputs_coords + + def loss_by_feat_two_stage( + self, + all_layers_cls_scores: Tensor, + all_layers_bbox_preds: Tensor, + enc_cls_scores: Tensor, + enc_bbox_preds: Tensor, + batch_gt_instances, + batch_img_metas: List[dict], + dn_meta: Dict[str, int], + batch_gt_instances_ignore=None, + ) -> Dict[str, Tensor]: + """Loss function. + + Args: + all_layers_cls_scores (Tensor): Classification scores of all + decoder layers, has shape (num_decoder_layers, bs, + num_queries_total, cls_out_channels), where + `num_queries_total` is the sum of `num_denoising_queries` + and `num_matching_queries`. + all_layers_bbox_preds (Tensor): Regression outputs of all decoder + layers. Each is a 4D-tensor with normalized coordinate format + (cx, cy, w, h) and has shape (num_decoder_layers, bs, + num_queries_total, 4). + enc_cls_scores (Tensor): The score of each point on encode + feature map, has shape (bs, num_feat_points, cls_out_channels). + enc_bbox_preds (Tensor): The proposal generate from the encode + feature map, has shape (bs, num_feat_points, 4) with the last + dimension arranged as (cx, cy, w, h). + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + dn_meta (Dict[str, int]): The dictionary saves information about + group collation, including 'num_denoising_queries' and + 'num_denoising_groups'. It will be used for split outputs of + denoising and matching parts and loss calculation. + batch_gt_instances_ignore (list[:obj:`InstanceData`], optional): + Batch of gt_instances_ignore. It includes ``bboxes`` attribute + data that is ignored during training and testing. + Defaults to None. + + Returns: + dict[str, Tensor]: A dictionary of loss components. + """ + # extract denoising and matching part of outputs + ( + all_layers_matching_cls_scores, + all_layers_matching_bbox_preds, + all_layers_denoising_cls_scores, + all_layers_denoising_bbox_preds, + ) = self.split_outputs(all_layers_cls_scores, all_layers_bbox_preds, dn_meta) + + loss_dict = super(DeformableDETRHead, self).loss_by_feat( + all_layers_matching_cls_scores, + all_layers_matching_bbox_preds, + batch_gt_instances, + batch_img_metas, + batch_gt_instances_ignore, + ) + + # loss of proposal generated from encode feature map. + if enc_cls_scores is not None: + # NOTE The enc_loss calculation of the DINO is + # different from that of Deformable DETR. + enc_loss_cls, enc_losses_bbox, enc_losses_iou = self.loss_by_feat_single( + enc_cls_scores, enc_bbox_preds, batch_gt_instances=batch_gt_instances, batch_img_metas=batch_img_metas + ) + loss_dict["enc_loss_cls"] = enc_loss_cls + loss_dict["enc_loss_bbox"] = enc_losses_bbox + loss_dict["enc_loss_iou"] = enc_losses_iou + + if all_layers_denoising_cls_scores is not None: + # calculate denoising loss from all decoder layers + dn_losses_cls, dn_losses_bbox, dn_losses_iou = self.loss_dn( + all_layers_denoising_cls_scores, + all_layers_denoising_bbox_preds, + batch_gt_instances=batch_gt_instances, + batch_img_metas=batch_img_metas, + dn_meta=dn_meta, + ) + # collate denoising loss + loss_dict["dn_loss_cls"] = dn_losses_cls[-1] + loss_dict["dn_loss_bbox"] = dn_losses_bbox[-1] + loss_dict["dn_loss_iou"] = dn_losses_iou[-1] + for num_dec_layer, (loss_cls_i, loss_bbox_i, loss_iou_i) in enumerate( + zip(dn_losses_cls[:-1], dn_losses_bbox[:-1], dn_losses_iou[:-1]) + ): + loss_dict[f"d{num_dec_layer}.dn_loss_cls"] = loss_cls_i + loss_dict[f"d{num_dec_layer}.dn_loss_bbox"] = loss_bbox_i + loss_dict[f"d{num_dec_layer}.dn_loss_iou"] = loss_iou_i + return loss_dict + + def loss_dn( + self, + all_layers_denoising_cls_scores: Tensor, + all_layers_denoising_bbox_preds: Tensor, + batch_gt_instances, + batch_img_metas: List[dict], + dn_meta: Dict[str, int], + ) -> Tuple[List[Tensor], ...]: + """Calculate denoising loss. + + Args: + all_layers_denoising_cls_scores (Tensor): Classification scores of + all decoder layers in denoising part, has shape ( + num_decoder_layers, bs, num_denoising_queries, + cls_out_channels). + all_layers_denoising_bbox_preds (Tensor): Regression outputs of all + decoder layers in denoising part. Each is a 4D-tensor with + normalized coordinate format (cx, cy, w, h) and has shape + (num_decoder_layers, bs, num_denoising_queries, 4). + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + dn_meta (Dict[str, int]): The dictionary saves information about + group collation, including 'num_denoising_queries' and + 'num_denoising_groups'. It will be used for split outputs of + denoising and matching parts and loss calculation. + + Returns: + Tuple[List[Tensor]]: The loss_dn_cls, loss_dn_bbox, and loss_dn_iou + of each decoder layers. + """ + return multi_apply( + self._loss_dn_single, + all_layers_denoising_cls_scores, + all_layers_denoising_bbox_preds, + batch_gt_instances=batch_gt_instances, + batch_img_metas=batch_img_metas, + dn_meta=dn_meta, + ) + + def _loss_dn_single( + self, + dn_cls_scores: Tensor, + dn_bbox_preds: Tensor, + batch_gt_instances, + batch_img_metas: List[dict], + dn_meta: Dict[str, int], + ) -> Tuple[Tensor, ...]: + """Denoising loss for outputs from a single decoder layer. + + Args: + dn_cls_scores (Tensor): Classification scores of a single decoder + layer in denoising part, has shape (bs, num_denoising_queries, + cls_out_channels). + dn_bbox_preds (Tensor): Regression outputs of a single decoder + layer in denoising part. Each is a 4D-tensor with normalized + coordinate format (cx, cy, w, h) and has shape + (bs, num_denoising_queries, 4). + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + dn_meta (Dict[str, int]): The dictionary saves information about + group collation, including 'num_denoising_queries' and + 'num_denoising_groups'. It will be used for split outputs of + denoising and matching parts and loss calculation. + + Returns: + Tuple[Tensor]: A tuple including `loss_cls`, `loss_box` and + `loss_iou`. + """ + cls_reg_targets = self.get_dn_targets(batch_gt_instances, batch_img_metas, dn_meta) + ( + labels_list, + label_weights_list, + bbox_targets_list, + bbox_weights_list, + num_total_pos, + num_total_neg, + ) = cls_reg_targets + labels = torch.cat(labels_list, 0) + label_weights = torch.cat(label_weights_list, 0) + bbox_targets = torch.cat(bbox_targets_list, 0) + bbox_weights = torch.cat(bbox_weights_list, 0) + + # classification loss + cls_scores = dn_cls_scores.reshape(-1, self.cls_out_channels) + # construct weighted avg_factor to match with the official DETR repo + cls_avg_factor = num_total_pos * 1.0 + num_total_neg * self.bg_cls_weight + if self.sync_cls_avg_factor: + cls_avg_factor = reduce_mean(cls_scores.new_tensor([cls_avg_factor])) + cls_avg_factor = max(cls_avg_factor, 1) + + if len(cls_scores) > 0: + loss_cls = self.loss_cls(cls_scores, labels, label_weights, avg_factor=cls_avg_factor) + else: + loss_cls = torch.zeros(1, dtype=cls_scores.dtype, device=cls_scores.device) + + # Compute the average number of gt boxes across all gpus, for + # normalization purposes + num_total_pos = loss_cls.new_tensor([num_total_pos]) + num_total_pos = torch.clamp(reduce_mean(num_total_pos), min=1).item() + + # construct factors used for rescale bboxes + factors = [] + for img_meta, bbox_pred in zip(batch_img_metas, dn_bbox_preds): + img_h, img_w = img_meta["img_shape"][:2] + factor = bbox_pred.new_tensor([img_w, img_h, img_w, img_h]).unsqueeze(0).repeat(bbox_pred.size(0), 1) + factors.append(factor) + factors = torch.cat(factors) + + # DETR regress the relative position of boxes (cxcywh) in the image, + # thus the learning target is normalized by the image size. So here + # we need to re-scale them for calculating IoU loss + bbox_preds = dn_bbox_preds.reshape(-1, 4) + bboxes = bbox_cxcywh_to_xyxy(bbox_preds) * factors + bboxes_gt = bbox_cxcywh_to_xyxy(bbox_targets) * factors + + # regression IoU loss, defaultly GIoU loss + loss_iou = self.loss_iou(bboxes, bboxes_gt, bbox_weights, avg_factor=num_total_pos) + + # regression L1 loss + loss_bbox = self.loss_bbox(bbox_preds, bbox_targets, bbox_weights, avg_factor=num_total_pos) + return loss_cls, loss_bbox, loss_iou + + def get_dn_targets(self, batch_gt_instances, batch_img_metas: List[Dict], dn_meta: Dict[str, int]) -> tuple: + """Get targets in denoising part for a batch of images. + + Args: + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + dn_meta (Dict[str, int]): The dictionary saves information about + group collation, including 'num_denoising_queries' and + 'num_denoising_groups'. It will be used for split outputs of + denoising and matching parts and loss calculation. + + Returns: + tuple: a tuple containing the following targets. + + - labels_list (list[Tensor]): Labels for all images. + - label_weights_list (list[Tensor]): Label weights for all images. + - bbox_targets_list (list[Tensor]): BBox targets for all images. + - bbox_weights_list (list[Tensor]): BBox weights for all images. + - num_total_pos (int): Number of positive samples in all images. + - num_total_neg (int): Number of negative samples in all images. + """ + ( + labels_list, + label_weights_list, + bbox_targets_list, + bbox_weights_list, + pos_inds_list, + neg_inds_list, + ) = multi_apply(self._get_dn_targets_single, batch_gt_instances, batch_img_metas, dn_meta=dn_meta) + num_total_pos = sum((inds.numel() for inds in pos_inds_list)) + num_total_neg = sum((inds.numel() for inds in neg_inds_list)) + return (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list, num_total_pos, num_total_neg) + + def _get_dn_targets_single(self, gt_instances, img_meta: dict, dn_meta: Dict[str, int]) -> tuple: + """Get targets in denoising part for one image. + + Args: + gt_instances (:obj:`InstanceData`): Ground truth of instance + annotations. It should includes ``bboxes`` and ``labels`` + attributes. + img_meta (dict): Meta information for one image. + dn_meta (Dict[str, int]): The dictionary saves information about + group collation, including 'num_denoising_queries' and + 'num_denoising_groups'. It will be used for split outputs of + denoising and matching parts and loss calculation. + + Returns: + tuple[Tensor]: a tuple containing the following for one image. + + - labels (Tensor): Labels of each image. + - label_weights (Tensor]): Label weights of each image. + - bbox_targets (Tensor): BBox targets of each image. + - bbox_weights (Tensor): BBox weights of each image. + - pos_inds (Tensor): Sampled positive indices for each image. + - neg_inds (Tensor): Sampled negative indices for each image. + """ + gt_bboxes = gt_instances.bboxes + gt_labels = gt_instances.labels + num_groups = dn_meta["num_denoising_groups"] + num_denoising_queries = dn_meta["num_denoising_queries"] + num_queries_each_group = int(num_denoising_queries / num_groups) + device = gt_bboxes.device + + if len(gt_labels) > 0: + t = torch.arange(len(gt_labels), dtype=torch.long, device=device) + t = t.unsqueeze(0).repeat(num_groups, 1) + pos_assigned_gt_inds = t.flatten() + pos_inds = torch.arange(num_groups, dtype=torch.long, device=device) + pos_inds = pos_inds.unsqueeze(1) * num_queries_each_group + t + pos_inds = pos_inds.flatten() + else: + pos_inds = pos_assigned_gt_inds = gt_bboxes.new_tensor([], dtype=torch.long) + + neg_inds = pos_inds + num_queries_each_group // 2 + + # label targets + labels = gt_bboxes.new_full((num_denoising_queries,), self.num_classes, dtype=torch.long) + labels[pos_inds] = gt_labels[pos_assigned_gt_inds] + label_weights = gt_bboxes.new_ones(num_denoising_queries) + + # bbox targets + bbox_targets = torch.zeros(num_denoising_queries, 4, device=device) + bbox_weights = torch.zeros(num_denoising_queries, 4, device=device) + bbox_weights[pos_inds] = 1.0 + img_h, img_w = img_meta["img_shape"][:2] + + # DETR regress the relative position of boxes (cxcywh) in the image. + # Thus the learning target should be normalized by the image size, also + # the box format should be converted from defaultly x1y1x2y2 to cxcywh. + factor = gt_bboxes.new_tensor([img_w, img_h, img_w, img_h]).unsqueeze(0) + gt_bboxes_normalized = gt_bboxes / factor + gt_bboxes_targets = bbox_xyxy_to_cxcywh(gt_bboxes_normalized) + bbox_targets[pos_inds] = gt_bboxes_targets.repeat([num_groups, 1]) + + return (labels, label_weights, bbox_targets, bbox_weights, pos_inds, neg_inds) + + @staticmethod + def split_outputs( + all_layers_cls_scores: Tensor, all_layers_bbox_preds: Tensor, dn_meta: Dict[str, int] + ) -> Tuple[Tensor, ...]: + """Split outputs of the denoising part and the matching part. + + For the total outputs of `num_queries_total` length, the former + `num_denoising_queries` outputs are from denoising queries, and + the rest `num_matching_queries` ones are from matching queries, + where `num_queries_total` is the sum of `num_denoising_queries` and + `num_matching_queries`. + + Args: + all_layers_cls_scores (Tensor): Classification scores of all + decoder layers, has shape (num_decoder_layers, bs, + num_queries_total, cls_out_channels). + all_layers_bbox_preds (Tensor): Regression outputs of all decoder + layers. Each is a 4D-tensor with normalized coordinate format + (cx, cy, w, h) and has shape (num_decoder_layers, bs, + num_queries_total, 4). + dn_meta (Dict[str, int]): The dictionary saves information about + group collation, including 'num_denoising_queries' and + 'num_denoising_groups'. + + Returns: + Tuple[Tensor]: a tuple containing the following outputs. + + - all_layers_matching_cls_scores (Tensor): Classification scores + of all decoder layers in matching part, has shape + (num_decoder_layers, bs, num_matching_queries, cls_out_channels). + - all_layers_matching_bbox_preds (Tensor): Regression outputs of + all decoder layers in matching part. Each is a 4D-tensor with + normalized coordinate format (cx, cy, w, h) and has shape + (num_decoder_layers, bs, num_matching_queries, 4). + - all_layers_denoising_cls_scores (Tensor): Classification scores + of all decoder layers in denoising part, has shape + (num_decoder_layers, bs, num_denoising_queries, + cls_out_channels). + - all_layers_denoising_bbox_preds (Tensor): Regression outputs of + all decoder layers in denoising part. Each is a 4D-tensor with + normalized coordinate format (cx, cy, w, h) and has shape + (num_decoder_layers, bs, num_denoising_queries, 4). + """ + num_denoising_queries = dn_meta["num_denoising_queries"] + if dn_meta is not None: + all_layers_denoising_cls_scores = all_layers_cls_scores[:, :, :num_denoising_queries, :] + all_layers_denoising_bbox_preds = all_layers_bbox_preds[:, :, :num_denoising_queries, :] + all_layers_matching_cls_scores = all_layers_cls_scores[:, :, num_denoising_queries:, :] + all_layers_matching_bbox_preds = all_layers_bbox_preds[:, :, num_denoising_queries:, :] + else: + all_layers_denoising_cls_scores = None + all_layers_denoising_bbox_preds = None + all_layers_matching_cls_scores = all_layers_cls_scores + all_layers_matching_bbox_preds = all_layers_bbox_preds + return ( + all_layers_matching_cls_scores, + all_layers_matching_bbox_preds, + all_layers_denoising_cls_scores, + all_layers_denoising_bbox_preds, + ) + + def simple_test_bboxes(self, feats, img_metas, rescale=False): + """Test det bboxes without test-time augmentation. + + Args: + feats (tuple[torch.Tensor]): Multi-level features from the + upstream network, each is a 4D-tensor. + img_metas (list[dict]): List of image information. + rescale (bool, optional): Whether to rescale the results. + Defaults to False. + + Returns: + list[tuple[Tensor, Tensor]]: Each item in result_list is 2-tuple. + The first item is ``bboxes`` with shape (n, 5), + where 5 represent (tl_x, tl_y, br_x, br_y, score). + The shape of the second tensor in the tuple is ``labels`` + with shape (n,) + """ + # forward of this head requires img_metas + gt_bboxes = [None] * len(feats) + gt_labels = [None] * len(feats) + hidden_states, references, enc_output_class, enc_output_coord, _ = self.forward_transformer( + feats, gt_bboxes, gt_labels, img_metas + ) + cls_scores, bbox_preds = self(hidden_states, references) + results_list = self.get_bboxes( + cls_scores, bbox_preds, enc_output_class, enc_output_coord, img_metas, rescale=rescale + ) + return results_list diff --git a/otx/algorithms/detection/adapters/mmdet/models/heads/detr_head.py b/otx/algorithms/detection/adapters/mmdet/models/heads/detr_head.py new file mode 100644 index 00000000000..86841e12022 --- /dev/null +++ b/otx/algorithms/detection/adapters/mmdet/models/heads/detr_head.py @@ -0,0 +1,263 @@ +"""DETR Head extension for OTX DINO.""" +# Copyright (C) 2023 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# + +from typing import Dict, List, Tuple + +import torch +from mmcv.runner import BaseModule +from mmdet.core import bbox_cxcywh_to_xyxy, bbox_xyxy_to_cxcywh, multi_apply, reduce_mean +from torch import Tensor + + +class DETRHeadExtension(BaseModule): + """Head of DETR. DETR:End-to-End Object Detection with Transformers.""" + + def loss_by_feat( + self, + all_layers_cls_scores: Tensor, + all_layers_bbox_preds: Tensor, + batch_gt_instances, + batch_img_metas: List[dict], + batch_gt_instances_ignore=None, + ) -> Dict[str, Tensor]: + """Loss function. + + Only outputs from the last feature level are used for computing + losses by default. + + Args: + all_layers_cls_scores (Tensor): Classification outputs + of each decoder layers. Each is a 4D-tensor, has shape + (num_decoder_layers, bs, num_queries, cls_out_channels). + all_layers_bbox_preds (Tensor): Sigmoid regression + outputs of each decoder layers. Each is a 4D-tensor with + normalized coordinate format (cx, cy, w, h) and shape + (num_decoder_layers, bs, num_queries, 4). + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + batch_gt_instances_ignore (list[:obj:`InstanceData`], optional): + Batch of gt_instances_ignore. It includes ``bboxes`` attribute + data that is ignored during training and testing. + Defaults to None. + + Returns: + dict[str, Tensor]: A dictionary of loss components. + """ + assert batch_gt_instances_ignore is None, ( + f"{self.__class__.__name__} only supports " "for batch_gt_instances_ignore setting to None." + ) + + losses_cls, losses_bbox, losses_iou = multi_apply( + self.loss_by_feat_single, + all_layers_cls_scores, + all_layers_bbox_preds, + batch_gt_instances=batch_gt_instances, + batch_img_metas=batch_img_metas, + ) + + loss_dict = dict() + # loss from the last decoder layer + loss_dict["loss_cls"] = losses_cls[-1] + loss_dict["loss_bbox"] = losses_bbox[-1] + loss_dict["loss_iou"] = losses_iou[-1] + # loss from other decoder layers + num_dec_layer = 0 + for loss_cls_i, loss_bbox_i, loss_iou_i in zip(losses_cls[:-1], losses_bbox[:-1], losses_iou[:-1]): + loss_dict[f"d{num_dec_layer}.loss_cls"] = loss_cls_i + loss_dict[f"d{num_dec_layer}.loss_bbox"] = loss_bbox_i + loss_dict[f"d{num_dec_layer}.loss_iou"] = loss_iou_i + num_dec_layer += 1 + return loss_dict + + def loss_by_feat_single( + self, cls_scores: Tensor, bbox_preds: Tensor, batch_gt_instances, batch_img_metas: List[dict] + ) -> Tuple[Tensor, Tensor, Tensor]: + """Loss function for outputs from a single decoder layer of a single feature level. + + Args: + cls_scores (Tensor): Box score logits from a single decoder layer + for all images, has shape (bs, num_queries, cls_out_channels). + bbox_preds (Tensor): Sigmoid outputs from a single decoder layer + for all images, with normalized coordinate (cx, cy, w, h) and + shape (bs, num_queries, 4). + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + + Returns: + Tuple[Tensor]: A tuple including `loss_cls`, `loss_box` and + `loss_iou`. + """ + num_imgs = cls_scores.size(0) + cls_scores_list = [cls_scores[i] for i in range(num_imgs)] + bbox_preds_list = [bbox_preds[i] for i in range(num_imgs)] + cls_reg_targets = self._get_targets(cls_scores_list, bbox_preds_list, batch_gt_instances, batch_img_metas) + ( + labels_list, + label_weights_list, + bbox_targets_list, + bbox_weights_list, + num_total_pos, + num_total_neg, + ) = cls_reg_targets + labels = torch.cat(labels_list, 0) + label_weights = torch.cat(label_weights_list, 0) + bbox_targets = torch.cat(bbox_targets_list, 0) + bbox_weights = torch.cat(bbox_weights_list, 0) + + # classification loss + cls_scores = cls_scores.reshape(-1, self.cls_out_channels) + # construct weighted avg_factor to match with the official DETR repo + cls_avg_factor = num_total_pos * 1.0 + num_total_neg * self.bg_cls_weight + if self.sync_cls_avg_factor: + cls_avg_factor = reduce_mean(cls_scores.new_tensor([cls_avg_factor])) + cls_avg_factor = max(cls_avg_factor, 1) + + loss_cls = self.loss_cls(cls_scores, labels, label_weights, avg_factor=cls_avg_factor) + + # Compute the average number of gt boxes across all gpus, for + # normalization purposes + num_total_pos = loss_cls.new_tensor([num_total_pos]) + num_total_pos = torch.clamp(reduce_mean(num_total_pos), min=1).item() + + # construct factors used for rescale bboxes + factors = [] + for img_meta, bbox_pred in zip(batch_img_metas, bbox_preds): + (img_h, img_w,) = img_meta[ + "img_shape" + ][:2] + factor = bbox_pred.new_tensor([img_w, img_h, img_w, img_h]).unsqueeze(0).repeat(bbox_pred.size(0), 1) + factors.append(factor) + factors = torch.cat(factors, 0) + + # DETR regress the relative position of boxes (cxcywh) in the image, + # thus the learning target is normalized by the image size. So here + # we need to re-scale them for calculating IoU loss + bbox_preds = bbox_preds.reshape(-1, 4) + bboxes = bbox_cxcywh_to_xyxy(bbox_preds) * factors + bboxes_gt = bbox_cxcywh_to_xyxy(bbox_targets) * factors + + # regression IoU loss, defaultly GIoU loss + loss_iou = self.loss_iou(bboxes, bboxes_gt, bbox_weights, avg_factor=num_total_pos) + + # regression L1 loss + loss_bbox = self.loss_bbox(bbox_preds, bbox_targets, bbox_weights, avg_factor=num_total_pos) + return loss_cls, loss_bbox, loss_iou + + def _get_targets( + self, + cls_scores_list: List[Tensor], + bbox_preds_list: List[Tensor], + batch_gt_instances, + batch_img_metas: List[dict], + ) -> tuple: + """Compute regression and classification targets for a batch image. + + Outputs from a single decoder layer of a single feature level are used. + + Args: + cls_scores_list (list[Tensor]): Box score logits from a single + decoder layer for each image, has shape [num_queries, + cls_out_channels]. + bbox_preds_list (list[Tensor]): Sigmoid outputs from a single + decoder layer for each image, with normalized coordinate + (cx, cy, w, h) and shape [num_queries, 4]. + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + + Returns: + tuple: a tuple containing the following targets. + + - labels_list (list[Tensor]): Labels for all images. + - label_weights_list (list[Tensor]): Label weights for all images. + - bbox_targets_list (list[Tensor]): BBox targets for all images. + - bbox_weights_list (list[Tensor]): BBox weights for all images. + - num_total_pos (int): Number of positive samples in all images. + - num_total_neg (int): Number of negative samples in all images. + """ + ( + labels_list, + label_weights_list, + bbox_targets_list, + bbox_weights_list, + pos_inds_list, + neg_inds_list, + ) = multi_apply( + self.__get_targets_single, cls_scores_list, bbox_preds_list, batch_gt_instances, batch_img_metas + ) + num_total_pos = sum((inds.numel() for inds in pos_inds_list)) + num_total_neg = sum((inds.numel() for inds in neg_inds_list)) + return (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list, num_total_pos, num_total_neg) + + def __get_targets_single(self, cls_score: Tensor, bbox_pred: Tensor, gt_instances, img_meta: dict) -> tuple: + """Compute regression and classification targets for one image. + + Outputs from a single decoder layer of a single feature level are used. + + Args: + cls_score (Tensor): Box score logits from a single decoder layer + for one image. Shape [num_queries, cls_out_channels]. + bbox_pred (Tensor): Sigmoid outputs from a single decoder layer + for one image, with normalized coordinate (cx, cy, w, h) and + shape [num_queries, 4]. + gt_instances (:obj:`InstanceData`): Ground truth of instance + annotations. It should includes ``bboxes`` and ``labels`` + attributes. + img_meta (dict): Meta information for one image. + + Returns: + tuple[Tensor]: a tuple containing the following for one image. + + - labels (Tensor): Labels of each image. + - label_weights (Tensor]): Label weights of each image. + - bbox_targets (Tensor): BBox targets of each image. + - bbox_weights (Tensor): BBox weights of each image. + - pos_inds (Tensor): Sampled positive indices for each image. + - neg_inds (Tensor): Sampled negative indices for each image. + """ + img_h, img_w = img_meta["img_shape"][:2] + factor = bbox_pred.new_tensor([img_w, img_h, img_w, img_h]).unsqueeze(0) + num_bboxes = bbox_pred.size(0) + # # convert bbox_pred from xywh, normalized to xyxy, unnormalized + # bbox_pred = bbox_cxcywh_to_xyxy(bbox_pred) + # bbox_pred = bbox_pred * factor + + # assigner and sampler + assign_result = self.assigner.assign( + bbox_pred, cls_score, gt_instances.bboxes, gt_instances.labels, img_meta=img_meta + ) + + gt_bboxes = gt_instances.bboxes + gt_labels = gt_instances.labels + pos_inds = torch.nonzero(assign_result.gt_inds > 0, as_tuple=False).squeeze(-1).unique() + neg_inds = torch.nonzero(assign_result.gt_inds == 0, as_tuple=False).squeeze(-1).unique() + pos_assigned_gt_inds = assign_result.gt_inds[pos_inds] - 1 + pos_gt_bboxes = gt_bboxes[pos_assigned_gt_inds.long(), :] + + # label targets + labels = gt_bboxes.new_full((num_bboxes,), self.num_classes, dtype=torch.long) + labels[pos_inds] = gt_labels[pos_assigned_gt_inds] + label_weights = gt_bboxes.new_ones(num_bboxes) + + # bbox targets + bbox_targets = torch.zeros_like(bbox_pred) + bbox_weights = torch.zeros_like(bbox_pred) + bbox_weights[pos_inds] = 1.0 + + # DETR regress the relative position of boxes (cxcywh) in the image. + # Thus the learning target should be normalized by the image size, also + # the box format should be converted from defaultly x1y1x2y2 to cxcywh. + pos_gt_bboxes_normalized = pos_gt_bboxes / factor + pos_gt_bboxes_targets = bbox_xyxy_to_cxcywh(pos_gt_bboxes_normalized) + bbox_targets[pos_inds] = pos_gt_bboxes_targets + return (labels, label_weights, bbox_targets, bbox_weights, pos_inds, neg_inds) diff --git a/otx/algorithms/detection/adapters/mmdet/models/layers/__init__.py b/otx/algorithms/detection/adapters/mmdet/models/layers/__init__.py new file mode 100644 index 00000000000..4ded67b4b79 --- /dev/null +++ b/otx/algorithms/detection/adapters/mmdet/models/layers/__init__.py @@ -0,0 +1,9 @@ +"""Initial file for mmdetection layers for models.""" +# Copyright (C) 2023 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# + +from .dino import CustomDINOTransformer +from .dino_layers import CdnQueryGenerator, DINOTransformerDecoder + +__all__ = ["CustomDINOTransformer", "DINOTransformerDecoder", "CdnQueryGenerator"] diff --git a/otx/algorithms/detection/adapters/mmdet/models/layers/dino.py b/otx/algorithms/detection/adapters/mmdet/models/layers/dino.py new file mode 100644 index 00000000000..f942b5b1717 --- /dev/null +++ b/otx/algorithms/detection/adapters/mmdet/models/layers/dino.py @@ -0,0 +1,169 @@ +"""Custom DINO transformer for OTX template.""" +# Copyright (C) 2023 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# + +import torch +from mmdet.models.utils.builder import TRANSFORMER +from mmdet.models.utils.transformer import DeformableDetrTransformer + + +@TRANSFORMER.register_module() +class CustomDINOTransformer(DeformableDetrTransformer): + """Custom DINO transformer.""" + + def init_layers(self): + """Initialize layers of the DINO.""" + self.level_embeds = torch.nn.Parameter(torch.Tensor(self.num_feature_levels, self.embed_dims)) + + self.enc_output = torch.nn.Linear(self.embed_dims, self.embed_dims) + self.enc_output_norm = torch.nn.LayerNorm(self.embed_dims) + + def forward( + self, + batch_info, + mlvl_feats, + mlvl_masks, + query_embed, + mlvl_pos_embeds, + reg_branches=None, + cls_branches=None, + **kwargs + ): + """Forward function for `Transformer`. + + Args: + batch_info(list(dict(str, union(tuple, tensor)))): + Information about batch such as image shaep, + gt information. + mlvl_feats (list(Tensor)): Input queries from + different level. Each element has shape + [bs, embed_dims, h, w]. + mlvl_masks (list(Tensor)): The key_padding_mask from + different level used for encoder and decoder, + each element has shape [bs, h, w]. + query_embed (Tensor): The query embedding for decoder, + with shape [num_query, c]. + mlvl_pos_embeds (list(Tensor)): The positional encoding + of feats from different level, has the shape + [bs, embed_dims, h, w]. + reg_branches (obj:`nn.ModuleList`): Regression heads for + feature maps from each decoder layer. Only would + be passed when + `with_box_refine` is True. Default to None. + cls_branches (obj:`nn.ModuleList`): Classification heads + for feature maps from each decoder layer. Only would + be passed when `as_two_stage` + is True. Default to None. + kwargs: Additional argument for forward_transformer function. + + + Returns: + tuple[Tensor]: results of decoder containing the following tensor. + + - inter_states: Outputs from decoder. If + return_intermediate_dec is True output has shape \ + (num_dec_layers, bs, num_query, embed_dims), else has \ + shape (1, bs, num_query, embed_dims). + - init_reference_out: The initial value of reference \ + points, has shape (bs, num_queries, 4). + - inter_references_out: The internal value of reference \ + points in decoder, has shape \ + (num_dec_layers, bs,num_query, embed_dims) + - enc_outputs_class: The classification score of \ + proposals generated from \ + encoder's feature maps, has shape \ + (batch, h*w, num_classes). \ + Only would be returned when `as_two_stage` is True, \ + otherwise None. + - enc_outputs_coord_unact: The regression results \ + generated from encoder's feature maps., has shape \ + (batch, h*w, 4). Only would \ + be returned when `as_two_stage` is True, \ + otherwise None. + """ + feat_flatten = [] + mask_flatten = [] + lvl_pos_embed_flatten = [] + spatial_shapes = [] + for lvl, (feat, mask, pos_embed) in enumerate(zip(mlvl_feats, mlvl_masks, mlvl_pos_embeds)): + bs, c, h, w = feat.shape + spatial_shape = (h, w) + spatial_shapes.append(spatial_shape) + feat = feat.flatten(2).transpose(1, 2) + mask = mask.flatten(1) + pos_embed = pos_embed.flatten(2).transpose(1, 2) + lvl_pos_embed = pos_embed + self.level_embeds[lvl].view(1, 1, -1) + lvl_pos_embed_flatten.append(lvl_pos_embed) + feat_flatten.append(feat) + mask_flatten.append(mask) + feat_flatten = torch.cat(feat_flatten, 1) + mask_flatten = torch.cat(mask_flatten, 1) + lvl_pos_embed_flatten = torch.cat(lvl_pos_embed_flatten, 1) + spatial_shapes = torch.as_tensor(spatial_shapes, dtype=torch.long, device=feat_flatten.device) + level_start_index = torch.cat((spatial_shapes.new_zeros((1,)), spatial_shapes.prod(1).cumsum(0)[:-1])) + valid_ratios = torch.stack([self.get_valid_ratio(m) for m in mlvl_masks], 1) + + reference_points = self.get_reference_points(spatial_shapes, valid_ratios, device=feat.device) + + feat_flatten = feat_flatten.permute(1, 0, 2) # (H*W, bs, embed_dims) + lvl_pos_embed_flatten = lvl_pos_embed_flatten.permute(1, 0, 2) # (H*W, bs, embed_dims) + memory = self.encoder( + query=feat_flatten, + key=None, + value=None, + query_pos=lvl_pos_embed_flatten, + query_key_padding_mask=mask_flatten, + spatial_shapes=spatial_shapes, + reference_points=reference_points, + level_start_index=level_start_index, + valid_ratios=valid_ratios, + **kwargs + ) + + # pre_decoder part at mmdet 3.x version + memory = memory.permute(1, 0, 2) + bs, _, c = memory.shape + cls_out_features = cls_branches[self.decoder.num_layers].out_features + output_memory, output_proposals = self.gen_encoder_output_proposals(memory, mask_flatten, spatial_shapes) + enc_outputs_class = cls_branches[self.decoder.num_layers](output_memory) + enc_outputs_coord_unact = reg_branches[self.decoder.num_layers](output_memory) + output_proposals + + topk_indices = torch.topk(enc_outputs_class.max(-1)[0], k=self.two_stage_num_proposals, dim=1)[1] + topk_scores = torch.gather(enc_outputs_class, 1, topk_indices.unsqueeze(-1).repeat(1, 1, cls_out_features)) + topk_coords_unact = torch.gather(enc_outputs_coord_unact, 1, topk_indices.unsqueeze(-1).repeat(1, 1, 4)) + topk_coords = topk_coords_unact.sigmoid() + topk_coords_unact = topk_coords_unact.detach() + + query = query_embed[:, None, :] + query = query.repeat(1, bs, 1).transpose(0, 1) + if self.training: + dn_label_query, dn_bbox_query, dn_mask, dn_meta = self.dn_query_generator(batch_info) + query = torch.cat([dn_label_query, query], dim=1) + reference_points = torch.cat([dn_bbox_query, topk_coords_unact], dim=1) + else: + reference_points = topk_coords_unact + dn_mask, dn_meta = None, None + reference_points = reference_points.sigmoid() + + # forward_decoder part in mmdet 3.x + inter_states, references = self.decoder( + query=query, + value=memory, + key_padding_mask=mask_flatten, + self_attn_mask=dn_mask, + reference_points=reference_points, + spatial_shapes=spatial_shapes, + level_start_index=level_start_index, + valid_ratios=valid_ratios, + reg_branches=reg_branches, + ) + + if len(query) == self.two_stage_num_proposals: + # NOTE: This is to make sure label_embeding can be involved to + # produce loss even if there is no denoising query (no ground truth + # target in this GPU), otherwise, this will raise runtime error in + # distributed training. + inter_states[0] += self.dn_query_generator.label_embedding.weight[0, 0] * 0.0 + + return inter_states, list(references), topk_scores, topk_coords, dn_meta diff --git a/otx/algorithms/detection/adapters/mmdet/models/layers/dino_layers.py b/otx/algorithms/detection/adapters/mmdet/models/layers/dino_layers.py new file mode 100644 index 00000000000..fb82a1febb1 --- /dev/null +++ b/otx/algorithms/detection/adapters/mmdet/models/layers/dino_layers.py @@ -0,0 +1,609 @@ +"""Initial file for mmdetection layers for models.""" +# Copyright (C) 2023 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# + +import math +import warnings +from typing import Any, Dict, List, Optional, Tuple, Union + +import torch +import torch.nn.functional as F +from mmcv.cnn.bricks.registry import TRANSFORMER_LAYER_SEQUENCE +from mmcv.runner import BaseModule +from mmcv.utils import Config +from mmdet.core import bbox_xyxy_to_cxcywh +from mmdet.models.utils.transformer import DeformableDetrTransformerDecoder, inverse_sigmoid +from torch import Tensor, nn + + +@TRANSFORMER_LAYER_SEQUENCE.register_module() +class DINOTransformerDecoder(DeformableDetrTransformerDecoder): + """Transformer encoder of DINO.""" + + def __init__(self, *args, return_intermediate=False, **kwargs): + super().__init__(*args, return_intermediate=return_intermediate, **kwargs) + self.ref_point_head = MLP(self.embed_dims * 2, self.embed_dims, self.embed_dims, 2) + self.norm = nn.LayerNorm(self.embed_dims) + + def forward( + self, + query: Tensor, + value: Tensor, + key_padding_mask: Tensor, + self_attn_mask: Tensor, + reference_points: Tensor, + spatial_shapes: Tensor, + level_start_index: Tensor, + valid_ratios: Tensor, + reg_branches: nn.ModuleList, + **kwargs, + ) -> Tensor: + """Forward function of Transformer encoder. + + Args: + query (Tensor): The input query, has shape (num_queries, bs, dim). + value (Tensor): The input values, has shape (num_value, bs, dim). + key_padding_mask (Tensor): The `key_padding_mask` of `self_attn` + input. ByteTensor, has shape (num_queries, bs). + self_attn_mask (Tensor): The attention mask to prevent information + leakage from different denoising groups and matching parts, has + shape (num_queries_total, num_queries_total). It is `None` when + `self.training` is `False`. + reference_points (Tensor): The initial reference, has shape + (bs, num_queries, 4) with the last dimension arranged as + (cx, cy, w, h). + spatial_shapes (Tensor): Spatial shapes of features in all levels, + has shape (num_levels, 2), last dimension represents (h, w). + level_start_index (Tensor): The start index of each level. + A tensor has shape (num_levels, ) and can be represented + as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...]. + valid_ratios (Tensor): The ratios of the valid width and the valid + height relative to the width and the height of features in all + levels, has shape (bs, num_levels, 2). + reg_branches: (obj:`nn.ModuleList`): Used for refining the + regression results. + kwargs: Additional argument for attention layers. + + Returns: + Tensor: Output queries of Transformer encoder, which is also + called 'encoder output embeddings' or 'memory', has shape + (num_queries, bs, dim) + """ + intermediate = [] + intermediate_reference_points = [reference_points] + for lid, layer in enumerate(self.layers): + if reference_points.shape[-1] == 4: + reference_points_input = ( + reference_points[:, :, None] * torch.cat([valid_ratios, valid_ratios], -1)[:, None] + ) + else: + assert reference_points.shape[-1] == 2 + reference_points_input = reference_points[:, :, None] * valid_ratios[:, None] + + query_sine_embed = coordinate_to_encoding(reference_points_input[:, :, 0, :]) + query_pos = self.ref_point_head(query_sine_embed) + + query = layer( + query.permute(1, 0, 2), + query_pos=query_pos.permute(1, 0, 2), + value=value.permute(1, 0, 2), + key_padding_mask=key_padding_mask, + attn_masks=[self_attn_mask, None], + spatial_shapes=spatial_shapes, + level_start_index=level_start_index, + valid_ratios=valid_ratios, + reference_points=reference_points_input, + **kwargs, + ) + + query = query.permute(1, 0, 2) + if reg_branches is not None: + tmp = reg_branches[lid](query) + assert reference_points.shape[-1] == 4 + new_reference_points = tmp + inverse_sigmoid(reference_points, eps=1e-3) + new_reference_points = new_reference_points.sigmoid() + reference_points = new_reference_points.detach() + + if self.return_intermediate: + intermediate.append(self.norm(query)) + intermediate_reference_points.append(new_reference_points) + # NOTE this is for the "Look Forward Twice" module, + # in the DeformDETR, reference_points was appended. + + if self.return_intermediate: + return torch.stack(intermediate), torch.stack(intermediate_reference_points) + + return query, reference_points + + +class CdnQueryGenerator(BaseModule): + """Implement query generator of the Contrastive denoising (CDN). + + Proposed in`DINO: DETR with Improved DeNoising Anchor Boxes for End-to-End Object + Detection `_. + + Code is modified from the `official github repo + `_. + + Args: + num_classes (int): Number of object classes. + embed_dims (int): The embedding dimensions of the generated queries. + num_matching_queries (int): The queries number of the matching part. + Used for generating dn_mask. + label_noise_scale (float): The scale of label noise, defaults to 0.5. + box_noise_scale (float): The scale of box noise, defaults to 1.0. + group_cfg (:obj:`ConfigDict` or dict, optional): The config of the + denoising queries grouping, includes `dynamic`, `num_dn_queries`, + and `num_groups`. Two grouping strategies, 'static dn groups' and + 'dynamic dn groups', are supported. When `dynamic` is `False`, + the `num_groups` should be set, and the number of denoising query + groups will always be `num_groups`. When `dynamic` is `True`, the + `num_dn_queries` should be set, and the group number will be + dynamic to ensure that the denoising queries number will not exceed + `num_dn_queries` to prevent large fluctuations of memory. Defaults + to `None`. + """ + + def __init__( + self, + num_classes: int, + embed_dims: int, + num_matching_queries: int, + label_noise_scale: float = 0.5, + box_noise_scale: float = 1.0, + group_cfg: Optional[Config] = None, + ) -> None: + super().__init__() + self.num_classes = num_classes + self.embed_dims = embed_dims + self.num_matching_queries = num_matching_queries + self.label_noise_scale = label_noise_scale + self.box_noise_scale = box_noise_scale + + # prepare grouping strategy + group_cfg = {} if group_cfg is None else group_cfg + self.dynamic_dn_groups = group_cfg.get("dynamic", True) + if self.dynamic_dn_groups: + if "num_dn_queries" not in group_cfg: + warnings.warn("'num_dn_queries' should be set when using " "dynamic dn groups, use 100 as default.") + self.num_dn_queries = group_cfg.get("num_dn_queries", 100) + assert isinstance(self.num_dn_queries, int), ( + f"Expected the num_dn_queries to have type int, but got " + f"{self.num_dn_queries}({type(self.num_dn_queries)}). " + ) + else: + assert "num_groups" in group_cfg, "num_groups should be set when using static dn groups" + self.num_groups = group_cfg["num_groups"] + assert isinstance(self.num_groups, int), ( + f"Expected the num_groups to have type int, but got " f"{self.num_groups}({type(self.num_groups)}). " + ) + + # NOTE The original repo of DINO set the num_embeddings 92 for coco, + # 91 (0~90) of which represents target classes and the 92 (91) + # indicates `Unknown` class. However, the embedding of `unknown` class + # is not used in the original DINO. + # TODO: num_classes + 1 or num_classes ? + self.label_embedding = nn.Embedding(self.num_classes, self.embed_dims) + + def __call__(self, batch_info: List[Dict[str, Any]]) -> tuple: + """Generate contrastive denoising (cdn) queries with ground truth. + + Descriptions of the Number Values in code and comments: + - num_target_total: the total target number of the input batch + samples. + - max_num_target: the max target number of the input batch samples. + - num_noisy_targets: the total targets number after adding noise, + i.e., num_target_total * num_groups * 2. + - num_denoising_queries: the length of the output batched queries, + i.e., max_num_target * num_groups * 2. + + NOTE The format of input bboxes in batch_info is unnormalized + (x, y, x, y), and the output bbox queries are embedded by normalized + (cx, cy, w, h) format bboxes going through inverse_sigmoid. + + Args: + batch_info (list[dict[str, union[tuple, tensor]]]): List of the batch + information such as image size, and gt information. + + Returns: + tuple: The outputs of the dn query generator. + + - dn_label_query (Tensor): The output content queries for denoising + part, has shape (bs, num_denoising_queries, dim), where + `num_denoising_queries = max_num_target * num_groups * 2`. + - dn_bbox_query (Tensor): The output reference bboxes as positions + of queries for denoising part, which are embedded by normalized + (cx, cy, w, h) format bboxes going through inverse_sigmoid, has + shape (bs, num_denoising_queries, 4) with the last dimension + arranged as (cx, cy, w, h). + - attn_mask (Tensor): The attention mask to prevent information + leakage from different denoising groups and matching parts, + will be used as `self_attn_mask` of the `decoder`, has shape + (num_queries_total, num_queries_total), where `num_queries_total` + is the sum of `num_denoising_queries` and `num_matching_queries`. + - dn_meta (Dict[str, int]): The dictionary saves information about + group collation, including 'num_denoising_queries' and + 'num_denoising_groups'. It will be used for split outputs of + denoising and matching parts and loss calculation. + """ + # normalize bbox and collate ground truth (gt) + gt_labels_list = [] + gt_bboxes_list = [] + for sample in batch_info: + img_h, img_w = sample["img_shape"] + bboxes = sample["bboxes"] + factor = bboxes.new_tensor([img_w, img_h, img_w, img_h]).unsqueeze(0) + bboxes_normalized = bboxes / factor + gt_bboxes_list.append(bboxes_normalized) + gt_labels_list.append(sample["labels"]) + gt_labels = torch.cat(gt_labels_list) # (num_target_total, 4) + gt_bboxes = torch.cat(gt_bboxes_list) + + num_target_list = [len(bboxes) for bboxes in gt_bboxes_list] + max_num_target = max(num_target_list) + num_groups = self.get_num_groups(max_num_target) + + dn_label_query = self.generate_dn_label_query(gt_labels, num_groups) + dn_bbox_query = self.generate_dn_bbox_query(gt_bboxes, num_groups) + + # The `batch_idx` saves the batch index of the corresponding sample + # for each target, has shape (num_target_total). + batch_idx = torch.cat([torch.full_like(t.long(), i) for i, t in enumerate(gt_labels_list)]) + dn_label_query, dn_bbox_query = self.collate_dn_queries( + dn_label_query, dn_bbox_query, batch_idx, len(batch_info), num_groups + ) + + attn_mask = self.generate_dn_mask(max_num_target, num_groups, device=dn_label_query.device) + + dn_meta = dict(num_denoising_queries=int(max_num_target * 2 * num_groups), num_denoising_groups=num_groups) + + return dn_label_query, dn_bbox_query, attn_mask, dn_meta + + def get_num_groups(self, max_num_target: int = None) -> int: + """Calculate denoising query groups number. + + Two grouping strategies, 'static dn groups' and 'dynamic dn groups', + are supported. When `self.dynamic_dn_groups` is `False`, the number + of denoising query groups will always be `self.num_groups`. When + `self.dynamic_dn_groups` is `True`, the group number will be dynamic, + ensuring the denoising queries number will not exceed + `self.num_dn_queries` to prevent large fluctuations of memory. + + NOTE The `num_group` is shared for different samples in a batch. When + the target numbers in the samples varies, the denoising queries of the + samples containing fewer targets are padded to the max length. + + Args: + max_num_target (int, optional): The max target number of the batch + samples. It will only be used when `self.dynamic_dn_groups` is + `True`. Defaults to `None`. + + Returns: + int: The denoising group number of the current batch. + """ + if self.dynamic_dn_groups: + assert max_num_target is not None, "group_queries should be provided when using " "dynamic dn groups" + if max_num_target == 0: + num_groups = 1 + else: + num_groups = self.num_dn_queries // max_num_target + else: + num_groups = self.num_groups + if num_groups < 1: + num_groups = 1 + return int(num_groups) + + def generate_dn_label_query(self, gt_labels: Tensor, num_groups: int) -> Tensor: + """Generate noisy labels and their query embeddings. + + The strategy for generating noisy labels is: Randomly choose labels of + `self.label_noise_scale * 0.5` proportion and override each of them + with a random object category label. + + NOTE Not add noise to all labels. Besides, the `self.label_noise_scale + * 0.5` arg is the ratio of the chosen positions, which is higher than + the actual proportion of noisy labels, because the labels to override + may be correct. And the gap becomes larger as the number of target + categories decreases. The users should notice this and modify the scale + arg or the corresponding logic according to specific dataset. + + Args: + gt_labels (Tensor): The concatenated gt labels of all samples + in the batch, has shape (num_target_total, ) where + `num_target_total = sum(num_target_list)`. + num_groups (int): The number of denoising query groups. + + Returns: + Tensor: The query embeddings of noisy labels, has shape + (num_noisy_targets, embed_dims), where `num_noisy_targets = + num_target_total * num_groups * 2`. + """ + assert self.label_noise_scale > 0 + gt_labels_expand = gt_labels.repeat(2 * num_groups, 1).view(-1) # Note `* 2` # noqa + p = torch.rand_like(gt_labels_expand.float()) + chosen_indice = torch.nonzero(p < (self.label_noise_scale * 0.5)).view(-1) # Note `* 0.5` + new_labels = torch.randint_like(chosen_indice, 0, self.num_classes) + noisy_labels_expand = gt_labels_expand.scatter(0, chosen_indice, new_labels) + dn_label_query = self.label_embedding(noisy_labels_expand) + return dn_label_query + + def generate_dn_bbox_query(self, gt_bboxes: Tensor, num_groups: int) -> Tensor: + """Generate noisy bboxes and their query embeddings. + + The strategy for generating noisy bboxes is as follow: + + .. code:: text + + +--------------------+ + | negative | + | +----------+ | + | | positive | | + | | +-----|----+------------+ + | | | | | | + | +----+-----+ | | + | | | | + +---------+----------+ | + | | + | gt bbox | + | | + | +---------+----------+ + | | | | + | | +----+-----+ | + | | | | | | + +-------------|--- +----+ | | + | | positive | | + | +----------+ | + | negative | + +--------------------+ + + The random noise is added to the top-left and down-right point + positions, hence, normalized (x, y, x, y) format of bboxes are + required. The noisy bboxes of positive queries have the points + both within the inner square, while those of negative queries + have the points both between the inner and outer squares. + + Besides, the length of outer square is twice as long as that of + the inner square, i.e., self.box_noise_scale * w_or_h / 2. + NOTE The noise is added to all the bboxes. Moreover, there is still + unconsidered case when one point is within the positive square and + the others is between the inner and outer squares. + + Args: + gt_bboxes (Tensor): The concatenated gt bboxes of all samples + in the batch, has shape (num_target_total, 4) with the last + dimension arranged as (cx, cy, w, h) where + `num_target_total = sum(num_target_list)`. + num_groups (int): The number of denoising query groups. + + Returns: + Tensor: The output noisy bboxes, which are embedded by normalized + (cx, cy, w, h) format bboxes going through inverse_sigmoid, has + shape (num_noisy_targets, 4) with the last dimension arranged as + (cx, cy, w, h), where + `num_noisy_targets = num_target_total * num_groups * 2`. + """ + assert self.box_noise_scale > 0 + device = gt_bboxes.device + + # expand gt_bboxes as groups + gt_bboxes_expand = gt_bboxes.repeat(2 * num_groups, 1) # xyxy + + # obtain index of negative queries in gt_bboxes_expand + positive_idx = torch.arange(len(gt_bboxes), dtype=torch.long, device=device) + positive_idx = positive_idx.unsqueeze(0).repeat(num_groups, 1) + positive_idx += 2 * len(gt_bboxes) * torch.arange(num_groups, dtype=torch.long, device=device)[:, None] + positive_idx = positive_idx.flatten() + negative_idx = positive_idx + len(gt_bboxes) + + # determine the sign of each element in the random part of the added + # noise to be positive or negative randomly. + rand_sign = ( + torch.randint_like(gt_bboxes_expand, low=0, high=2, dtype=torch.float32) * 2.0 - 1.0 + ) # [low, high), 1 or -1, randomly + + # calculate the random part of the added noise + rand_part = torch.rand_like(gt_bboxes_expand) # [0, 1) + rand_part[negative_idx] += 1.0 # pos: [0, 1); neg: [1, 2) + rand_part *= rand_sign # pos: (-1, 1); neg: (-2, -1] U [1, 2) + + # add noise to the bboxes + bboxes_whwh = bbox_xyxy_to_cxcywh(gt_bboxes_expand)[:, 2:].repeat(1, 2) + noisy_bboxes_expand = gt_bboxes_expand + torch.mul(rand_part, bboxes_whwh) * self.box_noise_scale / 2 # xyxy + noisy_bboxes_expand = noisy_bboxes_expand.clamp(min=0.0, max=1.0) + noisy_bboxes_expand = bbox_xyxy_to_cxcywh(noisy_bboxes_expand) + + dn_bbox_query = inverse_sigmoid(noisy_bboxes_expand, eps=1e-3) + return dn_bbox_query + + def collate_dn_queries( + self, input_label_query: Tensor, input_bbox_query: Tensor, batch_idx: Tensor, batch_size: int, num_groups: int + ) -> Tuple[Tensor, Tensor]: + """Collate generated queries to obtain batched dn queries. + + The strategy for query collation is as follow: + + .. code:: text + + input_queries (num_target_total, query_dim) + P_A1 P_B1 P_B2 N_A1 N_B1 N_B2 P'A1 P'B1 P'B2 N'A1 N'B1 N'B2 + |________ group1 ________| |________ group2 ________| + | + V + P_A1 Pad0 N_A1 Pad0 P'A1 Pad0 N'A1 Pad0 + P_B1 P_B2 N_B1 N_B2 P'B1 P'B2 N'B1 N'B2 + |____ group1 ____| |____ group2 ____| + batched_queries (batch_size, max_num_target, query_dim) + + where query_dim is 4 for bbox and self.embed_dims for label. + Notation: _-group 1; '-group 2; + A-Sample1(has 1 target); B-sample2(has 2 targets) + + Args: + input_label_query (Tensor): The generated label queries of all + targets, has shape (num_target_total, embed_dims) where + `num_target_total = sum(num_target_list)`. + input_bbox_query (Tensor): The generated bbox queries of all + targets, has shape (num_target_total, 4) with the last + dimension arranged as (cx, cy, w, h). + batch_idx (Tensor): The batch index of the corresponding sample + for each target, has shape (num_target_total). + batch_size (int): The size of the input batch. + num_groups (int): The number of denoising query groups. + + Returns: + tuple[Tensor]: Output batched label and bbox queries. + - batched_label_query (Tensor): The output batched label queries, + has shape (batch_size, max_num_target, embed_dims). + - batched_bbox_query (Tensor): The output batched bbox queries, + has shape (batch_size, max_num_target, 4) with the last dimension + arranged as (cx, cy, w, h). + """ + device = input_label_query.device + num_target_list = [torch.sum(batch_idx == idx) for idx in range(batch_size)] + max_num_target = max(num_target_list) + num_denoising_queries = int(max_num_target * 2 * num_groups) + + map_query_index = torch.cat([torch.arange(num_target, device=device) for num_target in num_target_list]) + map_query_index = torch.cat([map_query_index + max_num_target * i for i in range(2 * num_groups)]).long() + batch_idx_expand = batch_idx.repeat(2 * num_groups, 1).view(-1) + mapper = (batch_idx_expand, map_query_index) + + batched_label_query = torch.zeros(batch_size, num_denoising_queries, self.embed_dims, device=device) + batched_bbox_query = torch.zeros(batch_size, num_denoising_queries, 4, device=device) + + batched_label_query[mapper] = input_label_query + batched_bbox_query[mapper] = input_bbox_query + return batched_label_query, batched_bbox_query + + def generate_dn_mask(self, max_num_target: int, num_groups: int, device: Union[torch.device, str]) -> Tensor: + """Generate attention mask to prevent information leakage from different denoising groups and matching parts. + + .. code:: text + + 0 0 0 0 1 1 1 1 0 0 0 0 0 + 0 0 0 0 1 1 1 1 0 0 0 0 0 + 0 0 0 0 1 1 1 1 0 0 0 0 0 + 0 0 0 0 1 1 1 1 0 0 0 0 0 + 1 1 1 1 0 0 0 0 0 0 0 0 0 + 1 1 1 1 0 0 0 0 0 0 0 0 0 + 1 1 1 1 0 0 0 0 0 0 0 0 0 + 1 1 1 1 0 0 0 0 0 0 0 0 0 + 1 1 1 1 1 1 1 1 0 0 0 0 0 + 1 1 1 1 1 1 1 1 0 0 0 0 0 + 1 1 1 1 1 1 1 1 0 0 0 0 0 + 1 1 1 1 1 1 1 1 0 0 0 0 0 + 1 1 1 1 1 1 1 1 0 0 0 0 0 + max_num_target |_| |_________| num_matching_queries + |_____________| num_denoising_queries + + 1 -> True (Masked), means 'can not see'. + 0 -> False (UnMasked), means 'can see'. + + Args: + max_num_target (int): The max target number of the input batch + samples. + num_groups (int): The number of denoising query groups. + device (obj:`device` or str): The device of generated mask. + + Returns: + Tensor: The attention mask to prevent information leakage from + different denoising groups and matching parts, will be used as + `self_attn_mask` of the `decoder`, has shape (num_queries_total, + num_queries_total), where `num_queries_total` is the sum of + `num_denoising_queries` and `num_matching_queries`. + """ + num_denoising_queries = int(max_num_target * 2 * num_groups) + num_queries_total = num_denoising_queries + self.num_matching_queries + attn_mask = torch.zeros(num_queries_total, num_queries_total, device=device, dtype=torch.bool) + # Make the matching part cannot see the denoising groups + attn_mask[num_denoising_queries:, :num_denoising_queries] = True + # Make the denoising groups cannot see each other + for i in range(num_groups): + # Mask rows of one group per step. + row_scope = slice(max_num_target * 2 * i, max_num_target * 2 * (i + 1)) + left_scope = slice(max_num_target * 2 * i) + right_scope = slice(max_num_target * 2 * (i + 1), num_denoising_queries) + attn_mask[row_scope, right_scope] = True + attn_mask[row_scope, left_scope] = True + return attn_mask + + +class MLP(BaseModule): + """Very simple multi-layer perceptron (also called FFN) with relu. Mostly used in DETR series detectors. + + Args: + input_dim (int): Feature dim of the input tensor. + hidden_dim (int): Feature dim of the hidden layer. + output_dim (int): Feature dim of the output tensor. + num_layers (int): Number of FFN layers. As the last + layer of MLP only contains FFN (Linear). + """ + + def __init__(self, input_dim: int, hidden_dim: int, output_dim: int, num_layers: int) -> None: + super().__init__() + self.num_layers = num_layers + h = [hidden_dim] * (num_layers - 1) + self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim])) + + def forward(self, x: Tensor) -> Tensor: + """Forward function of MLP. + + Args: + x (Tensor): The input feature, has shape + (num_queries, bs, input_dim). + + + Returns: + Tensor: The output feature, has shape + (num_queries, bs, output_dim). + """ + for i, layer in enumerate(self.layers): + x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x) + return x + + +def coordinate_to_encoding( + coord_tensor: Tensor, num_feats: int = 128, temperature: int = 10000, scale: float = 2 * math.pi +): + """Convert coordinate tensor to positional encoding. + + Args: + coord_tensor (Tensor): Coordinate tensor to be converted to + positional encoding. With the last dimension as 2 or 4. + num_feats (int, optional): The feature dimension for each position + along x-axis or y-axis. Note the final returned dimension + for each position is 2 times of this value. Defaults to 128. + temperature (int, optional): The temperature used for scaling + the position embedding. Defaults to 10000. + scale (float, optional): A scale factor that scales the position + embedding. The scale will be used only when `normalize` is True. + Defaults to 2*pi. + + + Returns: + Tensor: Returned encoded positional tensor. + """ + dim_t = torch.arange(num_feats, dtype=torch.float32, device=coord_tensor.device) + dim_t = temperature ** (2 * (dim_t // 2) / num_feats) + x_embed = coord_tensor[..., 0] * scale + y_embed = coord_tensor[..., 1] * scale + pos_x = x_embed[..., None] / dim_t + pos_y = y_embed[..., None] / dim_t + pos_x = torch.stack((pos_x[..., 0::2].sin(), pos_x[..., 1::2].cos()), dim=-1).flatten(2) + pos_y = torch.stack((pos_y[..., 0::2].sin(), pos_y[..., 1::2].cos()), dim=-1).flatten(2) + if coord_tensor.size(-1) == 2: + pos = torch.cat((pos_y, pos_x), dim=-1) + elif coord_tensor.size(-1) == 4: + w_embed = coord_tensor[..., 2] * scale + pos_w = w_embed[..., None] / dim_t + pos_w = torch.stack((pos_w[..., 0::2].sin(), pos_w[..., 1::2].cos()), dim=-1).flatten(2) + + h_embed = coord_tensor[..., 3] * scale + pos_h = h_embed[..., None] / dim_t + pos_h = torch.stack((pos_h[..., 0::2].sin(), pos_h[..., 1::2].cos()), dim=-1).flatten(2) + + pos = torch.cat((pos_y, pos_x, pos_w, pos_h), dim=-1) + else: + raise ValueError("Unknown pos_tensor shape(-1):{}".format(coord_tensor.size(-1))) + return pos diff --git a/otx/algorithms/detection/configs/detection/resnet50_dino/data_pipeline.py b/otx/algorithms/detection/configs/detection/resnet50_dino/data_pipeline.py new file mode 100644 index 00000000000..4b577e21eb8 --- /dev/null +++ b/otx/algorithms/detection/configs/detection/resnet50_dino/data_pipeline.py @@ -0,0 +1,115 @@ +"""Data pipeline for Deformable DETR.""" +# dataset settings +dataset_type = "CocoDataset" +data_root = "data/coco/" +img_norm_cfg = dict(mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) + +# train_pipeline, NOTE the img_scale and the Pad's size_divisor is different +# from the default setting in mmdet. +train_pipeline = [ + dict(type="LoadImageFromFile"), + dict(type="LoadAnnotations", with_bbox=True), + dict(type="RandomFlip", flip_ratio=0.5), + dict( + type="AutoAugment", + policies=[ + [ + dict( + type="Resize", + img_scale=[ + (480, 1333), + (512, 1333), + (544, 1333), + (576, 1333), + (608, 1333), + (640, 1333), + (672, 1333), + (704, 1333), + (736, 1333), + (768, 1333), + (800, 1333), + ], + multiscale_mode="value", + keep_ratio=True, + ) + ], + [ + dict( + type="Resize", + # The radio of all image in train dataset < 7 + # follow the original impl + img_scale=[(400, 4200), (500, 4200), (600, 4200)], + multiscale_mode="value", + keep_ratio=True, + ), + dict(type="RandomCrop", crop_type="absolute_range", crop_size=(384, 600), allow_negative_crop=True), + dict( + type="Resize", + img_scale=[ + (480, 1333), + (512, 1333), + (544, 1333), + (576, 1333), + (608, 1333), + (640, 1333), + (672, 1333), + (704, 1333), + (736, 1333), + (768, 1333), + (800, 1333), + ], + multiscale_mode="value", + override=True, + keep_ratio=True, + ), + ], + ], + ), + dict(type="Normalize", **img_norm_cfg), + dict(type="Pad", size_divisor=1), + dict(type="DefaultFormatBundle"), + dict(type="Collect", keys=["img", "gt_bboxes", "gt_labels"]), +] +# test_pipeline, NOTE the Pad's size_divisor is different from the default +# setting (size_divisor=32). While there is little effect on the performance +# whether we use the default setting or use size_divisor=1. +test_pipeline = [ + dict(type="LoadImageFromFile"), + dict( + type="MultiScaleFlipAug", + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type="Resize", keep_ratio=True), + dict(type="RandomFlip"), + dict(type="Normalize", **img_norm_cfg), + dict(type="Pad", size_divisor=1), + dict(type="ImageToTensor", keys=["img"]), + dict(type="Collect", keys=["img"]), + ], + ), +] +data = dict( + samples_per_gpu=2, + workers_per_gpu=2, + train=dict( + type=dataset_type, + filter_empty_gt=False, + ann_file=data_root + "annotations/instances_train2017.json", + img_prefix=data_root + "train2017/", + pipeline=train_pipeline, + ), + val=dict( + type=dataset_type, + ann_file=data_root + "annotations/instances_val2017.json", + img_prefix=data_root + "val2017/", + pipeline=test_pipeline, + ), + test=dict( + type=dataset_type, + ann_file=data_root + "annotations/instances_val2017.json", + img_prefix=data_root + "val2017/", + pipeline=test_pipeline, + ), +) +evaluation = dict(interval=1, metric="bbox") diff --git a/otx/algorithms/detection/configs/detection/resnet50_dino/deployment.py b/otx/algorithms/detection/configs/detection/resnet50_dino/deployment.py new file mode 100644 index 00000000000..76b4a6544f5 --- /dev/null +++ b/otx/algorithms/detection/configs/detection/resnet50_dino/deployment.py @@ -0,0 +1,12 @@ +"""MMDeploy config of Deformable DETR model for Detection Task.""" + +_base_ = ["../../base/deployments/base_detection_dynamic.py"] + +ir_config = dict( + output_names=["boxes", "labels"], + opset_version=16, +) + +backend_config = dict( + model_inputs=[dict(opt_shapes=dict(input=[-1, 3, 800, 1333]))], +) diff --git a/otx/algorithms/detection/configs/detection/resnet50_dino/model.py b/otx/algorithms/detection/configs/detection/resnet50_dino/model.py new file mode 100644 index 00000000000..a9cdf215901 --- /dev/null +++ b/otx/algorithms/detection/configs/detection/resnet50_dino/model.py @@ -0,0 +1,117 @@ +"""Model config for Deformable DETR.""" +model = dict( + type="CustomDINO", + backbone=dict( + type="ResNet", + depth=50, + num_stages=4, + out_indices=(1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type="BN", requires_grad=False), + norm_eval=True, + style="pytorch", + init_cfg=dict(type="Pretrained", checkpoint="torchvision://resnet50"), + ), + neck=dict( + type="ChannelMapper", + in_channels=[512, 1024, 2048], + kernel_size=1, + out_channels=256, + act_cfg=None, + norm_cfg=dict(type="GN", num_groups=32), + num_outs=4, + ), + bbox_head=dict( + type="CustomDINOHead", + num_query=900, + num_classes=80, + in_channels=2048, + sync_cls_avg_factor=True, + with_box_refine=True, + as_two_stage=True, + transformer=dict( + type="CustomDINOTransformer", + encoder=dict( + type="DetrTransformerEncoder", + num_layers=6, + transformerlayers=dict( + type="BaseTransformerLayer", + attn_cfgs=dict(type="MultiScaleDeformableAttention", embed_dims=256, dropout=0.0), + feedforward_channels=2048, + ffn_dropout=0.0, + operation_order=("self_attn", "norm", "ffn", "norm"), + ), + ), + decoder=dict( + type="DINOTransformerDecoder", + num_layers=6, + return_intermediate=True, + transformerlayers=dict( + type="DetrTransformerDecoderLayer", + attn_cfgs=[ + dict(type="MultiheadAttention", embed_dims=256, num_heads=8, dropout=0.0), + dict(type="MultiScaleDeformableAttention", embed_dims=256, dropout=0.0), + ], + feedforward_channels=2048, + ffn_dropout=0.0, + operation_order=("self_attn", "norm", "cross_attn", "norm", "ffn", "norm"), + ), + ), + ), + positional_encoding=dict( + type="SinePositionalEncoding", num_feats=128, normalize=True, offset=0.0, temperature=20 + ), + loss_cls=dict(type="FocalLoss", use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=1.0), + loss_bbox=dict(type="L1Loss", loss_weight=5.0), + loss_iou=dict(type="GIoULoss", loss_weight=2.0), + dn_cfg=dict( + label_noise_scale=0.5, + box_noise_scale=1.0, # 0.4 for DN-DETR + group_cfg=dict(dynamic=True, num_groups=None, num_dn_queries=100), + ), + ), + # training and testing settings + train_cfg=dict( + assigner=dict( + type="HungarianAssigner", + cls_cost=dict(type="FocalLossCost", weight=1.0), + reg_cost=dict(type="BBoxL1Cost", weight=5.0, box_format="xywh"), + iou_cost=dict(type="IoUCost", iou_mode="giou", weight=2.0), + ) + ), + test_cfg=dict(max_per_img=300), +) +# optimizer +optimizer = dict( + type="AdamW", + lr=1e-4, + weight_decay=0.0001, + paramwise_cfg=dict( + custom_keys={ + "backbone": dict(lr_mult=0.1), + "sampling_offsets": dict(lr_mult=0.1), + "reference_points": dict(lr_mult=0.1), + } + ), +) +optimizer_config = dict(grad_clip=dict(max_norm=0.1, norm_type=2)) +# learning policy +lr_config = dict(policy="step", step=[10]) +runner = dict(type="EpochRunnerWithCancel", max_epochs=12) +load_from = ( + "https://download.openmmlab.com/mmdetection/v3.0/dino/" + "dino-4scale_r50_8xb2-12e_coco/dino-4scale_r50_8xb2-12e_coco_20221202_182705-55b2bba2.pth" +) +resume_from = None + +checkpoint_config = dict(interval=1) +# yapf:disable +log_config = dict( + interval=100, + hooks=[ + dict(type="TextLoggerHook"), + ], +) +log_level = "INFO" +workflow = [("train", 1)] +task_adapt = dict(op="REPLACE", type="temp", efficient_mode=False, use_mpa_anchor=False) diff --git a/otx/algorithms/detection/configs/detection/resnet50_dino/template_experimental.yaml b/otx/algorithms/detection/configs/detection/resnet50_dino/template_experimental.yaml new file mode 100644 index 00000000000..cddcef9542f --- /dev/null +++ b/otx/algorithms/detection/configs/detection/resnet50_dino/template_experimental.yaml @@ -0,0 +1,64 @@ +# Description. +model_template_id: Custom_Object_Detection_Gen3_DINO +name: DINO +task_type: DETECTION +task_family: VISION +instantiation: "CLASS" +summary: Class-Incremental Object Detection for DINO +application: ~ + +# Algo backend. +framework: OTXDetection v2.9.1 + +# Task implementations. +entrypoints: + base: otx.algorithms.detection.adapters.mmdet.task.MMDetectionTask + openvino: otx.algorithms.detection.adapters.openvino.task.OpenVINODetectionTask + nncf: otx.algorithms.detection.adapters.mmdet.nncf.task.DetectionNNCFTask + +# Capabilities. +capabilities: + - compute_representations + +# Hyperparameters. +hyper_parameters: + base_path: ../configuration.yaml + parameter_overrides: + learning_parameters: + batch_size: + default_value: 2 + auto_hpo_state: POSSIBLE + learning_rate: + default_value: 0.0001 + auto_hpo_state: POSSIBLE + learning_rate_warmup_iters: + default_value: 3 + num_iters: + default_value: 12 + nncf_optimization: + enable_quantization: + default_value: true + enable_pruning: + default_value: false + pruning_supported: + default_value: true + maximal_accuracy_degradation: + default_value: 1.0 + algo_backend: + train_type: + default_value: Incremental + +# Training resources. +max_nodes: 1 +training_targets: + - GPU + - CPU + +# Stats. +gigaflops: ??? +size: ??? +# # Inference options. Defined by OpenVINO capabilities, not Algo Backend or Platform. +# inference_targets: +# - CPU +# - GPU +# - VPU From a00c25b5b3ace8ebbe3f0713c202e46473c7d9ad Mon Sep 17 00:00:00 2001 From: jaegukhyun Date: Fri, 23 Jun 2023 13:47:24 +0900 Subject: [PATCH 02/11] Modify docstrings --- .../models/detectors/custom_dino_detector.py | 2 +- .../mmdet/models/heads/custom_dino_head.py | 120 ++++++++++++++---- .../adapters/mmdet/models/heads/detr_head.py | 29 +++-- .../adapters/mmdet/models/layers/dino.py | 52 +++++--- .../mmdet/models/layers/dino_layers.py | 9 +- 5 files changed, 155 insertions(+), 57 deletions(-) diff --git a/otx/algorithms/detection/adapters/mmdet/models/detectors/custom_dino_detector.py b/otx/algorithms/detection/adapters/mmdet/models/detectors/custom_dino_detector.py index 3bfd97bfa05..e84bb25eeec 100644 --- a/otx/algorithms/detection/adapters/mmdet/models/detectors/custom_dino_detector.py +++ b/otx/algorithms/detection/adapters/mmdet/models/detectors/custom_dino_detector.py @@ -70,7 +70,7 @@ def load_state_dict_pre_hook(model, ckpt_dict, *args, **kwargs): "otx.algorithms.detection.adapters.mmdet.models.detectors.custom_dino_detector.CustomDINO.simple_test" ) def custom_dino__simple_test(ctx, self, img, img_metas, **kwargs): - """Function for custom_mask_rcnn__simple_test.""" + """Function for custom_dino__simple_test.""" height = int(img_metas[0]["img_shape"][0]) width = int(img_metas[0]["img_shape"][1]) img_metas[0]["batch_input_shape"] = (height, width) diff --git a/otx/algorithms/detection/adapters/mmdet/models/heads/custom_dino_head.py b/otx/algorithms/detection/adapters/mmdet/models/heads/custom_dino_head.py index e17ec30bf55..eae551f7ebd 100644 --- a/otx/algorithms/detection/adapters/mmdet/models/heads/custom_dino_head.py +++ b/otx/algorithms/detection/adapters/mmdet/models/heads/custom_dino_head.py @@ -20,7 +20,13 @@ @HEADS.register_module() class CustomDINOHead(DeformableDETRHead, DETRHeadExtension): - """Head of DINO.""" + """Head of DINO. + + Based on detr_head.py and deformable_detr.py in mmdet2.x, some functions from dino_head.py in mmdet3.x are added. + Forward structure: + - Training: self.forward_train -> self.forward_transformer -> self.forward -> self.loss + - Inference: self.simple_test_bboxes -> self.forward_transformer -> self.forward -> self.get_bboxes + """ def __init__(self, *args, dn_cfg: Optional[Config] = None, **kwargs): super().__init__(*args, **kwargs) @@ -45,6 +51,10 @@ def _init_layers(self): def forward_train(self, x, img_metas, gt_bboxes, gt_labels=None, gt_bboxes_ignore=None, proposal_cfg=None): """Forward function for training mode. + Origin impelmentation: forward_train function of detr_head.py in mmdet2.x + What's changed: Divided self.forward into self.forward_transformer + self.forward. + This kind of structure is from mmdet3.x. + Args: x (list[Tensor]): Features from backbone. img_metas (list[dict]): Meta information of each image, e.g., @@ -72,7 +82,11 @@ def forward_train(self, x, img_metas, gt_bboxes, gt_labels=None, gt_bboxes_ignor return losses def forward_transformer(self, mlvl_feats, gt_bboxes, gt_labels, img_metas): - """Forward function. + """Transformers's forward function. + + Origin implementation: forward function of deformable_detr_head.py in mmdet2.x + What's changed: Original implementation has post-processing process after getting outputs from + self.transformer. However, this function directly return outputs from self.transformer Args: mlvl_feats (tuple[Tensor]): Features from the upstream @@ -99,6 +113,10 @@ def forward_transformer(self, mlvl_feats, gt_bboxes, gt_labels, img_metas): encode feature map, has shape (N, h*w, 4). Only when \ as_two_stage is True it would be returned, otherwise \ `None` would be returned. + dn_meta (Dict[str, int]): The dictionary saves information about + group collation, including 'num_denoising_queries' and + 'num_denoising_groups'. It will be used for split outputs of + denoising and matching parts and loss calculation. """ batch_size = mlvl_feats[0].size(0) @@ -140,10 +158,14 @@ def loss( enc_outputs_class: Tensor, enc_outputs_coord: Tensor, dn_meta: Dict[str, int], - batch_data_samples, + batch_data_samples: List[Config], ) -> dict: """Perform forward propagation and loss calculation. + Original implementation: loss function of dino_head.py in mmdet3.x + What's changed: Change the name of function of loss_by_feat to loss_by_feat_two_stage since + there are changes in function input from parent's implementation. + Args: hidden_states (Tensor): Hidden states output from each decoder layer, has shape (num_decoder_layers, bs, num_queries_total, @@ -162,13 +184,12 @@ def loss( enc_outputs_coord (Tensor): The proposal generate from the encode feature map, has shape (bs, num_feat_points, 4) with the last dimension arranged as (cx, cy, w, h). - batch_data_samples (list[:obj:`DetDataSample`]): The Data - Samples. It usually includes information such as - `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`. dn_meta (Dict[str, int]): The dictionary saves information about group collation, including 'num_denoising_queries' and 'num_denoising_groups'. It will be used for split outputs of denoising and matching parts and loss calculation. + batch_data_samples (List[Config]): This is same with batch_data_samples in mmdet3.x + It contains meta_info(==img_metas) and gt_instances(==(gt_bboxes, gt_labels)) Returns: dict: A dictionary of loss components. @@ -187,8 +208,35 @@ def loss( def forward(self, hidden_states, references): """Forward function. - T.B.D. + Original implementation: forward function of deformable_detr_head.py in mmdet3.x + What's changed: None + + Args: + hidden_states (Tensor): Hidden states output from each decoder + layer, has shape (num_decoder_layers, bs, num_queries, dim). + references (list[Tensor]): List of the reference from the decoder. + The first reference is the `init_reference` (initial) and the + other num_decoder_layers(6) references are `inter_references` + (intermediate). The `init_reference` has shape (bs, + num_queries, 4) when `as_two_stage` of the detector is `True`, + otherwise (bs, num_queries, 2). Each `inter_reference` has + shape (bs, num_queries, 4) when `with_box_refine` of the + detector is `True`, otherwise (bs, num_queries, 2). The + coordinates are arranged as (cx, cy) when the last dimension is + 2, and (cx, cy, w, h) when it is 4. + + Returns: + tuple[Tensor]: results of head containing the following tensor. + + - all_layers_outputs_classes (Tensor): Outputs from the + classification head, has shape (num_decoder_layers, bs, + num_queries, cls_out_channels). + - all_layers_outputs_coords (Tensor): Sigmoid outputs from the + regression head with normalized coordinate format (cx, cy, w, + h), has shape (num_decoder_layers, bs, num_queries, 4) with the + last dimension arranged as (cx, cy, w, h). """ + all_layers_outputs_classes = [] all_layers_outputs_coords = [] @@ -224,13 +272,16 @@ def loss_by_feat_two_stage( all_layers_bbox_preds: Tensor, enc_cls_scores: Tensor, enc_bbox_preds: Tensor, - batch_gt_instances, + batch_gt_instances: List[Config], batch_img_metas: List[dict], dn_meta: Dict[str, int], batch_gt_instances_ignore=None, ) -> Dict[str, Tensor]: """Loss function. + Original implementation: loss_by_feat function of dino_head.py in mmdet3.x + What's changed: Name of function is changed. Parent's loss_by_feat function has different inputs. + Args: all_layers_cls_scores (Tensor): Classification scores of all decoder layers, has shape (num_decoder_layers, bs, @@ -246,9 +297,8 @@ def loss_by_feat_two_stage( enc_bbox_preds (Tensor): The proposal generate from the encode feature map, has shape (bs, num_feat_points, 4) with the last dimension arranged as (cx, cy, w, h). - batch_gt_instances (list[:obj:`InstanceData`]): Batch of - gt_instance. It usually includes ``bboxes`` and ``labels`` - attributes. + batch_gt_instances (List[Config]): Batch of gt_instance. + It usually includes ``bboxes`` and ``labels`` attributes. batch_img_metas (list[dict]): Meta information of each image, e.g., image size, scaling factor, etc. dn_meta (Dict[str, int]): The dictionary saves information about @@ -315,12 +365,15 @@ def loss_dn( self, all_layers_denoising_cls_scores: Tensor, all_layers_denoising_bbox_preds: Tensor, - batch_gt_instances, + batch_gt_instances: List[Config], batch_img_metas: List[dict], dn_meta: Dict[str, int], ) -> Tuple[List[Tensor], ...]: """Calculate denoising loss. + Original implementation: loss_dn function of dino_head.py in mmdet3.x + What's changed: None + Args: all_layers_denoising_cls_scores (Tensor): Classification scores of all decoder layers in denoising part, has shape ( @@ -330,9 +383,8 @@ def loss_dn( decoder layers in denoising part. Each is a 4D-tensor with normalized coordinate format (cx, cy, w, h) and has shape (num_decoder_layers, bs, num_denoising_queries, 4). - batch_gt_instances (list[:obj:`InstanceData`]): Batch of - gt_instance. It usually includes ``bboxes`` and ``labels`` - attributes. + batch_gt_instances (List[Config]): Batch of gt_instance. + It usually includes ``bboxes`` and ``labels`` attributes. batch_img_metas (list[dict]): Meta information of each image, e.g., image size, scaling factor, etc. dn_meta (Dict[str, int]): The dictionary saves information about @@ -357,12 +409,15 @@ def _loss_dn_single( self, dn_cls_scores: Tensor, dn_bbox_preds: Tensor, - batch_gt_instances, + batch_gt_instances: List[Config], batch_img_metas: List[dict], dn_meta: Dict[str, int], ) -> Tuple[Tensor, ...]: """Denoising loss for outputs from a single decoder layer. + Original implementation: _loss_dn_single function of dino_head.py in mmdet3.x + What's changed: None + Args: dn_cls_scores (Tensor): Classification scores of a single decoder layer in denoising part, has shape (bs, num_denoising_queries, @@ -371,9 +426,8 @@ def _loss_dn_single( layer in denoising part. Each is a 4D-tensor with normalized coordinate format (cx, cy, w, h) and has shape (bs, num_denoising_queries, 4). - batch_gt_instances (list[:obj:`InstanceData`]): Batch of - gt_instance. It usually includes ``bboxes`` and ``labels`` - attributes. + batch_gt_instances (List[Config]): Batch of gt_instance. + It usually includes ``bboxes`` and ``labels`` attributes. batch_img_metas (list[dict]): Meta information of each image, e.g., image size, scaling factor, etc. dn_meta (Dict[str, int]): The dictionary saves information about @@ -439,13 +493,17 @@ def _loss_dn_single( loss_bbox = self.loss_bbox(bbox_preds, bbox_targets, bbox_weights, avg_factor=num_total_pos) return loss_cls, loss_bbox, loss_iou - def get_dn_targets(self, batch_gt_instances, batch_img_metas: List[Dict], dn_meta: Dict[str, int]) -> tuple: + def get_dn_targets( + self, batch_gt_instances: List[Config], batch_img_metas: List[Dict], dn_meta: Dict[str, int] + ) -> tuple: """Get targets in denoising part for a batch of images. + Original implementation: get_dn_targets function of dino_head.py in mmdet3.x + What's changed: None + Args: - batch_gt_instances (list[:obj:`InstanceData`]): Batch of - gt_instance. It usually includes ``bboxes`` and ``labels`` - attributes. + batch_gt_instances (List[Config]): Batch of gt_instance. + It usually includes ``bboxes`` and ``labels`` attributes. batch_img_metas (list[dict]): Meta information of each image, e.g., image size, scaling factor, etc. dn_meta (Dict[str, int]): The dictionary saves information about @@ -475,13 +533,14 @@ def get_dn_targets(self, batch_gt_instances, batch_img_metas: List[Dict], dn_met num_total_neg = sum((inds.numel() for inds in neg_inds_list)) return (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list, num_total_pos, num_total_neg) - def _get_dn_targets_single(self, gt_instances, img_meta: dict, dn_meta: Dict[str, int]) -> tuple: + def _get_dn_targets_single(self, gt_instances: Config, img_meta: dict, dn_meta: Dict[str, int]) -> tuple: """Get targets in denoising part for one image. + Original implementation: _get_dn_targets_single function of dino_head.py in mmdet3.x + What's changed: None + Args: - gt_instances (:obj:`InstanceData`): Ground truth of instance - annotations. It should includes ``bboxes`` and ``labels`` - attributes. + gt_instances (Config): A gt_instance which usually includes ``bboxes`` and ``labels`` attributes. img_meta (dict): Meta information for one image. dn_meta (Dict[str, int]): The dictionary saves information about group collation, including 'num_denoising_queries' and @@ -544,6 +603,9 @@ def split_outputs( ) -> Tuple[Tensor, ...]: """Split outputs of the denoising part and the matching part. + Original implementation: split_outputs function of dino_head.py in mmdet3.x + What's changed: None + For the total outputs of `num_queries_total` length, the former `num_denoising_queries` outputs are from denoising queries, and the rest `num_matching_queries` ones are from matching queries, @@ -602,6 +664,10 @@ def split_outputs( def simple_test_bboxes(self, feats, img_metas, rescale=False): """Test det bboxes without test-time augmentation. + Original implementation: simple_test_bboxes funciton of detr_head.py in mmdet2.x + What's changed: self.forward function is divided into self.forward_transformer and self.forward function. + This changes is from mmdet3.x + Args: feats (tuple[torch.Tensor]): Multi-level features from the upstream network, each is a 4D-tensor. diff --git a/otx/algorithms/detection/adapters/mmdet/models/heads/detr_head.py b/otx/algorithms/detection/adapters/mmdet/models/heads/detr_head.py index 86841e12022..29ab16e22e2 100644 --- a/otx/algorithms/detection/adapters/mmdet/models/heads/detr_head.py +++ b/otx/algorithms/detection/adapters/mmdet/models/heads/detr_head.py @@ -7,18 +7,24 @@ import torch from mmcv.runner import BaseModule +from mmcv.utils import Config from mmdet.core import bbox_cxcywh_to_xyxy, bbox_xyxy_to_cxcywh, multi_apply, reduce_mean from torch import Tensor class DETRHeadExtension(BaseModule): - """Head of DETR. DETR:End-to-End Object Detection with Transformers.""" + """Head of DETR. DETR:End-to-End Object Detection with Transformers. + + Origin implementation: DETRHead of detr_head.py in mmdet3.x + What's changed: Change data type of batch_gt_instances from InstanceList to List[Config]. + Since InstanceList is a new data type from mmdet3.x, List[Config] will replace it. + """ def loss_by_feat( self, all_layers_cls_scores: Tensor, all_layers_bbox_preds: Tensor, - batch_gt_instances, + batch_gt_instances: List[Config], batch_img_metas: List[dict], batch_gt_instances_ignore=None, ) -> Dict[str, Tensor]: @@ -35,9 +41,8 @@ def loss_by_feat( outputs of each decoder layers. Each is a 4D-tensor with normalized coordinate format (cx, cy, w, h) and shape (num_decoder_layers, bs, num_queries, 4). - batch_gt_instances (list[:obj:`InstanceData`]): Batch of - gt_instance. It usually includes ``bboxes`` and ``labels`` - attributes. + batch_gt_instances (List[Config]): Batch of gt_instance. + It usually includes ``bboxes`` and ``labels`` attributes. batch_img_metas (list[dict]): Meta information of each image, e.g., image size, scaling factor, etc. batch_gt_instances_ignore (list[:obj:`InstanceData`], optional): @@ -75,7 +80,7 @@ def loss_by_feat( return loss_dict def loss_by_feat_single( - self, cls_scores: Tensor, bbox_preds: Tensor, batch_gt_instances, batch_img_metas: List[dict] + self, cls_scores: Tensor, bbox_preds: Tensor, batch_gt_instances: List[Config], batch_img_metas: List[dict] ) -> Tuple[Tensor, Tensor, Tensor]: """Loss function for outputs from a single decoder layer of a single feature level. @@ -85,9 +90,8 @@ def loss_by_feat_single( bbox_preds (Tensor): Sigmoid outputs from a single decoder layer for all images, with normalized coordinate (cx, cy, w, h) and shape (bs, num_queries, 4). - batch_gt_instances (list[:obj:`InstanceData`]): Batch of - gt_instance. It usually includes ``bboxes`` and ``labels`` - attributes. + batch_gt_instances (List[Config]): Batch of gt_instance. + It usually includes ``bboxes`` and ``labels`` attributes. batch_img_metas (list[dict]): Meta information of each image, e.g., image size, scaling factor, etc. @@ -155,7 +159,7 @@ def _get_targets( self, cls_scores_list: List[Tensor], bbox_preds_list: List[Tensor], - batch_gt_instances, + batch_gt_instances: List[Config], batch_img_metas: List[dict], ) -> tuple: """Compute regression and classification targets for a batch image. @@ -169,9 +173,8 @@ def _get_targets( bbox_preds_list (list[Tensor]): Sigmoid outputs from a single decoder layer for each image, with normalized coordinate (cx, cy, w, h) and shape [num_queries, 4]. - batch_gt_instances (list[:obj:`InstanceData`]): Batch of - gt_instance. It usually includes ``bboxes`` and ``labels`` - attributes. + batch_gt_instances (List[Config]): Batch of gt_instance. + It usually includes ``bboxes`` and ``labels`` attributes. batch_img_metas (list[dict]): Meta information of each image, e.g., image size, scaling factor, etc. diff --git a/otx/algorithms/detection/adapters/mmdet/models/layers/dino.py b/otx/algorithms/detection/adapters/mmdet/models/layers/dino.py index f942b5b1717..cab3f23183d 100644 --- a/otx/algorithms/detection/adapters/mmdet/models/layers/dino.py +++ b/otx/algorithms/detection/adapters/mmdet/models/layers/dino.py @@ -3,17 +3,28 @@ # SPDX-License-Identifier: Apache-2.0 # +from typing import Dict, List, Optional, Tuple, Union + import torch from mmdet.models.utils.builder import TRANSFORMER from mmdet.models.utils.transformer import DeformableDetrTransformer +from torch import Tensor, nn @TRANSFORMER.register_module() class CustomDINOTransformer(DeformableDetrTransformer): - """Custom DINO transformer.""" + """Custom DINO transformer. + + Original implementation: mmdet.models.utils.transformer.DeformableDETR in mmdet2.x + What's changed: The forward function is modified. + Modified implementations are come from mmdet.models.detectors.dino.DINO in mmdet3.x + """ def init_layers(self): - """Initialize layers of the DINO.""" + """Initialize layers of the DINO. + + Unlike Deformable DETR, DINO does not need pos_trans, pos_trans_norm. + """ self.level_embeds = torch.nn.Parameter(torch.Tensor(self.num_feature_levels, self.embed_dims)) self.enc_output = torch.nn.Linear(self.embed_dims, self.embed_dims) @@ -21,17 +32,26 @@ def init_layers(self): def forward( self, - batch_info, - mlvl_feats, - mlvl_masks, - query_embed, - mlvl_pos_embeds, - reg_branches=None, - cls_branches=None, + batch_info: List[Dict[str, Union[Tuple, Tensor]]], + mlvl_feats: List[Tensor], + mlvl_masks: List[Tensor], + query_embed: Tensor, + mlvl_pos_embeds: List[Tensor], + reg_branches: Optional[nn.ModuleList] = None, + cls_branches: Optional[nn.ModuleList] = None, **kwargs ): """Forward function for `Transformer`. + What's changed: + In mmdet3.x forward of transformer is divided into + pre_transformer() -> forward_encoder() -> pre_decoder() -> forward_decoder(). + In comparison, mmdet2.x forward function takes charge of all functions above. + The differences in Deformable DETR and DINO are occured in pre_decoder(), forward_decoder(). + Therefore this function modified those parts. Modified implementations are come from + pre_decoder(), and forward_decoder() of mmdet.models.detectors.dino.DINO in mmdet3.x. + + Args: batch_info(list(dict(str, union(tuple, tensor)))): Information about batch such as image shaep, @@ -65,8 +85,6 @@ def forward( return_intermediate_dec is True output has shape \ (num_dec_layers, bs, num_query, embed_dims), else has \ shape (1, bs, num_query, embed_dims). - - init_reference_out: The initial value of reference \ - points, has shape (bs, num_queries, 4). - inter_references_out: The internal value of reference \ points in decoder, has shape \ (num_dec_layers, bs,num_query, embed_dims) @@ -81,11 +99,15 @@ def forward( (batch, h*w, 4). Only would \ be returned when `as_two_stage` is True, \ otherwise None. + - dn_meta (Dict[str, int]): The dictionary saves information about + group collation, including 'num_denoising_queries' and + 'num_denoising_groups'. It will be used for split outputs of + denoising and matching parts and loss calculation. """ - feat_flatten = [] - mask_flatten = [] - lvl_pos_embed_flatten = [] - spatial_shapes = [] + feat_flatten: Union[Tensor, List[Tensor]] = [] + mask_flatten: Union[Tensor, List[Tensor]] = [] + lvl_pos_embed_flatten: Union[Tensor, List[Tensor]] = [] + spatial_shapes: Union[Tensor, List[Tensor]] = [] for lvl, (feat, mask, pos_embed) in enumerate(zip(mlvl_feats, mlvl_masks, mlvl_pos_embeds)): bs, c, h, w = feat.shape spatial_shape = (h, w) diff --git a/otx/algorithms/detection/adapters/mmdet/models/layers/dino_layers.py b/otx/algorithms/detection/adapters/mmdet/models/layers/dino_layers.py index fb82a1febb1..4dda964e3d8 100644 --- a/otx/algorithms/detection/adapters/mmdet/models/layers/dino_layers.py +++ b/otx/algorithms/detection/adapters/mmdet/models/layers/dino_layers.py @@ -39,7 +39,11 @@ def forward( reg_branches: nn.ModuleList, **kwargs, ) -> Tensor: - """Forward function of Transformer encoder. + """Forward function of Transformer decoder. + + Original implementation: forward function of DinoTransformerDecoder in mmdet3.x. + What's change: Since implementation of base transformer layer is different between mmdet2.x and mmdet3.x, + input shape of layer and some input parameters of layer is modified. Args: query (Tensor): The input query, has shape (num_queries, bs, dim). @@ -126,6 +130,9 @@ class CdnQueryGenerator(BaseModule): Code is modified from the `official github repo `_. + Original implementation: mmdet.models.layers.transformer.dino_layers.CdnQueryGenerator + What's changed: None + Args: num_classes (int): Number of object classes. embed_dims (int): The embedding dimensions of the generated queries. From fc4bc6ac443e66488b9c94940dbca1dea92e5681 Mon Sep 17 00:00:00 2001 From: jaegukhyun Date: Fri, 23 Jun 2023 14:06:04 +0900 Subject: [PATCH 03/11] Add mmengine to detection requirements --- requirements/detection.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements/detection.txt b/requirements/detection.txt index 4c3802395d4..a0e94ea71bc 100644 --- a/requirements/detection.txt +++ b/requirements/detection.txt @@ -6,4 +6,5 @@ pytorchcv mmcls==0.25.0 timm==0.6.12 mmdeploy==0.14.0 +mmengine==0.7.4 scikit-image From a6712be97fac6a89eef7ca3eaf8a6671a2a429ad Mon Sep 17 00:00:00 2001 From: jaegukhyun Date: Fri, 23 Jun 2023 16:00:59 +0900 Subject: [PATCH 04/11] Add unit tests --- .../mmdet/models/heads/custom_dino_head.py | 30 ++- .../mmdet/models/detectors/conftest.py | 92 +++++++- .../detectors/test_custom_dino_detector.py | 52 +++++ .../adapters/mmdet/models/heads/__init__.py | 3 + .../models/heads/test_custom_dino_head.py | 213 ++++++++++++++++++ 5 files changed, 381 insertions(+), 9 deletions(-) create mode 100644 tests/unit/algorithms/detection/adapters/mmdet/models/detectors/test_custom_dino_detector.py create mode 100644 tests/unit/algorithms/detection/adapters/mmdet/models/heads/__init__.py create mode 100644 tests/unit/algorithms/detection/adapters/mmdet/models/heads/test_custom_dino_head.py diff --git a/otx/algorithms/detection/adapters/mmdet/models/heads/custom_dino_head.py b/otx/algorithms/detection/adapters/mmdet/models/heads/custom_dino_head.py index eae551f7ebd..7da7d4fa8a8 100644 --- a/otx/algorithms/detection/adapters/mmdet/models/heads/custom_dino_head.py +++ b/otx/algorithms/detection/adapters/mmdet/models/heads/custom_dino_head.py @@ -3,7 +3,7 @@ # SPDX-License-Identifier: Apache-2.0 # -from typing import Dict, List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple import torch import torch.nn.functional as F @@ -48,7 +48,15 @@ def _init_layers(self): super()._init_layers() self.query_embedding = torch.nn.Embedding(self.num_query, self.embed_dims) - def forward_train(self, x, img_metas, gt_bboxes, gt_labels=None, gt_bboxes_ignore=None, proposal_cfg=None): + def forward_train( + self, + x: Tuple[Tensor], + img_metas: List[Dict[str, Any]], + gt_bboxes: List[Tensor], + gt_labels: Optional[List[Tensor]] = None, + gt_bboxes_ignore: Optional[List[Tensor]] = None, + proposal_cfg: Optional[Config] = None, + ): """Forward function for training mode. Origin impelmentation: forward_train function of detr_head.py in mmdet2.x @@ -59,11 +67,11 @@ def forward_train(self, x, img_metas, gt_bboxes, gt_labels=None, gt_bboxes_ignor x (list[Tensor]): Features from backbone. img_metas (list[dict]): Meta information of each image, e.g., image size, scaling factor, etc. - gt_bboxes (Tensor): Ground truth bboxes of the image, + gt_bboxes (List[Tensor]): Ground truth bboxes of the image, shape (num_gts, 4). - gt_labels (Tensor): Ground truth labels of each box, + gt_labels (List[Tensor]): Ground truth labels of each box, shape (num_gts,). - gt_bboxes_ignore (Tensor): Ground truth bboxes to be + gt_bboxes_ignore (List[Tensor]): Ground truth bboxes to be ignored, shape (num_ignored_gts, 4). proposal_cfg (mmcv.Config): Test / postprocessing configuration, if None, test_cfg would be used. @@ -81,7 +89,13 @@ def forward_train(self, x, img_metas, gt_bboxes, gt_labels=None, gt_bboxes_ignor losses = self.loss(*loss_inputs) return losses - def forward_transformer(self, mlvl_feats, gt_bboxes, gt_labels, img_metas): + def forward_transformer( + self, + mlvl_feats: Tuple[Tensor], + gt_bboxes: Optional[List[Tensor]], + gt_labels: Optional[List[Tensor]], + img_metas: List[Dict[str, Any]], + ): """Transformers's forward function. Origin implementation: forward function of deformable_detr_head.py in mmdet2.x @@ -205,7 +219,7 @@ def loss( losses = self.loss_by_feat_two_stage(*loss_inputs) return losses - def forward(self, hidden_states, references): + def forward(self, hidden_states: Tensor, references: List[Tensor]): """Forward function. Original implementation: forward function of deformable_detr_head.py in mmdet3.x @@ -661,7 +675,7 @@ def split_outputs( all_layers_denoising_bbox_preds, ) - def simple_test_bboxes(self, feats, img_metas, rescale=False): + def simple_test_bboxes(self, feats: Tuple[Tensor], img_metas: List[Dict[str, Any]], rescale=False): """Test det bboxes without test-time augmentation. Original implementation: simple_test_bboxes funciton of detr_head.py in mmdet2.x diff --git a/tests/unit/algorithms/detection/adapters/mmdet/models/detectors/conftest.py b/tests/unit/algorithms/detection/adapters/mmdet/models/detectors/conftest.py index 52b50f2722d..4ac44156cba 100644 --- a/tests/unit/algorithms/detection/adapters/mmdet/models/detectors/conftest.py +++ b/tests/unit/algorithms/detection/adapters/mmdet/models/detectors/conftest.py @@ -341,7 +341,7 @@ def fxt_cfg_custom_deformable_detr(num_classes: int = 3): bbox_head=dict( type="DeformableDETRHead", num_query=300, - num_classes=80, + num_classes=num_classes, in_channels=2048, sync_cls_avg_factor=True, with_box_refine=True, @@ -395,3 +395,93 @@ def fxt_cfg_custom_deformable_detr(num_classes: int = 3): dst_classes=["tree", "car", "person"], ), ) + + +@pytest.fixture +def fxt_cfg_custom_dino(num_classes: int = 3): + return ConfigDict( + type="CustomDINO", + backbone=dict( + type="ResNet", + depth=50, + num_stages=4, + out_indices=(1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type="BN", requires_grad=False), + norm_eval=True, + style="pytorch", + init_cfg=dict(type="Pretrained", checkpoint="torchvision://resnet50"), + ), + neck=dict( + type="ChannelMapper", + in_channels=[512, 1024, 2048], + kernel_size=1, + out_channels=256, + act_cfg=None, + norm_cfg=dict(type="GN", num_groups=32), + num_outs=4, + ), + bbox_head=dict( + type="CustomDINOHead", + num_query=900, + num_classes=num_classes, + in_channels=2048, + sync_cls_avg_factor=True, + with_box_refine=True, + as_two_stage=True, + transformer=dict( + type="CustomDINOTransformer", + encoder=dict( + type="DetrTransformerEncoder", + num_layers=6, + transformerlayers=dict( + type="BaseTransformerLayer", + attn_cfgs=dict(type="MultiScaleDeformableAttention", embed_dims=256, dropout=0.0), + feedforward_channels=2048, + ffn_dropout=0.0, + operation_order=("self_attn", "norm", "ffn", "norm"), + ), + ), + decoder=dict( + type="DINOTransformerDecoder", + num_layers=6, + return_intermediate=True, + transformerlayers=dict( + type="DetrTransformerDecoderLayer", + attn_cfgs=[ + dict(type="MultiheadAttention", embed_dims=256, num_heads=8, dropout=0.0), + dict(type="MultiScaleDeformableAttention", embed_dims=256, dropout=0.0), + ], + feedforward_channels=2048, + ffn_dropout=0.0, + operation_order=("self_attn", "norm", "cross_attn", "norm", "ffn", "norm"), + ), + ), + ), + positional_encoding=dict( + type="SinePositionalEncoding", num_feats=128, normalize=True, offset=0.0, temperature=20 + ), + loss_cls=dict(type="FocalLoss", use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=1.0), + loss_bbox=dict(type="L1Loss", loss_weight=5.0), + loss_iou=dict(type="GIoULoss", loss_weight=2.0), + dn_cfg=dict( + label_noise_scale=0.5, + box_noise_scale=1.0, # 0.4 for DN-DETR + group_cfg=dict(dynamic=True, num_groups=None, num_dn_queries=100), + ), + ), + # training and testing settings + train_cfg=dict( + assigner=dict( + type="HungarianAssigner", + cls_cost=dict(type="FocalLossCost", weight=1.0), + reg_cost=dict(type="BBoxL1Cost", weight=5.0, box_format="xywh"), + iou_cost=dict(type="IoUCost", iou_mode="giou", weight=2.0), + ) + ), + test_cfg=dict(max_per_img=300), + task_adapt=dict( + src_classes=["person", "car"], + dst_classes=["tree", "car", "person"], + ), + ) diff --git a/tests/unit/algorithms/detection/adapters/mmdet/models/detectors/test_custom_dino_detector.py b/tests/unit/algorithms/detection/adapters/mmdet/models/detectors/test_custom_dino_detector.py new file mode 100644 index 00000000000..22b89435f09 --- /dev/null +++ b/tests/unit/algorithms/detection/adapters/mmdet/models/detectors/test_custom_dino_detector.py @@ -0,0 +1,52 @@ +# Copyright (C) 2023 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# + +from typing import Dict +import torch +from mmdet.models.builder import build_detector + +from otx.algorithms.detection.adapters.mmdet.models.detectors.custom_dino_detector import ( + CustomDINO, +) +from tests.test_suite.e2e_test_system import e2e_pytest_unit + + +class TestCustomDINO: + @e2e_pytest_unit + def test_custom_dino_build(self, fxt_cfg_custom_dino: Dict): + model = build_detector(fxt_cfg_custom_dino) + assert isinstance(model, CustomDINO) + + def test_custom_dino_load_state_pre_hook(self, fxt_cfg_custom_dino: Dict): + model = build_detector(fxt_cfg_custom_dino) + ckpt_dict = { + "level_embed": "level_embed", + "encoder.self_attn": "encoder.self_attn", + "encoder.cross_attn": "encoder.cross_attn", + "encoder.ffn": "encoder.ffn", + "level_embed": "level_embed", + "decoder.self_attn": "decoder.self_attn", + "decoder.cross_attn": "decoder.cross_attn", + "decoder.ffn": "decoder.ffn", + "query_embedding.weight": "query_embedding.weight", + "dn_query_generator.label_embedding.weight": "dn_query_generator.label_embedding.weight", + "memory_trans_fc": "memory_trans_fc", + "memory_trans_norm": "memory_trans_norm", + } + model.load_state_dict_pre_hook(model, ckpt_dict) + + assert ckpt_dict["bbox_head.transformer.level_embeds"] == "level_embed" + assert ckpt_dict["bbox_head.transformer.encoder.attentions.0"] == "encoder.self_attn" + assert ckpt_dict["bbox_head.transformer.encoder.attentions.1"] == "encoder.cross_attn" + assert ckpt_dict["bbox_head.transformer.encoder.ffns.0"] == "encoder.ffn" + assert ckpt_dict["bbox_head.transformer.decoder.attentions.0"] == "decoder.self_attn" + assert ckpt_dict["bbox_head.transformer.decoder.attentions.1"] == "decoder.cross_attn" + assert ckpt_dict["bbox_head.transformer.decoder.ffns.0"] == "decoder.ffn" + assert ckpt_dict["bbox_head.query_embedding.weight"] == "query_embedding.weight" + assert ( + ckpt_dict["bbox_head.transformer.dn_query_generator.label_embedding.weight"] + == "dn_query_generator.label_embedding.weight" + ) + assert ckpt_dict["bbox_head.transformer.enc_output"] == "memory_trans_fc" + assert ckpt_dict["bbox_head.transformer.enc_output_norm"] == "memory_trans_norm" diff --git a/tests/unit/algorithms/detection/adapters/mmdet/models/heads/__init__.py b/tests/unit/algorithms/detection/adapters/mmdet/models/heads/__init__.py new file mode 100644 index 00000000000..9c68be83ef0 --- /dev/null +++ b/tests/unit/algorithms/detection/adapters/mmdet/models/heads/__init__.py @@ -0,0 +1,3 @@ +# Copyright (C) 2023 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# diff --git a/tests/unit/algorithms/detection/adapters/mmdet/models/heads/test_custom_dino_head.py b/tests/unit/algorithms/detection/adapters/mmdet/models/heads/test_custom_dino_head.py new file mode 100644 index 00000000000..fec8abac448 --- /dev/null +++ b/tests/unit/algorithms/detection/adapters/mmdet/models/heads/test_custom_dino_head.py @@ -0,0 +1,213 @@ +# Copyright (C) 2023 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# + +import numpy as np +import pytest +import torch +from mmcv.utils import ConfigDict +from mmdet.core import build_assigner +from mmdet.models.builder import build_detector + +from otx.algorithms.detection.adapters.mmdet.models.heads.custom_dino_head import ( + CustomDINOHead, +) +from tests.test_suite.e2e_test_system import e2e_pytest_unit + + +class TestCustomDINOHead: + @pytest.fixture(autouse=True) + def setup(self): + cfg = ConfigDict( + dict( + type="CustomDINOHead", + num_query=900, + num_classes=80, + in_channels=2048, + sync_cls_avg_factor=True, + with_box_refine=True, + as_two_stage=True, + transformer=dict( + type="CustomDINOTransformer", + encoder=dict( + type="DetrTransformerEncoder", + num_layers=6, + transformerlayers=dict( + type="BaseTransformerLayer", + attn_cfgs=dict(type="MultiScaleDeformableAttention", embed_dims=256, dropout=0.0), + feedforward_channels=2048, + ffn_dropout=0.0, + operation_order=("self_attn", "norm", "ffn", "norm"), + ), + ), + decoder=dict( + type="DINOTransformerDecoder", + num_layers=6, + return_intermediate=True, + transformerlayers=dict( + type="DetrTransformerDecoderLayer", + attn_cfgs=[ + dict(type="MultiheadAttention", embed_dims=256, num_heads=8, dropout=0.0), + dict(type="MultiScaleDeformableAttention", embed_dims=256, dropout=0.0), + ], + feedforward_channels=2048, + ffn_dropout=0.0, + operation_order=("self_attn", "norm", "cross_attn", "norm", "ffn", "norm"), + ), + ), + ), + positional_encoding=dict( + type="SinePositionalEncoding", num_feats=128, normalize=True, offset=0.0, temperature=20 + ), + loss_cls=dict(type="FocalLoss", use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=1.0), + loss_bbox=dict(type="L1Loss", loss_weight=5.0), + loss_iou=dict(type="GIoULoss", loss_weight=2.0), + dn_cfg=dict( + label_noise_scale=0.5, + box_noise_scale=1.0, # 0.4 for DN-DETR + group_cfg=dict(dynamic=True, num_groups=None, num_dn_queries=100), + ), + ), + ) + self.bbox_head = build_detector(cfg) + + assigner_cfg = ConfigDict( + type="HungarianAssigner", + cls_cost=dict(type="FocalLossCost", weight=1.0), + reg_cost=dict(type="BBoxL1Cost", weight=5.0, box_format="xywh"), + iou_cost=dict(type="IoUCost", iou_mode="giou", weight=2.0), + ) + self.bbox_head.assigner = build_assigner(assigner_cfg) + + test_cfg = dict(max_per_img=300) + self.bbox_head.test_cfg = test_cfg + + @e2e_pytest_unit + def test_forward_train(self): + inputs = [ + torch.randn([2, 256, 92, 95]), + torch.randn([2, 256, 46, 48]), + torch.randn([2, 256, 23, 24]), + torch.randn([2, 256, 12, 12]), + ] + gt_bboxes = [ + torch.Tensor( + [ + [432.2500, 514.2661, 632.6323, 638.8889], + [361.2484, 294.9931, 558.4751, 466.9410], + [616.8542, 201.9204, 752.5462, 328.1207], + [591.6091, 386.4883, 733.6124, 571.0562], + [728.8790, 255.5556, 760.0000, 408.5734], + [713.1008, 397.5309, 760.0000, 541.0837], + [246.0680, 354.9383, 427.5165, 498.4911], + [113.5316, 361.2483, 309.1805, 517.4211], + [457.4950, 654.6639, 646.8326, 736.0000], + [132.4654, 631.0014, 187.6889, 684.6365], + [217.6673, 694.1015, 298.1358, 736.0000], + [0.0000, 583.6763, 56.7303, 672.0164], + [86.7088, 675.1714, 168.7551, 736.0000], + [173.4885, 93.0727, 253.9570, 151.4403], + [738.3458, 119.8903, 760.0000, 164.0603], + [683.1224, 522.1536, 760.0000, 736.0000], + ] + ), + torch.Tensor( + [ + [442.0, 279.0, 544.0, 377.0], + [386.0, 1.0, 497.0, 108.0], + [288.0, 1.0, 399.0, 84.0], + [154.0, 1.0, 268.0, 77.0], + [530.0, 163.0, 625.0, 248.0], + [179.0, 298.0, 278.0, 398.0], + [275.0, 320.0, 374.0, 420.0], + [525.0, 394.0, 613.0, 480.0], + [332.0, 160.0, 463.0, 286.0], + [210.0, 395.0, 308.0, 480.0], + [141.0, 395.0, 239.0, 480.0], + [106.0, 225.0, 204.0, 310.0], + [12.0, 1.0, 148.0, 70.0], + [165.0, 79.0, 396.0, 247.0], + [483.0, 13.0, 518.0, 52.0], + ], + ), + ] + gt_labels = [ + torch.Tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 2]).long(), + torch.Tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 0]).long(), + ] + img_metas = [ + { + "flip_direction": "horizontal", + "img_shape": (736, 760, 3), + "ori_shape": (480, 640, 3), + "img_norm_cfg": { + "mean": np.array([123.675, 116.28, 103.53], dtype=np.float32), + "std": np.array([58.395, 57.12, 57.375], dtype=np.float32), + "to_rgb": False, + }, + "scale_factor": np.array([1.5139443, 1.5144033, 1.5139443, 1.5144033], dtype=np.float32), + "flip": True, + "pad_shape": (736, 760, 3), + "batch_input_shape": (736, 760), + }, + { + "flip_direction": "horizontal", + "img_shape": (480, 640, 3), + "ori_shape": (480, 640, 3), + "img_norm_cfg": { + "mean": np.array([123.675, 116.28, 103.53], dtype=np.float32), + "std": np.array([58.395, 57.12, 57.375], dtype=np.float32), + "to_rgb": False, + }, + "scale_factor": np.array([1.0, 1.0, 1.0, 1.0], dtype=np.float32), + "flip": True, + "pad_shape": (480, 640, 3), + "batch_input_shape": (736, 760), + }, + ] + losses = self.bbox_head.forward_train(inputs, img_metas, gt_bboxes, gt_labels) + assert len(losses) == 39 + + @e2e_pytest_unit + def test_simple_test_bboxes(self): + feats = [ + torch.randn([2, 256, 100, 134]), + torch.randn([2, 256, 50, 67]), + torch.randn([2, 256, 25, 34]), + torch.randn([2, 256, 13, 17]), + ] + img_metas = [ + { + "ori_shape": (480, 640, 3), + "img_shape": (800, 1067, 3), + "pad_shape": (800, 1067, 3), + "scale_factor": np.array([1.6671875, 1.6666666, 1.6671875, 1.6666666], dtype=np.float32), + "flip": False, + "flip_direction": None, + "img_norm_cfg": { + "mean": np.array([123.675, 116.28, 103.53], dtype=np.float32), + "std": np.array([58.395, 57.12, 57.375], dtype=np.float32), + "to_rgb": False, + }, + "batch_input_shape": (800, 1067), + }, + { + "ori_shape": (480, 640, 3), + "img_shape": (800, 1067, 3), + "pad_shape": (800, 1067, 3), + "scale_factor": np.array([1.6671875, 1.6666666, 1.6671875, 1.6666666], dtype=np.float32), + "flip": False, + "flip_direction": None, + "img_norm_cfg": { + "mean": np.array([123.675, 116.28, 103.53], dtype=np.float32), + "std": np.array([58.395, 57.12, 57.375], dtype=np.float32), + "to_rgb": False, + }, + "batch_input_shape": (800, 1067), + }, + ] + self.bbox_head.eval() + results = self.bbox_head.simple_test_bboxes(feats, img_metas) + assert len(results) == 2 + assert results[0][0].shape == torch.Size([300, 5]) + assert results[0][1].shape == torch.Size([300]) From e04378314259561878b4f51af7253ff4829ee944 Mon Sep 17 00:00:00 2001 From: jaegukhyun Date: Fri, 23 Jun 2023 16:30:49 +0900 Subject: [PATCH 05/11] Add intg test --- tests/integration/cli/detection/test_detection.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/tests/integration/cli/detection/test_detection.py b/tests/integration/cli/detection/test_detection.py index 18a2d65f230..d36948ba13a 100644 --- a/tests/integration/cli/detection/test_detection.py +++ b/tests/integration/cli/detection/test_detection.py @@ -68,13 +68,16 @@ templates = Registry("otx/algorithms/detection").filter(task_type="DETECTION").templates templates_ids = [template.model_template_id for template in templates] -experimental_template = parse_model_template( - "otx/algorithms/detection/configs/detection/resnet50_deformable-detr/template_experimental.yaml" -) -experimental_template_id = experimental_template.model_template_id +experimental_templates = [ + parse_model_template( + "otx/algorithms/detection/configs/detection/resnet50_deformable-detr/template_experimental.yaml" + ), + parse_model_template("otx/algorithms/detection/configs/detection/resnet50_dino/template_experimental.yaml"), +] +experimental_template_ids = [template.model_template_id for template in experimental_templates] -templates_w_experimental = templates + [experimental_template] -templates_ids_w_experimental = templates_ids + [experimental_template_id] +templates_w_experimental = templates + experimental_templates +templates_ids_w_experimental = templates_ids + experimental_template_ids class TestDetectionCLI: From 406a485ba2951c41fd749e08cdfb4f1e4f725df9 Mon Sep 17 00:00:00 2001 From: jaegukhyun Date: Fri, 23 Jun 2023 16:37:48 +0900 Subject: [PATCH 06/11] Update CHANGELOG.md --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index d730cb9c7c7..0d71e175803 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ All notable changes to this project will be documented in this file. - Add custom max iou assigner to prevent CPU OOM when large annotations are used () - Auto train type detection for Semi-SL, Self-SL and Incremental: "--train-type" now is optional (https://github.com/openvinotoolkit/training_extensions/pull/2195) - Add new object detector Deformable DETR () +- Add new object detecotr DINO() ### Enhancements From 36ccfb1ee9d4b9d016ca81421293186391074538 Mon Sep 17 00:00:00 2001 From: jaegukhyun Date: Fri, 23 Jun 2023 19:12:46 +0900 Subject: [PATCH 07/11] Change description of config files for DINO --- .../detection/configs/detection/resnet50_dino/data_pipeline.py | 2 +- .../detection/configs/detection/resnet50_dino/deployment.py | 2 +- .../detection/configs/detection/resnet50_dino/model.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/otx/algorithms/detection/configs/detection/resnet50_dino/data_pipeline.py b/otx/algorithms/detection/configs/detection/resnet50_dino/data_pipeline.py index 4b577e21eb8..9610b3fd514 100644 --- a/otx/algorithms/detection/configs/detection/resnet50_dino/data_pipeline.py +++ b/otx/algorithms/detection/configs/detection/resnet50_dino/data_pipeline.py @@ -1,4 +1,4 @@ -"""Data pipeline for Deformable DETR.""" +"""Data pipeline for DINO.""" # dataset settings dataset_type = "CocoDataset" data_root = "data/coco/" diff --git a/otx/algorithms/detection/configs/detection/resnet50_dino/deployment.py b/otx/algorithms/detection/configs/detection/resnet50_dino/deployment.py index 76b4a6544f5..6e7d1fba3ed 100644 --- a/otx/algorithms/detection/configs/detection/resnet50_dino/deployment.py +++ b/otx/algorithms/detection/configs/detection/resnet50_dino/deployment.py @@ -1,4 +1,4 @@ -"""MMDeploy config of Deformable DETR model for Detection Task.""" +"""MMDeploy config of DINO model for Detection Task.""" _base_ = ["../../base/deployments/base_detection_dynamic.py"] diff --git a/otx/algorithms/detection/configs/detection/resnet50_dino/model.py b/otx/algorithms/detection/configs/detection/resnet50_dino/model.py index a9cdf215901..a929fb5dc3d 100644 --- a/otx/algorithms/detection/configs/detection/resnet50_dino/model.py +++ b/otx/algorithms/detection/configs/detection/resnet50_dino/model.py @@ -1,4 +1,4 @@ -"""Model config for Deformable DETR.""" +"""Model config for DINO.""" model = dict( type="CustomDINO", backbone=dict( From 909727ef51759eb1b3b5c608ea853a033aedccf0 Mon Sep 17 00:00:00 2001 From: jaegukhyun Date: Mon, 26 Jun 2023 12:34:20 +0900 Subject: [PATCH 08/11] Modify unit tests --- .../mmdet/models/heads/test_custom_dino_head.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/unit/algorithms/detection/adapters/mmdet/models/heads/test_custom_dino_head.py b/tests/unit/algorithms/detection/adapters/mmdet/models/heads/test_custom_dino_head.py index fec8abac448..4b54c6ea05d 100644 --- a/tests/unit/algorithms/detection/adapters/mmdet/models/heads/test_custom_dino_head.py +++ b/tests/unit/algorithms/detection/adapters/mmdet/models/heads/test_custom_dino_head.py @@ -85,10 +85,10 @@ def setup(self): @e2e_pytest_unit def test_forward_train(self): inputs = [ - torch.randn([2, 256, 92, 95]), - torch.randn([2, 256, 46, 48]), - torch.randn([2, 256, 23, 24]), - torch.randn([2, 256, 12, 12]), + torch.zeros([2, 256, 92, 95]), + torch.zeros([2, 256, 46, 48]), + torch.zeros([2, 256, 23, 24]), + torch.zeros([2, 256, 12, 12]), ] gt_bboxes = [ torch.Tensor( @@ -171,10 +171,10 @@ def test_forward_train(self): @e2e_pytest_unit def test_simple_test_bboxes(self): feats = [ - torch.randn([2, 256, 100, 134]), - torch.randn([2, 256, 50, 67]), - torch.randn([2, 256, 25, 34]), - torch.randn([2, 256, 13, 17]), + torch.zeros([2, 256, 100, 134]), + torch.zeros([2, 256, 50, 67]), + torch.zeros([2, 256, 25, 34]), + torch.zeros([2, 256, 13, 17]), ] img_metas = [ { From 42d7e15f0afbd107247261ccf8615dbc3e119c6f Mon Sep 17 00:00:00 2001 From: jaegukhyun Date: Tue, 27 Jun 2023 14:43:36 +0900 Subject: [PATCH 09/11] Reflect reviews --- .../mmdet/models/detectors/custom_dino_detector.py | 11 +++-------- .../detection/adapters/mmdet/models/layers/dino.py | 6 +++--- .../configs/detection/resnet50_dino/data_pipeline.py | 1 - 3 files changed, 6 insertions(+), 12 deletions(-) diff --git a/otx/algorithms/detection/adapters/mmdet/models/detectors/custom_dino_detector.py b/otx/algorithms/detection/adapters/mmdet/models/detectors/custom_dino_detector.py index e84bb25eeec..eeed11d99d1 100644 --- a/otx/algorithms/detection/adapters/mmdet/models/detectors/custom_dino_detector.py +++ b/otx/algorithms/detection/adapters/mmdet/models/detectors/custom_dino_detector.py @@ -4,8 +4,6 @@ # SPDX-License-Identifier: Apache-2.0 # -import functools - from mmdet.models.builder import DETECTORS from otx.algorithms.common.adapters.mmcv.hooks.recording_forward_hook import ( @@ -26,19 +24,16 @@ class CustomDINO(CustomDeformableDETR): def __init__(self, *args, task_adapt=None, **kwargs): super().__init__(*args, task_adapt=task_adapt, **kwargs) self._register_load_state_dict_pre_hook( - functools.partial( - self.load_state_dict_pre_hook, - self, - ) + self.load_state_dict_pre_hook, ) @staticmethod - def load_state_dict_pre_hook(model, ckpt_dict, *args, **kwargs): + def load_state_dict_pre_hook(ckpt_dict, *args, **kwargs): """Modify mmdet3.x version's weights before weight loading.""" if list(ckpt_dict.keys())[0] == "level_embed": logger.info("----------------- CustomDINO.load_state_dict_pre_hook() called") - # This ckpt_dict is come from mmdet3.x + # This ckpt_dict comes from mmdet3.x ckpt_dict["bbox_head.transformer.level_embeds"] = ckpt_dict.pop("level_embed") replaced_params = {} for param in ckpt_dict: diff --git a/otx/algorithms/detection/adapters/mmdet/models/layers/dino.py b/otx/algorithms/detection/adapters/mmdet/models/layers/dino.py index cab3f23183d..573417cfabf 100644 --- a/otx/algorithms/detection/adapters/mmdet/models/layers/dino.py +++ b/otx/algorithms/detection/adapters/mmdet/models/layers/dino.py @@ -17,7 +17,7 @@ class CustomDINOTransformer(DeformableDetrTransformer): Original implementation: mmdet.models.utils.transformer.DeformableDETR in mmdet2.x What's changed: The forward function is modified. - Modified implementations are come from mmdet.models.detectors.dino.DINO in mmdet3.x + Modified implementations come from mmdet.models.detectors.dino.DINO in mmdet3.x """ def init_layers(self): @@ -48,13 +48,13 @@ def forward( pre_transformer() -> forward_encoder() -> pre_decoder() -> forward_decoder(). In comparison, mmdet2.x forward function takes charge of all functions above. The differences in Deformable DETR and DINO are occured in pre_decoder(), forward_decoder(). - Therefore this function modified those parts. Modified implementations are come from + Therefore this function modified those parts. Modified implementations come from pre_decoder(), and forward_decoder() of mmdet.models.detectors.dino.DINO in mmdet3.x. Args: batch_info(list(dict(str, union(tuple, tensor)))): - Information about batch such as image shaep, + Information about batch such as image shape, gt information. mlvl_feats (list(Tensor)): Input queries from different level. Each element has shape diff --git a/otx/algorithms/detection/configs/detection/resnet50_dino/data_pipeline.py b/otx/algorithms/detection/configs/detection/resnet50_dino/data_pipeline.py index 9610b3fd514..9f7b3f1d404 100644 --- a/otx/algorithms/detection/configs/detection/resnet50_dino/data_pipeline.py +++ b/otx/algorithms/detection/configs/detection/resnet50_dino/data_pipeline.py @@ -112,4 +112,3 @@ pipeline=test_pipeline, ), ) -evaluation = dict(interval=1, metric="bbox") From 53359e939e48e3d4eac248cbefcd51abf2b5ef37 Mon Sep 17 00:00:00 2001 From: jaegukhyun Date: Tue, 27 Jun 2023 14:48:39 +0900 Subject: [PATCH 10/11] Reflect Reviews --- .../detection/resnet50_dino/data_pipeline.py | 41 +++++++------------ 1 file changed, 15 insertions(+), 26 deletions(-) diff --git a/otx/algorithms/detection/configs/detection/resnet50_dino/data_pipeline.py b/otx/algorithms/detection/configs/detection/resnet50_dino/data_pipeline.py index 9f7b3f1d404..19365de72a7 100644 --- a/otx/algorithms/detection/configs/detection/resnet50_dino/data_pipeline.py +++ b/otx/algorithms/detection/configs/detection/resnet50_dino/data_pipeline.py @@ -3,6 +3,19 @@ dataset_type = "CocoDataset" data_root = "data/coco/" img_norm_cfg = dict(mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +__img_scale = [ + (480, 1333), + (512, 1333), + (544, 1333), + (576, 1333), + (608, 1333), + (640, 1333), + (672, 1333), + (704, 1333), + (736, 1333), + (768, 1333), + (800, 1333), +] # train_pipeline, NOTE the img_scale and the Pad's size_divisor is different # from the default setting in mmdet. @@ -16,19 +29,7 @@ [ dict( type="Resize", - img_scale=[ - (480, 1333), - (512, 1333), - (544, 1333), - (576, 1333), - (608, 1333), - (640, 1333), - (672, 1333), - (704, 1333), - (736, 1333), - (768, 1333), - (800, 1333), - ], + img_scale=__img_scale, multiscale_mode="value", keep_ratio=True, ) @@ -45,19 +46,7 @@ dict(type="RandomCrop", crop_type="absolute_range", crop_size=(384, 600), allow_negative_crop=True), dict( type="Resize", - img_scale=[ - (480, 1333), - (512, 1333), - (544, 1333), - (576, 1333), - (608, 1333), - (640, 1333), - (672, 1333), - (704, 1333), - (736, 1333), - (768, 1333), - (800, 1333), - ], + img_scale=__img_scale, multiscale_mode="value", override=True, keep_ratio=True, From d893e1a25a825815b2683a6a4cb89e0fc8da88db Mon Sep 17 00:00:00 2001 From: jaegukhyun Date: Tue, 27 Jun 2023 15:34:52 +0900 Subject: [PATCH 11/11] Update unit tests --- .../mmdet/models/detectors/test_custom_dino_detector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/algorithms/detection/adapters/mmdet/models/detectors/test_custom_dino_detector.py b/tests/unit/algorithms/detection/adapters/mmdet/models/detectors/test_custom_dino_detector.py index 22b89435f09..2bd02b25505 100644 --- a/tests/unit/algorithms/detection/adapters/mmdet/models/detectors/test_custom_dino_detector.py +++ b/tests/unit/algorithms/detection/adapters/mmdet/models/detectors/test_custom_dino_detector.py @@ -34,7 +34,7 @@ def test_custom_dino_load_state_pre_hook(self, fxt_cfg_custom_dino: Dict): "memory_trans_fc": "memory_trans_fc", "memory_trans_norm": "memory_trans_norm", } - model.load_state_dict_pre_hook(model, ckpt_dict) + model.load_state_dict_pre_hook(ckpt_dict) assert ckpt_dict["bbox_head.transformer.level_embeds"] == "level_embed" assert ckpt_dict["bbox_head.transformer.encoder.attentions.0"] == "encoder.self_attn"