openvinotoolkit · goodsong81 · Dec 20, 2022 · Dec 7, 2022 · Dec 7, 2022 · Dec 8, 2022
@@ -76,52 +76,16 @@
 from ote_sdk.utils import Tiler
 from ote_sdk.utils.detection_utils import detection2array
 from ote_sdk.utils.vis_utils import get_actmap
-from typing import Any, Dict, Optional, Tuple, Union, List
+from typing import Any, Dict, Optional, Tuple, Union
 from zipfile import ZipFile
 
 from mmdet.utils.logger import get_root_logger
 from .configuration import OTEDetectionConfig
 from . import model_wrappers
 
-from mmcv.ops import nms
-
 logger = get_root_logger()
 
 
-def multiclass_nms(
-    scores: np.ndarray,
-    labels: np.ndarray,
-    boxes: np.ndarray,
-    iou_threshold=0.45,
-    max_num=200,
-):
-    """ Multi-class NMS
-
-    strategy: in order to perform NMS independently per class,
-    we add an offset to all the boxes. The offset is dependent
-    only on the class idx, and is large enough so that boxes
-    from different classes do not overlap
-
-    Args:
-        scores (np.ndarray): box scores
-        labels (np.ndarray): box label indices
-        boxes (np.ndarray): box coordinates
-        iou_threshold (float, optional): IoU threshold. Defaults to 0.45.
-        max_num (int, optional): Max number of objects filter. Defaults to 200.
-
-    Returns:
-        _type_: _description_
-    """
-    max_coordinate = boxes.max()
-    offsets = labels.astype(boxes.dtype) * (max_coordinate + 1)
-    boxes_for_nms = boxes + offsets[:, None]
-    dets, keep = nms(boxes_for_nms, scores, iou_threshold)
-    if max_num > 0:
-        dets = dets[:max_num]
-        keep = keep[:max_num]
-    return dets, keep
-
-
 class BaseInferencerWithConverter(BaseInferencer):
     @check_input_parameters_type()
     def __init__(
@@ -174,6 +138,28 @@ def predict(
     def forward(self, inputs: Dict[str, np.ndarray]) -> Dict[str, np.ndarray]:
         return self.model.infer_sync(inputs)
 
+    @check_input_parameters_type()
+    def predict_tile(
+        self, image: np.ndarray, tile_size: int, overlap: float, max_number: int
+    ) -> Tuple[AnnotationSceneEntity, np.ndarray, np.ndarray]:
+        """ Run prediction by tiling image to small patches
+
+        Args:
+            image (np.ndarray): input image
+            tile_size (int): tile crop size
+            overlap (float): overlap ratio between tiles
+            max_number (int): max number of predicted objects allowed
+
+        Returns:
+            detections: AnnotationSceneEntity
+            features: list including saliency map and feature vector
+        """
+        segm = isinstance(self.converter, (MaskToAnnotationConverter, RotatedRectToAnnotationConverter))
+        tiler = Tiler(tile_size=tile_size, overlap=overlap, max_number=max_number, model=self.model, segm=segm)
+        detections, features = tiler.predict(image)
+        detections = self.converter.convert_to_annotation(detections, metadata={"original_shape": image.shape})
+        return detections, features
+
 
 class OpenVINODetectionInferencer(BaseInferencerWithConverter):
     @check_input_parameters_type()
@@ -227,27 +213,6 @@ def post_process(
         detections = detection2array(detections)
         return self.converter.convert_to_annotation(detections, metadata)
 
-    @check_input_parameters_type()
-    def predict_tile(
-        self, image: np.ndarray, tile_size: int, overlap: float, max_number: int
-    ) -> Tuple[AnnotationSceneEntity, np.ndarray, np.ndarray]:
-        """ Run prediction by tiling image to small patches
-
-        Args:
-            image (np.ndarray): input image
-            tile_size (int): tile crop size
-            overlap (float): overlap ratio between tiles
-            max_number (int): max number of predicted objects allowed
-
-        Returns:
-            detections: AnnotationSceneEntity
-            features: list including saliency map and feature vector
-        """
-        tiler = Tiler(tile_size=tile_size, overlap=overlap, max_number=max_number, model=self.model)
-        detections, features = tiler.predict(image)
-        detections = self.converter.convert_to_annotation(detections, metadata={"original_shape": image.shape})
-        return detections, features
-
 
 class OpenVINOMaskInferencer(BaseInferencerWithConverter):
     @check_input_parameters_type()
@@ -284,27 +249,6 @@ def __init__(
 
         super().__init__(configuration, model, converter)
 
-    @check_input_parameters_type()
-    def predict_tile(
-        self, image: np.ndarray, tile_size: int, overlap: float, max_number: int
-    ) -> Tuple[AnnotationSceneEntity, np.ndarray, np.ndarray]:
-        """ Run prediction by tiling image to small patches
-
-        Args:
-            image (np.ndarray): input image
-            tile_size (int): tile crop size
-            overlap (float): overlap ratio between tiles
-            max_number (int): max number of predicted objects allowed
-
-        Returns:
-            detections: AnnotationSceneEntity
-            features: list including saliency map and feature vector
-        """
-        tiler = Tiler(tile_size=tile_size, overlap=overlap, max_number=max_number, model=self.model, segm=True)
-        detections, features = tiler.predict(image)
-        detections = self.converter.convert_to_annotation(detections, metadata={"original_shape": image.shape})
-        return detections, features
-
 
 class OpenVINORotatedRectInferencer(BaseInferencerWithConverter):
     @check_input_parameters_type()

@@ -1,7 +1,7 @@
 dataset_type = "CocoDataset"
 img_size = (1024, 1024)
 
-img_norm_cfg = dict(mean=(103.53, 116.28, 123.675), std=(1.0, 1.0, 1.0), to_rgb=False)
+img_norm_cfg = dict(mean=(103.53, 116.28, 123.675), std=(1.0, 1.0, 1.0), to_rgb=True)
 
 train_pipeline = [
     dict(type="LoadImageFromFile"),

@@ -6,7 +6,7 @@
     tile_size=400, min_area_ratio=0.9, overlap_ratio=0.2, iou_threshold=0.45, max_per_img=1500, filter_empty_gt=True
 )
 
-img_norm_cfg = dict(mean=(103.53, 116.28, 123.675), std=(1.0, 1.0, 1.0), to_rgb=False)
+img_norm_cfg = dict(mean=(103.53, 116.28, 123.675), std=(1.0, 1.0, 1.0), to_rgb=True)
 
 train_pipeline = [
     dict(type="Resize", img_scale=img_size, keep_ratio=False),

@@ -376,3 +376,95 @@ nncf_optimization:
     warning: null
   type: PARAMETER_GROUP
   visible_in_ui: True
+tiling_parameters:
+  header: Tiling
+  description: Crop dataset to tiles
+
+  enable_tiling:
+    header: Enable tiling
+    description: Set to True to allow tiny objects to be better detected.
+    default_value: false
+    editable: true
+    affects_outcome_of: TRAINING
+    type: BOOLEAN
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: true
+    visible_in_ui: true
+    warning: Tiling trades off speed for accuracy as it increases the number of images to be processed.
+
+  enable_adaptive_params:
+    header: Enable adaptive tiling parameters
+    description: Config tile size and tile overlap adaptively based on annotated dataset statistic
+    default_value: True
+    editable: true
+    affects_outcome_of: TRAINING
+    type: BOOLEAN
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: true
+    visible_in_ui: true
+    warning: null
+
+  tile_size:
+    header: Tile Image Size
+    description: Tile Image Size
+    affects_outcome_of: TRAINING
+    default_value: 400
+    min_value: 100
+    max_value: 1024
+    type: INTEGER
+    editable: true
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 400
+    visible_in_ui: true
+    warning: null
+
+  tile_overlap:
+    header: Tile Overlap
+    description: Overlap between each two neighboring tiles.
+    affects_outcome_of: TRAINING
+    default_value: 0.2
+    min_value: 0.0
+    max_value: 1.0
+    type: FLOAT
+    editable: true
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 0.2
+    visible_in_ui: true
+    warning: null
+
+  tile_max_number:
+    header: Max object per image
+    description: Max object per image
+    affects_outcome_of: TRAINING
+    default_value: 1500
+    min_value: 1
+    max_value: 10000
+    type: INTEGER
+    editable: true
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 1500
+    visible_in_ui: true
+    warning: null
+
+  type: PARAMETER_GROUP
+  visible_in_ui: true
@@ -1,7 +1,7 @@
 dataset_type = "CocoDataset"
 img_size = (1024, 1024)
 
-img_norm_cfg = dict(mean=(103.53, 116.28, 123.675), std=(1.0, 1.0, 1.0), to_rgb=False)
+img_norm_cfg = dict(mean=(103.53, 116.28, 123.675), std=(1.0, 1.0, 1.0), to_rgb=True)
 
 train_pipeline = [
     dict(type="LoadImageFromFile"),

@@ -0,0 +1,85 @@
+dataset_type = "CocoDataset"
+
+img_size = (1024, 1024)
+
+tile_cfg = dict(
+    tile_size=400, min_area_ratio=0.9, overlap_ratio=0.2, iou_threshold=0.45, max_per_img=1500, filter_empty_gt=True
+)
+
+img_norm_cfg = dict(mean=(103.53, 116.28, 123.675), std=(1.0, 1.0, 1.0), to_rgb=True)
+
+train_pipeline = [
+    dict(type="Resize", img_scale=img_size, keep_ratio=False),
+    dict(type="RandomFlip", flip_ratio=0.5),
+    dict(type="Normalize", **img_norm_cfg),
+    dict(type="Pad", size_divisor=32),
+    dict(type="DefaultFormatBundle"),
+    dict(type="Collect", keys=["img", "gt_bboxes", "gt_labels", "gt_masks"]),
+]
+
+test_pipeline = [
+    dict(
+        type="MultiScaleFlipAug",
+        img_scale=img_size,
+        flip=False,
+        transforms=[
+            dict(type="Resize", keep_ratio=False),
+            dict(type="RandomFlip"),
+            dict(type="Normalize", **img_norm_cfg),
+            dict(type="Pad", size_divisor=32),
+            dict(type="ImageToTensor", keys=["img"]),
+            dict(type="Collect", keys=["img"]),
+        ],
+    )
+]
+
+__dataset_type = "CocoDataset"
+__data_root = "data/coco/"
+
+__samples_per_gpu = 4
+
+train_dataset = dict(
+    type="ImageTilingDataset",
+    dataset=dict(
+        type=__dataset_type,
+        ann_file=__data_root + "annotations/instances_train.json",
+        img_prefix=__data_root + "images/train",
+        pipeline=[
+            dict(type="LoadImageFromFile"),
+            dict(type="LoadAnnotations", with_bbox=True, with_mask=True),
+        ],
+    ),
+    pipeline=train_pipeline,
+    **tile_cfg
+)
+
+val_dataset = dict(
+    type="ImageTilingDataset",
+    dataset=dict(
+        type=__dataset_type,
+        ann_file=__data_root + "annotations/instances_val.json",
+        img_prefix=__data_root + "images/val",
+        test_mode=True,
+        pipeline=[dict(type="LoadImageFromFile")],
+    ),
+    pipeline=test_pipeline,
+    **tile_cfg
+)
+
+test_dataset = dict(
+    type="ImageTilingDataset",
+    dataset=dict(
+        type=__dataset_type,
+        ann_file=__data_root + "annotations/instances_test.json",
+        img_prefix=__data_root + "images/test",
+        test_mode=True,
+        pipeline=[dict(type="LoadImageFromFile")],
+    ),
+    pipeline=test_pipeline,
+    **tile_cfg
+)
+
+
+data = dict(
+    samples_per_gpu=__samples_per_gpu, workers_per_gpu=2, train=train_dataset, val=val_dataset, test=test_dataset
+)