From 032b0c151083f08e87bade0692bee946fc0a2663 Mon Sep 17 00:00:00 2001
From: Maxim Zhiltsov <maxim.zhiltsov@intel.com>
Date: Wed, 31 Mar 2021 13:04:07 +0300
Subject: [PATCH] Format fixes in COCO and VOC (#195)

* Allow splitting and merging of image directories in COCO export

* Avoid producing conflicting attributes in VOC segmentation

* update changelog
---
 CHANGELOG.md                              |  1 +
 datumaro/plugins/coco_format/converter.py | 26 +++++++++++------
 datumaro/plugins/voc_format/extractor.py  | 22 ++++++--------
 tests/test_coco_format.py                 | 35 ++++++++++++++++++++---
 4 files changed, 58 insertions(+), 26 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 7f2b5feadd6c..f732be7d7489 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -12,6 +12,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Changed
 - Added an option to allow undeclared annotation attributes in CVAT format export (<https://github.com/openvinotoolkit/datumaro/pull/192>)
+- COCO exports images in separate dirs by subsets. Added an option to control this (<https://github.com/openvinotoolkit/datumaro/pull/195>)
 
 ### Deprecated
 -
diff --git a/datumaro/plugins/coco_format/converter.py b/datumaro/plugins/coco_format/converter.py
index b1ae77f90b22..0caf89de3b35 100644
--- a/datumaro/plugins/coco_format/converter.py
+++ b/datumaro/plugins/coco_format/converter.py
@@ -478,9 +478,12 @@ def build_cmdline_parser(cls, **kwargs):
         parser.add_argument('--allow-attributes',
             type=str_to_bool, default=True,
             help="Allow export of attributes (default: %(default)s)")
-        parser.add_argument('--reindex', action='store_true',
-            help="Assign new indices to images and annotations "
-                "(default: %(default)s)")
+        parser.add_argument('--reindex', type=str_to_bool, default=False,
+            help="Assign new indices to images and annotations, "
+                "useful to avoid merge conflicts (default: %(default)s)")
+        parser.add_argument('--merge-images', type=str_to_bool, default=False,
+            help="Save all images into a single "
+                "directory (default: %(default)s)")
         parser.add_argument('--tasks', type=cls._split_tasks_string,
             help="COCO task filter, comma-separated list of {%s} "
                 "(default: all)" % ', '.join(t.name for t in CocoTask))
@@ -498,7 +501,8 @@ def build_cmdline_parser(cls, **kwargs):
 
     def __init__(self, extractor, save_dir,
             tasks=None, segmentation_mode=None, crop_covered=False,
-            allow_attributes=True, reindex=False, **kwargs):
+            allow_attributes=True, reindex=False, merge_images=False,
+            **kwargs):
         super().__init__(extractor, save_dir, **kwargs)
 
         assert tasks is None or isinstance(tasks, (CocoTask, list, str))
@@ -526,6 +530,7 @@ def __init__(self, extractor, save_dir,
         self._crop_covered = crop_covered
         self._allow_attributes = allow_attributes
         self._reindex = reindex
+        self._merge_images = merge_images
 
         self._image_ids = {}
 
@@ -556,10 +561,6 @@ def _get_image_id(self, item):
             self._image_ids[item.id] = image_id
         return image_id
 
-    def _save_image(self, item, path=None):
-        super()._save_image(item,
-            osp.join(self._images_dir, self._make_image_filename(item)))
-
     def apply(self):
         self._make_dirs()
 
@@ -571,7 +572,8 @@ def apply(self):
             for item in subset:
                 if self._save_images:
                     if item.has_image:
-                        self._save_image(item)
+                        self._save_image(item, subdir=osp.join(self._images_dir,
+                            '' if self._merge_images else subset_name))
                     else:
                         log.debug("Item '%s' has no image info", item.id)
                 for task_conv in task_converters.values():
@@ -605,6 +607,12 @@ def patch(cls, dataset, patch, save_dir, **kwargs):
             if osp.isfile(image_path):
                 os.unlink(image_path)
 
+            image_path = osp.join(images_dir, subset,
+                conv._make_image_filename(item))
+            if osp.isfile(image_path):
+                os.unlink(image_path)
+
+
 class CocoInstancesConverter(CocoConverter):
     def __init__(self, *args, **kwargs):
         kwargs['tasks'] = CocoTask.instances
diff --git a/datumaro/plugins/voc_format/extractor.py b/datumaro/plugins/voc_format/extractor.py
index 993b825350f9..9df7cc066d16 100644
--- a/datumaro/plugins/voc_format/extractor.py
+++ b/datumaro/plugins/voc_format/extractor.py
@@ -302,22 +302,18 @@ def _load_annotations(self, item_id):
                     for i in range(compiled_mask.instance_count)}
 
             for instance_id, label_id in instance_labels.items():
+                if len(label_cat) <= label_id:
+                    raise Exception(
+                        "Item %s: a mask has unexpected class number %s" %
+                        (item_id, label_id))
+
                 image = compiled_mask.lazy_extract(instance_id)
 
-                attributes = {}
-                if label_id is not None:
-                    actions = {a: False
-                        for a in label_cat.items[label_id].attributes
-                    }
-                    attributes.update(actions)
-
-                item_annotations.append(Mask(
-                    image=image, label=label_id,
-                    attributes=attributes, group=instance_id
-                ))
+                item_annotations.append(Mask(image=image, label=label_id,
+                    group=instance_id))
         elif class_mask is not None:
-            log.warn("item '%s': has only class segmentation, "
-                "instance masks will not be available" % item_id)
+            log.warning("Item %s: only class segmentations available" % item_id)
+
             class_mask = class_mask()
             classes = np.unique(class_mask)
             for label_id in classes:
diff --git a/tests/test_coco_format.py b/tests/test_coco_format.py
index b884009b2266..ae24b4d88e3e 100644
--- a/tests/test_coco_format.py
+++ b/tests/test_coco_format.py
@@ -630,6 +630,33 @@ def test_reindex(self):
                 partial(CocoConverter.convert, reindex=True),
                 test_dir, target_dataset=target_dataset)
 
+    def test_can_save_images_in_single_dir(self):
+        dataset = Dataset.from_iterable([
+            DatasetItem(id=1, subset='train', image=np.ones((2, 4, 3)),
+                attributes={'id': 1}),
+        ])
+
+        with TestDir() as test_dir:
+            self._test_save_and_load(dataset,
+                partial(CocoImageInfoConverter.convert, save_images=True,
+                    merge_images=True),
+                test_dir, require_images=True)
+            self.assertTrue(osp.isfile(osp.join(test_dir, 'images', '1.jpg')))
+
+    def test_can_save_images_in_separate_dirs(self):
+        dataset = Dataset.from_iterable([
+            DatasetItem(id=1, subset='train', image=np.ones((2, 4, 3)),
+                attributes={'id': 1}),
+        ])
+
+        with TestDir() as test_dir:
+            self._test_save_and_load(dataset,
+                partial(CocoImageInfoConverter.convert, save_images=True,
+                    merge_images=False),
+                test_dir, require_images=True)
+            self.assertTrue(osp.isfile(osp.join(
+                test_dir, 'images', 'train', '1.jpg')))
+
     def test_inplace_save_writes_only_updated_data(self):
         with TestDir() as path:
             # generate initial dataset
@@ -642,8 +669,8 @@ def test_inplace_save_writes_only_updated_data(self):
             os.unlink(osp.join(path, 'annotations', 'image_info_a.json'))
             os.unlink(osp.join(path, 'annotations', 'image_info_b.json'))
             os.unlink(osp.join(path, 'annotations', 'image_info_c.json'))
-            self.assertFalse(osp.isfile(osp.join(path, 'images', '2.jpg')))
-            self.assertTrue(osp.isfile(osp.join(path, 'images', '3.jpg')))
+            self.assertFalse(osp.isfile(osp.join(path, 'images', 'b', '2.jpg')))
+            self.assertTrue(osp.isfile(osp.join(path, 'images', 'c', '3.jpg')))
 
             dataset.put(DatasetItem(2, subset='a', image=np.ones((3, 2, 3))))
             dataset.remove(3, 'c')
@@ -655,5 +682,5 @@ def test_inplace_save_writes_only_updated_data(self):
                 path, 'annotations', 'image_info_b.json')))
             self.assertFalse(osp.isfile(osp.join(
                 path, 'annotations', 'image_info_c.json')))
-            self.assertTrue(osp.isfile(osp.join(path, 'images', '2.jpg')))
-            self.assertFalse(osp.isfile(osp.join(path, 'images', '3.jpg')))
\ No newline at end of file
+            self.assertTrue(osp.isfile(osp.join(path, 'images', 'a', '2.jpg')))
+            self.assertFalse(osp.isfile(osp.join(path, 'images', 'c', '3.jpg')))
\ No newline at end of file