openvinotoolkit · zhiltsov-max · Jul 5, 2021 · Jul 2, 2021 · Jun 25, 2021 · Jun 28, 2021
@@ -16,8 +16,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - `keep-empty` export parameter in VOC format (<https://github.com/openvinotoolkit/datumaro/pull/297>)
 - A base class for dataset validation plugins (<https://github.com/openvinotoolkit/datumaro/pull/299>)
 - Partial support for the Open Images format;
-  only reading is supported, and only images and image-level labels can be read
-  (<https://github.com/openvinotoolkit/datumaro/pull/291>).
+  only images and image-level labels can be read/written
+  (<https://github.com/openvinotoolkit/datumaro/pull/291>,
+  <https://github.com/openvinotoolkit/datumaro/pull/315>).
 
 ### Changed
 - Tensorflow AVX check is made optional in API and is disabled by default (<https://github.com/openvinotoolkit/datumaro/pull/305>)

@@ -6,19 +6,23 @@
 import csv
 import fnmatch
 import glob
+import itertools
 import json
+import logging as log
 import os
 import os.path as osp
 import re
 
 from attr import attrs
 
+from datumaro.components.converter import Converter
 from datumaro.components.errors import DatasetError, RepeatedItemError, UndefinedLabel
 from datumaro.components.extractor import (
     AnnotationType, DatasetItem, Importer, Label, LabelCategories, Extractor,
 )
 from datumaro.components.validator import Severity
 from datumaro.util.image import find_images
+from datumaro.util.os_util import split_path
 
 # A regex to check whether a subset name can be used as a "normal" path
 # component.
@@ -40,11 +44,38 @@ def __str__(self):
 
 class OpenImagesPath:
     ANNOTATIONS_DIR = 'annotations'
-    FULL_IMAGE_DESCRIPTION_NAME = 'image_ids_and_rotation.csv'
-    SUBSET_IMAGE_DESCRIPTION_PATTERNS = (
+    IMAGES_DIR = 'images'
+
+    FULL_IMAGE_DESCRIPTION_FILE_NAME = 'image_ids_and_rotation.csv'
+    SUBSET_IMAGE_DESCRIPTION_FILE_PATTERNS = (
         '*-images-with-rotation.csv',
         '*-images-with-labels-with-rotation.csv',
     )
+    V5_CLASS_DESCRIPTION_FILE_NAME = 'class-descriptions.csv'
+    HIERARCHY_FILE_NAME = 'bbox_labels_600_hierarchy.json'
+
+    IMAGE_DESCRIPTION_FIELDS = (
+        'ImageID',
+        'Subset',
+        'OriginalURL',
+        'OriginalLandingURL',
+        'License',
+        'AuthorProfileURL',
+        'Author',
+        'Title',
+        'OriginalSize',
+        'OriginalMD5',
+        'Thumbnail300KURL',
+        'Rotation',
+    )
+
+    LABEL_DESCRIPTION_FIELDS = (
+        'ImageID',
+        'Source',
+        'LabelName',
+        'Confidence',
+    )
+
 
 class OpenImagesExtractor(Extractor):
     def __init__(self, path):
@@ -92,16 +123,14 @@ def _load_categories(self):
         # If the file doesn't exist with either name, we'll fail trying to open
         # `class-descriptions.csv`.
 
-        V5_CLASS_DESCRIPTIONS = 'class-descriptions.csv'
-
         annotation_name = [
             *self._glob_annotations('oidv*-class-descriptions.csv'),
-            V5_CLASS_DESCRIPTIONS,
+            OpenImagesPath.V5_CLASS_DESCRIPTION_FILE_NAME,
         ][0]
 
         with self._open_csv_annotation(annotation_name) as class_description_reader:
             # Prior to OID v6, this file didn't contain a header row.
-            if annotation_name == V5_CLASS_DESCRIPTIONS:
+            if annotation_name == OpenImagesPath.V5_CLASS_DESCRIPTION_FILE_NAME:
                 class_description_reader.fieldnames = ('LabelName', 'DisplayName')
 
             for class_description in class_description_reader:
@@ -116,7 +145,7 @@ def _load_label_category_parents(self):
         label_categories = self._categories[AnnotationType.label]
 
         hierarchy_path = osp.join(
-            self._dataset_dir, OpenImagesPath.ANNOTATIONS_DIR, 'bbox_labels_600_hierarchy.json')
+            self._dataset_dir, OpenImagesPath.ANNOTATIONS_DIR, OpenImagesPath.HIERARCHY_FILE_NAME)
 
         try:
             with open(hierarchy_path, 'rb') as hierarchy_file:
@@ -137,11 +166,16 @@ def set_parents_from_node(node, category):
         set_parents_from_node(root_node, root_category)
 
     def _load_items(self):
+        images_dir = osp.join(self._dataset_dir, OpenImagesPath.IMAGES_DIR)
+
         image_paths_by_id = {
-            osp.splitext(osp.basename(path))[0]: path
-            for path in find_images(
-                osp.join(self._dataset_dir, 'images'),
-                recursive=True, max_depth=1)
+            # the first component of `path_parts` is the subset name
+            '/'.join(path_parts[1:]): path
+            for path in find_images(images_dir, recursive=True)
+            for path_parts in [split_path(
+                osp.splitext(osp.relpath(path, images_dir))[0],
+            )]
+            if 1 < len(path_parts)
         }
 
         items_by_id = {}
@@ -170,9 +204,9 @@ def load_from(annotation_name):
         # However, if it's missing, we'll try loading subset-specific files instead, so that
         # this extractor can be used on individual subsets of the dataset.
         try:
-            load_from(OpenImagesPath.FULL_IMAGE_DESCRIPTION_NAME)
+            load_from(OpenImagesPath.FULL_IMAGE_DESCRIPTION_FILE_NAME)
         except FileNotFoundError:
-            for pattern in OpenImagesPath.SUBSET_IMAGE_DESCRIPTION_PATTERNS:
+            for pattern in OpenImagesPath.SUBSET_IMAGE_DESCRIPTION_FILE_PATTERNS:
                 for path in self._glob_annotations(pattern):
                     load_from(path)
 
@@ -207,10 +241,136 @@ class OpenImagesImporter(Importer):
     @classmethod
     def find_sources(cls, path):
         for pattern in [
-            OpenImagesPath.FULL_IMAGE_DESCRIPTION_NAME,
-            *OpenImagesPath.SUBSET_IMAGE_DESCRIPTION_PATTERNS,
+            OpenImagesPath.FULL_IMAGE_DESCRIPTION_FILE_NAME,
+            *OpenImagesPath.SUBSET_IMAGE_DESCRIPTION_FILE_PATTERNS,
         ]:
             if glob.glob(osp.join(glob.escape(path), OpenImagesPath.ANNOTATIONS_DIR, pattern)):
                 return [{'url': path, 'format': 'open_images'}]
 
         return []
+
+class OpenImagesConverter(Converter):
+    DEFAULT_IMAGE_EXT = '.jpg'
+
+    @contextlib.contextmanager
+    def _open_csv_annotation(self, file_name, field_names):
+        absolute_path = osp.join(self._save_dir, OpenImagesPath.ANNOTATIONS_DIR, file_name)
+
+        with open(absolute_path, 'w', encoding='utf-8', newline='') as f:
+            yield csv.DictWriter(f, field_names)
+
+    def apply(self):
+        annotations_dir = osp.join(self._save_dir, OpenImagesPath.ANNOTATIONS_DIR)
+
+        os.makedirs(annotations_dir, exist_ok=True)
+
+        self._save_categories()
+        self._save_label_category_parents()
+        self._save_subsets()
+
+    def _save_categories(self):
+        with self._open_csv_annotation(
+            OpenImagesPath.V5_CLASS_DESCRIPTION_FILE_NAME, ['LabelName', 'DisplayName'],
+        ) as class_description_writer:
+            # no .writeheader() here, since we're saving it in the V5 format
+
+            for category in self._extractor.categories()[AnnotationType.label]:
+                class_description_writer.writerow({
+                    'LabelName': category.name,
+                    'DisplayName': category.name,
+                })
+
+    def _save_label_category_parents(self):
+        all_label_names = set()
+        hierarchy_nodes = {}
+        orphan_nodes = []
+
+        def get_node(name):
+            return hierarchy_nodes.setdefault(name, {'LabelName': name})
+
+        for category in self._extractor.categories()[AnnotationType.label]:
+            all_label_names.add(category.name)
+
+            child_node = get_node(category.name)
+
+            if category.parent:
+                parent_node = get_node(category.parent)
+                parent_node.setdefault('Subcategory', []).append(child_node)
+            else:
+                orphan_nodes.append(child_node)
+
+        # The hierarchy has to be rooted in a single node. However, there's
+        # no guarantee that there exists only one orphan (label without a parent).
+        # Therefore, we create a fake root node and make it the parent of every
+        # orphan label.
+        # This is not a violation of the format, because the original OID does
+        # the same thing.
+        root_node = {
+            # Create an OID-like label name that isn't already used by a real label
+            'LabelName': next(root_name
+                for i in itertools.count()
+                for root_name in [f'/m/{i}']
+                if root_name not in all_label_names
+            ),
+            # If an orphan has no children, then it makes no semantic difference
+            # whether it's listed in the hierarchy file or not. So strip such nodes
+            # to avoid recording meaningless data.
+            'Subcategory': [node for node in orphan_nodes if 'Subcategory' in node],
+        }
+
+        hierarchy_path = osp.join(
+            self._save_dir, OpenImagesPath.ANNOTATIONS_DIR, OpenImagesPath.HIERARCHY_FILE_NAME)
+
+        with open(hierarchy_path, 'w', encoding='utf-8') as hierarchy_file:
+            json.dump(root_node, hierarchy_file, indent=4, ensure_ascii=False)
+            hierarchy_file.write('\n')
+
+    def _save_subsets(self):
+        label_categories = self._extractor.categories().get(
+            AnnotationType.label, LabelCategories())
+
+        for subset_name, subset in self._extractor.subsets().items():
+            if _RE_INVALID_SUBSET.fullmatch(subset_name):
+                raise UnsupportedSubsetNameError(item_id=next(iter(subset)).id, subset=subset)
+
+            image_description_name = f'{subset_name}-images-with-rotation.csv'
+            label_description_name = f'{subset_name}-annotations-human-imagelabels.csv'
+
+            with \
+                self._open_csv_annotation(
+                    image_description_name, OpenImagesPath.IMAGE_DESCRIPTION_FIELDS,
+                ) as image_description_writer, \
+                contextlib.ExitStack() as annotation_writers \
+            :
+                image_description_writer.writeheader()
+
+                # The label description writer is created lazily,
+                # so that we don't create the label description file if there are no labels.
+                label_description_writer = None
+
+                for item in subset:
+                    image_description_writer.writerow({
+                        'ImageID': item.id, 'Subset': subset_name,
+                    })
+
+                    if self._save_images:
+                        if item.has_image:
+                            self._save_image(item, subdir=osp.join(
+                                OpenImagesPath.IMAGES_DIR, subset_name))
+                        else:
+                            log.debug("Item '%s' has no image", item.id)
+
+                    for annotation in item.annotations:
+                        if annotation.type is AnnotationType.label:
+                            if label_description_writer is None:
+                                label_description_writer = annotation_writers.enter_context(
+                                    self._open_csv_annotation(
+                                        label_description_name,
+                                        OpenImagesPath.LABEL_DESCRIPTION_FIELDS))
+                                label_description_writer.writeheader()
+
+                            label_description_writer.writerow({
+                                'ImageID': item.id,
+                                'LabelName': label_categories[annotation.label].name,
+                                'Confidence': str(annotation.attributes.get('score', 1)),
+                            })
@@ -29,8 +29,8 @@ which can be downloaded from the following URLs:
 
 - [complete set](https://storage.googleapis.com/openimages/2018_04/image_ids_and_rotation.csv)
 - [train set](https://storage.googleapis.com/openimages/v6/oidv6-train-images-with-labels-with-rotation.csv)
-- [validation set](https://storage.googleapis.com/openimages/2018_04/test/test-images-with-rotation.csv)
-- [test set](https://storage.googleapis.com/openimages/2018_04/validation/validation-images-with-rotation.csv)
+- [validation set](https://storage.googleapis.com/openimages/2018_04/validation/validation-images-with-rotation.csv)
+- [test set](https://storage.googleapis.com/openimages/2018_04/test/test-images-with-rotation.csv)
 
 Datumaro expects at least one of the files above to be present.
 
@@ -111,7 +111,27 @@ To get information about them, run
 
 ## Export to Open Images
 
-Converting datasets to the Open Images format is currently not supported.
+There are few ways to convert an existing dataset to the Open Images format:
+
+``` bash
+# export dataset into Open Images format from existing project
+datum export -p <path/to/project> -f open_images -o <path/to/export/dir> \
+  -- --save_images
+
+# convert a dataset in another format to the Open Images format
+datum convert -if imagenet -i <path/to/imagenet/dataset> \
+    -f open_images -o <path/to/export/dir> \
+    -- --save-images
+```
+
+Extra options for export to the Open Images format:
+
+- `--save-images` - save image files when exporting the dataset
+  (by default, `False`)
+
+- `--image-ext IMAGE_EXT` - save image files with the speficied extension
+  when exporting the dataset (by default, uses the original extension
+  or `.jpg` if there isn't one)
 
 ## Particular use cases
 
@@ -120,10 +140,10 @@ and for the Open Images format in particular. Follow
 [user manual](../user_manual.md)
 to get more information about these operations.
 
-Here is an example of using Datumaro operations to solve
-a particular problem with the Open Images dataset:
+Here are a few examples of using Datumaro operations to solve
+particular problems with the Open Images dataset:
 
-### Example. How to load the Open Images dataset and convert to the format used by CVAT
+### Example 1. How to load the Open Images dataset and convert to the format used by CVAT
 
 ```bash
 datum create -o project
@@ -132,5 +152,31 @@ datum stats -p project
 datum export -p project -o dataset -f cvat --overwrite -- --save-images
 ```
 
+### Example 2. How to create a custom OID-like dataset
+
+```python
+import numpy as np
+from datumaro.components.dataset import Dataset
+from datumaro.components.extractor import (
+    AnnotationType, Label, LabelCategories, DatasetItem,
+)
+
+dataset = Dataset.from_iterable(
+    [
+        DatasetItem(
+            id='0000000000000001',
+            image=np.ones((1, 5, 3)),
+            subset='validation',
+            annotations=[
+                Label(0, attributes={'score': 1}),
+                Label(1, attributes={'score': 0}),
+            ],
+        ),
+    ],
+    categories=['/m/0', '/m/1'],
+)
+dataset.export('./dataset', format='open_images')
+```
+
 More examples of working with OID from code can be found in
 [tests](../../tests/test_open_images_format.py).
@@ -21,6 +21,7 @@ class Requirements:
     DATUM_231 = "Readable formats for CJK"
     DATUM_244 = "Add Snyk integration"
     DATUM_267 = "Add Image zip format"
+    DATUM_274 = "Support the Open Images dataset"
     DATUM_280 = "Support KITTI dataset formats"
     DATUM_283 = "Create cli tests for testing convert command for VOC format"