From d85d38526222b42f7fd277cc65f9b21784aef062 Mon Sep 17 00:00:00 2001
From: Roman Donchenko <roman.donchenko@intel.com>
Date: Fri, 2 Jul 2021 17:55:01 +0300
Subject: [PATCH 01/17] open_images_user_manual.md: fix image description file
 URLs

I accidentally swapped the URLs for test and validation sets.
---
 docs/formats/open_images_user_manual.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/formats/open_images_user_manual.md b/docs/formats/open_images_user_manual.md
index e86d3598a5..24306ac646 100644
--- a/docs/formats/open_images_user_manual.md
+++ b/docs/formats/open_images_user_manual.md
@@ -29,8 +29,8 @@ which can be downloaded from the following URLs:
 
 - [complete set](https://storage.googleapis.com/openimages/2018_04/image_ids_and_rotation.csv)
 - [train set](https://storage.googleapis.com/openimages/v6/oidv6-train-images-with-labels-with-rotation.csv)
-- [validation set](https://storage.googleapis.com/openimages/2018_04/test/test-images-with-rotation.csv)
-- [test set](https://storage.googleapis.com/openimages/2018_04/validation/validation-images-with-rotation.csv)
+- [validation set](https://storage.googleapis.com/openimages/2018_04/validation/validation-images-with-rotation.csv)
+- [test set](https://storage.googleapis.com/openimages/2018_04/test/test-images-with-rotation.csv)
 
 Datumaro expects at least one of the files above to be present.
 

From db35a038b99df768f09dff1cb7d2ab0d632605f7 Mon Sep 17 00:00:00 2001
From: Roman Donchenko <roman.donchenko@intel.com>
Date: Fri, 25 Jun 2021 13:43:35 +0300
Subject: [PATCH 02/17] open_images_format: add conversion support

---
 datumaro/plugins/open_images_format.py | 145 ++++++++++++++++++++++++-
 1 file changed, 140 insertions(+), 5 deletions(-)

diff --git a/datumaro/plugins/open_images_format.py b/datumaro/plugins/open_images_format.py
index 2f823eef89..ebbe14a14b 100644
--- a/datumaro/plugins/open_images_format.py
+++ b/datumaro/plugins/open_images_format.py
@@ -6,6 +6,7 @@
 import csv
 import fnmatch
 import glob
+import itertools
 import json
 import os
 import os.path as osp
@@ -13,6 +14,7 @@
 
 from attr import attrs
 
+from datumaro.components.converter import Converter
 from datumaro.components.errors import DatasetError, RepeatedItemError, UndefinedLabel
 from datumaro.components.extractor import (
     AnnotationType, DatasetItem, Importer, Label, LabelCategories, Extractor,
@@ -45,6 +47,8 @@ class OpenImagesPath:
         '*-images-with-rotation.csv',
         '*-images-with-labels-with-rotation.csv',
     )
+    V5_CLASS_DESCRIPTION_NAME = 'class-descriptions.csv'
+    HIERARCHY_NAME = 'bbox_labels_600_hierarchy.json'
 
 class OpenImagesExtractor(Extractor):
     def __init__(self, path):
@@ -92,16 +96,14 @@ def _load_categories(self):
         # If the file doesn't exist with either name, we'll fail trying to open
         # `class-descriptions.csv`.
 
-        V5_CLASS_DESCRIPTIONS = 'class-descriptions.csv'
-
         annotation_name = [
             *self._glob_annotations('oidv*-class-descriptions.csv'),
-            V5_CLASS_DESCRIPTIONS,
+            OpenImagesPath.V5_CLASS_DESCRIPTION_NAME,
         ][0]
 
         with self._open_csv_annotation(annotation_name) as class_description_reader:
             # Prior to OID v6, this file didn't contain a header row.
-            if annotation_name == V5_CLASS_DESCRIPTIONS:
+            if annotation_name == OpenImagesPath.V5_CLASS_DESCRIPTION_NAME:
                 class_description_reader.fieldnames = ('LabelName', 'DisplayName')
 
             for class_description in class_description_reader:
@@ -116,7 +118,7 @@ def _load_label_category_parents(self):
         label_categories = self._categories[AnnotationType.label]
 
         hierarchy_path = osp.join(
-            self._dataset_dir, OpenImagesPath.ANNOTATIONS_DIR, 'bbox_labels_600_hierarchy.json')
+            self._dataset_dir, OpenImagesPath.ANNOTATIONS_DIR, OpenImagesPath.HIERARCHY_NAME)
 
         try:
             with open(hierarchy_path, 'rb') as hierarchy_file:
@@ -214,3 +216,136 @@ def find_sources(cls, path):
                 return [{'url': path, 'format': 'open_images'}]
 
         return []
+
+class OpenImagesConverter(Converter):
+    DEFAULT_IMAGE_EXT = '.jpg'
+
+    @contextlib.contextmanager
+    def _open_csv_annotation(self, file_name, field_names):
+        absolute_path = osp.join(self._save_dir, OpenImagesPath.ANNOTATIONS_DIR, file_name)
+
+        with open(absolute_path, 'w', encoding='utf-8', newline='') as f:
+            yield csv.DictWriter(f, field_names)
+
+    def apply(self):
+        annotations_dir = osp.join(self._save_dir, OpenImagesPath.ANNOTATIONS_DIR)
+
+        os.makedirs(annotations_dir, exist_ok=True)
+
+        self._save_categories()
+        self._save_label_category_parents()
+        self._save_subsets()
+
+    def _save_categories(self):
+        with self._open_csv_annotation(
+            OpenImagesPath.V5_CLASS_DESCRIPTION_NAME, ['LabelName', 'DisplayName'],
+        ) as class_description_writer:
+            # no .writeheader() here, since we're saving it in the V5 format
+
+            for category in self._extractor.categories()[AnnotationType.label]:
+                class_description_writer.writerow({
+                    'LabelName': category.name,
+                    'DisplayName': category.name,
+                })
+
+    def _save_label_category_parents(self):
+        all_label_names = set()
+        hierarchy_nodes = {}
+        orphan_nodes = []
+
+        def get_node(name):
+            return hierarchy_nodes.setdefault(name, {'LabelName': name})
+
+        for category in self._extractor.categories()[AnnotationType.label]:
+            all_label_names.add(category.name)
+
+            child_node = get_node(category.name)
+
+            if category.parent:
+                parent_node = get_node(category.parent)
+                parent_node.setdefault('Subcategory', []).append(child_node)
+            else:
+                orphan_nodes.append(child_node)
+
+        # The hierarchy has to be rooted in a single node. However, there's
+        # no guarantee that there exists only one orphan (label without a parent).
+        # Therefore, we create a fake root node and make it the parent of every
+        # orphan label.
+        # This is not a violation of the format, because the original OID does
+        # the same thing.
+        root_node = {
+            # Create an OID-like label name that isn't already used by a real label
+            'LabelName': next(root_name
+                for i in itertools.count()
+                for root_name in [f'/m/{i}']
+                if root_name not in all_label_names
+            ),
+            # If an orphan has no children, then it makes no semantic difference
+            # whether it's listed in the hierarchy file or not. So strip such nodes
+            # to avoid recording meaningless data.
+            'Subcategory': [node for node in orphan_nodes if 'Subcategory' in node],
+        }
+
+        hierarchy_path = osp.join(
+            self._save_dir, OpenImagesPath.ANNOTATIONS_DIR, OpenImagesPath.HIERARCHY_NAME)
+
+        with open(hierarchy_path, 'w', encoding='UTF-8') as hierarchy_file:
+            json.dump(root_node, hierarchy_file, indent=4)
+            hierarchy_file.write('\n')
+
+    def _save_subsets(self):
+        # TODO: what if there are no categories?
+        label_categories = self._extractor.categories()[AnnotationType.label]
+
+        for subset_name, subset in self._extractor.subsets().items():
+            if _RE_INVALID_SUBSET.fullmatch(subset_name):
+                raise UnsupportedSubsetNameError(item_id=next(iter(subset)).id, subset=subset)
+
+            image_description_name = f'{subset_name}-images-with-rotation.csv'
+            image_description_fields = [
+                'ImageID',
+                'Subset',
+                'OriginalURL',
+                'OriginalLandingURL',
+                'License',
+                'AuthorProfileURL',
+                'Author',
+                'Title',
+                'OriginalSize',
+                'OriginalMD5',
+                'Thumbnail300KURL',
+                'Rotation',
+            ]
+
+            label_description_name = f'{subset_name}-annotations-human-imagelabels.csv'
+            label_description_fields =  [
+                'ImageID',
+                'Source',
+                'LabelName',
+                'Confidence',
+            ]
+
+            with \
+                self._open_csv_annotation(
+                    image_description_name, image_description_fields) as image_description_writer, \
+                self._open_csv_annotation(
+                    label_description_name, label_description_fields) as label_description_writer \
+            :
+                image_description_writer.writeheader()
+                label_description_writer.writeheader()
+
+                for item in subset:
+                    image_description_writer.writerow({
+                        'ImageID': item.id, 'Subset': subset_name,
+                    })
+
+                    if self._save_images and item.has_image:
+                        self._save_image(item, subdir=osp.join('images', subset_name))
+
+                    for annotation in item.annotations:
+                        if isinstance(annotation, Label):
+                            label_description_writer.writerow({
+                                'ImageID': item.id,
+                                'LabelName': label_categories[annotation.label].name,
+                                'Confidence': str(annotation.attributes.get('score', 1)),
+                            })

From 3a711fa8cda9307b6b8d1742e797ccd1d5b892f6 Mon Sep 17 00:00:00 2001
From: Roman Donchenko <roman.donchenko@intel.com>
Date: Mon, 28 Jun 2021 17:59:00 +0300
Subject: [PATCH 03/17] open_images_format: add support for images in
 subdirectories

---
 datumaro/plugins/open_images_format.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/datumaro/plugins/open_images_format.py b/datumaro/plugins/open_images_format.py
index ebbe14a14b..97cac876c1 100644
--- a/datumaro/plugins/open_images_format.py
+++ b/datumaro/plugins/open_images_format.py
@@ -21,6 +21,7 @@
 )
 from datumaro.components.validator import Severity
 from datumaro.util.image import find_images
+from datumaro.util.os_util import split_path
 
 # A regex to check whether a subset name can be used as a "normal" path
 # component.
@@ -139,11 +140,16 @@ def set_parents_from_node(node, category):
         set_parents_from_node(root_node, root_category)
 
     def _load_items(self):
+        images_dir = osp.join(self._dataset_dir, 'images')
+
         image_paths_by_id = {
-            osp.splitext(osp.basename(path))[0]: path
-            for path in find_images(
-                osp.join(self._dataset_dir, 'images'),
-                recursive=True, max_depth=1)
+            # the first component of `path_parts` is the subset name
+            '/'.join(path_parts[1:]): path
+            for path in find_images(images_dir, recursive=True)
+            for path_parts in [split_path(
+                osp.splitext(osp.relpath(path, images_dir))[0],
+            )]
+            if len(path_parts) > 1
         }
 
         items_by_id = {}

From 44bcc8a2d28435bca6982f2ebda2d6665a07fcea Mon Sep 17 00:00:00 2001
From: Roman Donchenko <roman.donchenko@intel.com>
Date: Mon, 28 Jun 2021 17:59:31 +0300
Subject: [PATCH 04/17] open_images_format: add tests for writing support

---
 tests/test_open_images_format.py | 74 +++++++++++++++++++++++++++++++-
 1 file changed, 72 insertions(+), 2 deletions(-)

diff --git a/tests/test_open_images_format.py b/tests/test_open_images_format.py
index a7492e0cb7..8fc4df874e 100644
--- a/tests/test_open_images_format.py
+++ b/tests/test_open_images_format.py
@@ -8,12 +8,82 @@
 
 import numpy as np
 
-from datumaro.plugins.open_images_format import OpenImagesImporter
-from datumaro.util.test_utils import compare_datasets_strict
+from datumaro.plugins.open_images_format import OpenImagesConverter, OpenImagesImporter
+from datumaro.util.image import Image
+from datumaro.util.test_utils import TestDir, compare_datasets_strict
 from datumaro.components.extractor import AnnotationType, DatasetItem, Label, LabelCategories
 from datumaro.components.dataset import Dataset
 from tests.requirements import Requirements, mark_requirement
 
+class OpenImagesFormatTest(TestCase):
+    @mark_requirement(Requirements.DATUM_GENERAL_REQ)
+    def test_can_save_and_load(self):
+        source_dataset = Dataset.from_iterable(
+            [
+                DatasetItem(id='a', subset='train',
+                    annotations=[Label(0, attributes={'score': 0.7})]
+                ),
+                DatasetItem(id='b', subset='train', image=np.zeros((8, 8, 3)),
+                    annotations=[Label(1), Label(2, attributes={'score': 0})]
+                ),
+            ],
+            categories={
+                AnnotationType.label: LabelCategories.from_iterable([
+                    '/m/0',
+                    ('/m/1', '/m/0'),
+                    '/m/2',
+                ]),
+            },
+        )
+
+        with TestDir() as test_dir:
+            OpenImagesConverter.convert(source_dataset, test_dir,
+                save_images=True)
+
+            parsed_dataset = Dataset.import_from(test_dir, 'open_images')
+
+            # the converter assumes that labels without a score have a score of 100%
+            source_dataset.get('b', subset='train').annotations[0].attributes['score'] = 1
+
+            compare_datasets_strict(self, source_dataset, parsed_dataset)
+
+    @mark_requirement(Requirements.DATUM_GENERAL_REQ)
+    def test_can_save_and_load_with_no_subsets(self):
+        source_dataset = Dataset.from_iterable(
+            [
+                DatasetItem(id='a',
+                    annotations=[Label(0, attributes={'score': 0.7})]
+                ),
+            ],
+            categories={
+                AnnotationType.label: LabelCategories.from_iterable(['/m/0']),
+            },
+        )
+
+        with TestDir() as test_dir:
+            OpenImagesConverter.convert(source_dataset, test_dir,
+                save_images=True)
+
+            parsed_dataset = Dataset.import_from(test_dir, 'open_images')
+
+            compare_datasets_strict(self, source_dataset, parsed_dataset)
+
+    @mark_requirement(Requirements.DATUM_GENERAL_REQ)
+    def test_can_save_and_load_image_with_arbitrary_extension(self):
+        dataset = Dataset.from_iterable([
+            DatasetItem(id='a/1', image=Image(path='a/1.JPEG',
+                data=np.zeros((4, 3, 3)))),
+            DatasetItem(id='b/c/d/2', image=Image(path='b/c/d/2.bmp',
+                data=np.zeros((3, 4, 3)))),
+        ], categories=[])
+
+        with TestDir() as test_dir:
+            OpenImagesConverter.convert(dataset, test_dir, save_images=True)
+
+            parsed_dataset = Dataset.import_from(test_dir, 'open_images')
+
+            compare_datasets_strict(self, dataset, parsed_dataset)
+
 ASSETS_DIR = osp.join(osp.dirname(__file__), 'assets')
 
 DUMMY_DATASET_DIR_V6 = osp.join(ASSETS_DIR, 'open_images_dataset_v6')

From f544ce644c760d1f6d172f5dc9a09166441c7750 Mon Sep 17 00:00:00 2001
From: Roman Donchenko <roman.donchenko@intel.com>
Date: Mon, 28 Jun 2021 19:16:00 +0300
Subject: [PATCH 05/17] open_images_format: add documentation for the writing
 support

---
 docs/formats/open_images_user_manual.md | 58 +++++++++++++++++++++++--
 1 file changed, 54 insertions(+), 4 deletions(-)

diff --git a/docs/formats/open_images_user_manual.md b/docs/formats/open_images_user_manual.md
index 24306ac646..e7b3bb469e 100644
--- a/docs/formats/open_images_user_manual.md
+++ b/docs/formats/open_images_user_manual.md
@@ -111,7 +111,27 @@ To get information about them, run
 
 ## Export to Open Images
 
-Converting datasets to the Open Images format is currently not supported.
+There are few ways to convert an existing dataset to the Open Images format:
+
+``` bash
+# export dataset into Open Images format from existing project
+datum export -p <path/to/project> -f open_images -o <path/to/export/dir> \
+  -- --save_images
+
+# convert a dataset in another format to the Open Images format
+datum convert -if imagenet -i <path/to/imagenet/dataset> \
+    -f open_images -o <path/to/export/dir> \
+    -- --save-images
+```
+
+Extra options for export to the Open Images format:
+
+- `--save-images` - save image files when exporting the dataset
+  (by default, `False`)
+
+- `--image-ext IMAGE_EXT` - save image files with the speficied extension
+  when exporting the dataset (by default, uses the original extension
+  or `.jpg` if there isn't one)
 
 ## Particular use cases
 
@@ -120,10 +140,10 @@ and for the Open Images format in particular. Follow
 [user manual](../user_manual.md)
 to get more information about these operations.
 
-Here is an example of using Datumaro operations to solve
-a particular problem with the Open Images dataset:
+Here are a few examples of using Datumaro operations to solve
+particular problems with the Open Images dataset:
 
-### Example. How to load the Open Images dataset and convert to the format used by CVAT
+### Example 1. How to load the Open Images dataset and convert to the format used by CVAT
 
 ```bash
 datum create -o project
@@ -132,5 +152,35 @@ datum stats -p project
 datum export -p project -o dataset -f cvat --overwrite -- --save-images
 ```
 
+### Example 2. How to create a custom OID-like dataset
+
+```python
+import numpy as np
+from datumaro.components.dataset import Dataset
+from datumaro.components.extractor import (
+    AnnotationType, Label, LabelCategories, DatasetItem,
+)
+
+dataset = Dataset.from_iterable(
+    [
+        DatasetItem(
+            id='0000000000000001',
+            image=np.ones((1, 5, 3)),
+            subset='validation',
+            annotations=[
+                Label(0, attributes={'score': 1}),
+                Label(1, attributes={'score': 0}),
+            ],
+        ),
+    ],
+    categories={
+        AnnotationType.label: LabelCategories.from_iterable([
+            '/m/0', '/m/1',
+        ]),
+    },
+)
+dataset.export('./dataset', format='open_images')
+```
+
 More examples of working with OID from code can be found in
 [tests](../../tests/test_open_images_format.py).

From ef41c2674324161269ecdddb103a3e8bdf0cc629 Mon Sep 17 00:00:00 2001
From: Roman Donchenko <roman.donchenko@intel.com>
Date: Mon, 28 Jun 2021 19:19:17 +0300
Subject: [PATCH 06/17] open_images_format: factor out the 'images' constant

---
 datumaro/plugins/open_images_format.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/datumaro/plugins/open_images_format.py b/datumaro/plugins/open_images_format.py
index 97cac876c1..de21f14e78 100644
--- a/datumaro/plugins/open_images_format.py
+++ b/datumaro/plugins/open_images_format.py
@@ -43,6 +43,8 @@ def __str__(self):
 
 class OpenImagesPath:
     ANNOTATIONS_DIR = 'annotations'
+    IMAGES_DIR = 'images'
+
     FULL_IMAGE_DESCRIPTION_NAME = 'image_ids_and_rotation.csv'
     SUBSET_IMAGE_DESCRIPTION_PATTERNS = (
         '*-images-with-rotation.csv',
@@ -140,7 +142,7 @@ def set_parents_from_node(node, category):
         set_parents_from_node(root_node, root_category)
 
     def _load_items(self):
-        images_dir = osp.join(self._dataset_dir, 'images')
+        images_dir = osp.join(self._dataset_dir, OpenImagesPath.IMAGES_DIR)
 
         image_paths_by_id = {
             # the first component of `path_parts` is the subset name
@@ -346,7 +348,8 @@ def _save_subsets(self):
                     })
 
                     if self._save_images and item.has_image:
-                        self._save_image(item, subdir=osp.join('images', subset_name))
+                        self._save_image(item, subdir=osp.join(
+                            OpenImagesPath.IMAGES_DIR, subset_name))
 
                     for annotation in item.annotations:
                         if isinstance(annotation, Label):

From c04aaa7e65a90f720a213a17bb6be4ac83870fa1 Mon Sep 17 00:00:00 2001
From: Roman Donchenko <roman.donchenko@intel.com>
Date: Mon, 28 Jun 2021 19:19:35 +0300
Subject: [PATCH 07/17] Update the changelog entry for the Open Images support

---
 CHANGELOG.md | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 9fb2769e37..0da593907c 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -16,8 +16,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - `keep-empty` export parameter in VOC format (<https://github.com/openvinotoolkit/datumaro/pull/297>)
 - A base class for dataset validation plugins (<https://github.com/openvinotoolkit/datumaro/pull/299>)
 - Partial support for the Open Images format;
-  only reading is supported, and only images and image-level labels can be read
-  (<https://github.com/openvinotoolkit/datumaro/pull/291>).
+  only images and image-level labels can be read/written
+  (<https://github.com/openvinotoolkit/datumaro/pull/291>,
+  <https://github.com/openvinotoolkit/datumaro/pull/315>).
 
 ### Changed
 - Tensorflow AVX check is made optional in API and is disabled by default (<https://github.com/openvinotoolkit/datumaro/pull/305>)

From bf6f82e9a67458e67b049e1d49ccf59741f08460 Mon Sep 17 00:00:00 2001
From: Roman Donchenko <roman.donchenko@intel.com>
Date: Tue, 29 Jun 2021 18:41:12 +0300
Subject: [PATCH 08/17] open_images_format: rename some members of
 OpenImagesPath

This makes it clearer that they refer to paths.
---
 datumaro/plugins/open_images_format.py | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/datumaro/plugins/open_images_format.py b/datumaro/plugins/open_images_format.py
index de21f14e78..f1f6003aac 100644
--- a/datumaro/plugins/open_images_format.py
+++ b/datumaro/plugins/open_images_format.py
@@ -45,13 +45,13 @@ class OpenImagesPath:
     ANNOTATIONS_DIR = 'annotations'
     IMAGES_DIR = 'images'
 
-    FULL_IMAGE_DESCRIPTION_NAME = 'image_ids_and_rotation.csv'
-    SUBSET_IMAGE_DESCRIPTION_PATTERNS = (
+    FULL_IMAGE_DESCRIPTION_FILE_NAME = 'image_ids_and_rotation.csv'
+    SUBSET_IMAGE_DESCRIPTION_FILE_PATTERNS = (
         '*-images-with-rotation.csv',
         '*-images-with-labels-with-rotation.csv',
     )
-    V5_CLASS_DESCRIPTION_NAME = 'class-descriptions.csv'
-    HIERARCHY_NAME = 'bbox_labels_600_hierarchy.json'
+    V5_CLASS_DESCRIPTION_FILE_NAME = 'class-descriptions.csv'
+    HIERARCHY_FILE_NAME = 'bbox_labels_600_hierarchy.json'
 
 class OpenImagesExtractor(Extractor):
     def __init__(self, path):
@@ -101,12 +101,12 @@ def _load_categories(self):
 
         annotation_name = [
             *self._glob_annotations('oidv*-class-descriptions.csv'),
-            OpenImagesPath.V5_CLASS_DESCRIPTION_NAME,
+            OpenImagesPath.V5_CLASS_DESCRIPTION_FILE_NAME,
         ][0]
 
         with self._open_csv_annotation(annotation_name) as class_description_reader:
             # Prior to OID v6, this file didn't contain a header row.
-            if annotation_name == OpenImagesPath.V5_CLASS_DESCRIPTION_NAME:
+            if annotation_name == OpenImagesPath.V5_CLASS_DESCRIPTION_FILE_NAME:
                 class_description_reader.fieldnames = ('LabelName', 'DisplayName')
 
             for class_description in class_description_reader:
@@ -121,7 +121,7 @@ def _load_label_category_parents(self):
         label_categories = self._categories[AnnotationType.label]
 
         hierarchy_path = osp.join(
-            self._dataset_dir, OpenImagesPath.ANNOTATIONS_DIR, OpenImagesPath.HIERARCHY_NAME)
+            self._dataset_dir, OpenImagesPath.ANNOTATIONS_DIR, OpenImagesPath.HIERARCHY_FILE_NAME)
 
         try:
             with open(hierarchy_path, 'rb') as hierarchy_file:
@@ -180,9 +180,9 @@ def load_from(annotation_name):
         # However, if it's missing, we'll try loading subset-specific files instead, so that
         # this extractor can be used on individual subsets of the dataset.
         try:
-            load_from(OpenImagesPath.FULL_IMAGE_DESCRIPTION_NAME)
+            load_from(OpenImagesPath.FULL_IMAGE_DESCRIPTION_FILE_NAME)
         except FileNotFoundError:
-            for pattern in OpenImagesPath.SUBSET_IMAGE_DESCRIPTION_PATTERNS:
+            for pattern in OpenImagesPath.SUBSET_IMAGE_DESCRIPTION_FILE_PATTERNS:
                 for path in self._glob_annotations(pattern):
                     load_from(path)
 
@@ -217,8 +217,8 @@ class OpenImagesImporter(Importer):
     @classmethod
     def find_sources(cls, path):
         for pattern in [
-            OpenImagesPath.FULL_IMAGE_DESCRIPTION_NAME,
-            *OpenImagesPath.SUBSET_IMAGE_DESCRIPTION_PATTERNS,
+            OpenImagesPath.FULL_IMAGE_DESCRIPTION_FILE_NAME,
+            *OpenImagesPath.SUBSET_IMAGE_DESCRIPTION_FILE_PATTERNS,
         ]:
             if glob.glob(osp.join(glob.escape(path), OpenImagesPath.ANNOTATIONS_DIR, pattern)):
                 return [{'url': path, 'format': 'open_images'}]
@@ -246,7 +246,7 @@ def apply(self):
 
     def _save_categories(self):
         with self._open_csv_annotation(
-            OpenImagesPath.V5_CLASS_DESCRIPTION_NAME, ['LabelName', 'DisplayName'],
+            OpenImagesPath.V5_CLASS_DESCRIPTION_FILE_NAME, ['LabelName', 'DisplayName'],
         ) as class_description_writer:
             # no .writeheader() here, since we're saving it in the V5 format
 
@@ -295,7 +295,7 @@ def get_node(name):
         }
 
         hierarchy_path = osp.join(
-            self._save_dir, OpenImagesPath.ANNOTATIONS_DIR, OpenImagesPath.HIERARCHY_NAME)
+            self._save_dir, OpenImagesPath.ANNOTATIONS_DIR, OpenImagesPath.HIERARCHY_FILE_NAME)
 
         with open(hierarchy_path, 'w', encoding='UTF-8') as hierarchy_file:
             json.dump(root_node, hierarchy_file, indent=4)

From e8ba7dab7d67f2360267eafa9fe6c713cfd1cdf2 Mon Sep 17 00:00:00 2001
From: Roman Donchenko <roman.donchenko@intel.com>
Date: Tue, 29 Jun 2021 18:43:32 +0300
Subject: [PATCH 09/17] open_images_format: fix style errors

---
 datumaro/plugins/open_images_format.py | 6 +++---
 tests/test_open_images_format.py       | 5 +++--
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/datumaro/plugins/open_images_format.py b/datumaro/plugins/open_images_format.py
index f1f6003aac..55a8873ae3 100644
--- a/datumaro/plugins/open_images_format.py
+++ b/datumaro/plugins/open_images_format.py
@@ -151,7 +151,7 @@ def _load_items(self):
             for path_parts in [split_path(
                 osp.splitext(osp.relpath(path, images_dir))[0],
             )]
-            if len(path_parts) > 1
+            if 1 < len(path_parts)
         }
 
         items_by_id = {}
@@ -297,8 +297,8 @@ def get_node(name):
         hierarchy_path = osp.join(
             self._save_dir, OpenImagesPath.ANNOTATIONS_DIR, OpenImagesPath.HIERARCHY_FILE_NAME)
 
-        with open(hierarchy_path, 'w', encoding='UTF-8') as hierarchy_file:
-            json.dump(root_node, hierarchy_file, indent=4)
+        with open(hierarchy_path, 'w', encoding='utf-8') as hierarchy_file:
+            json.dump(root_node, hierarchy_file, indent=4, ensure_ascii=False)
             hierarchy_file.write('\n')
 
     def _save_subsets(self):
diff --git a/tests/test_open_images_format.py b/tests/test_open_images_format.py
index 8fc4df874e..2cf6d3bb29 100644
--- a/tests/test_open_images_format.py
+++ b/tests/test_open_images_format.py
@@ -8,11 +8,12 @@
 
 import numpy as np
 
+from datumaro.components.dataset import Dataset
+from datumaro.components.extractor import AnnotationType, DatasetItem, Label, LabelCategories
 from datumaro.plugins.open_images_format import OpenImagesConverter, OpenImagesImporter
 from datumaro.util.image import Image
 from datumaro.util.test_utils import TestDir, compare_datasets_strict
-from datumaro.components.extractor import AnnotationType, DatasetItem, Label, LabelCategories
-from datumaro.components.dataset import Dataset
+
 from tests.requirements import Requirements, mark_requirement
 
 class OpenImagesFormatTest(TestCase):

From 5a97e59a01281bcfc7918d7df27c84c0ebd921f0 Mon Sep 17 00:00:00 2001
From: Roman Donchenko <roman.donchenko@intel.com>
Date: Tue, 29 Jun 2021 18:45:02 +0300
Subject: [PATCH 10/17] open_images_format: handle the case where the exported
 dataset has no categories

Well, handle it as well as we can - if there is at least one annotation, then
the exporting process will crash.
---
 datumaro/plugins/open_images_format.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/datumaro/plugins/open_images_format.py b/datumaro/plugins/open_images_format.py
index 55a8873ae3..a2a8fa61f3 100644
--- a/datumaro/plugins/open_images_format.py
+++ b/datumaro/plugins/open_images_format.py
@@ -302,8 +302,8 @@ def get_node(name):
             hierarchy_file.write('\n')
 
     def _save_subsets(self):
-        # TODO: what if there are no categories?
-        label_categories = self._extractor.categories()[AnnotationType.label]
+        label_categories = self._extractor.categories().get(
+            AnnotationType.label, LabelCategories())
 
         for subset_name, subset in self._extractor.subsets().items():
             if _RE_INVALID_SUBSET.fullmatch(subset_name):

From f97c48f54f2834083c33fa08f51624c0f887e8b9 Mon Sep 17 00:00:00 2001
From: Roman Donchenko <roman.donchenko@intel.com>
Date: Tue, 29 Jun 2021 18:52:42 +0300
Subject: [PATCH 11/17] open_images_format: make code more idiomatic

---
 datumaro/plugins/open_images_format.py  |  2 +-
 docs/formats/open_images_user_manual.md |  6 +-----
 tests/test_open_images_format.py        | 15 +++++----------
 3 files changed, 7 insertions(+), 16 deletions(-)

diff --git a/datumaro/plugins/open_images_format.py b/datumaro/plugins/open_images_format.py
index a2a8fa61f3..d2f32570f4 100644
--- a/datumaro/plugins/open_images_format.py
+++ b/datumaro/plugins/open_images_format.py
@@ -352,7 +352,7 @@ def _save_subsets(self):
                             OpenImagesPath.IMAGES_DIR, subset_name))
 
                     for annotation in item.annotations:
-                        if isinstance(annotation, Label):
+                        if annotation.type is AnnotationType.label:
                             label_description_writer.writerow({
                                 'ImageID': item.id,
                                 'LabelName': label_categories[annotation.label].name,
diff --git a/docs/formats/open_images_user_manual.md b/docs/formats/open_images_user_manual.md
index e7b3bb469e..a8ea766621 100644
--- a/docs/formats/open_images_user_manual.md
+++ b/docs/formats/open_images_user_manual.md
@@ -173,11 +173,7 @@ dataset = Dataset.from_iterable(
             ],
         ),
     ],
-    categories={
-        AnnotationType.label: LabelCategories.from_iterable([
-            '/m/0', '/m/1',
-        ]),
-    },
+    categories=['/m/0', '/m/1'],
 )
 dataset.export('./dataset', format='open_images')
 ```
diff --git a/tests/test_open_images_format.py b/tests/test_open_images_format.py
index 2cf6d3bb29..4ad9df2c8f 100644
--- a/tests/test_open_images_format.py
+++ b/tests/test_open_images_format.py
@@ -50,16 +50,11 @@ def test_can_save_and_load(self):
 
     @mark_requirement(Requirements.DATUM_GENERAL_REQ)
     def test_can_save_and_load_with_no_subsets(self):
-        source_dataset = Dataset.from_iterable(
-            [
-                DatasetItem(id='a',
-                    annotations=[Label(0, attributes={'score': 0.7})]
-                ),
-            ],
-            categories={
-                AnnotationType.label: LabelCategories.from_iterable(['/m/0']),
-            },
-        )
+        source_dataset = Dataset.from_iterable([
+            DatasetItem(id='a',
+                annotations=[Label(0, attributes={'score': 0.7})]
+            ),
+        ], categories=['/m/0'])
 
         with TestDir() as test_dir:
             OpenImagesConverter.convert(source_dataset, test_dir,

From fff79366ebbfc041be42af3a30bbb1a300031aed Mon Sep 17 00:00:00 2001
From: Roman Donchenko <roman.donchenko@intel.com>
Date: Tue, 29 Jun 2021 18:53:14 +0300
Subject: [PATCH 12/17] test_open_images_format: mark tests with the correct
 requirement

---
 tests/requirements.py            |  1 +
 tests/test_open_images_format.py | 12 ++++++------
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/tests/requirements.py b/tests/requirements.py
index 31711fa9e4..97e56e5577 100644
--- a/tests/requirements.py
+++ b/tests/requirements.py
@@ -21,6 +21,7 @@ class Requirements:
     DATUM_231 = "Readable formats for CJK"
     DATUM_244 = "Add Snyk integration"
     DATUM_267 = "Add Image zip format"
+    DATUM_274 = "Support the Open Images dataset"
     DATUM_280 = "Support KITTI dataset formats"
     DATUM_283 = "Create cli tests for testing convert command for VOC format"
 
diff --git a/tests/test_open_images_format.py b/tests/test_open_images_format.py
index 4ad9df2c8f..e32e3c3e31 100644
--- a/tests/test_open_images_format.py
+++ b/tests/test_open_images_format.py
@@ -17,7 +17,7 @@
 from tests.requirements import Requirements, mark_requirement
 
 class OpenImagesFormatTest(TestCase):
-    @mark_requirement(Requirements.DATUM_GENERAL_REQ)
+    @mark_requirement(Requirements.DATUM_274)
     def test_can_save_and_load(self):
         source_dataset = Dataset.from_iterable(
             [
@@ -48,7 +48,7 @@ def test_can_save_and_load(self):
 
             compare_datasets_strict(self, source_dataset, parsed_dataset)
 
-    @mark_requirement(Requirements.DATUM_GENERAL_REQ)
+    @mark_requirement(Requirements.DATUM_274)
     def test_can_save_and_load_with_no_subsets(self):
         source_dataset = Dataset.from_iterable([
             DatasetItem(id='a',
@@ -64,7 +64,7 @@ def test_can_save_and_load_with_no_subsets(self):
 
             compare_datasets_strict(self, source_dataset, parsed_dataset)
 
-    @mark_requirement(Requirements.DATUM_GENERAL_REQ)
+    @mark_requirement(Requirements.DATUM_274)
     def test_can_save_and_load_image_with_arbitrary_extension(self):
         dataset = Dataset.from_iterable([
             DatasetItem(id='a/1', image=Image(path='a/1.JPEG',
@@ -86,7 +86,7 @@ def test_can_save_and_load_image_with_arbitrary_extension(self):
 DUMMY_DATASET_DIR_V5 = osp.join(ASSETS_DIR, 'open_images_dataset_v5')
 
 class OpenImagesImporterTest(TestCase):
-    @mark_requirement(Requirements.DATUM_GENERAL_REQ)
+    @mark_requirement(Requirements.DATUM_274)
     def test_can_import_v6(self):
         expected_dataset = Dataset.from_iterable(
             [
@@ -120,7 +120,7 @@ def test_can_import_v6(self):
 
         compare_datasets_strict(self, expected_dataset, dataset)
 
-    @mark_requirement(Requirements.DATUM_GENERAL_REQ)
+    @mark_requirement(Requirements.DATUM_274)
     def test_can_import_v5(self):
         expected_dataset = Dataset.from_iterable(
             [
@@ -139,7 +139,7 @@ def test_can_import_v5(self):
 
         compare_datasets_strict(self, expected_dataset, dataset)
 
-    @mark_requirement(Requirements.DATUM_GENERAL_REQ)
+    @mark_requirement(Requirements.DATUM_274)
     def test_can_detect(self):
         self.assertTrue(OpenImagesImporter.detect(DUMMY_DATASET_DIR_V6))
         self.assertTrue(OpenImagesImporter.detect(DUMMY_DATASET_DIR_V5))

From 8e7cf547598377e6a20b302d899849622eab01ed Mon Sep 17 00:00:00 2001
From: Roman Donchenko <roman.donchenko@intel.com>
Date: Wed, 30 Jun 2021 13:21:40 +0300
Subject: [PATCH 13/17] test_open_images_format: use compare_datasets instead
 of compare_datasets_strict

Per Maxim Zhiltsov, Datumaro doesn't guarantee that the order of items will
be preserved, so we don't need to check it.
---
 tests/test_open_images_format.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tests/test_open_images_format.py b/tests/test_open_images_format.py
index e32e3c3e31..fbf769ec99 100644
--- a/tests/test_open_images_format.py
+++ b/tests/test_open_images_format.py
@@ -12,7 +12,7 @@
 from datumaro.components.extractor import AnnotationType, DatasetItem, Label, LabelCategories
 from datumaro.plugins.open_images_format import OpenImagesConverter, OpenImagesImporter
 from datumaro.util.image import Image
-from datumaro.util.test_utils import TestDir, compare_datasets_strict
+from datumaro.util.test_utils import TestDir, compare_datasets
 
 from tests.requirements import Requirements, mark_requirement
 
@@ -46,7 +46,7 @@ def test_can_save_and_load(self):
             # the converter assumes that labels without a score have a score of 100%
             source_dataset.get('b', subset='train').annotations[0].attributes['score'] = 1
 
-            compare_datasets_strict(self, source_dataset, parsed_dataset)
+            compare_datasets(self, source_dataset, parsed_dataset, require_images=True)
 
     @mark_requirement(Requirements.DATUM_274)
     def test_can_save_and_load_with_no_subsets(self):
@@ -62,7 +62,7 @@ def test_can_save_and_load_with_no_subsets(self):
 
             parsed_dataset = Dataset.import_from(test_dir, 'open_images')
 
-            compare_datasets_strict(self, source_dataset, parsed_dataset)
+            compare_datasets(self, source_dataset, parsed_dataset)
 
     @mark_requirement(Requirements.DATUM_274)
     def test_can_save_and_load_image_with_arbitrary_extension(self):
@@ -78,7 +78,7 @@ def test_can_save_and_load_image_with_arbitrary_extension(self):
 
             parsed_dataset = Dataset.import_from(test_dir, 'open_images')
 
-            compare_datasets_strict(self, dataset, parsed_dataset)
+            compare_datasets(self, dataset, parsed_dataset, require_images=True)
 
 ASSETS_DIR = osp.join(osp.dirname(__file__), 'assets')
 
@@ -118,7 +118,7 @@ def test_can_import_v6(self):
 
         dataset = Dataset.import_from(DUMMY_DATASET_DIR_V6, 'open_images')
 
-        compare_datasets_strict(self, expected_dataset, dataset)
+        compare_datasets(self, expected_dataset, dataset, require_images=True)
 
     @mark_requirement(Requirements.DATUM_274)
     def test_can_import_v5(self):
@@ -137,7 +137,7 @@ def test_can_import_v5(self):
 
         dataset = Dataset.import_from(DUMMY_DATASET_DIR_V5, 'open_images')
 
-        compare_datasets_strict(self, expected_dataset, dataset)
+        compare_datasets(self, expected_dataset, dataset, require_images=True)
 
     @mark_requirement(Requirements.DATUM_274)
     def test_can_detect(self):

From d350c182b78a82555c7db5f3d4b3b1b9cfdb6825 Mon Sep 17 00:00:00 2001
From: Roman Donchenko <roman.donchenko@intel.com>
Date: Wed, 30 Jun 2021 13:53:05 +0300
Subject: [PATCH 14/17] OpenImagesFormatTest.test_can_save_and_load: avoid
 modifying the source dataset

---
 tests/test_open_images_format.py | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/tests/test_open_images_format.py b/tests/test_open_images_format.py
index fbf769ec99..9ba5b9a9d8 100644
--- a/tests/test_open_images_format.py
+++ b/tests/test_open_images_format.py
@@ -37,16 +37,25 @@ def test_can_save_and_load(self):
             },
         )
 
+        expected_dataset = Dataset.from_extractors(source_dataset)
+        expected_dataset.put(
+            DatasetItem(id='b', subset='train', image=np.zeros((8, 8, 3)),
+                annotations=[
+                    # the converter assumes that labels without a score
+                    # have a score of 100%
+                    Label(1, attributes={'score': 1}),
+                    Label(2, attributes={'score': 0}),
+                ]
+            ),
+        )
+
         with TestDir() as test_dir:
             OpenImagesConverter.convert(source_dataset, test_dir,
                 save_images=True)
 
             parsed_dataset = Dataset.import_from(test_dir, 'open_images')
 
-            # the converter assumes that labels without a score have a score of 100%
-            source_dataset.get('b', subset='train').annotations[0].attributes['score'] = 1
-
-            compare_datasets(self, source_dataset, parsed_dataset, require_images=True)
+            compare_datasets(self, expected_dataset, parsed_dataset, require_images=True)
 
     @mark_requirement(Requirements.DATUM_274)
     def test_can_save_and_load_with_no_subsets(self):

From fa29fbc353ffeea3f5636be9b093f20401deb568 Mon Sep 17 00:00:00 2001
From: Roman Donchenko <roman.donchenko@intel.com>
Date: Thu, 1 Jul 2021 19:20:29 +0300
Subject: [PATCH 15/17] open_images_format: move the long field lists to
 OpenImagesPath

---
 datumaro/plugins/open_images_format.py | 50 ++++++++++++++------------
 1 file changed, 27 insertions(+), 23 deletions(-)

diff --git a/datumaro/plugins/open_images_format.py b/datumaro/plugins/open_images_format.py
index d2f32570f4..2e51515244 100644
--- a/datumaro/plugins/open_images_format.py
+++ b/datumaro/plugins/open_images_format.py
@@ -53,6 +53,29 @@ class OpenImagesPath:
     V5_CLASS_DESCRIPTION_FILE_NAME = 'class-descriptions.csv'
     HIERARCHY_FILE_NAME = 'bbox_labels_600_hierarchy.json'
 
+    IMAGE_DESCRIPTION_FIELDS = (
+        'ImageID',
+        'Subset',
+        'OriginalURL',
+        'OriginalLandingURL',
+        'License',
+        'AuthorProfileURL',
+        'Author',
+        'Title',
+        'OriginalSize',
+        'OriginalMD5',
+        'Thumbnail300KURL',
+        'Rotation',
+    )
+
+    LABEL_DESCRIPTION_FIELDS = (
+        'ImageID',
+        'Source',
+        'LabelName',
+        'Confidence',
+    )
+
+
 class OpenImagesExtractor(Extractor):
     def __init__(self, path):
         if not osp.isdir(path):
@@ -310,34 +333,15 @@ def _save_subsets(self):
                 raise UnsupportedSubsetNameError(item_id=next(iter(subset)).id, subset=subset)
 
             image_description_name = f'{subset_name}-images-with-rotation.csv'
-            image_description_fields = [
-                'ImageID',
-                'Subset',
-                'OriginalURL',
-                'OriginalLandingURL',
-                'License',
-                'AuthorProfileURL',
-                'Author',
-                'Title',
-                'OriginalSize',
-                'OriginalMD5',
-                'Thumbnail300KURL',
-                'Rotation',
-            ]
-
             label_description_name = f'{subset_name}-annotations-human-imagelabels.csv'
-            label_description_fields =  [
-                'ImageID',
-                'Source',
-                'LabelName',
-                'Confidence',
-            ]
 
             with \
                 self._open_csv_annotation(
-                    image_description_name, image_description_fields) as image_description_writer, \
+                    image_description_name, OpenImagesPath.IMAGE_DESCRIPTION_FIELDS,
+                ) as image_description_writer, \
                 self._open_csv_annotation(
-                    label_description_name, label_description_fields) as label_description_writer \
+                    label_description_name, OpenImagesPath.LABEL_DESCRIPTION_FIELDS,
+                ) as label_description_writer \
             :
                 image_description_writer.writeheader()
                 label_description_writer.writeheader()

From 47f7d491b0ed5f5d22bdc2bc89b1fdb4bc76a8d5 Mon Sep 17 00:00:00 2001
From: Roman Donchenko <roman.donchenko@intel.com>
Date: Fri, 2 Jul 2021 19:24:03 +0300
Subject: [PATCH 16/17] open_images_format: add logging in the case where an
 item being saved has no image

---
 datumaro/plugins/open_images_format.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/datumaro/plugins/open_images_format.py b/datumaro/plugins/open_images_format.py
index 2e51515244..6dd034d415 100644
--- a/datumaro/plugins/open_images_format.py
+++ b/datumaro/plugins/open_images_format.py
@@ -8,6 +8,7 @@
 import glob
 import itertools
 import json
+import logging as log
 import os
 import os.path as osp
 import re
@@ -351,9 +352,12 @@ def _save_subsets(self):
                         'ImageID': item.id, 'Subset': subset_name,
                     })
 
-                    if self._save_images and item.has_image:
-                        self._save_image(item, subdir=osp.join(
-                            OpenImagesPath.IMAGES_DIR, subset_name))
+                    if self._save_images:
+                        if item.has_image:
+                            self._save_image(item, subdir=osp.join(
+                                OpenImagesPath.IMAGES_DIR, subset_name))
+                        else:
+                            log.debug("Item '%s' has no image", item.id)
 
                     for annotation in item.annotations:
                         if annotation.type is AnnotationType.label:

From 5a98b1c690da66333572c8acd04fd71721009309 Mon Sep 17 00:00:00 2001
From: Roman Donchenko <roman.donchenko@intel.com>
Date: Fri, 2 Jul 2021 19:55:45 +0300
Subject: [PATCH 17/17] OpenImagesConverter: create the label description file
 lazily

---
 datumaro/plugins/open_images_format.py | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/datumaro/plugins/open_images_format.py b/datumaro/plugins/open_images_format.py
index 6dd034d415..d6b83f33c5 100644
--- a/datumaro/plugins/open_images_format.py
+++ b/datumaro/plugins/open_images_format.py
@@ -340,12 +340,13 @@ def _save_subsets(self):
                 self._open_csv_annotation(
                     image_description_name, OpenImagesPath.IMAGE_DESCRIPTION_FIELDS,
                 ) as image_description_writer, \
-                self._open_csv_annotation(
-                    label_description_name, OpenImagesPath.LABEL_DESCRIPTION_FIELDS,
-                ) as label_description_writer \
+                contextlib.ExitStack() as annotation_writers \
             :
                 image_description_writer.writeheader()
-                label_description_writer.writeheader()
+
+                # The label description writer is created lazily,
+                # so that we don't create the label description file if there are no labels.
+                label_description_writer = None
 
                 for item in subset:
                     image_description_writer.writerow({
@@ -361,6 +362,13 @@ def _save_subsets(self):
 
                     for annotation in item.annotations:
                         if annotation.type is AnnotationType.label:
+                            if label_description_writer is None:
+                                label_description_writer = annotation_writers.enter_context(
+                                    self._open_csv_annotation(
+                                        label_description_name,
+                                        OpenImagesPath.LABEL_DESCRIPTION_FIELDS))
+                                label_description_writer.writeheader()
+
                             label_description_writer.writerow({
                                 'ImageID': item.id,
                                 'LabelName': label_categories[annotation.label].name,