From d85d38526222b42f7fd277cc65f9b21784aef062 Mon Sep 17 00:00:00 2001 From: Roman Donchenko Date: Fri, 2 Jul 2021 17:55:01 +0300 Subject: [PATCH 01/17] open_images_user_manual.md: fix image description file URLs I accidentally swapped the URLs for test and validation sets. --- docs/formats/open_images_user_manual.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/formats/open_images_user_manual.md b/docs/formats/open_images_user_manual.md index e86d3598a5..24306ac646 100644 --- a/docs/formats/open_images_user_manual.md +++ b/docs/formats/open_images_user_manual.md @@ -29,8 +29,8 @@ which can be downloaded from the following URLs: - [complete set](https://storage.googleapis.com/openimages/2018_04/image_ids_and_rotation.csv) - [train set](https://storage.googleapis.com/openimages/v6/oidv6-train-images-with-labels-with-rotation.csv) -- [validation set](https://storage.googleapis.com/openimages/2018_04/test/test-images-with-rotation.csv) -- [test set](https://storage.googleapis.com/openimages/2018_04/validation/validation-images-with-rotation.csv) +- [validation set](https://storage.googleapis.com/openimages/2018_04/validation/validation-images-with-rotation.csv) +- [test set](https://storage.googleapis.com/openimages/2018_04/test/test-images-with-rotation.csv) Datumaro expects at least one of the files above to be present. From db35a038b99df768f09dff1cb7d2ab0d632605f7 Mon Sep 17 00:00:00 2001 From: Roman Donchenko Date: Fri, 25 Jun 2021 13:43:35 +0300 Subject: [PATCH 02/17] open_images_format: add conversion support --- datumaro/plugins/open_images_format.py | 145 ++++++++++++++++++++++++- 1 file changed, 140 insertions(+), 5 deletions(-) diff --git a/datumaro/plugins/open_images_format.py b/datumaro/plugins/open_images_format.py index 2f823eef89..ebbe14a14b 100644 --- a/datumaro/plugins/open_images_format.py +++ b/datumaro/plugins/open_images_format.py @@ -6,6 +6,7 @@ import csv import fnmatch import glob +import itertools import json import os import os.path as osp @@ -13,6 +14,7 @@ from attr import attrs +from datumaro.components.converter import Converter from datumaro.components.errors import DatasetError, RepeatedItemError, UndefinedLabel from datumaro.components.extractor import ( AnnotationType, DatasetItem, Importer, Label, LabelCategories, Extractor, @@ -45,6 +47,8 @@ class OpenImagesPath: '*-images-with-rotation.csv', '*-images-with-labels-with-rotation.csv', ) + V5_CLASS_DESCRIPTION_NAME = 'class-descriptions.csv' + HIERARCHY_NAME = 'bbox_labels_600_hierarchy.json' class OpenImagesExtractor(Extractor): def __init__(self, path): @@ -92,16 +96,14 @@ def _load_categories(self): # If the file doesn't exist with either name, we'll fail trying to open # `class-descriptions.csv`. - V5_CLASS_DESCRIPTIONS = 'class-descriptions.csv' - annotation_name = [ *self._glob_annotations('oidv*-class-descriptions.csv'), - V5_CLASS_DESCRIPTIONS, + OpenImagesPath.V5_CLASS_DESCRIPTION_NAME, ][0] with self._open_csv_annotation(annotation_name) as class_description_reader: # Prior to OID v6, this file didn't contain a header row. - if annotation_name == V5_CLASS_DESCRIPTIONS: + if annotation_name == OpenImagesPath.V5_CLASS_DESCRIPTION_NAME: class_description_reader.fieldnames = ('LabelName', 'DisplayName') for class_description in class_description_reader: @@ -116,7 +118,7 @@ def _load_label_category_parents(self): label_categories = self._categories[AnnotationType.label] hierarchy_path = osp.join( - self._dataset_dir, OpenImagesPath.ANNOTATIONS_DIR, 'bbox_labels_600_hierarchy.json') + self._dataset_dir, OpenImagesPath.ANNOTATIONS_DIR, OpenImagesPath.HIERARCHY_NAME) try: with open(hierarchy_path, 'rb') as hierarchy_file: @@ -214,3 +216,136 @@ def find_sources(cls, path): return [{'url': path, 'format': 'open_images'}] return [] + +class OpenImagesConverter(Converter): + DEFAULT_IMAGE_EXT = '.jpg' + + @contextlib.contextmanager + def _open_csv_annotation(self, file_name, field_names): + absolute_path = osp.join(self._save_dir, OpenImagesPath.ANNOTATIONS_DIR, file_name) + + with open(absolute_path, 'w', encoding='utf-8', newline='') as f: + yield csv.DictWriter(f, field_names) + + def apply(self): + annotations_dir = osp.join(self._save_dir, OpenImagesPath.ANNOTATIONS_DIR) + + os.makedirs(annotations_dir, exist_ok=True) + + self._save_categories() + self._save_label_category_parents() + self._save_subsets() + + def _save_categories(self): + with self._open_csv_annotation( + OpenImagesPath.V5_CLASS_DESCRIPTION_NAME, ['LabelName', 'DisplayName'], + ) as class_description_writer: + # no .writeheader() here, since we're saving it in the V5 format + + for category in self._extractor.categories()[AnnotationType.label]: + class_description_writer.writerow({ + 'LabelName': category.name, + 'DisplayName': category.name, + }) + + def _save_label_category_parents(self): + all_label_names = set() + hierarchy_nodes = {} + orphan_nodes = [] + + def get_node(name): + return hierarchy_nodes.setdefault(name, {'LabelName': name}) + + for category in self._extractor.categories()[AnnotationType.label]: + all_label_names.add(category.name) + + child_node = get_node(category.name) + + if category.parent: + parent_node = get_node(category.parent) + parent_node.setdefault('Subcategory', []).append(child_node) + else: + orphan_nodes.append(child_node) + + # The hierarchy has to be rooted in a single node. However, there's + # no guarantee that there exists only one orphan (label without a parent). + # Therefore, we create a fake root node and make it the parent of every + # orphan label. + # This is not a violation of the format, because the original OID does + # the same thing. + root_node = { + # Create an OID-like label name that isn't already used by a real label + 'LabelName': next(root_name + for i in itertools.count() + for root_name in [f'/m/{i}'] + if root_name not in all_label_names + ), + # If an orphan has no children, then it makes no semantic difference + # whether it's listed in the hierarchy file or not. So strip such nodes + # to avoid recording meaningless data. + 'Subcategory': [node for node in orphan_nodes if 'Subcategory' in node], + } + + hierarchy_path = osp.join( + self._save_dir, OpenImagesPath.ANNOTATIONS_DIR, OpenImagesPath.HIERARCHY_NAME) + + with open(hierarchy_path, 'w', encoding='UTF-8') as hierarchy_file: + json.dump(root_node, hierarchy_file, indent=4) + hierarchy_file.write('\n') + + def _save_subsets(self): + # TODO: what if there are no categories? + label_categories = self._extractor.categories()[AnnotationType.label] + + for subset_name, subset in self._extractor.subsets().items(): + if _RE_INVALID_SUBSET.fullmatch(subset_name): + raise UnsupportedSubsetNameError(item_id=next(iter(subset)).id, subset=subset) + + image_description_name = f'{subset_name}-images-with-rotation.csv' + image_description_fields = [ + 'ImageID', + 'Subset', + 'OriginalURL', + 'OriginalLandingURL', + 'License', + 'AuthorProfileURL', + 'Author', + 'Title', + 'OriginalSize', + 'OriginalMD5', + 'Thumbnail300KURL', + 'Rotation', + ] + + label_description_name = f'{subset_name}-annotations-human-imagelabels.csv' + label_description_fields = [ + 'ImageID', + 'Source', + 'LabelName', + 'Confidence', + ] + + with \ + self._open_csv_annotation( + image_description_name, image_description_fields) as image_description_writer, \ + self._open_csv_annotation( + label_description_name, label_description_fields) as label_description_writer \ + : + image_description_writer.writeheader() + label_description_writer.writeheader() + + for item in subset: + image_description_writer.writerow({ + 'ImageID': item.id, 'Subset': subset_name, + }) + + if self._save_images and item.has_image: + self._save_image(item, subdir=osp.join('images', subset_name)) + + for annotation in item.annotations: + if isinstance(annotation, Label): + label_description_writer.writerow({ + 'ImageID': item.id, + 'LabelName': label_categories[annotation.label].name, + 'Confidence': str(annotation.attributes.get('score', 1)), + }) From 3a711fa8cda9307b6b8d1742e797ccd1d5b892f6 Mon Sep 17 00:00:00 2001 From: Roman Donchenko Date: Mon, 28 Jun 2021 17:59:00 +0300 Subject: [PATCH 03/17] open_images_format: add support for images in subdirectories --- datumaro/plugins/open_images_format.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/datumaro/plugins/open_images_format.py b/datumaro/plugins/open_images_format.py index ebbe14a14b..97cac876c1 100644 --- a/datumaro/plugins/open_images_format.py +++ b/datumaro/plugins/open_images_format.py @@ -21,6 +21,7 @@ ) from datumaro.components.validator import Severity from datumaro.util.image import find_images +from datumaro.util.os_util import split_path # A regex to check whether a subset name can be used as a "normal" path # component. @@ -139,11 +140,16 @@ def set_parents_from_node(node, category): set_parents_from_node(root_node, root_category) def _load_items(self): + images_dir = osp.join(self._dataset_dir, 'images') + image_paths_by_id = { - osp.splitext(osp.basename(path))[0]: path - for path in find_images( - osp.join(self._dataset_dir, 'images'), - recursive=True, max_depth=1) + # the first component of `path_parts` is the subset name + '/'.join(path_parts[1:]): path + for path in find_images(images_dir, recursive=True) + for path_parts in [split_path( + osp.splitext(osp.relpath(path, images_dir))[0], + )] + if len(path_parts) > 1 } items_by_id = {} From 44bcc8a2d28435bca6982f2ebda2d6665a07fcea Mon Sep 17 00:00:00 2001 From: Roman Donchenko Date: Mon, 28 Jun 2021 17:59:31 +0300 Subject: [PATCH 04/17] open_images_format: add tests for writing support --- tests/test_open_images_format.py | 74 +++++++++++++++++++++++++++++++- 1 file changed, 72 insertions(+), 2 deletions(-) diff --git a/tests/test_open_images_format.py b/tests/test_open_images_format.py index a7492e0cb7..8fc4df874e 100644 --- a/tests/test_open_images_format.py +++ b/tests/test_open_images_format.py @@ -8,12 +8,82 @@ import numpy as np -from datumaro.plugins.open_images_format import OpenImagesImporter -from datumaro.util.test_utils import compare_datasets_strict +from datumaro.plugins.open_images_format import OpenImagesConverter, OpenImagesImporter +from datumaro.util.image import Image +from datumaro.util.test_utils import TestDir, compare_datasets_strict from datumaro.components.extractor import AnnotationType, DatasetItem, Label, LabelCategories from datumaro.components.dataset import Dataset from tests.requirements import Requirements, mark_requirement +class OpenImagesFormatTest(TestCase): + @mark_requirement(Requirements.DATUM_GENERAL_REQ) + def test_can_save_and_load(self): + source_dataset = Dataset.from_iterable( + [ + DatasetItem(id='a', subset='train', + annotations=[Label(0, attributes={'score': 0.7})] + ), + DatasetItem(id='b', subset='train', image=np.zeros((8, 8, 3)), + annotations=[Label(1), Label(2, attributes={'score': 0})] + ), + ], + categories={ + AnnotationType.label: LabelCategories.from_iterable([ + '/m/0', + ('/m/1', '/m/0'), + '/m/2', + ]), + }, + ) + + with TestDir() as test_dir: + OpenImagesConverter.convert(source_dataset, test_dir, + save_images=True) + + parsed_dataset = Dataset.import_from(test_dir, 'open_images') + + # the converter assumes that labels without a score have a score of 100% + source_dataset.get('b', subset='train').annotations[0].attributes['score'] = 1 + + compare_datasets_strict(self, source_dataset, parsed_dataset) + + @mark_requirement(Requirements.DATUM_GENERAL_REQ) + def test_can_save_and_load_with_no_subsets(self): + source_dataset = Dataset.from_iterable( + [ + DatasetItem(id='a', + annotations=[Label(0, attributes={'score': 0.7})] + ), + ], + categories={ + AnnotationType.label: LabelCategories.from_iterable(['/m/0']), + }, + ) + + with TestDir() as test_dir: + OpenImagesConverter.convert(source_dataset, test_dir, + save_images=True) + + parsed_dataset = Dataset.import_from(test_dir, 'open_images') + + compare_datasets_strict(self, source_dataset, parsed_dataset) + + @mark_requirement(Requirements.DATUM_GENERAL_REQ) + def test_can_save_and_load_image_with_arbitrary_extension(self): + dataset = Dataset.from_iterable([ + DatasetItem(id='a/1', image=Image(path='a/1.JPEG', + data=np.zeros((4, 3, 3)))), + DatasetItem(id='b/c/d/2', image=Image(path='b/c/d/2.bmp', + data=np.zeros((3, 4, 3)))), + ], categories=[]) + + with TestDir() as test_dir: + OpenImagesConverter.convert(dataset, test_dir, save_images=True) + + parsed_dataset = Dataset.import_from(test_dir, 'open_images') + + compare_datasets_strict(self, dataset, parsed_dataset) + ASSETS_DIR = osp.join(osp.dirname(__file__), 'assets') DUMMY_DATASET_DIR_V6 = osp.join(ASSETS_DIR, 'open_images_dataset_v6') From f544ce644c760d1f6d172f5dc9a09166441c7750 Mon Sep 17 00:00:00 2001 From: Roman Donchenko Date: Mon, 28 Jun 2021 19:16:00 +0300 Subject: [PATCH 05/17] open_images_format: add documentation for the writing support --- docs/formats/open_images_user_manual.md | 58 +++++++++++++++++++++++-- 1 file changed, 54 insertions(+), 4 deletions(-) diff --git a/docs/formats/open_images_user_manual.md b/docs/formats/open_images_user_manual.md index 24306ac646..e7b3bb469e 100644 --- a/docs/formats/open_images_user_manual.md +++ b/docs/formats/open_images_user_manual.md @@ -111,7 +111,27 @@ To get information about them, run ## Export to Open Images -Converting datasets to the Open Images format is currently not supported. +There are few ways to convert an existing dataset to the Open Images format: + +``` bash +# export dataset into Open Images format from existing project +datum export -p -f open_images -o \ + -- --save_images + +# convert a dataset in another format to the Open Images format +datum convert -if imagenet -i \ + -f open_images -o \ + -- --save-images +``` + +Extra options for export to the Open Images format: + +- `--save-images` - save image files when exporting the dataset + (by default, `False`) + +- `--image-ext IMAGE_EXT` - save image files with the speficied extension + when exporting the dataset (by default, uses the original extension + or `.jpg` if there isn't one) ## Particular use cases @@ -120,10 +140,10 @@ and for the Open Images format in particular. Follow [user manual](../user_manual.md) to get more information about these operations. -Here is an example of using Datumaro operations to solve -a particular problem with the Open Images dataset: +Here are a few examples of using Datumaro operations to solve +particular problems with the Open Images dataset: -### Example. How to load the Open Images dataset and convert to the format used by CVAT +### Example 1. How to load the Open Images dataset and convert to the format used by CVAT ```bash datum create -o project @@ -132,5 +152,35 @@ datum stats -p project datum export -p project -o dataset -f cvat --overwrite -- --save-images ``` +### Example 2. How to create a custom OID-like dataset + +```python +import numpy as np +from datumaro.components.dataset import Dataset +from datumaro.components.extractor import ( + AnnotationType, Label, LabelCategories, DatasetItem, +) + +dataset = Dataset.from_iterable( + [ + DatasetItem( + id='0000000000000001', + image=np.ones((1, 5, 3)), + subset='validation', + annotations=[ + Label(0, attributes={'score': 1}), + Label(1, attributes={'score': 0}), + ], + ), + ], + categories={ + AnnotationType.label: LabelCategories.from_iterable([ + '/m/0', '/m/1', + ]), + }, +) +dataset.export('./dataset', format='open_images') +``` + More examples of working with OID from code can be found in [tests](../../tests/test_open_images_format.py). From ef41c2674324161269ecdddb103a3e8bdf0cc629 Mon Sep 17 00:00:00 2001 From: Roman Donchenko Date: Mon, 28 Jun 2021 19:19:17 +0300 Subject: [PATCH 06/17] open_images_format: factor out the 'images' constant --- datumaro/plugins/open_images_format.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/datumaro/plugins/open_images_format.py b/datumaro/plugins/open_images_format.py index 97cac876c1..de21f14e78 100644 --- a/datumaro/plugins/open_images_format.py +++ b/datumaro/plugins/open_images_format.py @@ -43,6 +43,8 @@ def __str__(self): class OpenImagesPath: ANNOTATIONS_DIR = 'annotations' + IMAGES_DIR = 'images' + FULL_IMAGE_DESCRIPTION_NAME = 'image_ids_and_rotation.csv' SUBSET_IMAGE_DESCRIPTION_PATTERNS = ( '*-images-with-rotation.csv', @@ -140,7 +142,7 @@ def set_parents_from_node(node, category): set_parents_from_node(root_node, root_category) def _load_items(self): - images_dir = osp.join(self._dataset_dir, 'images') + images_dir = osp.join(self._dataset_dir, OpenImagesPath.IMAGES_DIR) image_paths_by_id = { # the first component of `path_parts` is the subset name @@ -346,7 +348,8 @@ def _save_subsets(self): }) if self._save_images and item.has_image: - self._save_image(item, subdir=osp.join('images', subset_name)) + self._save_image(item, subdir=osp.join( + OpenImagesPath.IMAGES_DIR, subset_name)) for annotation in item.annotations: if isinstance(annotation, Label): From c04aaa7e65a90f720a213a17bb6be4ac83870fa1 Mon Sep 17 00:00:00 2001 From: Roman Donchenko Date: Mon, 28 Jun 2021 19:19:35 +0300 Subject: [PATCH 07/17] Update the changelog entry for the Open Images support --- CHANGELOG.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9fb2769e37..0da593907c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,8 +16,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - `keep-empty` export parameter in VOC format () - A base class for dataset validation plugins () - Partial support for the Open Images format; - only reading is supported, and only images and image-level labels can be read - (). + only images and image-level labels can be read/written + (, + ). ### Changed - Tensorflow AVX check is made optional in API and is disabled by default () From bf6f82e9a67458e67b049e1d49ccf59741f08460 Mon Sep 17 00:00:00 2001 From: Roman Donchenko Date: Tue, 29 Jun 2021 18:41:12 +0300 Subject: [PATCH 08/17] open_images_format: rename some members of OpenImagesPath This makes it clearer that they refer to paths. --- datumaro/plugins/open_images_format.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/datumaro/plugins/open_images_format.py b/datumaro/plugins/open_images_format.py index de21f14e78..f1f6003aac 100644 --- a/datumaro/plugins/open_images_format.py +++ b/datumaro/plugins/open_images_format.py @@ -45,13 +45,13 @@ class OpenImagesPath: ANNOTATIONS_DIR = 'annotations' IMAGES_DIR = 'images' - FULL_IMAGE_DESCRIPTION_NAME = 'image_ids_and_rotation.csv' - SUBSET_IMAGE_DESCRIPTION_PATTERNS = ( + FULL_IMAGE_DESCRIPTION_FILE_NAME = 'image_ids_and_rotation.csv' + SUBSET_IMAGE_DESCRIPTION_FILE_PATTERNS = ( '*-images-with-rotation.csv', '*-images-with-labels-with-rotation.csv', ) - V5_CLASS_DESCRIPTION_NAME = 'class-descriptions.csv' - HIERARCHY_NAME = 'bbox_labels_600_hierarchy.json' + V5_CLASS_DESCRIPTION_FILE_NAME = 'class-descriptions.csv' + HIERARCHY_FILE_NAME = 'bbox_labels_600_hierarchy.json' class OpenImagesExtractor(Extractor): def __init__(self, path): @@ -101,12 +101,12 @@ def _load_categories(self): annotation_name = [ *self._glob_annotations('oidv*-class-descriptions.csv'), - OpenImagesPath.V5_CLASS_DESCRIPTION_NAME, + OpenImagesPath.V5_CLASS_DESCRIPTION_FILE_NAME, ][0] with self._open_csv_annotation(annotation_name) as class_description_reader: # Prior to OID v6, this file didn't contain a header row. - if annotation_name == OpenImagesPath.V5_CLASS_DESCRIPTION_NAME: + if annotation_name == OpenImagesPath.V5_CLASS_DESCRIPTION_FILE_NAME: class_description_reader.fieldnames = ('LabelName', 'DisplayName') for class_description in class_description_reader: @@ -121,7 +121,7 @@ def _load_label_category_parents(self): label_categories = self._categories[AnnotationType.label] hierarchy_path = osp.join( - self._dataset_dir, OpenImagesPath.ANNOTATIONS_DIR, OpenImagesPath.HIERARCHY_NAME) + self._dataset_dir, OpenImagesPath.ANNOTATIONS_DIR, OpenImagesPath.HIERARCHY_FILE_NAME) try: with open(hierarchy_path, 'rb') as hierarchy_file: @@ -180,9 +180,9 @@ def load_from(annotation_name): # However, if it's missing, we'll try loading subset-specific files instead, so that # this extractor can be used on individual subsets of the dataset. try: - load_from(OpenImagesPath.FULL_IMAGE_DESCRIPTION_NAME) + load_from(OpenImagesPath.FULL_IMAGE_DESCRIPTION_FILE_NAME) except FileNotFoundError: - for pattern in OpenImagesPath.SUBSET_IMAGE_DESCRIPTION_PATTERNS: + for pattern in OpenImagesPath.SUBSET_IMAGE_DESCRIPTION_FILE_PATTERNS: for path in self._glob_annotations(pattern): load_from(path) @@ -217,8 +217,8 @@ class OpenImagesImporter(Importer): @classmethod def find_sources(cls, path): for pattern in [ - OpenImagesPath.FULL_IMAGE_DESCRIPTION_NAME, - *OpenImagesPath.SUBSET_IMAGE_DESCRIPTION_PATTERNS, + OpenImagesPath.FULL_IMAGE_DESCRIPTION_FILE_NAME, + *OpenImagesPath.SUBSET_IMAGE_DESCRIPTION_FILE_PATTERNS, ]: if glob.glob(osp.join(glob.escape(path), OpenImagesPath.ANNOTATIONS_DIR, pattern)): return [{'url': path, 'format': 'open_images'}] @@ -246,7 +246,7 @@ def apply(self): def _save_categories(self): with self._open_csv_annotation( - OpenImagesPath.V5_CLASS_DESCRIPTION_NAME, ['LabelName', 'DisplayName'], + OpenImagesPath.V5_CLASS_DESCRIPTION_FILE_NAME, ['LabelName', 'DisplayName'], ) as class_description_writer: # no .writeheader() here, since we're saving it in the V5 format @@ -295,7 +295,7 @@ def get_node(name): } hierarchy_path = osp.join( - self._save_dir, OpenImagesPath.ANNOTATIONS_DIR, OpenImagesPath.HIERARCHY_NAME) + self._save_dir, OpenImagesPath.ANNOTATIONS_DIR, OpenImagesPath.HIERARCHY_FILE_NAME) with open(hierarchy_path, 'w', encoding='UTF-8') as hierarchy_file: json.dump(root_node, hierarchy_file, indent=4) From e8ba7dab7d67f2360267eafa9fe6c713cfd1cdf2 Mon Sep 17 00:00:00 2001 From: Roman Donchenko Date: Tue, 29 Jun 2021 18:43:32 +0300 Subject: [PATCH 09/17] open_images_format: fix style errors --- datumaro/plugins/open_images_format.py | 6 +++--- tests/test_open_images_format.py | 5 +++-- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/datumaro/plugins/open_images_format.py b/datumaro/plugins/open_images_format.py index f1f6003aac..55a8873ae3 100644 --- a/datumaro/plugins/open_images_format.py +++ b/datumaro/plugins/open_images_format.py @@ -151,7 +151,7 @@ def _load_items(self): for path_parts in [split_path( osp.splitext(osp.relpath(path, images_dir))[0], )] - if len(path_parts) > 1 + if 1 < len(path_parts) } items_by_id = {} @@ -297,8 +297,8 @@ def get_node(name): hierarchy_path = osp.join( self._save_dir, OpenImagesPath.ANNOTATIONS_DIR, OpenImagesPath.HIERARCHY_FILE_NAME) - with open(hierarchy_path, 'w', encoding='UTF-8') as hierarchy_file: - json.dump(root_node, hierarchy_file, indent=4) + with open(hierarchy_path, 'w', encoding='utf-8') as hierarchy_file: + json.dump(root_node, hierarchy_file, indent=4, ensure_ascii=False) hierarchy_file.write('\n') def _save_subsets(self): diff --git a/tests/test_open_images_format.py b/tests/test_open_images_format.py index 8fc4df874e..2cf6d3bb29 100644 --- a/tests/test_open_images_format.py +++ b/tests/test_open_images_format.py @@ -8,11 +8,12 @@ import numpy as np +from datumaro.components.dataset import Dataset +from datumaro.components.extractor import AnnotationType, DatasetItem, Label, LabelCategories from datumaro.plugins.open_images_format import OpenImagesConverter, OpenImagesImporter from datumaro.util.image import Image from datumaro.util.test_utils import TestDir, compare_datasets_strict -from datumaro.components.extractor import AnnotationType, DatasetItem, Label, LabelCategories -from datumaro.components.dataset import Dataset + from tests.requirements import Requirements, mark_requirement class OpenImagesFormatTest(TestCase): From 5a97e59a01281bcfc7918d7df27c84c0ebd921f0 Mon Sep 17 00:00:00 2001 From: Roman Donchenko Date: Tue, 29 Jun 2021 18:45:02 +0300 Subject: [PATCH 10/17] open_images_format: handle the case where the exported dataset has no categories Well, handle it as well as we can - if there is at least one annotation, then the exporting process will crash. --- datumaro/plugins/open_images_format.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datumaro/plugins/open_images_format.py b/datumaro/plugins/open_images_format.py index 55a8873ae3..a2a8fa61f3 100644 --- a/datumaro/plugins/open_images_format.py +++ b/datumaro/plugins/open_images_format.py @@ -302,8 +302,8 @@ def get_node(name): hierarchy_file.write('\n') def _save_subsets(self): - # TODO: what if there are no categories? - label_categories = self._extractor.categories()[AnnotationType.label] + label_categories = self._extractor.categories().get( + AnnotationType.label, LabelCategories()) for subset_name, subset in self._extractor.subsets().items(): if _RE_INVALID_SUBSET.fullmatch(subset_name): From f97c48f54f2834083c33fa08f51624c0f887e8b9 Mon Sep 17 00:00:00 2001 From: Roman Donchenko Date: Tue, 29 Jun 2021 18:52:42 +0300 Subject: [PATCH 11/17] open_images_format: make code more idiomatic --- datumaro/plugins/open_images_format.py | 2 +- docs/formats/open_images_user_manual.md | 6 +----- tests/test_open_images_format.py | 15 +++++---------- 3 files changed, 7 insertions(+), 16 deletions(-) diff --git a/datumaro/plugins/open_images_format.py b/datumaro/plugins/open_images_format.py index a2a8fa61f3..d2f32570f4 100644 --- a/datumaro/plugins/open_images_format.py +++ b/datumaro/plugins/open_images_format.py @@ -352,7 +352,7 @@ def _save_subsets(self): OpenImagesPath.IMAGES_DIR, subset_name)) for annotation in item.annotations: - if isinstance(annotation, Label): + if annotation.type is AnnotationType.label: label_description_writer.writerow({ 'ImageID': item.id, 'LabelName': label_categories[annotation.label].name, diff --git a/docs/formats/open_images_user_manual.md b/docs/formats/open_images_user_manual.md index e7b3bb469e..a8ea766621 100644 --- a/docs/formats/open_images_user_manual.md +++ b/docs/formats/open_images_user_manual.md @@ -173,11 +173,7 @@ dataset = Dataset.from_iterable( ], ), ], - categories={ - AnnotationType.label: LabelCategories.from_iterable([ - '/m/0', '/m/1', - ]), - }, + categories=['/m/0', '/m/1'], ) dataset.export('./dataset', format='open_images') ``` diff --git a/tests/test_open_images_format.py b/tests/test_open_images_format.py index 2cf6d3bb29..4ad9df2c8f 100644 --- a/tests/test_open_images_format.py +++ b/tests/test_open_images_format.py @@ -50,16 +50,11 @@ def test_can_save_and_load(self): @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_can_save_and_load_with_no_subsets(self): - source_dataset = Dataset.from_iterable( - [ - DatasetItem(id='a', - annotations=[Label(0, attributes={'score': 0.7})] - ), - ], - categories={ - AnnotationType.label: LabelCategories.from_iterable(['/m/0']), - }, - ) + source_dataset = Dataset.from_iterable([ + DatasetItem(id='a', + annotations=[Label(0, attributes={'score': 0.7})] + ), + ], categories=['/m/0']) with TestDir() as test_dir: OpenImagesConverter.convert(source_dataset, test_dir, From fff79366ebbfc041be42af3a30bbb1a300031aed Mon Sep 17 00:00:00 2001 From: Roman Donchenko Date: Tue, 29 Jun 2021 18:53:14 +0300 Subject: [PATCH 12/17] test_open_images_format: mark tests with the correct requirement --- tests/requirements.py | 1 + tests/test_open_images_format.py | 12 ++++++------ 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/tests/requirements.py b/tests/requirements.py index 31711fa9e4..97e56e5577 100644 --- a/tests/requirements.py +++ b/tests/requirements.py @@ -21,6 +21,7 @@ class Requirements: DATUM_231 = "Readable formats for CJK" DATUM_244 = "Add Snyk integration" DATUM_267 = "Add Image zip format" + DATUM_274 = "Support the Open Images dataset" DATUM_280 = "Support KITTI dataset formats" DATUM_283 = "Create cli tests for testing convert command for VOC format" diff --git a/tests/test_open_images_format.py b/tests/test_open_images_format.py index 4ad9df2c8f..e32e3c3e31 100644 --- a/tests/test_open_images_format.py +++ b/tests/test_open_images_format.py @@ -17,7 +17,7 @@ from tests.requirements import Requirements, mark_requirement class OpenImagesFormatTest(TestCase): - @mark_requirement(Requirements.DATUM_GENERAL_REQ) + @mark_requirement(Requirements.DATUM_274) def test_can_save_and_load(self): source_dataset = Dataset.from_iterable( [ @@ -48,7 +48,7 @@ def test_can_save_and_load(self): compare_datasets_strict(self, source_dataset, parsed_dataset) - @mark_requirement(Requirements.DATUM_GENERAL_REQ) + @mark_requirement(Requirements.DATUM_274) def test_can_save_and_load_with_no_subsets(self): source_dataset = Dataset.from_iterable([ DatasetItem(id='a', @@ -64,7 +64,7 @@ def test_can_save_and_load_with_no_subsets(self): compare_datasets_strict(self, source_dataset, parsed_dataset) - @mark_requirement(Requirements.DATUM_GENERAL_REQ) + @mark_requirement(Requirements.DATUM_274) def test_can_save_and_load_image_with_arbitrary_extension(self): dataset = Dataset.from_iterable([ DatasetItem(id='a/1', image=Image(path='a/1.JPEG', @@ -86,7 +86,7 @@ def test_can_save_and_load_image_with_arbitrary_extension(self): DUMMY_DATASET_DIR_V5 = osp.join(ASSETS_DIR, 'open_images_dataset_v5') class OpenImagesImporterTest(TestCase): - @mark_requirement(Requirements.DATUM_GENERAL_REQ) + @mark_requirement(Requirements.DATUM_274) def test_can_import_v6(self): expected_dataset = Dataset.from_iterable( [ @@ -120,7 +120,7 @@ def test_can_import_v6(self): compare_datasets_strict(self, expected_dataset, dataset) - @mark_requirement(Requirements.DATUM_GENERAL_REQ) + @mark_requirement(Requirements.DATUM_274) def test_can_import_v5(self): expected_dataset = Dataset.from_iterable( [ @@ -139,7 +139,7 @@ def test_can_import_v5(self): compare_datasets_strict(self, expected_dataset, dataset) - @mark_requirement(Requirements.DATUM_GENERAL_REQ) + @mark_requirement(Requirements.DATUM_274) def test_can_detect(self): self.assertTrue(OpenImagesImporter.detect(DUMMY_DATASET_DIR_V6)) self.assertTrue(OpenImagesImporter.detect(DUMMY_DATASET_DIR_V5)) From 8e7cf547598377e6a20b302d899849622eab01ed Mon Sep 17 00:00:00 2001 From: Roman Donchenko Date: Wed, 30 Jun 2021 13:21:40 +0300 Subject: [PATCH 13/17] test_open_images_format: use compare_datasets instead of compare_datasets_strict Per Maxim Zhiltsov, Datumaro doesn't guarantee that the order of items will be preserved, so we don't need to check it. --- tests/test_open_images_format.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/test_open_images_format.py b/tests/test_open_images_format.py index e32e3c3e31..fbf769ec99 100644 --- a/tests/test_open_images_format.py +++ b/tests/test_open_images_format.py @@ -12,7 +12,7 @@ from datumaro.components.extractor import AnnotationType, DatasetItem, Label, LabelCategories from datumaro.plugins.open_images_format import OpenImagesConverter, OpenImagesImporter from datumaro.util.image import Image -from datumaro.util.test_utils import TestDir, compare_datasets_strict +from datumaro.util.test_utils import TestDir, compare_datasets from tests.requirements import Requirements, mark_requirement @@ -46,7 +46,7 @@ def test_can_save_and_load(self): # the converter assumes that labels without a score have a score of 100% source_dataset.get('b', subset='train').annotations[0].attributes['score'] = 1 - compare_datasets_strict(self, source_dataset, parsed_dataset) + compare_datasets(self, source_dataset, parsed_dataset, require_images=True) @mark_requirement(Requirements.DATUM_274) def test_can_save_and_load_with_no_subsets(self): @@ -62,7 +62,7 @@ def test_can_save_and_load_with_no_subsets(self): parsed_dataset = Dataset.import_from(test_dir, 'open_images') - compare_datasets_strict(self, source_dataset, parsed_dataset) + compare_datasets(self, source_dataset, parsed_dataset) @mark_requirement(Requirements.DATUM_274) def test_can_save_and_load_image_with_arbitrary_extension(self): @@ -78,7 +78,7 @@ def test_can_save_and_load_image_with_arbitrary_extension(self): parsed_dataset = Dataset.import_from(test_dir, 'open_images') - compare_datasets_strict(self, dataset, parsed_dataset) + compare_datasets(self, dataset, parsed_dataset, require_images=True) ASSETS_DIR = osp.join(osp.dirname(__file__), 'assets') @@ -118,7 +118,7 @@ def test_can_import_v6(self): dataset = Dataset.import_from(DUMMY_DATASET_DIR_V6, 'open_images') - compare_datasets_strict(self, expected_dataset, dataset) + compare_datasets(self, expected_dataset, dataset, require_images=True) @mark_requirement(Requirements.DATUM_274) def test_can_import_v5(self): @@ -137,7 +137,7 @@ def test_can_import_v5(self): dataset = Dataset.import_from(DUMMY_DATASET_DIR_V5, 'open_images') - compare_datasets_strict(self, expected_dataset, dataset) + compare_datasets(self, expected_dataset, dataset, require_images=True) @mark_requirement(Requirements.DATUM_274) def test_can_detect(self): From d350c182b78a82555c7db5f3d4b3b1b9cfdb6825 Mon Sep 17 00:00:00 2001 From: Roman Donchenko Date: Wed, 30 Jun 2021 13:53:05 +0300 Subject: [PATCH 14/17] OpenImagesFormatTest.test_can_save_and_load: avoid modifying the source dataset --- tests/test_open_images_format.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/tests/test_open_images_format.py b/tests/test_open_images_format.py index fbf769ec99..9ba5b9a9d8 100644 --- a/tests/test_open_images_format.py +++ b/tests/test_open_images_format.py @@ -37,16 +37,25 @@ def test_can_save_and_load(self): }, ) + expected_dataset = Dataset.from_extractors(source_dataset) + expected_dataset.put( + DatasetItem(id='b', subset='train', image=np.zeros((8, 8, 3)), + annotations=[ + # the converter assumes that labels without a score + # have a score of 100% + Label(1, attributes={'score': 1}), + Label(2, attributes={'score': 0}), + ] + ), + ) + with TestDir() as test_dir: OpenImagesConverter.convert(source_dataset, test_dir, save_images=True) parsed_dataset = Dataset.import_from(test_dir, 'open_images') - # the converter assumes that labels without a score have a score of 100% - source_dataset.get('b', subset='train').annotations[0].attributes['score'] = 1 - - compare_datasets(self, source_dataset, parsed_dataset, require_images=True) + compare_datasets(self, expected_dataset, parsed_dataset, require_images=True) @mark_requirement(Requirements.DATUM_274) def test_can_save_and_load_with_no_subsets(self): From fa29fbc353ffeea3f5636be9b093f20401deb568 Mon Sep 17 00:00:00 2001 From: Roman Donchenko Date: Thu, 1 Jul 2021 19:20:29 +0300 Subject: [PATCH 15/17] open_images_format: move the long field lists to OpenImagesPath --- datumaro/plugins/open_images_format.py | 50 ++++++++++++++------------ 1 file changed, 27 insertions(+), 23 deletions(-) diff --git a/datumaro/plugins/open_images_format.py b/datumaro/plugins/open_images_format.py index d2f32570f4..2e51515244 100644 --- a/datumaro/plugins/open_images_format.py +++ b/datumaro/plugins/open_images_format.py @@ -53,6 +53,29 @@ class OpenImagesPath: V5_CLASS_DESCRIPTION_FILE_NAME = 'class-descriptions.csv' HIERARCHY_FILE_NAME = 'bbox_labels_600_hierarchy.json' + IMAGE_DESCRIPTION_FIELDS = ( + 'ImageID', + 'Subset', + 'OriginalURL', + 'OriginalLandingURL', + 'License', + 'AuthorProfileURL', + 'Author', + 'Title', + 'OriginalSize', + 'OriginalMD5', + 'Thumbnail300KURL', + 'Rotation', + ) + + LABEL_DESCRIPTION_FIELDS = ( + 'ImageID', + 'Source', + 'LabelName', + 'Confidence', + ) + + class OpenImagesExtractor(Extractor): def __init__(self, path): if not osp.isdir(path): @@ -310,34 +333,15 @@ def _save_subsets(self): raise UnsupportedSubsetNameError(item_id=next(iter(subset)).id, subset=subset) image_description_name = f'{subset_name}-images-with-rotation.csv' - image_description_fields = [ - 'ImageID', - 'Subset', - 'OriginalURL', - 'OriginalLandingURL', - 'License', - 'AuthorProfileURL', - 'Author', - 'Title', - 'OriginalSize', - 'OriginalMD5', - 'Thumbnail300KURL', - 'Rotation', - ] - label_description_name = f'{subset_name}-annotations-human-imagelabels.csv' - label_description_fields = [ - 'ImageID', - 'Source', - 'LabelName', - 'Confidence', - ] with \ self._open_csv_annotation( - image_description_name, image_description_fields) as image_description_writer, \ + image_description_name, OpenImagesPath.IMAGE_DESCRIPTION_FIELDS, + ) as image_description_writer, \ self._open_csv_annotation( - label_description_name, label_description_fields) as label_description_writer \ + label_description_name, OpenImagesPath.LABEL_DESCRIPTION_FIELDS, + ) as label_description_writer \ : image_description_writer.writeheader() label_description_writer.writeheader() From 47f7d491b0ed5f5d22bdc2bc89b1fdb4bc76a8d5 Mon Sep 17 00:00:00 2001 From: Roman Donchenko Date: Fri, 2 Jul 2021 19:24:03 +0300 Subject: [PATCH 16/17] open_images_format: add logging in the case where an item being saved has no image --- datumaro/plugins/open_images_format.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/datumaro/plugins/open_images_format.py b/datumaro/plugins/open_images_format.py index 2e51515244..6dd034d415 100644 --- a/datumaro/plugins/open_images_format.py +++ b/datumaro/plugins/open_images_format.py @@ -8,6 +8,7 @@ import glob import itertools import json +import logging as log import os import os.path as osp import re @@ -351,9 +352,12 @@ def _save_subsets(self): 'ImageID': item.id, 'Subset': subset_name, }) - if self._save_images and item.has_image: - self._save_image(item, subdir=osp.join( - OpenImagesPath.IMAGES_DIR, subset_name)) + if self._save_images: + if item.has_image: + self._save_image(item, subdir=osp.join( + OpenImagesPath.IMAGES_DIR, subset_name)) + else: + log.debug("Item '%s' has no image", item.id) for annotation in item.annotations: if annotation.type is AnnotationType.label: From 5a98b1c690da66333572c8acd04fd71721009309 Mon Sep 17 00:00:00 2001 From: Roman Donchenko Date: Fri, 2 Jul 2021 19:55:45 +0300 Subject: [PATCH 17/17] OpenImagesConverter: create the label description file lazily --- datumaro/plugins/open_images_format.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/datumaro/plugins/open_images_format.py b/datumaro/plugins/open_images_format.py index 6dd034d415..d6b83f33c5 100644 --- a/datumaro/plugins/open_images_format.py +++ b/datumaro/plugins/open_images_format.py @@ -340,12 +340,13 @@ def _save_subsets(self): self._open_csv_annotation( image_description_name, OpenImagesPath.IMAGE_DESCRIPTION_FIELDS, ) as image_description_writer, \ - self._open_csv_annotation( - label_description_name, OpenImagesPath.LABEL_DESCRIPTION_FIELDS, - ) as label_description_writer \ + contextlib.ExitStack() as annotation_writers \ : image_description_writer.writeheader() - label_description_writer.writeheader() + + # The label description writer is created lazily, + # so that we don't create the label description file if there are no labels. + label_description_writer = None for item in subset: image_description_writer.writerow({ @@ -361,6 +362,13 @@ def _save_subsets(self): for annotation in item.annotations: if annotation.type is AnnotationType.label: + if label_description_writer is None: + label_description_writer = annotation_writers.enter_context( + self._open_csv_annotation( + label_description_name, + OpenImagesPath.LABEL_DESCRIPTION_FIELDS)) + label_description_writer.writeheader() + label_description_writer.writerow({ 'ImageID': item.id, 'LabelName': label_categories[annotation.label].name,