Add initial support for the Open Images dataset (cvat-ai#291)

* Support reading or Labels in Open Images (v4, v5, v6) * Add tests for the Open Images extractor/importer * Add Open Images documentation * Update changelog
TOsmanov · Jun 25, 2021 · 5209d42 · 5209d42
1 parent d2073b8
commit 5209d42
Show file tree

Hide file tree

Showing 18 changed files with 468 additions and 0 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -15,6 +15,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - `ItemTransform` class, which describes item-wise dataset `Transform`s (<https://github.com/openvinotoolkit/datumaro/pull/297>)
 - `keep-empty` export parameter in VOC format (<https://github.com/openvinotoolkit/datumaro/pull/297>)
 - A base class for dataset validation plugins (<https://github.com/openvinotoolkit/datumaro/pull/299>)
+- Partial support for the Open Images format;
+  only reading is supported, and only images and image-level labels can be read
+  (<https://github.com/openvinotoolkit/datumaro/pull/291>).
 
 ### Changed
 - Tensorflow AVX check is made optional in API and is disabled by default (<https://github.com/openvinotoolkit/datumaro/pull/305>)

diff --git a/datumaro/plugins/open_images_format.py b/datumaro/plugins/open_images_format.py
@@ -0,0 +1,216 @@
+# Copyright (C) 2021 Intel Corporation
+#
+# SPDX-License-Identifier: MIT
+
+import contextlib
+import csv
+import fnmatch
+import glob
+import json
+import os
+import os.path as osp
+import re
+
+from attr import attrs
+
+from datumaro.components.errors import DatasetError, RepeatedItemError, UndefinedLabel
+from datumaro.components.extractor import (
+    AnnotationType, DatasetItem, Importer, Label, LabelCategories, Extractor,
+)
+from datumaro.components.validator import Severity
+from datumaro.util.image import find_images
+
+# A regex to check whether a subset name can be used as a "normal" path
+# component.
+# Accepting a subset name that doesn't match this regex could lead
+# to accessing data outside of the expected directory, so it's best
+# to reject them.
+_RE_INVALID_SUBSET = re.compile(r'''
+    # empty
+    | \.\.? # special path component
+    | .*[/\\\0].* # contains special characters
+''', re.VERBOSE)
+
+@attrs(auto_attribs=True)
+class UnsupportedSubsetNameError(DatasetError):
+    subset: str
+
+    def __str__(self):
+        return "Item %s has an unsupported subset name %r." % (self.item_id, self.subset)
+
+class OpenImagesPath:
+    ANNOTATIONS_DIR = 'annotations'
+    FULL_IMAGE_DESCRIPTION_NAME = 'image_ids_and_rotation.csv'
+    SUBSET_IMAGE_DESCRIPTION_PATTERNS = (
+        '*-images-with-rotation.csv',
+        '*-images-with-labels-with-rotation.csv',
+    )
+
+class OpenImagesExtractor(Extractor):
+    def __init__(self, path):
+        if not osp.isdir(path):
+            raise FileNotFoundError("Can't read dataset directory '%s'" % path)
+
+        super().__init__()
+
+        self._dataset_dir = path
+
+        self._annotation_files = os.listdir(
+            osp.join(path, OpenImagesPath.ANNOTATIONS_DIR))
+
+        self._categories = {}
+        self._items = []
+
+        self._load_categories()
+        self._load_items()
+
+    def __iter__(self):
+        return iter(self._items)
+
+    def categories(self):
+        return self._categories
+
+    @contextlib.contextmanager
+    def _open_csv_annotation(self, file_name):
+        absolute_path = osp.join(self._dataset_dir, OpenImagesPath.ANNOTATIONS_DIR, file_name)
+
+        with open(absolute_path, 'r', encoding='utf-8', newline='') as f:
+            yield csv.DictReader(f)
+
+    def _glob_annotations(self, pattern):
+        for annotation_file in self._annotation_files:
+            if fnmatch.fnmatch(annotation_file, pattern):
+                yield annotation_file
+
+    def _load_categories(self):
+        label_categories = LabelCategories()
+
+        # In OID v6, the class description file is prefixed with `oidv6-`, whereas
+        # in the previous versions, it isn't. We try to find it regardless.
+        # We use a wildcard so that if, say, OID v7 is released in the future with
+        # a similar layout as v6, it's automatically supported.
+        # If the file doesn't exist with either name, we'll fail trying to open
+        # `class-descriptions.csv`.
+
+        V5_CLASS_DESCRIPTIONS = 'class-descriptions.csv'
+
+        annotation_name = [
+            *self._glob_annotations('oidv*-class-descriptions.csv'),
+            V5_CLASS_DESCRIPTIONS,
+        ][0]
+
+        with self._open_csv_annotation(annotation_name) as class_description_reader:
+            # Prior to OID v6, this file didn't contain a header row.
+            if annotation_name == V5_CLASS_DESCRIPTIONS:
+                class_description_reader.fieldnames = ('LabelName', 'DisplayName')
+
+            for class_description in class_description_reader:
+                label_name = class_description['LabelName']
+                label_categories.add(label_name)
+
+        self._categories[AnnotationType.label] = label_categories
+
+        self._load_label_category_parents()
+
+    def _load_label_category_parents(self):
+        label_categories = self._categories[AnnotationType.label]
+
+        hierarchy_path = osp.join(
+            self._dataset_dir, OpenImagesPath.ANNOTATIONS_DIR, 'bbox_labels_600_hierarchy.json')
+
+        try:
+            with open(hierarchy_path, 'rb') as hierarchy_file:
+                root_node = json.load(hierarchy_file)
+        except FileNotFoundError:
+            return
+
+        def set_parents_from_node(node, category):
+            for child_node in node.get('Subcategory', []):
+                _, child_category = label_categories.find(child_node['LabelName'])
+
+                if category is not None and child_category is not None:
+                    child_category.parent = category.name
+
+                set_parents_from_node(child_node, child_category)
+
+        _, root_category = label_categories.find(root_node['LabelName'])
+        set_parents_from_node(root_node, root_category)
+
+    def _load_items(self):
+        image_paths_by_id = {
+            osp.splitext(osp.basename(path))[0]: path
+            for path in find_images(
+                osp.join(self._dataset_dir, 'images'),
+                recursive=True, max_depth=1)
+        }
+
+        items_by_id = {}
+
+        def load_from(annotation_name):
+            with self._open_csv_annotation(annotation_name) as image_reader:
+                for image_description in image_reader:
+                    image_id = image_description['ImageID']
+                    if image_id in items_by_id:
+                        raise RepeatedItemError(item_id=image_id)
+
+                    subset = image_description['Subset']
+
+                    if _RE_INVALID_SUBSET.fullmatch(subset):
+                        raise UnsupportedSubsetNameError(item_id=image_id, subset=subset)
+
+                    items_by_id[image_id] = DatasetItem(
+                        id=image_id,
+                        image=image_paths_by_id.get(image_id),
+                        subset=subset,
+                    )
+
+        # It's preferable to load the combined image description file,
+        # because it contains descriptions for training images without human-annotated labels
+        # (the file specific to the training set doesn't).
+        # However, if it's missing, we'll try loading subset-specific files instead, so that
+        # this extractor can be used on individual subsets of the dataset.
+        try:
+            load_from(OpenImagesPath.FULL_IMAGE_DESCRIPTION_NAME)
+        except FileNotFoundError:
+            for pattern in OpenImagesPath.SUBSET_IMAGE_DESCRIPTION_PATTERNS:
+                for path in self._glob_annotations(pattern):
+                    load_from(path)
+
+        self._items.extend(items_by_id.values())
+
+        self._load_labels(items_by_id)
+
+    def _load_labels(self, items_by_id):
+        label_categories = self._categories[AnnotationType.label]
+
+        # TODO: implement reading of machine-annotated labels
+
+        for label_path in self._glob_annotations('*-human-imagelabels.csv'):
+            with self._open_csv_annotation(label_path) as label_reader:
+                for label_description in label_reader:
+                    image_id = label_description['ImageID']
+                    item = items_by_id[image_id]
+
+                    confidence = float(label_description['Confidence'])
+
+                    label_name = label_description['LabelName']
+                    label_index, _ = label_categories.find(label_name)
+                    if label_index is None:
+                        raise UndefinedLabel(
+                            item_id=item.id, subset=item.subset,
+                            label_name=label_name, severity=Severity.error)
+                    item.annotations.append(Label(
+                        label=label_index, attributes={'score': confidence}))
+
+
+class OpenImagesImporter(Importer):
+    @classmethod
+    def find_sources(cls, path):
+        for pattern in [
+            OpenImagesPath.FULL_IMAGE_DESCRIPTION_NAME,
+            *OpenImagesPath.SUBSET_IMAGE_DESCRIPTION_PATTERNS,
+        ]:
+            if glob.glob(osp.join(glob.escape(path), OpenImagesPath.ANNOTATIONS_DIR, pattern)):
+                return [{'url': path, 'format': 'open_images'}]
+
+        return []
diff --git a/docs/formats/open_images_user_manual.md b/docs/formats/open_images_user_manual.md
@@ -0,0 +1,135 @@
+# Open Images user manual
+
+## Contents
+
+- [Format specification](#format-specification)
+- [Load Open Images dataset](#load-open-images-dataset)
+- [Export to other formats](#export-to-other-formats)
+- [Export to Open Images](#export-to-open-images)
+- [Particular use cases](#particular-use-cases)
+
+## Format specification
+
+A description of the Open Images Dataset (OID) format is available
+on [its website](https://storage.googleapis.com/openimages/web/download.html).
+Datumaro supports versions 4, 5 and 6.
+
+Datumaro currently supports only the human-verified image-level label annotations from this dataset.
+
+## Load Open Images dataset
+
+The Open Images dataset is available for free download.
+
+See the [`open-images-dataset` GitHub repository](https://github.com/cvdfoundation/open-images-dataset)
+for information on how to download the images.
+
+Datumaro also requires the image description files,
+which can be downloaded from the following URLs:
+
+- [complete set](https://storage.googleapis.com/openimages/2018_04/image_ids_and_rotation.csv)
+- [train set](https://storage.googleapis.com/openimages/v6/oidv6-train-images-with-labels-with-rotation.csv)
+- [validation set](https://storage.googleapis.com/openimages/2018_04/test/test-images-with-rotation.csv)
+- [test set](https://storage.googleapis.com/openimages/2018_04/validation/validation-images-with-rotation.csv)
+
+Datumaro expects at least one of the files above to be present.
+
+In addition, the following metadata file must be present as well:
+
+- [class descriptions](https://storage.googleapis.com/openimages/v6/oidv6-class-descriptions.csv)
+
+You can optionally download the following additional metadata file:
+
+- [class hierarchy](https://storage.googleapis.com/openimages/2018_04/bbox_labels_600_hierarchy.json)
+
+Annotations can be downloaded from the following URLs:
+
+- [train image labels](https://storage.googleapis.com/openimages/v6/oidv6-train-annotations-human-imagelabels.csv)
+- [validation image labels](https://storage.googleapis.com/openimages/v5/validation-annotations-human-imagelabels.csv)
+- [test image labels](https://storage.googleapis.com/openimages/v5/test-annotations-human-imagelabels.csv)
+
+The annotations are optional.
+
+There are two ways to create Datumaro project and add OID to it:
+
+``` bash
+datum import --format open_images --input-path <path/to/dataset>
+# or
+datum create
+datum add path -f open_images <path/to/dataset>
+```
+
+It is possible to specify project name and project directory; run
+`datum create --help` for more information.
+
+Open Images dataset directory should have the following structure:
+
+```
+└─ Dataset/
+    ├── annotations/
+    │   └── bbox_labels_600_hierarchy.json
+    │   └── image_ids_and_rotation.csv
+    │   └── oidv6-class-descriptions.csv
+    │   └── *-human-imagelabels.csv
+    └── images/
+        ├── test
+        │   ├── <image_name1.jpg>
+        │   ├── <image_name2.jpg>
+        │   └── ...
+        ├── train
+        │   ├── <image_name1.jpg>
+        │   ├── <image_name2.jpg>
+        │   └── ...
+        └── validation
+            ├── <image_name1.jpg>
+            ├── <image_name2.jpg>
+            └── ...
+```
+
+To use per-subset image description files instead of `image_ids_and_rotation.csv`,
+place them in the `annotations` subdirectory.
+
+##  Export to other formats
+
+Datumaro can convert OID into any other format [Datumaro supports](../user_manual.md#supported-formats).
+To get the expected result, the dataset needs to be converted to a format
+that supports image-level labels.
+There are a few ways to convert OID to other dataset format:
+
+``` bash
+datum project import -f open_images -i <path/to/open_images>
+datum export -f cvat -o <path/to/output/dir>
+# or
+datum convert -if open_images -i <path/to/open_images> -f cvat -o <path/to/output/dir>
+```
+
+Some formats provide extra options for conversion.
+These options are passed after double dash (`--`) in the command line.
+To get information about them, run
+
+`datum export -f <FORMAT> -- -h`
+
+##  Export to Open Images
+
+Converting datasets to the Open Images format is currently not supported.
+
+## Particular use cases
+
+Datumaro supports filtering, transformation, merging etc. for all formats
+and for the Open Images format in particular. Follow
+[user manual](../user_manual.md)
+to get more information about these operations.
+
+Here is an example of using Datumaro operations to solve
+a particular problem with the Open Images dataset:
+
+### Example. How to load the Open Images dataset and convert to the format used by CVAT
+
+```bash
+datum create -o project
+datum add path -p project -f open_images ./open-images-dataset/
+datum stats -p project
+datum export -p project -o dataset -f cvat --overwrite -- --save-images
+```
+
+More examples of working with OID from code can be found in
+[tests](../../tests/test_open_images_format.py).
diff --git a/tests/assets/open_images_dataset_v5/annotations/class-descriptions.csv b/tests/assets/open_images_dataset_v5/annotations/class-descriptions.csv
@@ -0,0 +1,2 @@
+/m/0,Generic label #0
+/m/1,Generic label #1
diff --git a/tests/assets/open_images_dataset_v5/annotations/test-images-with-rotation.csv b/tests/assets/open_images_dataset_v5/annotations/test-images-with-rotation.csv
@@ -0,0 +1,2 @@
+ImageID,Subset,OriginalURL,OriginalLandingURL,License,AuthorProfileURL,Author,Title,OriginalSize,OriginalMD5,Thumbnail300KURL,Rotation
+cc,test,,,,,Intel,Test Image CC,,,,0
diff --git a/tests/assets/open_images_dataset_v5/annotations/train-images-with-labels-with-rotation.csv b/tests/assets/open_images_dataset_v5/annotations/train-images-with-labels-with-rotation.csv
@@ -0,0 +1,2 @@
+ImageID,Subset,OriginalURL,OriginalLandingURL,License,AuthorProfileURL,Author,Title,OriginalSize,OriginalMD5,Thumbnail300KURL,Rotation
+aa,train,,,,,Intel,Test Image AA,,,,0
diff --git a/tests/assets/open_images_dataset_v5/images/test/cc.jpg b/tests/assets/open_images_dataset_v5/images/test/cc.jpg
diff --git a/tests/assets/open_images_dataset_v5/images/train/aa.jpg b/tests/assets/open_images_dataset_v5/images/train/aa.jpg
diff --git a/tests/assets/open_images_dataset_v6/annotations/bbox_labels_600_hierarchy.json b/tests/assets/open_images_dataset_v6/annotations/bbox_labels_600_hierarchy.json
@@ -0,0 +1,13 @@
+{
+    "LabelName": "/m/x",
+    "Subcategory": [
+        {
+            "LabelName": "/m/0",
+            "Subcategory": [
+                {
+                    "LabelName": "/m/1"
+                }
+            ]
+        }
+    ]
+}
diff --git a/tests/assets/open_images_dataset_v6/annotations/image_ids_and_rotation.csv b/tests/assets/open_images_dataset_v6/annotations/image_ids_and_rotation.csv
@@ -0,0 +1,5 @@
+ImageID,Subset,OriginalURL,OriginalLandingURL,License,AuthorProfileURL,Author,Title,OriginalSize,OriginalMD5,Thumbnail300KURL,Rotation
+a,train,,,,,Intel,Test Image A,,,,0
+b,train,,,,,Intel,Test Image B,,,,0
+c,test,,,,,Intel,Test Image C,,,,0
+d,validation,,,,Intel,Test Image D,,,,0
diff --git a/tests/assets/open_images_dataset_v6/annotations/oidv6-class-descriptions.csv b/tests/assets/open_images_dataset_v6/annotations/oidv6-class-descriptions.csv
@@ -0,0 +1,5 @@
+LabelName,DisplayName
+/m/0,Generic label #0
+/m/1,Generic label #1
+/m/2,Generic label #2
+/m/3,Generic label #3