Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open Images: add writing support #315

Merged
merged 17 commits into from
Jul 5, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- `keep-empty` export parameter in VOC format (<https://github.com/openvinotoolkit/datumaro/pull/297>)
- A base class for dataset validation plugins (<https://github.com/openvinotoolkit/datumaro/pull/299>)
- Partial support for the Open Images format;
only reading is supported, and only images and image-level labels can be read
(<https://github.com/openvinotoolkit/datumaro/pull/291>).
only images and image-level labels can be read/written
(<https://github.com/openvinotoolkit/datumaro/pull/291>,
<https://github.com/openvinotoolkit/datumaro/pull/315>).

### Changed
- Tensorflow AVX check is made optional in API and is disabled by default (<https://github.com/openvinotoolkit/datumaro/pull/305>)
Expand Down
190 changes: 175 additions & 15 deletions datumaro/plugins/open_images_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,19 +6,23 @@
import csv
import fnmatch
import glob
import itertools
import json
import logging as log
import os
import os.path as osp
import re

from attr import attrs

from datumaro.components.converter import Converter
from datumaro.components.errors import DatasetError, RepeatedItemError, UndefinedLabel
from datumaro.components.extractor import (
AnnotationType, DatasetItem, Importer, Label, LabelCategories, Extractor,
)
from datumaro.components.validator import Severity
from datumaro.util.image import find_images
from datumaro.util.os_util import split_path

# A regex to check whether a subset name can be used as a "normal" path
# component.
Expand All @@ -40,11 +44,38 @@ def __str__(self):

class OpenImagesPath:
ANNOTATIONS_DIR = 'annotations'
FULL_IMAGE_DESCRIPTION_NAME = 'image_ids_and_rotation.csv'
SUBSET_IMAGE_DESCRIPTION_PATTERNS = (
IMAGES_DIR = 'images'

FULL_IMAGE_DESCRIPTION_FILE_NAME = 'image_ids_and_rotation.csv'
SUBSET_IMAGE_DESCRIPTION_FILE_PATTERNS = (
'*-images-with-rotation.csv',
'*-images-with-labels-with-rotation.csv',
)
V5_CLASS_DESCRIPTION_FILE_NAME = 'class-descriptions.csv'
HIERARCHY_FILE_NAME = 'bbox_labels_600_hierarchy.json'

IMAGE_DESCRIPTION_FIELDS = (
'ImageID',
'Subset',
'OriginalURL',
'OriginalLandingURL',
'License',
'AuthorProfileURL',
'Author',
'Title',
'OriginalSize',
'OriginalMD5',
'Thumbnail300KURL',
'Rotation',
)

LABEL_DESCRIPTION_FIELDS = (
'ImageID',
'Source',
'LabelName',
'Confidence',
)


class OpenImagesExtractor(Extractor):
def __init__(self, path):
Expand Down Expand Up @@ -92,16 +123,14 @@ def _load_categories(self):
# If the file doesn't exist with either name, we'll fail trying to open
# `class-descriptions.csv`.

V5_CLASS_DESCRIPTIONS = 'class-descriptions.csv'

annotation_name = [
*self._glob_annotations('oidv*-class-descriptions.csv'),
V5_CLASS_DESCRIPTIONS,
OpenImagesPath.V5_CLASS_DESCRIPTION_FILE_NAME,
][0]

with self._open_csv_annotation(annotation_name) as class_description_reader:
# Prior to OID v6, this file didn't contain a header row.
if annotation_name == V5_CLASS_DESCRIPTIONS:
if annotation_name == OpenImagesPath.V5_CLASS_DESCRIPTION_FILE_NAME:
class_description_reader.fieldnames = ('LabelName', 'DisplayName')

for class_description in class_description_reader:
Expand All @@ -116,7 +145,7 @@ def _load_label_category_parents(self):
label_categories = self._categories[AnnotationType.label]

hierarchy_path = osp.join(
self._dataset_dir, OpenImagesPath.ANNOTATIONS_DIR, 'bbox_labels_600_hierarchy.json')
self._dataset_dir, OpenImagesPath.ANNOTATIONS_DIR, OpenImagesPath.HIERARCHY_FILE_NAME)

try:
with open(hierarchy_path, 'rb') as hierarchy_file:
Expand All @@ -137,11 +166,16 @@ def set_parents_from_node(node, category):
set_parents_from_node(root_node, root_category)

def _load_items(self):
images_dir = osp.join(self._dataset_dir, OpenImagesPath.IMAGES_DIR)

image_paths_by_id = {
osp.splitext(osp.basename(path))[0]: path
for path in find_images(
osp.join(self._dataset_dir, 'images'),
recursive=True, max_depth=1)
# the first component of `path_parts` is the subset name
'/'.join(path_parts[1:]): path
for path in find_images(images_dir, recursive=True)
for path_parts in [split_path(
osp.splitext(osp.relpath(path, images_dir))[0],
)]
if 1 < len(path_parts)
}

items_by_id = {}
Expand Down Expand Up @@ -170,9 +204,9 @@ def load_from(annotation_name):
# However, if it's missing, we'll try loading subset-specific files instead, so that
# this extractor can be used on individual subsets of the dataset.
try:
load_from(OpenImagesPath.FULL_IMAGE_DESCRIPTION_NAME)
load_from(OpenImagesPath.FULL_IMAGE_DESCRIPTION_FILE_NAME)
except FileNotFoundError:
for pattern in OpenImagesPath.SUBSET_IMAGE_DESCRIPTION_PATTERNS:
for pattern in OpenImagesPath.SUBSET_IMAGE_DESCRIPTION_FILE_PATTERNS:
for path in self._glob_annotations(pattern):
load_from(path)

Expand Down Expand Up @@ -207,10 +241,136 @@ class OpenImagesImporter(Importer):
@classmethod
def find_sources(cls, path):
for pattern in [
OpenImagesPath.FULL_IMAGE_DESCRIPTION_NAME,
*OpenImagesPath.SUBSET_IMAGE_DESCRIPTION_PATTERNS,
OpenImagesPath.FULL_IMAGE_DESCRIPTION_FILE_NAME,
*OpenImagesPath.SUBSET_IMAGE_DESCRIPTION_FILE_PATTERNS,
]:
if glob.glob(osp.join(glob.escape(path), OpenImagesPath.ANNOTATIONS_DIR, pattern)):
return [{'url': path, 'format': 'open_images'}]

return []

class OpenImagesConverter(Converter):
DEFAULT_IMAGE_EXT = '.jpg'

@contextlib.contextmanager
def _open_csv_annotation(self, file_name, field_names):
absolute_path = osp.join(self._save_dir, OpenImagesPath.ANNOTATIONS_DIR, file_name)

with open(absolute_path, 'w', encoding='utf-8', newline='') as f:
yield csv.DictWriter(f, field_names)

def apply(self):
annotations_dir = osp.join(self._save_dir, OpenImagesPath.ANNOTATIONS_DIR)

os.makedirs(annotations_dir, exist_ok=True)

self._save_categories()
self._save_label_category_parents()
self._save_subsets()

def _save_categories(self):
with self._open_csv_annotation(
OpenImagesPath.V5_CLASS_DESCRIPTION_FILE_NAME, ['LabelName', 'DisplayName'],
) as class_description_writer:
# no .writeheader() here, since we're saving it in the V5 format

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As an enhancement, you can add an optional argument for the format, which will determine the version of the exported dataset. What do you think about it?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I could, but... is there any reason to do so? It could just be unnecessary flexibility.

(Incidentally, the main reason I used the V5 format for this file is so that I wouldn't need to include the dataset name in the file name. It would be weird if the user could import OIDv5, then export it and end up with oidv6-class-descriptions.csv, even though the data is still from v5.)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Finally, can we import and export in one format?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is one format; just different versions. We can't use the same version as was used during import, since that information is not preserved anywhere (and the input could be a mishmash of different versions anyway).


for category in self._extractor.categories()[AnnotationType.label]:
class_description_writer.writerow({
'LabelName': category.name,
'DisplayName': category.name,
})

def _save_label_category_parents(self):
all_label_names = set()
hierarchy_nodes = {}
orphan_nodes = []

def get_node(name):
return hierarchy_nodes.setdefault(name, {'LabelName': name})

for category in self._extractor.categories()[AnnotationType.label]:
all_label_names.add(category.name)

child_node = get_node(category.name)

if category.parent:
parent_node = get_node(category.parent)
parent_node.setdefault('Subcategory', []).append(child_node)
else:
orphan_nodes.append(child_node)

# The hierarchy has to be rooted in a single node. However, there's
# no guarantee that there exists only one orphan (label without a parent).
# Therefore, we create a fake root node and make it the parent of every
# orphan label.
# This is not a violation of the format, because the original OID does
# the same thing.
root_node = {
# Create an OID-like label name that isn't already used by a real label
'LabelName': next(root_name
for i in itertools.count()
for root_name in [f'/m/{i}']
if root_name not in all_label_names
),
# If an orphan has no children, then it makes no semantic difference
# whether it's listed in the hierarchy file or not. So strip such nodes
# to avoid recording meaningless data.
'Subcategory': [node for node in orphan_nodes if 'Subcategory' in node],
sizov-kirill marked this conversation as resolved.
Show resolved Hide resolved
}

hierarchy_path = osp.join(
self._save_dir, OpenImagesPath.ANNOTATIONS_DIR, OpenImagesPath.HIERARCHY_FILE_NAME)

with open(hierarchy_path, 'w', encoding='utf-8') as hierarchy_file:
json.dump(root_node, hierarchy_file, indent=4, ensure_ascii=False)
hierarchy_file.write('\n')

def _save_subsets(self):
label_categories = self._extractor.categories().get(
AnnotationType.label, LabelCategories())

for subset_name, subset in self._extractor.subsets().items():
if _RE_INVALID_SUBSET.fullmatch(subset_name):
raise UnsupportedSubsetNameError(item_id=next(iter(subset)).id, subset=subset)

image_description_name = f'{subset_name}-images-with-rotation.csv'
label_description_name = f'{subset_name}-annotations-human-imagelabels.csv'

with \
self._open_csv_annotation(
image_description_name, OpenImagesPath.IMAGE_DESCRIPTION_FIELDS,
) as image_description_writer, \
contextlib.ExitStack() as annotation_writers \
:
image_description_writer.writeheader()

# The label description writer is created lazily,
# so that we don't create the label description file if there are no labels.
label_description_writer = None

for item in subset:
image_description_writer.writerow({
'ImageID': item.id, 'Subset': subset_name,
})

if self._save_images:
if item.has_image:
self._save_image(item, subdir=osp.join(
OpenImagesPath.IMAGES_DIR, subset_name))
else:
log.debug("Item '%s' has no image", item.id)

for annotation in item.annotations:
if annotation.type is AnnotationType.label:
if label_description_writer is None:
label_description_writer = annotation_writers.enter_context(
self._open_csv_annotation(
label_description_name,
OpenImagesPath.LABEL_DESCRIPTION_FIELDS))
label_description_writer.writeheader()

label_description_writer.writerow({
'ImageID': item.id,
'LabelName': label_categories[annotation.label].name,
'Confidence': str(annotation.attributes.get('score', 1)),
})
58 changes: 52 additions & 6 deletions docs/formats/open_images_user_manual.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,8 @@ which can be downloaded from the following URLs:

- [complete set](https://storage.googleapis.com/openimages/2018_04/image_ids_and_rotation.csv)
- [train set](https://storage.googleapis.com/openimages/v6/oidv6-train-images-with-labels-with-rotation.csv)
- [validation set](https://storage.googleapis.com/openimages/2018_04/test/test-images-with-rotation.csv)
- [test set](https://storage.googleapis.com/openimages/2018_04/validation/validation-images-with-rotation.csv)
- [validation set](https://storage.googleapis.com/openimages/2018_04/validation/validation-images-with-rotation.csv)
- [test set](https://storage.googleapis.com/openimages/2018_04/test/test-images-with-rotation.csv)

Datumaro expects at least one of the files above to be present.

Expand Down Expand Up @@ -111,7 +111,27 @@ To get information about them, run

## Export to Open Images

Converting datasets to the Open Images format is currently not supported.
There are few ways to convert an existing dataset to the Open Images format:

``` bash
# export dataset into Open Images format from existing project
datum export -p <path/to/project> -f open_images -o <path/to/export/dir> \
-- --save_images

# convert a dataset in another format to the Open Images format
datum convert -if imagenet -i <path/to/imagenet/dataset> \
-f open_images -o <path/to/export/dir> \
-- --save-images
```

Extra options for export to the Open Images format:

- `--save-images` - save image files when exporting the dataset
(by default, `False`)

- `--image-ext IMAGE_EXT` - save image files with the speficied extension
when exporting the dataset (by default, uses the original extension
or `.jpg` if there isn't one)

## Particular use cases

Expand All @@ -120,10 +140,10 @@ and for the Open Images format in particular. Follow
[user manual](../user_manual.md)
to get more information about these operations.

Here is an example of using Datumaro operations to solve
a particular problem with the Open Images dataset:
Here are a few examples of using Datumaro operations to solve
particular problems with the Open Images dataset:

### Example. How to load the Open Images dataset and convert to the format used by CVAT
### Example 1. How to load the Open Images dataset and convert to the format used by CVAT

```bash
datum create -o project
Expand All @@ -132,5 +152,31 @@ datum stats -p project
datum export -p project -o dataset -f cvat --overwrite -- --save-images
```

### Example 2. How to create a custom OID-like dataset

```python
import numpy as np
from datumaro.components.dataset import Dataset
from datumaro.components.extractor import (
AnnotationType, Label, LabelCategories, DatasetItem,
)

dataset = Dataset.from_iterable(
[
DatasetItem(
id='0000000000000001',
image=np.ones((1, 5, 3)),
subset='validation',
annotations=[
Label(0, attributes={'score': 1}),
Label(1, attributes={'score': 0}),
],
),
],
categories=['/m/0', '/m/1'],
)
dataset.export('./dataset', format='open_images')
```

More examples of working with OID from code can be found in
[tests](../../tests/test_open_images_format.py).
1 change: 1 addition & 0 deletions tests/requirements.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ class Requirements:
DATUM_231 = "Readable formats for CJK"
DATUM_244 = "Add Snyk integration"
DATUM_267 = "Add Image zip format"
DATUM_274 = "Support the Open Images dataset"
DATUM_280 = "Support KITTI dataset formats"
DATUM_283 = "Create cli tests for testing convert command for VOC format"

Expand Down
Loading