Skip to content

Commit

Permalink
Add image zip format (#273)
Browse files Browse the repository at this point in the history
* add tests

* add image_zip format

* update changelog

Co-authored-by: Maxim Zhiltsov <[email protected]>
  • Loading branch information
Kirill Sizov and Maxim Zhiltsov authored Jun 9, 2021
1 parent 839ca35 commit 54e21bf
Show file tree
Hide file tree
Showing 8 changed files with 403 additions and 0 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]
### Added
- Support for import/export zip archives with images (<https://github.com/openvinotoolkit/datumaro/pull/273>)
- Subformat importers for VOC and COCO (<https://github.com/openvinotoolkit/datumaro/pull/281>)

### Changed
Expand Down
114 changes: 114 additions & 0 deletions datumaro/plugins/image_zip_format.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
# Copyright (C) 2021 Intel Corporation
#
# SPDX-License-Identifier: MIT

from enum import Enum
import logging as log
import os
import os.path as osp
from zipfile import ZIP_BZIP2, ZIP_DEFLATED, ZIP_LZMA, ZIP_STORED, ZipFile

from datumaro.components.converter import Converter
from datumaro.components.extractor import DatasetItem, Importer, SourceExtractor

from datumaro.util import parse_str_enum_value
from datumaro.util.image import IMAGE_EXTENSIONS, ByteImage, encode_image

class Compression(Enum):
ZIP_STORED = ZIP_STORED
ZIP_DEFLATED = ZIP_DEFLATED
ZIP_BZIP2 = ZIP_BZIP2
ZIP_LZMA = ZIP_LZMA

class ImageZipPath:
DEFAULT_ARCHIVE_NAME = 'default.zip'
DEFAULT_COMPRESSION = Compression.ZIP_STORED

class ImageZipExtractor(SourceExtractor):
def __init__(self, url, subset=None):
super().__init__(subset=subset)

assert url.endswith('.zip'), url

with ZipFile(url, 'r') as zf:
for path in zf.filelist:
item_id, extension = osp.splitext(path.filename)
if extension.lower() not in IMAGE_EXTENSIONS:
continue
image = ByteImage(data=zf.read(path.filename))
self._items.append(DatasetItem(
id=item_id, image=image, subset=self._subset
))

class ImageZipImporter(Importer):
@classmethod
def find_sources(cls, path):
return cls._find_sources_recursive(path, '.zip', 'image_zip')

class ImageZipConverter(Converter):
DEFAULT_IMAGE_EXT = '.jpg'

@staticmethod
def _get_compression_method(s):
try:
return Compression[s.upper()]
except KeyError:
import argparse
raise argparse.ArgumentTypeError()

@classmethod
def build_cmdline_parser(cls, **kwargs):
parser = super().build_cmdline_parser(**kwargs)

parser.add_argument('--name', type=str,
default=ImageZipPath.DEFAULT_ARCHIVE_NAME,
help="Name of output zipfile (default: %(default)s)"
)

parser.add_argument('--compression', type=cls._get_compression_method,
default=ImageZipPath.DEFAULT_COMPRESSION.name,
help="Archive compression method.\nAvailable methods: {} "
"(default: %(default)s)" \
.format(', '.join(e.name for e in Compression))
)

return parser

def __init__(self, extractor, save_dir, name=None,
compression=None, **kwargs):
super().__init__(extractor, save_dir, **kwargs)

if name is None:
name = ImageZipPath.DEFAULT_ARCHIVE_NAME

compression = parse_str_enum_value(compression, Compression,
default=ImageZipPath.DEFAULT_COMPRESSION)

self._archive_name = name
self._compression = compression.value

def apply(self):
os.makedirs(self._save_dir, exist_ok=True)

archive_path = osp.join(self._save_dir, self._archive_name)

if osp.exists(archive_path):
raise FileExistsError('Zip file: %s, already exist,'
'specify archive name with --name extra argument' % archive_path)

with ZipFile(archive_path, 'w', self._compression) as zf:
for item in self._extractor:
if item.has_image:
self._archive_image(zf, item)
else:
log.debug("Item '%s' has no image info", item.id)

def _archive_image(self, zipfile, item):
image_name = self._make_image_filename(item)
if osp.isfile(item.image.path):
zipfile.write(item.image.path, arcname=image_name)
elif isinstance(item.image, ByteImage):
zipfile.writestr(image_name, item.image.get_bytes())
elif item.image.has_data:
zipfile.writestr(image_name,
encode_image(item.image.data, osp.splitext(image_name)[1]))
85 changes: 85 additions & 0 deletions docs/formats/image_zip_user_manual.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
# Image zip user manual

## Contents
- [Format specification](#format-specification)
- [Load image zip dataset](#load-image-zip-dataset)
- [Export to other formats](#export-to-other-formats)
- [Export unannotated dataset to zip archive](#export-unannotated-dataset-to-zip-archive)

## Format specification

- The image zip format allow to export/import unannotated datasets
with images to/from zip archive.

- The image zip format doesn't support any types of annotations
and attributes.

## Load Image zip dataset

Few ways to load unannotated datasets to your Datumaro project:

- From existing archive:

```bash
datum import -o project -f image_zip -i ./images.zip
```

- From directory with zip archives. Datumaro will loaded images from
all zip files in the directory:

```bash
datum import -o project -f image_zip -i ./foo
```

The directory with zip archives should have the following structure:

```
├── foo/
| ├── archive1.zip/
| | ├── image_1.jpg
| | ├── image_2.png
| | ├── subdir/
| | | ├── image_3.jpg
| | | ├── ...
| | ├── ...
| ├── archive2.zip/
| | ├── image_101.jpg
| | ├── image_102.jpg
| | ├── ...
| ...
```

Images in a archives should have supported extension,
follow the [user manual](../user_manual.md#data-formats) to see the supported extensions.

## Export to other formats

Datumaro can load dataset images from a zip archive and convert it to
[another supported dataset format](../user_manual.md#supported-formats),
for example:

```bash
datum import -o project -f image_zip -i ./images.zip
datum export -f coco -o ./new_dir -- --save-images
```

## Export unannotated dataset to zip archive

Example: exporting images from VOC dataset to zip archives:
```bash
datum import -o project -f voc -i ./VOC2012
datum export -f image_zip -o ./ --overwrite -- --name voc_images.zip \
--compression ZIP_DEFLATED
```

Extra options for export to image_zip format:

- `--save-images` allow to export dataset with saving images
(default: `False`);
- `--image-ext <IMAGE_EXT>` allow to specify image extension
for exporting dataset (default: use original or `.jpg`, if none);
- `--name` name of output zipfile (default: `default.zip`);
- `--compression` allow to specify archive compression method. Available methods:
`ZIP_STORED`, `ZIP_DEFLATED`, `ZIP_BZIP2`, `ZIP_LZMA` (default: `ZIP_STORED`).
Follow [zip documentation](https://pkware.cachefly.net/webdocs/casestudies/APPNOTE.TXT)
for more information.
Binary file added tests/assets/image_zip_dataset/1.zip
Binary file not shown.
Binary file added tests/assets/image_zip_dataset/2.zip
Binary file not shown.
93 changes: 93 additions & 0 deletions tests/cli/test_image_zip_format.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
import os
import os.path as osp
from unittest import TestCase
from zipfile import ZipFile

import numpy as np
import pytest

from datumaro.cli.__main__ import main
from datumaro.components.dataset import Dataset, DatasetItem
from datumaro.util.test_utils import TestDir, compare_datasets

from ..requirements import Requirements


def run(test, *args, expected_code=0):
test.assertEqual(expected_code, main(args), str(args))

def make_zip_archive(src_path, dst_path):
with ZipFile(dst_path, 'w') as archive:
for (dirpath, _, filenames) in os.walk(src_path):
for name in filenames:
path = osp.join(dirpath, name)
archive.write(path, osp.relpath(path, src_path))

class ImageZipIntegrationScenarios(TestCase):
@pytest.mark.reqids(Requirements.DATUM_267)
def test_can_save_and_load(self):
source_dataset = Dataset.from_iterable([
DatasetItem(id='1', image=np.ones((5, 5, 3))),
DatasetItem(id='2', image=np.ones((2, 8, 3)))
])

with TestDir() as test_dir:
source_dataset.export(test_dir, format='image_dir')
zip_path = osp.join(test_dir, 'images.zip')
make_zip_archive(test_dir, zip_path)

run(self, 'create', '-o', test_dir)
run(self, 'add', 'path', '-p', test_dir, '-f', 'image_zip', zip_path)

export_path = osp.join(test_dir, 'export.zip')
run(self, 'export', '-p', test_dir, '-f', 'image_zip',
'-o', test_dir, '--overwrite', '--',
'--name', osp.basename(export_path)
)

parsed_dataset = Dataset.import_from(export_path, format='image_zip')
compare_datasets(self, source_dataset, parsed_dataset)

@pytest.mark.reqids(Requirements.DATUM_267)
def test_can_export_zip_images_from_coco_dataset(self):
with TestDir() as test_dir:
coco_dir = osp.join(__file__[:__file__.rfind(osp.join('tests', ''))],
'tests', 'assets', 'coco_dataset')

run(self, 'create', '-o', test_dir)
run(self, 'add', 'path', '-p', test_dir, '-f', 'coco', coco_dir)

export_path = osp.join(test_dir, 'export.zip')
run(self, 'export', '-p', test_dir, '-f', 'image_zip',
'-o', test_dir, '--overwrite', '--',
'--name', osp.basename(export_path))

self.assertTrue(osp.isfile(export_path))
with ZipFile(export_path, 'r') as zf:
images = {f.filename for f in zf.filelist}
self.assertTrue(images == {'a.jpg', 'b.jpg'})

@pytest.mark.reqids(Requirements.DATUM_267)
def test_can_change_extension_for_images_in_zip(self):
source_dataset = Dataset.from_iterable([
DatasetItem(id='1', image=np.ones((5, 5, 3))),
DatasetItem(id='2', image=np.ones((2, 8, 3)))
])

with TestDir() as test_dir:
source_dataset.export(test_dir, format='image_dir', image_ext='.jpg')
zip_path = osp.join(test_dir, 'images.zip')
make_zip_archive(test_dir, zip_path)

run(self, 'create', '-o', test_dir)
run(self, 'add', 'path', '-p', test_dir, '-f', 'image_zip', zip_path)

export_path = osp.join(test_dir, 'export.zip')
run(self, 'export', '-p', test_dir, '-f', 'image_zip',
'-o', test_dir, '--overwrite', '--',
'--name', osp.basename(export_path), '--image-ext', '.png')

self.assertTrue(osp.isfile(export_path))
with ZipFile(export_path, 'r') as zf:
images = {f.filename for f in zf.filelist}
self.assertTrue(images == {'1.png', '2.png'})
1 change: 1 addition & 0 deletions tests/requirements.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ class Requirements:
# GitHub issues (not bugs)
# https://github.com/openvinotoolkit/datumaro/issues
DATUM_244 = "Add Snyk integration"
DATUM_267 = "Add Image zip format"

# GitHub issues (bugs)
# https://github.com/openvinotoolkit/datumaro/issues
Expand Down
Loading

0 comments on commit 54e21bf

Please sign in to comment.