-
Notifications
You must be signed in to change notification settings - Fork 137
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* add tests * add image_zip format * update changelog Co-authored-by: Maxim Zhiltsov <[email protected]>
- Loading branch information
Kirill Sizov
and
Maxim Zhiltsov
authored
Jun 9, 2021
1 parent
839ca35
commit 54e21bf
Showing
8 changed files
with
403 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,114 @@ | ||
# Copyright (C) 2021 Intel Corporation | ||
# | ||
# SPDX-License-Identifier: MIT | ||
|
||
from enum import Enum | ||
import logging as log | ||
import os | ||
import os.path as osp | ||
from zipfile import ZIP_BZIP2, ZIP_DEFLATED, ZIP_LZMA, ZIP_STORED, ZipFile | ||
|
||
from datumaro.components.converter import Converter | ||
from datumaro.components.extractor import DatasetItem, Importer, SourceExtractor | ||
|
||
from datumaro.util import parse_str_enum_value | ||
from datumaro.util.image import IMAGE_EXTENSIONS, ByteImage, encode_image | ||
|
||
class Compression(Enum): | ||
ZIP_STORED = ZIP_STORED | ||
ZIP_DEFLATED = ZIP_DEFLATED | ||
ZIP_BZIP2 = ZIP_BZIP2 | ||
ZIP_LZMA = ZIP_LZMA | ||
|
||
class ImageZipPath: | ||
DEFAULT_ARCHIVE_NAME = 'default.zip' | ||
DEFAULT_COMPRESSION = Compression.ZIP_STORED | ||
|
||
class ImageZipExtractor(SourceExtractor): | ||
def __init__(self, url, subset=None): | ||
super().__init__(subset=subset) | ||
|
||
assert url.endswith('.zip'), url | ||
|
||
with ZipFile(url, 'r') as zf: | ||
for path in zf.filelist: | ||
item_id, extension = osp.splitext(path.filename) | ||
if extension.lower() not in IMAGE_EXTENSIONS: | ||
continue | ||
image = ByteImage(data=zf.read(path.filename)) | ||
self._items.append(DatasetItem( | ||
id=item_id, image=image, subset=self._subset | ||
)) | ||
|
||
class ImageZipImporter(Importer): | ||
@classmethod | ||
def find_sources(cls, path): | ||
return cls._find_sources_recursive(path, '.zip', 'image_zip') | ||
|
||
class ImageZipConverter(Converter): | ||
DEFAULT_IMAGE_EXT = '.jpg' | ||
|
||
@staticmethod | ||
def _get_compression_method(s): | ||
try: | ||
return Compression[s.upper()] | ||
except KeyError: | ||
import argparse | ||
raise argparse.ArgumentTypeError() | ||
|
||
@classmethod | ||
def build_cmdline_parser(cls, **kwargs): | ||
parser = super().build_cmdline_parser(**kwargs) | ||
|
||
parser.add_argument('--name', type=str, | ||
default=ImageZipPath.DEFAULT_ARCHIVE_NAME, | ||
help="Name of output zipfile (default: %(default)s)" | ||
) | ||
|
||
parser.add_argument('--compression', type=cls._get_compression_method, | ||
default=ImageZipPath.DEFAULT_COMPRESSION.name, | ||
help="Archive compression method.\nAvailable methods: {} " | ||
"(default: %(default)s)" \ | ||
.format(', '.join(e.name for e in Compression)) | ||
) | ||
|
||
return parser | ||
|
||
def __init__(self, extractor, save_dir, name=None, | ||
compression=None, **kwargs): | ||
super().__init__(extractor, save_dir, **kwargs) | ||
|
||
if name is None: | ||
name = ImageZipPath.DEFAULT_ARCHIVE_NAME | ||
|
||
compression = parse_str_enum_value(compression, Compression, | ||
default=ImageZipPath.DEFAULT_COMPRESSION) | ||
|
||
self._archive_name = name | ||
self._compression = compression.value | ||
|
||
def apply(self): | ||
os.makedirs(self._save_dir, exist_ok=True) | ||
|
||
archive_path = osp.join(self._save_dir, self._archive_name) | ||
|
||
if osp.exists(archive_path): | ||
raise FileExistsError('Zip file: %s, already exist,' | ||
'specify archive name with --name extra argument' % archive_path) | ||
|
||
with ZipFile(archive_path, 'w', self._compression) as zf: | ||
for item in self._extractor: | ||
if item.has_image: | ||
self._archive_image(zf, item) | ||
else: | ||
log.debug("Item '%s' has no image info", item.id) | ||
|
||
def _archive_image(self, zipfile, item): | ||
image_name = self._make_image_filename(item) | ||
if osp.isfile(item.image.path): | ||
zipfile.write(item.image.path, arcname=image_name) | ||
elif isinstance(item.image, ByteImage): | ||
zipfile.writestr(image_name, item.image.get_bytes()) | ||
elif item.image.has_data: | ||
zipfile.writestr(image_name, | ||
encode_image(item.image.data, osp.splitext(image_name)[1])) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,85 @@ | ||
# Image zip user manual | ||
|
||
## Contents | ||
- [Format specification](#format-specification) | ||
- [Load image zip dataset](#load-image-zip-dataset) | ||
- [Export to other formats](#export-to-other-formats) | ||
- [Export unannotated dataset to zip archive](#export-unannotated-dataset-to-zip-archive) | ||
|
||
## Format specification | ||
|
||
- The image zip format allow to export/import unannotated datasets | ||
with images to/from zip archive. | ||
|
||
- The image zip format doesn't support any types of annotations | ||
and attributes. | ||
|
||
## Load Image zip dataset | ||
|
||
Few ways to load unannotated datasets to your Datumaro project: | ||
|
||
- From existing archive: | ||
|
||
```bash | ||
datum import -o project -f image_zip -i ./images.zip | ||
``` | ||
|
||
- From directory with zip archives. Datumaro will loaded images from | ||
all zip files in the directory: | ||
|
||
```bash | ||
datum import -o project -f image_zip -i ./foo | ||
``` | ||
|
||
The directory with zip archives should have the following structure: | ||
|
||
``` | ||
├── foo/ | ||
| ├── archive1.zip/ | ||
| | ├── image_1.jpg | ||
| | ├── image_2.png | ||
| | ├── subdir/ | ||
| | | ├── image_3.jpg | ||
| | | ├── ... | ||
| | ├── ... | ||
| ├── archive2.zip/ | ||
| | ├── image_101.jpg | ||
| | ├── image_102.jpg | ||
| | ├── ... | ||
| ... | ||
``` | ||
|
||
Images in a archives should have supported extension, | ||
follow the [user manual](../user_manual.md#data-formats) to see the supported extensions. | ||
|
||
## Export to other formats | ||
|
||
Datumaro can load dataset images from a zip archive and convert it to | ||
[another supported dataset format](../user_manual.md#supported-formats), | ||
for example: | ||
|
||
```bash | ||
datum import -o project -f image_zip -i ./images.zip | ||
datum export -f coco -o ./new_dir -- --save-images | ||
``` | ||
|
||
## Export unannotated dataset to zip archive | ||
|
||
Example: exporting images from VOC dataset to zip archives: | ||
```bash | ||
datum import -o project -f voc -i ./VOC2012 | ||
datum export -f image_zip -o ./ --overwrite -- --name voc_images.zip \ | ||
--compression ZIP_DEFLATED | ||
``` | ||
|
||
Extra options for export to image_zip format: | ||
|
||
- `--save-images` allow to export dataset with saving images | ||
(default: `False`); | ||
- `--image-ext <IMAGE_EXT>` allow to specify image extension | ||
for exporting dataset (default: use original or `.jpg`, if none); | ||
- `--name` name of output zipfile (default: `default.zip`); | ||
- `--compression` allow to specify archive compression method. Available methods: | ||
`ZIP_STORED`, `ZIP_DEFLATED`, `ZIP_BZIP2`, `ZIP_LZMA` (default: `ZIP_STORED`). | ||
Follow [zip documentation](https://pkware.cachefly.net/webdocs/casestudies/APPNOTE.TXT) | ||
for more information. |
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,93 @@ | ||
import os | ||
import os.path as osp | ||
from unittest import TestCase | ||
from zipfile import ZipFile | ||
|
||
import numpy as np | ||
import pytest | ||
|
||
from datumaro.cli.__main__ import main | ||
from datumaro.components.dataset import Dataset, DatasetItem | ||
from datumaro.util.test_utils import TestDir, compare_datasets | ||
|
||
from ..requirements import Requirements | ||
|
||
|
||
def run(test, *args, expected_code=0): | ||
test.assertEqual(expected_code, main(args), str(args)) | ||
|
||
def make_zip_archive(src_path, dst_path): | ||
with ZipFile(dst_path, 'w') as archive: | ||
for (dirpath, _, filenames) in os.walk(src_path): | ||
for name in filenames: | ||
path = osp.join(dirpath, name) | ||
archive.write(path, osp.relpath(path, src_path)) | ||
|
||
class ImageZipIntegrationScenarios(TestCase): | ||
@pytest.mark.reqids(Requirements.DATUM_267) | ||
def test_can_save_and_load(self): | ||
source_dataset = Dataset.from_iterable([ | ||
DatasetItem(id='1', image=np.ones((5, 5, 3))), | ||
DatasetItem(id='2', image=np.ones((2, 8, 3))) | ||
]) | ||
|
||
with TestDir() as test_dir: | ||
source_dataset.export(test_dir, format='image_dir') | ||
zip_path = osp.join(test_dir, 'images.zip') | ||
make_zip_archive(test_dir, zip_path) | ||
|
||
run(self, 'create', '-o', test_dir) | ||
run(self, 'add', 'path', '-p', test_dir, '-f', 'image_zip', zip_path) | ||
|
||
export_path = osp.join(test_dir, 'export.zip') | ||
run(self, 'export', '-p', test_dir, '-f', 'image_zip', | ||
'-o', test_dir, '--overwrite', '--', | ||
'--name', osp.basename(export_path) | ||
) | ||
|
||
parsed_dataset = Dataset.import_from(export_path, format='image_zip') | ||
compare_datasets(self, source_dataset, parsed_dataset) | ||
|
||
@pytest.mark.reqids(Requirements.DATUM_267) | ||
def test_can_export_zip_images_from_coco_dataset(self): | ||
with TestDir() as test_dir: | ||
coco_dir = osp.join(__file__[:__file__.rfind(osp.join('tests', ''))], | ||
'tests', 'assets', 'coco_dataset') | ||
|
||
run(self, 'create', '-o', test_dir) | ||
run(self, 'add', 'path', '-p', test_dir, '-f', 'coco', coco_dir) | ||
|
||
export_path = osp.join(test_dir, 'export.zip') | ||
run(self, 'export', '-p', test_dir, '-f', 'image_zip', | ||
'-o', test_dir, '--overwrite', '--', | ||
'--name', osp.basename(export_path)) | ||
|
||
self.assertTrue(osp.isfile(export_path)) | ||
with ZipFile(export_path, 'r') as zf: | ||
images = {f.filename for f in zf.filelist} | ||
self.assertTrue(images == {'a.jpg', 'b.jpg'}) | ||
|
||
@pytest.mark.reqids(Requirements.DATUM_267) | ||
def test_can_change_extension_for_images_in_zip(self): | ||
source_dataset = Dataset.from_iterable([ | ||
DatasetItem(id='1', image=np.ones((5, 5, 3))), | ||
DatasetItem(id='2', image=np.ones((2, 8, 3))) | ||
]) | ||
|
||
with TestDir() as test_dir: | ||
source_dataset.export(test_dir, format='image_dir', image_ext='.jpg') | ||
zip_path = osp.join(test_dir, 'images.zip') | ||
make_zip_archive(test_dir, zip_path) | ||
|
||
run(self, 'create', '-o', test_dir) | ||
run(self, 'add', 'path', '-p', test_dir, '-f', 'image_zip', zip_path) | ||
|
||
export_path = osp.join(test_dir, 'export.zip') | ||
run(self, 'export', '-p', test_dir, '-f', 'image_zip', | ||
'-o', test_dir, '--overwrite', '--', | ||
'--name', osp.basename(export_path), '--image-ext', '.png') | ||
|
||
self.assertTrue(osp.isfile(export_path)) | ||
with ZipFile(export_path, 'r') as zf: | ||
images = {f.filename for f in zf.filelist} | ||
self.assertTrue(images == {'1.png', '2.png'}) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.