Support arbitrary image extensions in formats (base + ImageNet, Image…

…Dir) (cvat-ai#169) * add image search function * rename image dir format, add ext support * add image dir tests * add exts in imagenet formats * add more extensions
TOsmanov · Mar 18, 2021 · 745c16a · 745c16a
1 parent ef12d30
commit 745c16a
Show file tree

Hide file tree

Showing 9 changed files with 159 additions and 114 deletions.
diff --git a/datumaro/plugins/image_dir.py → datumaro/plugins/image_dir_format.py b/datumaro/plugins/image_dir.py → datumaro/plugins/image_dir_format.py
@@ -9,7 +9,7 @@
 
 from datumaro.components.extractor import DatasetItem, SourceExtractor, Importer
 from datumaro.components.converter import Converter
-from datumaro.util.os_util import walk
+from datumaro.util.image import find_images
 
 
 class ImageDirImporter(Importer):
@@ -20,21 +20,15 @@ def find_sources(cls, path):
         return [{ 'url': path, 'format': 'image_dir' }]
 
 class ImageDirExtractor(SourceExtractor):
-    IMAGE_EXT_FORMATS = {'.jpg', '.jpeg', '.png', '.ppm', '.bmp',
-        '.pgm', '.tif', '.tiff'}
-
-    def __init__(self, url, max_depth=10):
-        super().__init__()
+    def __init__(self, url, subset=None, max_depth=None):
+        super().__init__(subset=subset)
 
         assert osp.isdir(url), url
 
-        for dirpath, _, filenames in walk(url, max_depth=max_depth):
-            for name in filenames:
-                if not osp.splitext(name)[-1] in self.IMAGE_EXT_FORMATS:
-                    continue
-                path = osp.join(dirpath, name)
-                item_id = osp.relpath(osp.splitext(path)[0], url)
-                self._items.append(DatasetItem(id=item_id, image=path))
+        for path in find_images(url, recursive=True, max_depth=max_depth):
+            item_id = osp.relpath(osp.splitext(path)[0], url)
+            self._items.append(DatasetItem(id=item_id, subset=self._subset,
+                image=path))
 
 class ImageDirConverter(Converter):
     DEFAULT_IMAGE_EXT = '.jpg'

diff --git a/datumaro/plugins/imagenet_format.py b/datumaro/plugins/imagenet_format.py
@@ -2,7 +2,6 @@
 #
 # SPDX-License-Identifier: MIT
 
-from glob import glob
 import logging as log
 import os
 import os.path as osp
@@ -11,13 +10,11 @@
     LabelCategories, AnnotationType, SourceExtractor, Importer
 )
 from datumaro.components.converter import Converter
+from datumaro.util.image import find_images
 
 
 class ImagenetPath:
-    DEFAULT_IMAGE_EXT = '.jpg'
-    IMAGE_EXT_FORMATS = {'.jpg', '.jpeg', '.png', '.ppm', '.bmp',
-        '.pgm', '.tif', '.tiff'}
-    IMAGES_DIR_NO_LABEL = 'no_label'
+    IMAGE_DIR_NO_LABEL = 'no_label'
 
 
 class ImagenetExtractor(SourceExtractor):
@@ -30,29 +27,31 @@ def __init__(self, path, subset=None):
 
     def _load_categories(self, path):
         label_cat = LabelCategories()
-        for images_dir in sorted(os.listdir(path)):
-            if images_dir != ImagenetPath.IMAGES_DIR_NO_LABEL:
-                label_cat.add(images_dir)
+        for dirname in sorted(os.listdir(path)):
+            if dirname != ImagenetPath.IMAGE_DIR_NO_LABEL:
+                label_cat.add(dirname)
         return { AnnotationType.label: label_cat }
 
     def _load_items(self, path):
         items = {}
-        for image_path in glob(osp.join(path, '*', '*')):
-            if not osp.isfile(image_path) or \
-                    osp.splitext(image_path)[-1].lower() not in \
-                        ImagenetPath.IMAGE_EXT_FORMATS:
-                continue
+
+        for image_path in find_images(path, recursive=True, max_depth=1):
             label = osp.basename(osp.dirname(image_path))
-            image_name = osp.splitext(osp.basename(image_path))[0][len(label) + 1:]
+            image_name = osp.splitext(osp.basename(image_path))[0]
+            if image_name.startswith(label + '_'):
+                image_name = image_name[len(label) + 1:]
+
             item = items.get(image_name)
             if item is None:
                 item = DatasetItem(id=image_name, subset=self._subset,
                     image=image_path)
+                items[image_name] = item
             annotations = item.annotations
-            if label != ImagenetPath.IMAGES_DIR_NO_LABEL:
+
+            if label != ImagenetPath.IMAGE_DIR_NO_LABEL:
                 label = self._categories[AnnotationType.label].find(label)[0]
                 annotations.append(Label(label=label))
-            items[image_name] = item
+
         return items
 
 
@@ -65,27 +64,27 @@ def find_sources(cls, path):
 
 
 class ImagenetConverter(Converter):
-    DEFAULT_IMAGE_EXT = ImagenetPath.DEFAULT_IMAGE_EXT
+    DEFAULT_IMAGE_EXT = '.jpg'
 
     def apply(self):
         if 1 < len(self._extractor.subsets()):
-            log.warning("ImageNet format supports exporting only a single "
+            log.warning("ImageNet format only supports exporting a single "
                 "subset, subset information will not be used.")
 
         subset_dir = self._save_dir
         extractor = self._extractor
         labels = {}
         for item in self._extractor:
-            image_name = item.id
-            labels[image_name] = [p.label for p in item.annotations
-                if p.type == AnnotationType.label]
-            for label in labels[image_name]:
+            labels = set(p.label for p in item.annotations
+                if p.type == AnnotationType.label)
+
+            for label in labels:
                 label_name = extractor.categories()[AnnotationType.label][label].name
                 self._save_image(item, osp.join(subset_dir, label_name,
                     '%s_%s' %  (label_name, self._make_image_filename(item))))
 
-            if not labels[image_name]:
+            if not labels:
                 self._save_image(item, osp.join(subset_dir,
-                    ImagenetPath.IMAGES_DIR_NO_LABEL,
-                    ImagenetPath.IMAGES_DIR_NO_LABEL + '_'
-                    + self._make_image_filename(item)))
+                    ImagenetPath.IMAGE_DIR_NO_LABEL,
+                    ImagenetPath.IMAGE_DIR_NO_LABEL + '_' + \
+                    self._make_image_filename(item)))
diff --git a/datumaro/plugins/imagenet_txt_format.py b/datumaro/plugins/imagenet_txt_format.py
@@ -3,26 +3,27 @@
 #
 # SPDX-License-Identifier: MIT
 
-from glob import glob
 import os
 import os.path as osp
 
 from datumaro.components.extractor import (DatasetItem, Label,
     LabelCategories, AnnotationType, SourceExtractor, Importer
 )
 from datumaro.components.converter import Converter
+from datumaro.util.image import find_images
 
 
 class ImagenetTxtPath:
-    DEFAULT_IMAGE_EXT = '.jpg'
-    IMAGE_EXT_FORMAT = ['.jpg', '.jpeg', '.png', '.ppm', '.bmp', '.pgm', '.tif']
     LABELS_FILE = 'synsets.txt'
     IMAGE_DIR = 'images'
 
 class ImagenetTxtExtractor(SourceExtractor):
-    def __init__(self, path, labels=None, image_dir=None):
+    def __init__(self, path, labels=None, image_dir=None, subset=None):
         assert osp.isfile(path), path
-        super().__init__(subset=osp.splitext(osp.basename(path))[0])
+
+        if not subset:
+            subset = osp.splitext(osp.basename(path))[0]
+        super().__init__(subset=subset)
 
         if not image_dir:
             image_dir = ImagenetTxtPath.IMAGE_DIR
@@ -33,8 +34,8 @@ def __init__(self, path, labels=None, image_dir=None):
             labels = self._parse_labels(labels)
         else:
             assert all(isinstance(e, str) for e in labels)
-
         self._categories = self._load_categories(labels)
+
         self._items = list(self._load_items(path).values())
 
     @staticmethod
@@ -47,6 +48,14 @@ def _load_categories(self, labels):
 
     def _load_items(self, path):
         items = {}
+
+        image_dir = self.image_dir
+        if osp.isdir(image_dir):
+            images = { osp.splitext(osp.relpath(p, image_dir))[0]: p
+                for p in find_images(image_dir, recursive=True) }
+        else:
+            images = {}
+
         with open(path, encoding='utf-8') as f:
             for line in f:
                 item = line.split('\"')
@@ -61,20 +70,17 @@ def _load_items(self, path):
                     item = line.split()
                     item_id = item[0]
                     label_ids = [int(id) for id in item[1:]]
+
                 anno = []
                 for label in label_ids:
                     assert 0 <= label and \
                         label < len(self._categories[AnnotationType.label]), \
                         "Image '%s': unknown label id '%s'" % (item_id, label)
                     anno.append(Label(label))
-                image_path = osp.join(self.image_dir, item_id +
-                    ImagenetTxtPath.DEFAULT_IMAGE_EXT)
-                for path in glob(osp.join(self.image_dir, item_id + '*')):
-                    if osp.splitext(path)[1] in ImagenetTxtPath.IMAGE_EXT_FORMAT:
-                        image_path = path
-                        break
+
                 items[item_id] = DatasetItem(id=item_id, subset=self._subset,
-                    image=image_path, annotations=anno)
+                    image=images.get(item_id), annotations=anno)
+
         return items
 
 
@@ -87,7 +93,7 @@ def find_sources(cls, path):
 
 
 class ImagenetTxtConverter(Converter):
-    DEFAULT_IMAGE_EXT = ImagenetTxtPath.DEFAULT_IMAGE_EXT
+    DEFAULT_IMAGE_EXT = '.jpg'
 
     def apply(self):
         subset_dir = self._save_dir
@@ -96,10 +102,11 @@ def apply(self):
         extractor = self._extractor
         for subset_name, subset in self._extractor.subsets().items():
             annotation_file = osp.join(subset_dir, '%s.txt' % subset_name)
+
             labels = {}
             for item in subset:
-                labels[item.id] = [str(p.label) for p in item.annotations
-                    if p.type == AnnotationType.label]
+                labels[item.id] = set(p.label for p in item.annotations
+                    if p.type == AnnotationType.label)
 
                 if self._save_images and item.has_image:
                     self._save_image(item, subdir=ImagenetTxtPath.IMAGE_DIR)
@@ -108,13 +115,15 @@ def apply(self):
             for item_id, item_labels in labels.items():
                 if 1 < len(item_id.split()):
                     item_id = '\"' + item_id + '\"'
-                annotation += '%s %s\n' % (item_id, ' '.join(item_labels))
+                annotation += '%s %s\n' % (
+                    item_id, ' '.join(str(l) for l in item_labels))
 
             with open(annotation_file, 'w', encoding='utf-8') as f:
                 f.write(annotation)
 
         labels_file = osp.join(subset_dir, ImagenetTxtPath.LABELS_FILE)
         with open(labels_file, 'w', encoding='utf-8') as f:
-            f.write('\n'.join(l.name
-                for l in extractor.categories()[AnnotationType.label])
+            f.writelines(l.name + '\n'
+                for l in extractor.categories().get(
+                    AnnotationType.label, LabelCategories())
             )
diff --git a/datumaro/util/image.py b/datumaro/util/image.py
@@ -7,6 +7,7 @@
 
 from enum import Enum
 from io import BytesIO
+from typing import Iterator, Iterable, Union
 import numpy as np
 import os
 import os.path as osp
@@ -21,6 +22,7 @@
     _IMAGE_BACKEND = _IMAGE_BACKENDS.PIL
 
 from datumaro.util.image_cache import ImageCache as _ImageCache
+from datumaro.util.os_util import walk
 
 
 def load_image(path, dtype=np.float32):
@@ -153,6 +155,37 @@ def decode_image(image_bytes, dtype=np.float32):
         assert image.shape[2] in {3, 4}
     return image
 
+IMAGE_EXTENSIONS = {'.jpg', '.jpeg', '.jpe', '.jp2',
+    '.png', '.bmp', '.dib', '.tif', '.tiff', '.tga', '.webp', '.pfm',
+    '.sr', '.ras', '.exr', '.hdr', '.pic',
+    '.pbm', '.pgm', '.ppm', '.pxm', '.pnm',
+}
+
+def find_images(dirpath: str, exts: Union[str, Iterable[str]] = None,
+        recursive: bool = False, max_depth: int = None) -> Iterator[str]:
+    if isinstance(exts, str):
+        exts = [exts.lower()]
+    elif exts is None:
+        exts = IMAGE_EXTENSIONS
+    else:
+        exts = list(e.lower() for e in exts)
+
+    def _check_image_ext(filename: str):
+        dotpos = filename.rfind('.')
+        if 0 < dotpos: # exclude '.ext' cases too
+            ext = filename[dotpos:].lower()
+            if ext in exts:
+                return True
+        return False
+
+    for d, _, filenames in walk(dirpath,
+            max_depth=max_depth if recursive else 0):
+        for filename in filenames:
+            if not _check_image_ext(filename):
+                continue
+
+            yield osp.join(d, filename)
+
 
 class lazy_image:
     def __init__(self, path, loader=None, cache=None):

diff --git a/datumaro/util/os_util.py b/datumaro/util/os_util.py
@@ -9,6 +9,8 @@
 import sys
 
 
+DEFAULT_MAX_DEPTH = 10
+
 def check_instruction_set(instruction):
     return instruction == str.strip(
         # Let's ignore a warning from bandit about using shell=True.
@@ -34,6 +36,9 @@ def import_foreign_module(name, path, package=None):
     return module
 
 def walk(path, max_depth=None):
+    if max_depth is None:
+        max_depth = DEFAULT_MAX_DEPTH
+
     baselevel = path.count(osp.sep)
     for dirpath, dirnames, filenames in os.walk(path, topdown=True):
         curlevel = dirpath.count(osp.sep)

diff --git a/datumaro/util/test_utils.py b/datumaro/util/test_utils.py
@@ -144,7 +144,7 @@ def compare_datasets_strict(test, expected, actual):
                 (idx, item_a, item_b))
 
 def test_save_and_load(test, source_dataset, converter, test_dir, importer,
-        target_dataset=None, importer_args=None, compare=None):
+        target_dataset=None, importer_args=None, compare=None, **kwargs):
     converter(source_dataset, test_dir)
 
     if importer_args is None:
@@ -156,4 +156,4 @@ def test_save_and_load(test, source_dataset, converter, test_dir, importer,
 
     if not compare:
         compare = compare_datasets
-    compare(test, expected=target_dataset, actual=parsed_dataset)
+    compare(test, expected=target_dataset, actual=parsed_dataset, **kwargs)
diff --git a/tests/test_image_dir_format.py b/tests/test_image_dir_format.py
@@ -4,7 +4,8 @@
 
 from datumaro.components.project import Dataset
 from datumaro.components.extractor import DatasetItem
-from datumaro.plugins.image_dir import ImageDirConverter
+from datumaro.plugins.image_dir_format import ImageDirConverter
+from datumaro.util.image import Image
 from datumaro.util.test_utils import TestDir, test_save_and_load
 
 
@@ -17,7 +18,7 @@ def test_can_load(self):
 
         with TestDir() as test_dir:
             test_save_and_load(self, dataset, ImageDirConverter.convert,
-                test_dir, importer='image_dir')
+                test_dir, importer='image_dir', require_images=True)
 
     def test_relative_paths(self):
         dataset = Dataset.from_iterable([
@@ -38,3 +39,15 @@ def test_can_save_dataset_with_cyrillic_and_spaces_in_filename(self):
         with TestDir() as test_dir:
             test_save_and_load(self, dataset, ImageDirConverter.convert,
                 test_dir, importer='image_dir')
+
+    def test_can_save_and_load_image_with_arbitrary_extension(self):
+        dataset = Dataset.from_iterable([
+            DatasetItem(id='q/1', image=Image(path='q/1.JPEG',
+                data=np.zeros((4, 3, 3)))),
+            DatasetItem(id='a/b/c/2', image=Image(path='a/b/c/2.bmp',
+                data=np.zeros((3, 4, 3)))),
+        ])
+
+        with TestDir() as test_dir:
+            test_save_and_load(self, dataset, ImageDirConverter.convert,
+                test_dir, importer='image_dir', require_images=True)