Dataset meta file for some formats (#569)

* add meta file support in CelebA, CIFAR, MNIST, LFW, YOLO, COCO, SYNTHIA, ADE20K * update documentation * update tests * update changelog
openvinotoolkit · Nov 30, 2021 · 82cdb62 · 82cdb62
1 parent ba452e5
commit 82cdb62
Show file tree

Hide file tree

Showing 115 changed files with 704 additions and 32 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -16,10 +16,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
   (<https://github.com/openvinotoolkit/datumaro/pull/547>)
 - Import for SYNTHIA dataset format
   (<https://github.com/openvinotoolkit/datumaro/pull/532>)
-- Support for Accuracy Checker dataset meta files in formats
-  (<https://github.com/openvinotoolkit/datumaro/pull/553>)
 - Support of `score` attribute in KITTI detetion
   (<https://github.com/openvinotoolkit/datumaro/pull/571>)
+- Support for Accuracy Checker dataset meta files in formats
+  (<https://github.com/openvinotoolkit/datumaro/pull/553>,
+  <https://github.com/openvinotoolkit/datumaro/pull/569>)
 
 ### Changed
 - The following formats can now be detected unambiguously:

diff --git a/datumaro/plugins/ade20k2017_format.py b/datumaro/plugins/ade20k2017_format.py
@@ -18,6 +18,7 @@
 from datumaro.util.image import (
     IMAGE_EXTENSIONS, find_images, lazy_image, load_image,
 )
+from datumaro.util.meta_file_util import has_meta_file, parse_meta_file
 
 
 class Ade20k2017Path:
@@ -31,7 +32,9 @@ def __init__(self, path):
         if not osp.isdir(path):
             raise FileNotFoundError("Can't read dataset directory '%s'" % path)
 
-        subsets = os.listdir(path)
+        # exclude dataset meta file
+        subsets = [subset for subset in os.listdir(path)
+            if osp.splitext(subset)[-1] != '.json']
         if len(subsets) < 1:
             raise FileNotFoundError("Can't read subsets in directory '%s'" % path)
 
@@ -41,6 +44,10 @@ def __init__(self, path):
         self._items = []
         self._categories  = {}
 
+        if has_meta_file(self._path):
+            self._categories =  { AnnotationType.label: LabelCategories().
+                from_iterable(list(parse_meta_file(self._path).keys())) }
+
         for subset in self._subsets:
             self._load_items(subset)
 

diff --git a/datumaro/plugins/ade20k2020_format.py b/datumaro/plugins/ade20k2020_format.py
@@ -19,6 +19,7 @@
 from datumaro.util.image import (
     IMAGE_EXTENSIONS, find_images, lazy_image, load_image,
 )
+from datumaro.util.meta_file_util import has_meta_file, parse_meta_file
 
 
 class Ade20k2020Path:
@@ -33,7 +34,9 @@ def __init__(self, path):
         if not osp.isdir(path):
             raise FileNotFoundError("Can't read dataset directory '%s'" % path)
 
-        subsets = os.listdir(path)
+        # exclude dataset meta file
+        subsets = [subset for subset in os.listdir(path)
+            if osp.splitext(subset)[-1] != '.json']
         if len(subsets) < 1:
             raise FileNotFoundError("Can't read subsets in directory '%s'" % path)
 
@@ -43,6 +46,10 @@ def __init__(self, path):
         self._items = []
         self._categories  = {}
 
+        if has_meta_file(self._path):
+            self._categories =  { AnnotationType.label: LabelCategories().
+                from_iterable(list(parse_meta_file(self._path).keys())) }
+
         for subset in self._subsets:
             self._load_items(subset)
 

diff --git a/datumaro/plugins/align_celeba_format.py b/datumaro/plugins/align_celeba_format.py
@@ -10,6 +10,7 @@
 from datumaro.components.errors import DatasetImportError
 from datumaro.components.extractor import DatasetItem, Importer, SourceExtractor
 from datumaro.util.image import find_images
+from datumaro.util.meta_file_util import has_meta_file, parse_meta_file
 
 
 class AlignCelebaPath:
@@ -32,6 +33,10 @@ def __init__(self, path):
         self._anno_dir = osp.dirname(path)
 
         self._categories = { AnnotationType.label: LabelCategories() }
+        if has_meta_file(path):
+            self._categories = { AnnotationType.label: LabelCategories().
+                from_iterable(list(parse_meta_file(path).keys())) }
+
         self._items = list(self._load_items(path).values())
 
     def _load_items(self, root_dir):

diff --git a/datumaro/plugins/celeba_format.py b/datumaro/plugins/celeba_format.py
@@ -10,6 +10,7 @@
 from datumaro.components.errors import DatasetImportError
 from datumaro.components.extractor import DatasetItem, Importer, SourceExtractor
 from datumaro.util.image import find_images
+from datumaro.util.meta_file_util import has_meta_file, parse_meta_file
 
 
 class CelebaPath:
@@ -30,6 +31,10 @@ def __init__(self, path):
         super().__init__()
 
         self._categories = { AnnotationType.label: LabelCategories() }
+        if has_meta_file(path):
+            self._categories = { AnnotationType.label: LabelCategories().
+                from_iterable(list(parse_meta_file(path).keys())) }
+
         self._items = list(self._load_items(path).values())
 
     def _load_items(self, root_dir):

diff --git a/datumaro/plugins/cifar_format.py b/datumaro/plugins/cifar_format.py
@@ -17,6 +17,7 @@
 from datumaro.components.dataset import ItemStatus
 from datumaro.components.extractor import DatasetItem, Importer, SourceExtractor
 from datumaro.util import cast
+from datumaro.util.meta_file_util import has_meta_file, parse_meta_file
 
 
 class RestrictedUnpickler(pickle.Unpickler):
@@ -65,6 +66,10 @@ def __init__(self, path, subset=None):
         self._items = list(self._load_items(path).values())
 
     def _load_categories(self, path):
+        if has_meta_file(path):
+            return { AnnotationType.label: LabelCategories().
+                from_iterable(list(parse_meta_file(path).keys())) }
+
         label_cat = LabelCategories()
 
         meta_file = osp.join(path, CifarPath.META_10_FILE)
@@ -175,6 +180,9 @@ class CifarConverter(Converter):
     def apply(self):
         os.makedirs(self._save_dir, exist_ok=True)
 
+        if self._save_dataset_meta:
+            self._save_meta_file(self._save_dir)
+
         label_categories = self._extractor.categories()[AnnotationType.label]
         label_names = []
         coarse_label_names = []

diff --git a/datumaro/plugins/kitti_format/converter.py b/datumaro/plugins/kitti_format/converter.py
@@ -18,6 +18,7 @@
 from datumaro.util.annotation_util import make_label_id_mapping
 from datumaro.util.image import save_image
 from datumaro.util.mask_tools import paint_mask
+from datumaro.util.meta_file_util import is_meta_file, parse_meta_file
 
 from .format import (
     KittiLabelMap, KittiPath, KittiTask, make_kitti_categories, parse_label_map,
@@ -156,8 +157,11 @@ def get_label(self, label_id):
             categories()[AnnotationType.label].items[label_id].name
 
     def save_label_map(self):
-        path = osp.join(self._save_dir, KittiPath.LABELMAP_FILE)
-        write_label_map(path, self._label_map)
+        if self._save_dataset_meta:
+            self._save_meta_file(self._save_dir)
+        else:
+            path = osp.join(self._save_dir, KittiPath.LABELMAP_FILE)
+            write_label_map(path, self._label_map)
 
     def _load_categories(self, label_map_source):
         if label_map_source == LabelmapType.kitti.name:
@@ -188,7 +192,10 @@ def _load_categories(self, label_map_source):
                 sorted(label_map_source.items(), key=lambda e: e[0]))
 
         elif isinstance(label_map_source, str) and osp.isfile(label_map_source):
-            label_map = parse_label_map(label_map_source)
+            if is_meta_file(label_map_source):
+                label_map = parse_meta_file(label_map_source)
+            else:
+                label_map = parse_label_map(label_map_source)
 
         else:
             raise Exception("Wrong labelmap specified, "

diff --git a/datumaro/plugins/kitti_format/extractor.py b/datumaro/plugins/kitti_format/extractor.py
@@ -12,6 +12,7 @@
     AnnotationType, DatasetItem, SourceExtractor,
 )
 from datumaro.util.image import find_images, load_image
+from datumaro.util.meta_file_util import has_meta_file, parse_meta_file
 
 from .format import (
     KittiLabelMap, KittiPath, KittiTask, make_kitti_categories, parse_label_map,
@@ -36,15 +37,23 @@ def _load_categories(self, path):
         if self._task == KittiTask.segmentation:
             return self._load_categories_segmentation(path)
         elif self._task == KittiTask.detection:
+            if has_meta_file(path):
+                return { AnnotationType.label: LabelCategories().
+                    from_iterable(list(parse_meta_file(path).keys())) }
+
             return {AnnotationType.label: LabelCategories()}
 
     def _load_categories_segmentation(self, path):
         label_map = None
-        label_map_path = osp.join(path, KittiPath.LABELMAP_FILE)
-        if osp.isfile(label_map_path):
-            label_map = parse_label_map(label_map_path)
+        if has_meta_file(path):
+            label_map = parse_meta_file(path)
         else:
-            label_map = KittiLabelMap
+            label_map_path = osp.join(path, KittiPath.LABELMAP_FILE)
+            if osp.isfile(label_map_path):
+                label_map = parse_label_map(label_map_path)
+            else:
+                label_map = KittiLabelMap
+
         self._labels = [label for label in label_map]
         return make_kitti_categories(label_map)
 

diff --git a/datumaro/plugins/lfw_format.py b/datumaro/plugins/lfw_format.py
@@ -13,6 +13,7 @@
 from datumaro.components.extractor import DatasetItem, Importer, SourceExtractor
 from datumaro.components.format_detection import FormatDetectionContext
 from datumaro.util.image import find_images
+from datumaro.util.meta_file_util import has_meta_file, parse_meta_file
 
 
 class LfwPath:
@@ -44,6 +45,10 @@ def __init__(self, path, subset=None):
         self._items = list(self._load_items(path).values())
 
     def _load_categories(self, path):
+        if has_meta_file(self._dataset_dir):
+            return { AnnotationType.label: LabelCategories().
+                from_iterable(list(parse_meta_file(self._dataset_dir).keys())) }
+
         label_cat = LabelCategories()
         if osp.isfile(path):
             with open(path, encoding='utf-8') as labels_file:
@@ -188,6 +193,10 @@ class LfwConverter(Converter):
     DEFAULT_IMAGE_EXT = LfwPath.IMAGE_EXT
 
     def apply(self):
+        os.makedirs(self._save_dir, exist_ok=True)
+        if self._save_dataset_meta:
+            self._save_meta_file(self._save_dir)
+
         for subset_name, subset in self._extractor.subsets().items():
             label_categories = self._extractor.categories()[AnnotationType.label]
             labels = {label.name: 0 for label in label_categories}

diff --git a/datumaro/plugins/mnist_csv_format.py b/datumaro/plugins/mnist_csv_format.py
@@ -12,6 +12,7 @@
 )
 from datumaro.components.converter import Converter
 from datumaro.components.extractor import DatasetItem, Importer, SourceExtractor
+from datumaro.util.meta_file_util import has_meta_file, parse_meta_file
 
 
 class MnistCsvPath:
@@ -35,6 +36,10 @@ def __init__(self, path, subset=None):
         self._items = list(self._load_items(path).values())
 
     def _load_categories(self):
+        if has_meta_file(self._dataset_dir):
+            return { AnnotationType.label: LabelCategories().
+                from_iterable(list(parse_meta_file(self._dataset_dir).keys())) }
+
         label_cat = LabelCategories()
 
         labels_file = osp.join(self._dataset_dir, 'labels.txt')
@@ -100,6 +105,9 @@ class MnistCsvConverter(Converter):
 
     def apply(self):
         os.makedirs(self._save_dir, exist_ok=True)
+        if self._save_dataset_meta:
+            self._save_meta_file(self._save_dir)
+
         for subset_name, subset in self._extractor.subsets().items():
             data = []
             item_ids = {}

diff --git a/datumaro/plugins/mnist_format.py b/datumaro/plugins/mnist_format.py
@@ -13,6 +13,7 @@
 )
 from datumaro.components.converter import Converter
 from datumaro.components.extractor import DatasetItem, Importer, SourceExtractor
+from datumaro.util.meta_file_util import has_meta_file, parse_meta_file
 
 
 class MnistPath:
@@ -43,6 +44,10 @@ def __init__(self, path, subset=None):
         self._items = list(self._load_items(path).values())
 
     def _load_categories(self):
+        if has_meta_file(self._dataset_dir):
+            return { AnnotationType.label: LabelCategories().
+                from_iterable(list(parse_meta_file(self._dataset_dir).keys())) }
+
         label_cat = LabelCategories()
 
         labels_file = osp.join(self._dataset_dir, 'labels.txt')
@@ -116,6 +121,9 @@ class MnistConverter(Converter):
 
     def apply(self):
         os.makedirs(self._save_dir, exist_ok=True)
+        if self._save_dataset_meta:
+            self._save_meta_file(self._save_dir)
+
         for subset_name, subset in self._extractor.subsets().items():
             labels = []
             images = np.array([])

diff --git a/datumaro/plugins/synthia_format.py b/datumaro/plugins/synthia_format.py
@@ -14,6 +14,7 @@
 from datumaro.components.format_detection import FormatDetectionContext
 from datumaro.util.image import find_images, load_image
 from datumaro.util.mask_tools import generate_colormap, lazy_mask
+from datumaro.util.meta_file_util import has_meta_file, parse_meta_file
 
 
 class SynthiaPath:
@@ -98,6 +99,8 @@ def __init__(self, path):
         self._items = list(self._load_items(path).values())
 
     def _load_categories(self, path):
+        if has_meta_file(path):
+            return make_categories(parse_meta_file(path))
         label_map_path = osp.join(path, SynthiaPath.LABELMAP_FILE)
         if osp.isfile(label_map_path):
             label_map = parse_label_map(label_map_path)

diff --git a/datumaro/plugins/yolo_format/converter.py b/datumaro/plugins/yolo_format/converter.py
@@ -35,6 +35,9 @@ def apply(self):
 
         os.makedirs(save_dir, exist_ok=True)
 
+        if self._save_dataset_meta:
+            self._save_meta_file(self._save_dir)
+
         label_categories = extractor.categories()[AnnotationType.label]
         label_ids = {label.name: idx
             for idx, label in enumerate(label_categories.items)}

diff --git a/datumaro/plugins/yolo_format/extractor.py b/datumaro/plugins/yolo_format/extractor.py
@@ -15,6 +15,7 @@
 from datumaro.util.image import (
     DEFAULT_IMAGE_META_FILE_NAME, load_image_meta_file,
 )
+from datumaro.util.meta_file_util import has_meta_file, parse_meta_file
 from datumaro.util.os_util import split_path
 
 from .format import YoloPath
@@ -171,6 +172,10 @@ def _parse_annotations(anno_path, image):
 
     @staticmethod
     def _load_categories(names_path):
+        if has_meta_file(osp.dirname(names_path)):
+            return LabelCategories().from_iterable(
+                list(parse_meta_file(osp.dirname(names_path)).keys()))
+
         label_categories = LabelCategories()
 
         with open(names_path, 'r', encoding='utf-8') as f:

diff --git a/site/content/en/docs/formats/ade20k2017.md b/site/content/en/docs/formats/ade20k2017.md
@@ -43,6 +43,7 @@ ADE20K dataset directory should have the following structure:
 <!--lint disable fenced-code-flag-->
 ```
 dataset/
+├── dataset_meta.json # a list of non-format labels (optional)
 ├── subset1/
 │   └── super_label_1/
 │       ├── img1.jpg
@@ -82,6 +83,8 @@ image. Each line in the text file contains:
 Each column is separated by a `#`. See example of dataset
 [here](https://github.com/openvinotoolkit/datumaro/tree/develop/tests/assets/ade20k2017_dataset).
 
+To add custom classes, you can use [`dataset_meta.json`](/docs/user_manual/supported_formats/#dataset-meta-file).
+
 ## Export to other formats
 
 Datumaro can convert an ADE20K dataset into any other format [Datumaro supports](/docs/user-manual/supported_formats/).

diff --git a/site/content/en/docs/formats/ade20k2020.md b/site/content/en/docs/formats/ade20k2020.md
@@ -43,6 +43,7 @@ ADE20K dataset directory should have the following structure:
 <!--lint disable fenced-code-flag-->
 ```
 dataset/
+├── dataset_meta.json # a list of non-format labels (optional)
 ├── subset1/
 │   ├── img1/  # directory with instance masks for img1
 │   |    ├── instance_001_img1.png
@@ -101,6 +102,8 @@ See our [tests asset](https://github.com/openvinotoolkit/datumaro/tree/develop/t
 for example of this file,
 or check [ADE20K toolkit](https://github.com/CSAILVision/ADE20K) for it.
 
+To add custom classes, you can use [`dataset_meta.json`](/docs/user_manual/supported_formats/#dataset-meta-file).
+
 ## Export to other formats
 
 Datumaro can convert an ADE20K dataset into any other format [Datumaro supports](/docs/user-manual/supported_formats/).

diff --git a/site/content/en/docs/formats/align_celeba.md b/site/content/en/docs/formats/align_celeba.md
@@ -48,6 +48,7 @@ Align CelebA dataset directory should have the following structure:
 <!--lint disable fenced-code-flag-->
 ```
 dataset/
+├── dataset_meta.json # a list of non-format labels (optional)
 ├── Anno/
 │   ├── identity_CelebA.txt
 │   ├── list_attr_celeba.txt
@@ -69,6 +70,8 @@ landmarks and subsets respectively (optional).
 The original CelebA dataset stores images in a .7z archive. The archive
 needs to be unpacked before importing.
 
+To add custom classes, you can use [`dataset_meta.json`](/docs/user_manual/supported_formats/#dataset-meta-file).
+
 ## Export to other formats
 
 Datumaro can convert an align CelebA dataset into any other format [Datumaro supports](/docs/user-manual/supported_formats/).