From 849ad7fd44d419d68a8e1def045d557cb686555d Mon Sep 17 00:00:00 2001
From: Anastasia Yasakova <anastasia.yasakova@intel.com>
Date: Tue, 13 Jul 2021 12:44:43 +0300
Subject: [PATCH] Support for CIFAR-100 (#301)

* Add support for CIFAR-100

* Update Changelog

* Update user_manual.md

* Add notes about differences in formats
---
 CHANGELOG.md                      |   1 +
 datumaro/plugins/cifar_format.py  |  85 +++++++++++----
 docs/formats/cifar_user_manual.md | 170 ++++++++++++++++++++++++++++++
 docs/user_manual.md               |   1 +
 tests/test_cifar_format.py        |  42 ++++++++
 5 files changed, 280 insertions(+), 19 deletions(-)
 create mode 100644 docs/formats/cifar_user_manual.md

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 71cb72449409..eea75b8fec07 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -21,6 +21,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
   <https://github.com/openvinotoolkit/datumaro/pull/315>).
 - Support for Supervisely Point Cloud dataset format (<https://github.com/openvinotoolkit/datumaro/pull/245>)
 - Support for KITTI Raw / Velodyne Points dataset format (<https://github.com/openvinotoolkit/datumaro/pull/245>)
+- Support for CIFAR-100 and documentation for CIFAR-10/100 (<https://github.com/openvinotoolkit/datumaro/pull/301>)
 
 ### Changed
 - Tensorflow AVX check is made optional in API and disabled by default (<https://github.com/openvinotoolkit/datumaro/pull/305>)
diff --git a/datumaro/plugins/cifar_format.py b/datumaro/plugins/cifar_format.py
index b4dcdc616e36..cb2ea030dd59 100644
--- a/datumaro/plugins/cifar_format.py
+++ b/datumaro/plugins/cifar_format.py
@@ -2,6 +2,7 @@
 #
 # SPDX-License-Identifier: MIT
 
+from collections import OrderedDict
 import os
 import os.path as osp
 import pickle  # nosec - disable B403:import_pickle check
@@ -18,11 +19,12 @@
 
 class CifarPath:
     BATCHES_META = 'batches.meta'
+    META = 'meta'
     TRAIN_ANNOTATION_FILE = 'data_batch_'
-    IMAGES_DIR = 'images'
+    USELESS_FILE = 'file.txt~'
     IMAGE_SIZE = 32
 
-CifarLabel = ['airplane', 'automobile', 'bird', 'cat',
+Cifar10Label = ['airplane', 'automobile', 'bird', 'cat',
     'deer', 'dog', 'frog', 'horse', 'ship', 'truck']
 
 # Support for Python version CIFAR-10/100
@@ -41,40 +43,61 @@ def __init__(self, path, subset=None):
 
         super().__init__(subset=subset)
 
-        batches_meta_file = osp.join(osp.dirname(path), CifarPath.BATCHES_META)
-        self._categories = self._load_categories(batches_meta_file)
+        self._categories = self._load_categories(osp.dirname(path))
 
         self._items = list(self._load_items(path).values())
 
     def _load_categories(self, path):
         label_cat = LabelCategories()
 
-        if osp.isfile(path):
+        meta_file = osp.join(path, CifarPath.BATCHES_META)
+        if not osp.isfile(meta_file):
+            meta_file = osp.join(path, CifarPath.META)
+        if osp.isfile(meta_file):
+            # CIFAR-10:
             # num_cases_per_batch: 1000
             # label_names: ['airplane', 'automobile', 'bird', 'cat', 'deer',
             #               'dog', 'frog', 'horse', 'ship', 'truck']
             # num_vis: 3072
-            with open(path, 'rb') as labels_file:
+            # CIFAR-100:
+            # fine_label_names: ['apple', 'aquarium_fish', 'baby', ...]
+            # coarse_label_names: ['aquatic_mammals', 'fish', 'flowers', ...]
+            with open(meta_file, 'rb') as labels_file:
                 data = pickle.load(labels_file) # nosec - disable B301:pickle check
-            for label in data['label_names']:
-                label_cat.add(label)
+            labels = data.get('label_names')
+            if labels != None:
+                for label in labels:
+                    label_cat.add(label)
+            else:
+                labels = data.get('fine_label_names')
+                self._coarse_labels = data.get('coarse_label_names', [])
+                if labels != None:
+                    for label in labels:
+                        label_cat.add(label)
         else:
-            for label in CifarLabel:
+            for label in Cifar10Label:
                 label_cat.add(label)
 
         return { AnnotationType.label: label_cat }
 
     def _load_items(self, path):
         items = {}
+        label_cat = self._categories[AnnotationType.label]
 
         # 'batch_label': 'training batch 1 of 5'
         # 'data': ndarray
         # 'filenames': list
-        # 'labels': list
+        # CIFAR-10: 'labels': list
+        # CIFAR-100: 'fine_labels': list
+        #            'coarse_labels': list
+
         with open(path, 'rb') as anno_file:
             annotation_dict = pickle.load(anno_file, encoding='latin1') # nosec - disable B301:pickle check
 
         labels = annotation_dict.get('labels', [])
+        coarse_labels = annotation_dict.get('coarse_labels', [])
+        if len(labels) == 0:
+            labels = annotation_dict.get('fine_labels', [])
         filenames = annotation_dict.get('filenames', [])
         images_data = annotation_dict.get('data')
         size = annotation_dict.get('image_sizes')
@@ -92,6 +115,8 @@ def _load_items(self, path):
             annotations = []
             if label != None:
                 annotations.append(Label(label))
+                if 0 < len(coarse_labels) and coarse_labels[i] != None and label_cat[label].parent == '':
+                    label_cat[label].parent = self._coarse_labels[coarse_labels[i]]
 
             image = None
             if 0 < len(images_data):
@@ -116,7 +141,7 @@ class CifarImporter(Importer):
     def find_sources(cls, path):
         return cls._find_sources_recursive(path, '', 'cifar',
             file_filter=lambda p: osp.basename(p) not in
-                {CifarPath.BATCHES_META, CifarPath.IMAGES_DIR})
+                {CifarPath.BATCHES_META, CifarPath.META, CifarPath.USELESS_FILE})
 
 
 class CifarConverter(Converter):
@@ -127,9 +152,20 @@ def apply(self):
 
         label_categories = self._extractor.categories()[AnnotationType.label]
         label_names = []
+        coarse_label_names = []
         for label in label_categories:
             label_names.append(label.name)
-        labels_dict = { 'label_names': label_names }
+            if label.parent != '' and label.parent not in coarse_label_names:
+                coarse_label_names.append(label.parent)
+        coarse_label_names.sort()
+
+        if 0 < len(coarse_label_names):
+            labels_dict = { 'fine_label_names': label_names,
+                            'coarse_label_names': coarse_label_names }
+            coarse_label_names = OrderedDict({name: i for i, name in enumerate(coarse_label_names)})
+        else:
+            labels_dict = { 'label_names': label_names }
+
         batches_meta_file = osp.join(self._save_dir, CifarPath.BATCHES_META)
         with open(batches_meta_file, 'wb') as labels_file:
             pickle.dump(labels_dict, labels_file)
@@ -137,17 +173,22 @@ def apply(self):
         for subset_name, subset in self._extractor.subsets().items():
             filenames = []
             labels = []
+            coarse_labels = []
             data = []
             image_sizes = {}
             for item in subset:
                 filenames.append(item.id + self._find_image_ext(item))
 
-                anns = [a.label for a in item.annotations
+                anns = [a for a in item.annotations
                     if a.type == AnnotationType.label]
-                label = None
-                if anns:
-                    label = anns[0]
-                labels.append(label)
+                if 0 < len(anns):
+                    labels.append(anns[0].label)
+                    if 0 < len(coarse_label_names):
+                        superclass = label_categories[anns[0].label].parent
+                        coarse_labels.append(coarse_label_names[superclass])
+                else:
+                    labels.append(None)
+                    coarse_labels.append(None)
 
                 if item.has_image and self._save_images:
                     image = item.image
@@ -163,7 +204,11 @@ def apply(self):
 
             annotation_dict = {}
             annotation_dict['filenames'] = filenames
-            annotation_dict['labels'] = labels
+            if 0 < len(labels) and len(labels) == len(coarse_labels):
+                annotation_dict['fine_labels'] = labels
+                annotation_dict['coarse_labels'] = coarse_labels
+            else:
+                annotation_dict['labels'] = labels
             annotation_dict['data'] = np.array(data, dtype=object)
             if len(image_sizes):
                 size = (CifarPath.IMAGE_SIZE, CifarPath.IMAGE_SIZE)
@@ -179,8 +224,10 @@ def apply(self):
                 num = subset_name.split('_')[1]
                 filename = CifarPath.TRAIN_ANNOTATION_FILE + num
                 batch_label = 'training batch %s of 5' % (num, )
-            if subset_name == 'test':
+            elif subset_name == 'test':
                 batch_label = 'testing batch 1 of 1'
+            elif subset_name == 'train':
+                filename = subset_name
             if batch_label:
                 annotation_dict['batch_label'] = batch_label
 
diff --git a/docs/formats/cifar_user_manual.md b/docs/formats/cifar_user_manual.md
new file mode 100644
index 000000000000..0ec72e14e5b9
--- /dev/null
+++ b/docs/formats/cifar_user_manual.md
@@ -0,0 +1,170 @@
+# CIFAR user manual
+
+## Contents
+
+- [Format specification](#format-specification)
+- [Load CIFAR dataset](#load-CIFAR-dataset)
+- [Export to other formats](#export-to-other-formats)
+- [Export to CIFAR](#export-to-CIFAR)
+- [Particular use cases](#particular-use-cases)
+
+## Format specification
+
+CIFAR format specification available [here](https://www.cs.toronto.edu/~kriz/cifar.html).
+
+CIFAR dataset format supports `Labels` annotations.
+
+Datumaro supports Python version CIFAR-10/100.
+
+## Load CIFAR dataset
+
+The CIFAR dataset is available for free download:
+
+- [cifar-10-python.tar.gz](https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz):
+  CIFAR-10 python version
+- [cifar-100-python.tar.gz](https://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz):
+  CIFAR-100 python version
+
+There are two ways to create Datumaro project and add CIFAR dataset to it:
+
+``` bash
+datum import --format cifar --input-path <path/to/dataset>
+# or
+datum create
+datum add path -f cifar <path/to/dataset>
+```
+
+It is possible to specify project name and project directory run
+`datum create --help` for more information.
+
+CIFAR-10 dataset directory should have the following structure:
+
+<!--lint disable fenced-code-flag-->
+```
+└─ Dataset/
+    ├── batches.meta
+    ├── data_batch_1
+    ├── data_batch_2
+    ├── data_batch_3
+    ├── data_batch_4
+    ├── data_batch_5
+    └── test_batch
+```
+
+CIFAR-100 dataset directory should have the following structure:
+
+<!--lint disable fenced-code-flag-->
+```
+└─ Dataset/
+    ├── meta
+    ├── test
+    └── train
+```
+
+CIFAR format only supports 32 x 32 images.
+
+The 100 classes in the CIFAR-100 are grouped into 20 superclasses. Each image
+comes with a "fine" label (the class to which it belongs) and a "coarse" label
+(the superclass to which it belongs)
+
+The difference between CIFAR-10 and CIFAR-100 is how labels are stored
+in the meta file (batches.meta or meta) and in the annotation file (train,
+data_batch_1, test_batch, etc.).
+<!--lint disable fenced-code-flag-->
+```
+meta file:
+CIFAR-10: num_cases_per_batch: 1000
+          label_names: ['airplane', 'automobile', 'bird', ...]
+          num_vis: 3072
+CIFAR-100: fine_label_names: ['apple', 'aquarium_fish', 'baby', ...]
+           coarse_label_names: ['aquatic_mammals', 'fish', 'flowers', ...]
+
+annotation file:
+'batch_label': 'training batch 1 of 5'
+'data': ndarray
+'filenames': list
+CIFAR-10: 'labels': list
+CIFAR-100: 'fine_labels': list
+           'coarse_labels': list
+```
+
+## Export to other formats
+
+Datumaro can convert CIFAR dataset into any other format [Datumaro supports](../user_manual.md#supported-formats).
+To get the expected result, the dataset needs to be converted to formats
+that support the classification task (e.g. MNIST, ImageNet, PascalVOC,
+etc.) There are few ways to convert CIFAR dataset to other dataset format:
+
+``` bash
+datum project import -f cifar -i <path/to/cifar>
+datum export -f imagenet -o <path/to/output/dir>
+# or
+datum convert -if cifar -i <path/to/cifar> -f imagenet -o <path/to/output/dir>
+```
+
+## Export to CIFAR
+
+There are few ways to convert dataset to CIFAR format:
+
+``` bash
+# export dataset into CIFAR format from existing project
+datum export -p <path/to/project> -f cifar -o <path/to/export/dir> \
+    -- --save-images
+# converting to CIFAR format from other format
+datum convert -if imagenet -i <path/to/imagenet/dataset> \
+    -f cifar -o <path/to/export/dir> -- --save-images
+```
+
+Extra options for export to CIFAR format:
+
+- `--save-images` allow to export dataset with saving images
+(by default `False`);
+- `--image-ext <IMAGE_EXT>` allow to specify image extension
+for exporting dataset (by default `.png`).
+
+The format (CIFAR-10 or CIFAR-100) in which the dataset will be
+exported depends on the presence of superclasses in the `LabelCategories`.
+
+## Particular use cases
+
+Datumaro supports filtering, transformation, merging etc. for all formats
+and for the CIFAR format in particular. Follow [user manual](../user_manual.md)
+to get more information about these operations.
+
+There are few examples of using Datumaro operations to solve
+particular problems with CIFAR dataset:
+
+### Example 1. How to create custom CIFAR-like dataset
+
+```python
+from datumaro.components.dataset import Dataset
+from datumaro.components.extractor import Label, DatasetItem
+
+dataset = Dataset.from_iterable([
+    DatasetItem(id=0, image=np.ones((32, 32, 3)),
+        annotations=[Label(3)]
+    ),
+    DatasetItem(id=1, image=np.ones((32, 32, 3)),
+        annotations=[Label(8)]
+    )
+], categories=[['airplane', 'automobile', 'bird', 'cat', 'deer',
+                'dog', 'frog', 'horse', 'ship', 'truck']])
+
+dataset.export('./dataset', format='cifar')
+```
+
+### Example 2. How to filter and convert CIFAR dataset to ImageNet
+
+Convert CIFAR dataset to ImageNet format, keep only images with `dog` class
+presented:
+
+``` bash
+# Download CIFAR-10 dataset:
+# https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz
+datum convert --input-format cifar --input-path <path/to/cifar> \
+              --output-format imagenet \
+              --filter '/item[annotation/label="dog"]'
+```
+
+More examples of working with CIFAR dataset from code can be found in
+[tests_cifar](../../tests/test_cifar_format.py)
diff --git a/docs/user_manual.md b/docs/user_manual.md
index 3990ed5c2585..63ac2fce60d9 100644
--- a/docs/user_manual.md
+++ b/docs/user_manual.md
@@ -123,6 +123,7 @@ List of supported formats:
 - CIFAR-10/100 (`classification` (python version))
   - [Format specification](https://www.cs.toronto.edu/~kriz/cifar.html)
   - [Dataset example](../tests/assets/cifar_dataset)
+  - [Format documentation](./formats/cifar_user_manual.md)
 - MNIST (`classification`)
   - [Format specification](http://yann.lecun.com/exdb/mnist/)
   - [Dataset example](../tests/assets/mnist_dataset)
diff --git a/tests/test_cifar_format.py b/tests/test_cifar_format.py
index 9b38ac0f4d31..48653979564e 100644
--- a/tests/test_cifar_format.py
+++ b/tests/test_cifar_format.py
@@ -128,6 +128,48 @@ def test_can_save_and_load_empty_image(self):
             compare_datasets(self, dataset, parsed_dataset,
                 require_images=True)
 
+    @mark_requirement(Requirements.DATUM_GENERAL_REQ)
+    def test_can_save_and_load_cifar100(self):
+        source_dataset = Dataset.from_iterable([
+            DatasetItem(id='image_2', subset='test',
+                image=np.ones((32, 32, 3)),
+                annotations=[Label(0)]
+            ),
+            DatasetItem(id='image_3', subset='test',
+                image=np.ones((32, 32, 3))
+            ),
+            DatasetItem(id='image_4', subset='test',
+                image=np.ones((32, 32, 3)),
+                annotations=[Label(1)]
+            )
+        ], categories=[['class_0', 'superclass_0'], ['class_1', 'superclass_0']])
+
+        with TestDir() as test_dir:
+            CifarConverter.convert(source_dataset, test_dir, save_images=True)
+            parsed_dataset = Dataset.import_from(test_dir, 'cifar')
+
+            compare_datasets(self, source_dataset, parsed_dataset,
+                require_images=True)
+
+    @mark_requirement(Requirements.DATUM_GENERAL_REQ)
+    def test_can_save_and_load_cifar100_without_saving_images(self):
+        source_dataset = Dataset.from_iterable([
+            DatasetItem(id='a', subset='train_1',
+                annotations=[Label(0)]
+            ),
+            DatasetItem(id='b', subset='train_1',
+                annotations=[Label(1)]
+            ),
+        ], categories=[['class_0', 'superclass_0'], ['class_1', 'superclass_0']])
+
+        with TestDir() as test_dir:
+            CifarConverter.convert(source_dataset, test_dir, save_images=False)
+            parsed_dataset = Dataset.import_from(test_dir, 'cifar')
+
+            compare_datasets(self, source_dataset, parsed_dataset,
+                require_images=True)
+
+
 DUMMY_DATASET_DIR = osp.join(osp.dirname(__file__), 'assets', 'cifar_dataset')
 
 class CifarImporterTest(TestCase):