Replace VOC format support in CVAT with Datumaro (#1167)

* Add image meta reading to voc * Replace voc support in cvat * Bump format version * Materialize lazy transforms in voc export * Store voc instance id as group id * Add flat format import * Add documentation * Fix format name in doc
cvat-ai · Feb 26, 2020 · cd8ef2a · cd8ef2a
1 parent 9850094
commit cd8ef2a
Show file tree

Hide file tree

Showing 7 changed files with 131 additions and 147 deletions.
diff --git a/cvat/apps/annotation/README.md b/cvat/apps/annotation/README.md
@@ -170,44 +170,58 @@ This is native CVAT annotation format.
 - supported shapes - Rectangles, Polygons, Polylines, Points
 
 ### [Pascal VOC](http://host.robots.ox.ac.uk/pascal/VOC/)
+- [Format specification](http://host.robots.ox.ac.uk/pascal/VOC/voc2012/devkit_doc.pdf)
 
 #### Pascal dumper description
-- downloaded file: a zip archive with following structure:
+- downloaded file: a zip archive of the following structure:
   ```bash
-  taskname.zip
-  ├── frame_000001.xml
-  ├── frame_000002.xml
-  ├── frame_000003.xml
-  └── ...
+  taskname.zip/
+  ├── Annotations/
+  │   ├── <image_name1>.xml
+  │   ├── <image_name2>.xml
+  │   └── <image_nameN>.xml
+  ├── ImageSets/
+  │   └── Main/
+  │       └── default.txt
+  └── labelmap.txt
   ```
-  Each annotation `*.xml` file has a name that corresponds to the name of the image file
-  (e.g. `frame_000001.xml` is the annotation for the `frame_000001.jpg` image).
-  Detailed structure specification of the `*.xml` file can be found
-  [here](http://host.robots.ox.ac.uk/pascal/VOC/voc2012/devkit_doc.pdf).
-- supported shapes - Rectangles
-- additional comments: If you plan to use 'truncated' and 'difficult' attributes please add the corresponding
+
+- supported shapes: Rectangles
+- additional comments: If you plan to use `truncated` and `difficult` attributes please add the corresponding
   items to the CVAT label attributes:
   `~checkbox=difficult:false ~checkbox=truncated:false`
 
 #### Pascal loader description
--   uploaded file: a zip archive with following structure:
-    ```bash
-    taskname.zip
-    ├── frame_000001.xml
-    ├── frame_000002.xml
-    ├── frame_000003.xml
-    └── ...
-    ```
-    It should be possible to match the CVAT frame(imagename) and image filename from the annotation \*.xml
-    file (the tag filename, e.g. `<filename>2008_004457.jpg</filename>`). There are 2 options:
-    1. full match between image name and filename from annotation *.xml
-       file (in case of a task was created from images or archive of images).
-    1. match by frame number (if CVAT cannot match by name). File name should be in the following format `frame_%6d.jpg`.
-       It will be used when task was created from a video.
+- uploaded file: a zip archive of the structure declared above or the following:
+  ```bash
+  taskname.zip/
+  ├── <image_name1>.xml
+  ├── <image_name2>.xml
+  ├── <image_nameN>.xml
+  └── labelmap.txt # optional
+  ```
 
--   supported shapes: Rectangles
--   limitations: Support of Pascal VOC object detection format
--   additional comments: the CVAT task should be created with the full label set that may be in the annotation files
+  The `labelmap.txt` file contains dataset labels. It **must** be included
+  if dataset labels **differ** from VOC default labels. The file structure:
+  ```bash
+  # label : color_rgb : 'body' parts : actions
+  background:::
+  aeroplane:::
+  bicycle:::
+  bird:::
+  ```
+
+  It must be possible for CVAT to match the frame (image name) and file name from annotation \*.xml
+  file (the tag filename, e.g. `<filename>2008_004457.jpg</filename>`). There are 2 options:
+  1. full match between image name and filename from annotation \*.xml
+      (in cases when task was created from images or image archive).
+  1. match by frame number (if CVAT cannot match by name). File name should
+      be in the following format `<number>.jpg`.
+      It should be used when task was created from a video.
+
+- supported shapes: Rectangles
+- limitations: Support of Pascal VOC object detection format
+- additional comments: the CVAT task should be created with the full label set that may be in the annotation files
 
 #### How to create a task from Pascal VOC dataset
 1.  Download the Pascal Voc dataset (Can be downloaded from the
@@ -222,7 +236,7 @@ This is native CVAT annotation format.
     (See [Creating an annotation task](cvat/apps/documentation/user_guide.md#creating-an-annotation-task)
     guide for details)
 1.  zip the corresponding annotation files
-1.  click `Upload annotation` button, choose `Pascal VOC ZIP 1.0`
+1.  click `Upload annotation` button, choose `Pascal VOC ZIP 1.1`
 and select the *.zip file with annotations from previous step.
 It may take some time.
 

diff --git a/cvat/apps/annotation/pascal_voc.py b/cvat/apps/annotation/pascal_voc.py
@@ -8,109 +8,65 @@
         {
             "display_name": "{name} {format} {version}",
             "format": "ZIP",
-            "version": "1.0",
+            "version": "1.1",
             "handler": "dump"
         },
     ],
     "loaders": [
         {
             "display_name": "{name} {format} {version}",
             "format": "ZIP",
-            "version": "1.0",
+            "version": "1.1",
             "handler": "load"
         },
     ],
 }
 
 def load(file_object, annotations):
-    from pyunpack import Archive
+    from glob import glob
     import os
+    import os.path as osp
+    import shutil
+    from pyunpack import Archive
     from tempfile import TemporaryDirectory
+    from datumaro.plugins.voc_format.importer import VocImporter
+    from cvat.apps.dataset_manager.bindings import import_dm_annotations
 
-    def parse_xml_file(annotation_file):
-        import xml.etree.ElementTree as ET
-        root = ET.parse(annotation_file).getroot()
-        frame_number = annotations.match_frame(root.find('filename').text)
-
-        for obj_tag in root.iter('object'):
-            bbox_tag = obj_tag.find("bndbox")
-            label = obj_tag.find('name').text
-            xmin = float(bbox_tag.find('xmin').text)
-            ymin = float(bbox_tag.find('ymin').text)
-            xmax = float(bbox_tag.find('xmax').text)
-            ymax = float(bbox_tag.find('ymax').text)
-            truncated = obj_tag.find('truncated')
-            truncated = truncated.text if truncated is not None else 0
-            difficult = obj_tag.find('difficult')
-            difficult = difficult.text if difficult is not None else 0
-
-            annotations.add_shape(annotations.LabeledShape(
-                type='rectangle',
-                frame=frame_number,
-                label=label,
-                points=[xmin, ymin, xmax, ymax],
-                occluded=False,
-                attributes=[
-                    annotations.Attribute('truncated', truncated),
-                    annotations.Attribute('difficult', difficult),
-                ],
-            ))
-
-    archive_file = getattr(file_object, 'name')
+    archive_file = file_object if isinstance(file_object, str) else getattr(file_object, "name")
     with TemporaryDirectory() as tmp_dir:
         Archive(archive_file).extractall(tmp_dir)
 
-        for dirpath, _, filenames in os.walk(tmp_dir):
-            for _file in filenames:
-                if '.xml' == os.path.splitext(_file)[1]:
-                    parse_xml_file(os.path.join(dirpath, _file))
+        # support flat archive layout
+        anno_dir = osp.join(tmp_dir, 'Annotations')
+        if not osp.isdir(anno_dir):
+            anno_files = glob(osp.join(tmp_dir, '**', '*.xml'), recursive=True)
+            subsets_dir = osp.join(tmp_dir, 'ImageSets', 'Main')
+            os.makedirs(subsets_dir, exist_ok=True)
+            with open(osp.join(subsets_dir, 'train.txt'), 'w') as subset_file:
+                for f in anno_files:
+                    subset_file.write(osp.splitext(osp.basename(f))[0] + '\n')
 
-def dump(file_object, annotations):
-    from pascal_voc_writer import Writer
-    import os
-    from zipfile import ZipFile
-    from tempfile import TemporaryDirectory
-
-    with TemporaryDirectory() as out_dir:
-        with ZipFile(file_object, 'w') as output_zip:
-            for frame_annotation in annotations.group_by_frame():
-                image_name = frame_annotation.name
-                width = frame_annotation.width
-                height = frame_annotation.height
-
-                writer = Writer(image_name, width, height)
-                writer.template_parameters['path'] = ''
-                writer.template_parameters['folder'] = ''
+            os.makedirs(anno_dir, exist_ok=True)
+            for f in anno_files:
+                shutil.move(f, anno_dir)
 
-                for shape in frame_annotation.labeled_shapes:
-                    if shape.type != "rectangle":
-                        continue
+        dm_project = VocImporter()(tmp_dir)
+        dm_dataset = dm_project.make_dataset()
+        import_dm_annotations(dm_dataset, annotations)
 
-                    label = shape.label
-                    xtl = shape.points[0]
-                    ytl = shape.points[1]
-                    xbr = shape.points[2]
-                    ybr = shape.points[3]
-
-                    difficult = 0
-                    truncated = 0
-                    for attribute in shape.attributes:
-                        if attribute.name == 'truncated' and 'true' == attribute.value.lower():
-                            truncated = 1
-                        elif attribute.name == 'difficult' and 'true' == attribute.value.lower():
-                            difficult = 1
+def dump(file_object, annotations):
+    from cvat.apps.dataset_manager.bindings import CvatAnnotationsExtractor
+    from cvat.apps.dataset_manager.util import make_zip_archive
+    from datumaro.components.project import Environment, Dataset
+    from tempfile import TemporaryDirectory
 
-                    writer.addObject(
-                        name=label,
-                        xmin=xtl,
-                        ymin=ytl,
-                        xmax=xbr,
-                        ymax=ybr,
-                        truncated=truncated,
-                        difficult=difficult,
-                    )
+    env = Environment()
+    id_from_image = env.transforms.get('id_from_image_name')
 
-                anno_name = os.path.basename('{}.{}'.format(os.path.splitext(image_name)[0], 'xml'))
-                anno_file = os.path.join(out_dir, anno_name)
-                writer.save(anno_file)
-                output_zip.write(filename=anno_file, arcname=anno_name)
+    extractor = CvatAnnotationsExtractor('', annotations)
+    extractor = extractor.transform(id_from_image)
+    extractor = Dataset.from_extractors(extractor) # apply lazy transforms
+    converter = env.make_converter('voc_detection')
+    with TemporaryDirectory() as temp_dir:
+        converter(extractor, save_dir=temp_dir)
+        make_zip_archive(temp_dir, file_object)
diff --git a/cvat/apps/annotation/yolo.py b/cvat/apps/annotation/yolo.py
@@ -8,15 +8,15 @@
         {
             "display_name": "{name} {format} {version}",
             "format": "ZIP",
-            "version": "1.0",
+            "version": "1.1",
             "handler": "dump"
         },
     ],
     "loaders": [
         {
             "display_name": "{name} {format} {version}",
             "format": "ZIP",
-            "version": "1.0",
+            "version": "1.1",
             "handler": "load"
         },
     ],

diff --git a/cvat/apps/engine/tests/test_rest_api.py b/cvat/apps/engine/tests/test_rest_api.py
@@ -2650,8 +2650,8 @@ def _get_initial_annotation(annotation_format):
             elif annotation_format == "CVAT XML 1.1 for images":
                 annotations["shapes"] = rectangle_shapes_with_attrs + rectangle_shapes_wo_attrs
 
-            elif annotation_format == "PASCAL VOC ZIP 1.0" or \
-                 annotation_format == "YOLO ZIP 1.0" or \
+            elif annotation_format == "PASCAL VOC ZIP 1.1" or \
+                 annotation_format == "YOLO ZIP 1.1" or \
                  annotation_format == "TFRecord ZIP 1.0":
                  annotations["shapes"] = rectangle_shapes_wo_attrs
 

diff --git a/datumaro/datumaro/plugins/voc_format/converter.py b/datumaro/datumaro/plugins/voc_format/converter.py
@@ -235,7 +235,8 @@ def save_subsets(self):
                         if bbox is not None:
                             _write_xml_bbox(bbox, obj_elem)
 
-                        for part_bbox in filter(lambda x: obj.id == x.group,
+                        for part_bbox in filter(
+                                lambda x: obj.group and obj.group == x.group,
                                 layout_bboxes):
                             part_elem = ET.SubElement(obj_elem, 'part')
                             ET.SubElement(part_elem, 'name').text = \

diff --git a/datumaro/datumaro/plugins/voc_format/extractor.py b/datumaro/datumaro/plugins/voc_format/extractor.py
@@ -4,6 +4,7 @@
 # SPDX-License-Identifier: MIT
 
 from collections import defaultdict
+import logging as log
 import os
 import os.path as osp
 from xml.etree import ElementTree as ET
@@ -13,7 +14,7 @@
     AnnotationType, Label, Mask, Bbox, CompiledMask
 )
 from datumaro.util import dir_items
-from datumaro.util.image import lazy_image
+from datumaro.util.image import lazy_image, Image
 from datumaro.util.mask_tools import lazy_mask, invert_colormap
 
 from .format import (
@@ -52,8 +53,12 @@ def _load_subsets(self, subsets_dir):
                 subset_name = None
             subset = __class__.Subset(subset_name, self)
 
+            subset.items = []
             with open(osp.join(subsets_dir, subset_file_name + '.txt'), 'r') as f:
-                subset.items = [line.split()[0] for line in f]
+                for line in f:
+                    line = line.split()[0].strip()
+                    if line:
+                        subset.items.append(line)
 
             subsets[subset_name] = subset
         return subsets
@@ -84,12 +89,7 @@ def _load_det_annotations(self):
         for ann_item in det_anno_items:
             with open(osp.join(det_anno_dir, ann_item + '.xml'), 'r') as f:
                 ann_file_data = f.read()
-                ann_file_root = ET.fromstring(ann_file_data)
-                item = ann_file_root.find('filename').text
-                if not item:
-                    item = ann_item
-                item = osp.splitext(item)[0]
-                det_annotations[item] = ann_file_data
+                det_annotations[ann_item] = ann_file_data
 
         self._annotations[VocTask.detection] = det_annotations
 
@@ -134,6 +134,19 @@ def __iter__(self):
     def _get(self, item_id, subset_name):
         image = osp.join(self._path, VocPath.IMAGES_DIR,
             item_id + VocPath.IMAGE_EXT)
+        det_annotations = self._annotations.get(VocTask.detection)
+        if det_annotations is not None:
+            det_annotations = det_annotations.get(item_id)
+        if det_annotations is not None:
+            root_elem = ET.fromstring(det_annotations)
+            height = root_elem.find('size/height')
+            if height is not None:
+                height = int(height.text)
+            width = root_elem.find('size/width')
+            if width is not None:
+                width = int(width.text)
+            if height and width:
+                image = Image(path=image, size=(height, width))
 
         annotations = self._get_annotations(item_id)
 
@@ -217,7 +230,7 @@ def _get_annotations(self, item_id):
             for obj_id, object_elem in enumerate(root_elem.findall('object')):
                 obj_id += 1
                 attributes = {}
-                group = None
+                group = obj_id
 
                 obj_label_id = None
                 label_elem = object_elem.find('name')
@@ -262,20 +275,21 @@ def _get_annotations(self, item_id):
                 for action, present in actions.items():
                     attributes[action] = present
 
+                has_parts = False
                 for part_elem in object_elem.findall('part'):
                     part = part_elem.find('name').text
                     part_label_id = self._get_label_id(part)
                     part_bbox = self._parse_bbox(part_elem)
-                    group = obj_id
 
                     if self._task is not VocTask.person_layout:
                         break
                     if part_bbox is None:
                         continue
+                    has_parts = True
                     item_annotations.append(Bbox(*part_bbox, label=part_label_id,
                         group=group))
 
-                if self._task is VocTask.person_layout and not group:
+                if self._task is VocTask.person_layout and not has_parts:
                     continue
                 if self._task is VocTask.action_classification and not actions:
                     continue
@@ -699,7 +713,7 @@ def __init__(self, path):
 
     def _load_categories(self):
         from collections import OrderedDict
-        from datumaro.components.formats.voc import VocAction
+        from .format import VocAction
         label_map = OrderedDict((a.name, [[], [], []]) for a in VocAction)
         self._categories = make_voc_categories(label_map)