From a3116488b857fa39eb0d1ef2f061a3de28f0ebde Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rafael=20Kazuo=20Sato=20Simi=C3=A3o?= <dntxos@gmail.com>
Date: Fri, 31 May 2019 18:09:51 -0300
Subject: [PATCH] YOLO adapted version of PASCAL VOC converter.py (#454)

---
 CHANGELOG.md                |   1 +
 utils/README.md             |   9 +-
 utils/yolo/__init__.py      |   0
 utils/yolo/converter.md     |  38 ++++++
 utils/yolo/converter.py     | 264 ++++++++++++++++++++++++++++++++++++
 utils/yolo/requirements.txt |   4 +
 6 files changed, 312 insertions(+), 4 deletions(-)
 create mode 100644 utils/yolo/__init__.py
 create mode 100644 utils/yolo/converter.md
 create mode 100644 utils/yolo/converter.py
 create mode 100644 utils/yolo/requirements.txt
diff --git a/CHANGELOG.md b/CHANGELOG.md
index d40ae678edd9..b50a48747b9e 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -6,6 +6,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 ### Added
+- A converter to YOLO format
 - Installation guide
 - Linear interpolation for a single point
 - Video frame filter
diff --git a/utils/README.md b/utils/README.md
index 68735adfb65b..2d856c6c0cf3 100644
--- a/utils/README.md
+++ b/utils/README.md
@@ -4,7 +4,8 @@
 ## Description
 
 This folder contains some useful utilities for Computer Vision Annotation Tool (CVAT). To read about a certain utility please choose a link:
--   [Convert CVAT XML to PASCAL VOC](voc/converter.md)
--   [Convert CVAT XML to MS COCO](coco/converter.md)
--   [Convert CVAT XML to PNG mask](mask/converter.md)
--   [Convert CVAT XML to TFRECORDS](tfrecords/converter.md)
+- [Convert CVAT XML to PASCAL VOC](voc/converter.md)
+- [Convert CVAT XML to MS COCO](coco/converter.md)
+- [Convert CVAT XML to PNG mask](mask/converter.md)
+- [Convert CVAT XML to TFRECORDS](tfrecords/converter.md)
+- [Convert CVAT XML to YOLO](yolo/converter.md)
diff --git a/utils/yolo/__init__.py b/utils/yolo/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/utils/yolo/converter.md b/utils/yolo/converter.md
new file mode 100644
index 000000000000..6ac3ec051766
--- /dev/null
+++ b/utils/yolo/converter.md
@@ -0,0 +1,38 @@
+# Utility for converting CVAT XML annotation file to YOLO format
+
+## Description
+
+Given a CVAT XML, this script reads the CVAT XML and writes the 
+annotations in YOLO format into a given directory. This implementation
+supports both interpolation tracks from video and annotated images.
+
+## Installation
+
+Install necessary packages and create a virtual environment.
+
+```bash
+sudo apt-get update
+sudo apt-get install -y --no-install-recommends python3-pip python3-venv python3-dev
+```
+
+```bash
+python3 -m venv .env
+. .env/bin/activate
+cat requirements.txt | xargs -n 1 -L 1 pip install
+```
+
+## Usage
+
+Run the script inside the virtual environment:
+
+```bash
+python converter.py --cvat-xml </path/to/cvat/xml> --image-dir </path/to/images> --output-dir </path/to/output/directory>
+```
+
+Case you need download frames from annotated video file submited to CVAT:
+
+```bash
+python converter.py --cvat-xml </path/to/cvat/xml> --output-dir </path/to/output/directory> --username <CVAT Username> --password <CVAT Password>
+```
+
+Please run `python converter.py --help` for more details.
diff --git a/utils/yolo/converter.py b/utils/yolo/converter.py
new file mode 100644
index 000000000000..23da12b9b526
--- /dev/null
+++ b/utils/yolo/converter.py
@@ -0,0 +1,264 @@
+#!/usr/bin/env python
+#
+# SPDX-License-Identifier: MIT
+"""
+Given a CVAT XML, this script reads the CVAT XML and writes the
+annotations in YOLO format into a given directory.
+
+This implementation supports both interpolation tracks from video and
+annotated images.
+"""
+
+import os
+import argparse
+import glog as log
+from lxml import etree
+import requests
+
+
+def parse_args():
+    """Parse arguments of command line"""
+    parser = argparse.ArgumentParser(
+        description='Convert CVAT XML annotations to YOLO format'
+    )
+
+    parser.add_argument(
+        '--cvat-xml', metavar='FILE', required=True,
+        help='input file with CVAT annotation in xml format'
+    )
+
+    parser.add_argument(
+        '--image-dir', metavar='DIRECTORY', required=False,
+        help='directory which contains original images'
+    )
+
+    parser.add_argument(
+        '--output-dir', metavar='DIRECTORY', required=True,
+        help='directory for output annotations in YOLO format'
+    )
+
+    parser.add_argument(
+        '--username', metavar='USERNAME', required=False,
+        help='Username from CVAT Login page, required to download images'
+    )
+
+    parser.add_argument(
+        '--password', metavar='PASSWORD', required=False,
+        help='Password from CVAT Login page, required to download images'
+    )
+
+    parser.add_argument(
+        '--labels', metavar='ILABELS', required=False,
+        help='Labels (separated by comma) to extract. Example: car,truck,motorcycle'
+    )
+
+    return parser.parse_args()
+
+
+def process_cvat_xml(xml_file, image_dir, output_dir,username,password,ilabels):
+    """
+    Transforms a single XML in CVAT format to YOLO TXT files and download images when not in IMAGE_DIR
+
+    :param xml_file: CVAT format XML
+    :param image_dir: image directory of the dataset
+    :param output_dir: directory of annotations with YOLO format
+    :param username: Username used to login CVAT. Required to download images
+    :param password: Password used to login CVAT. Required to download images
+    :param ilabels: Comma separated ordered labels
+    :return:
+    """
+    KNOWN_TAGS = {'box', 'image', 'attribute'}
+
+    if (image_dir is None):
+        image_dir=os.path.join(output_dir,"data/obj")
+        os.makedirs(image_dir, exist_ok=True)
+
+    os.makedirs(output_dir, exist_ok=True)
+    cvat_xml = etree.parse(xml_file)
+    basename = os.path.splitext( os.path.basename( xml_file ) )[0]
+    current_labels = {}
+    traintxt = ""
+    auto_lbl_count = 0
+
+    if (ilabels is not None):
+        vlabels=ilabels.split(',')
+        for _label in vlabels:
+            current_labels[_label]=auto_lbl_count
+            auto_lbl_count+=1
+
+    tracks= cvat_xml.findall( './/track' )
+
+    if (tracks is not None) and (len(tracks) > 0):
+        frames = {}
+
+        for track in tracks:
+            trackid = int(track.get("id"))
+            label = track.get("label")
+            boxes = track.findall( './box' )
+            for box in boxes:
+                frameid  = int(box.get('frame'))
+                outside  = int(box.get('outside'))
+                #occluded = int(box.get('occluded'))  #currently unused
+                #keyframe = int(box.get('keyframe'))  #currently unused
+                xtl      = float(box.get('xtl'))
+                ytl      = float(box.get('ytl'))
+                xbr      = float(box.get('xbr'))
+                ybr      = float(box.get('ybr'))
+
+                frame = frames.get( frameid, {} )
+
+                if outside == 0:
+                    frame[ trackid ] = { 'xtl': xtl, 'ytl': ytl, 'xbr': xbr, 'ybr': ybr, 'label': label }
+
+                frames[ frameid ] = frame
+
+        width = int(cvat_xml.find('.//original_size/width').text)
+        height  = int(cvat_xml.find('.//original_size/height').text)
+
+        taskid = int(cvat_xml.find('.//task/id').text)
+
+        urlsegment = cvat_xml.find(".//segments/segment/url").text
+        urlbase = urlsegment.split("?")[0]
+
+        httpclient = requests.session()
+        httpclient.get(urlbase)
+
+        csrftoken = "none"
+        sessionid = "none"
+
+        # Spit out a list of each object for each frame
+        for frameid in sorted(frames.keys()):
+            image_name = "%s_%08d.jpg" % (basename, frameid)
+            image_path = os.path.join(image_dir, image_name)
+            if not os.path.exists(image_path):
+                if username is None:
+                    log.warn('{} image cannot be found. Is `{}` image directory correct?\n'.format(image_path, image_dir))
+                else:
+                    log.info('{} image cannot be found. Downloading from task ID {}\n'.format(image_path, taskid))
+
+                    if sessionid == "none":
+                        if "csrftoken" in httpclient.cookies:
+                            csrftoken = httpclient.cookies["csrftoken"]
+                        elif "csrf" in httpclient.cookies:
+                            csrftoken = httpclient.cookies["csrf"]
+
+                        login_data = dict(username=username, password=password,
+                                        csrfmiddlewaretoken=csrftoken, next='/dashboard')
+
+                        urllogin = urlbase+"/auth/login"
+                        httpclient.post(urllogin, data=login_data,
+                                        headers=dict(Referer=urllogin))
+
+                        if ("sessionid" in httpclient.cookies):
+                            sessionid = httpclient.cookies["sessionid"]
+
+                    url = urlbase+"/api/v1/tasks/"+str(taskid)+"/frames/"+ str(frameid)
+
+                    req = httpclient.get(url, headers=dict(
+                        csrftoken=csrftoken, sessionid=sessionid))
+
+                    with open(image_path, 'wb') as fo:
+                        fo.write(req.content)
+                        print('Url saved as %s\n' % image_path)
+
+
+            frame = frames[frameid]
+
+            _yoloAnnotationContent=""
+
+            objids = sorted(frame.keys())
+
+            for objid in objids:
+
+                box = frame[objid]
+
+                label = box.get('label')
+                xmin = float(box.get('xtl'))
+                ymin = float(box.get('ytl'))
+                xmax = float(box.get('xbr'))
+                ymax = float(box.get('ybr'))
+
+                if not label in current_labels:
+                    current_labels[label] = auto_lbl_count
+                    auto_lbl_count+=1
+
+                labelid=current_labels[label]
+                yolo_x= (xmin + ((xmax-xmin)/2))/width
+                yolo_y= (ymin + ((ymax-ymin)/2))/height
+                yolo_w = (xmax - xmin) / width
+                yolo_h = (ymax - ymin) / height
+
+                if len(_yoloAnnotationContent) != 0:
+                        _yoloAnnotationContent += "\n"
+
+                _yoloAnnotationContent+=str(labelid)+" "+"{:.6f}".format(yolo_x) +" "+"{:.6f}".format(yolo_y) +" "+"{:.6f}".format(yolo_w) +" "+"{:.6f}".format(yolo_h)
+            anno_name = os.path.basename(os.path.splitext(image_name)[0] + '.txt')
+            anno_path = os.path.join(image_dir, anno_name)
+
+            _yoloFile = open(anno_path, "w", newline="\n")
+            _yoloFile.write(_yoloAnnotationContent)
+            _yoloFile.close()
+
+            if len(traintxt)!=0:
+                traintxt+="\n"
+
+            traintxt+=image_path
+
+    else:
+        for img_tag in cvat_xml.findall('image'):
+            image_name = img_tag.get('name')
+            width = img_tag.get('width')
+            height = img_tag.get('height')
+            image_path = os.path.join(image_dir, image_name)
+            if not os.path.exists(image_path):
+                log.warn('{} image cannot be found. Is `{}` image directory correct?'.
+                    format(image_path, image_dir))
+
+            unknown_tags = {x.tag for x in img_tag.iter()}.difference(KNOWN_TAGS)
+            if unknown_tags:
+                log.warn('Ignoring tags for image {}: {}'.format(image_path, unknown_tags))
+
+            _yoloAnnotationContent = ""
+
+            for box in img_tag.findall('box'):
+                label = box.get('label')
+                xmin = float(box.get('xtl'))
+                ymin = float(box.get('ytl'))
+                xmax = float(box.get('xbr'))
+                ymax = float(box.get('ybr'))
+
+                if not label in current_labels:
+                    current_labels[label] = auto_lbl_count
+                    auto_lbl_count += 1
+
+                labelid = current_labels[label]
+                yolo_x = (xmin + ((xmax-xmin)/2))/width
+                yolo_y = (ymin + ((ymax-ymin)/2))/height
+                yolo_w = (xmax - xmin) / width
+                yolo_h = (ymax - ymin) / height
+
+                if len(_yoloAnnotationContent) != 0:
+                        _yoloAnnotationContent += "\n"
+
+                _yoloAnnotationContent += str(labelid)+" "+"{:.6f}".format(yolo_x) + " "+"{:.6f}".format(
+                    yolo_y) + " "+"{:.6f}".format(yolo_w) + " "+"{:.6f}".format(yolo_h)
+
+            anno_name = os.path.basename(os.path.splitext(image_name)[0] + '.txt')
+            anno_path = os.path.join(image_dir, anno_name)
+
+            _yoloFile = open(anno_path, "w", newline="\n")
+            _yoloFile.write(_yoloAnnotationContent)
+            _yoloFile.close()
+
+    traintxt_file=open(output_dir+"/train.txt","w",newline="\n")
+    traintxt_file.write(traintxt)
+    traintxt_file.close()
+
+
+def main():
+    args = parse_args()
+    process_cvat_xml(args.cvat_xml, args.image_dir, args.output_dir, args.username,args.password,args.labels)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/utils/yolo/requirements.txt b/utils/yolo/requirements.txt
new file mode 100644
index 000000000000..b76cddb6fc3c
--- /dev/null
+++ b/utils/yolo/requirements.txt
@@ -0,0 +1,4 @@
+argparse>=1.1
+lxml>=3.5.0
+glog>=0.3.1
+requests==2.22.0