From a3116488b857fa39eb0d1ef2f061a3de28f0ebde Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafael=20Kazuo=20Sato=20Simi=C3=A3o?= Date: Fri, 31 May 2019 18:09:51 -0300 Subject: [PATCH] YOLO adapted version of PASCAL VOC converter.py (#454) --- CHANGELOG.md | 1 + utils/README.md | 9 +- utils/yolo/__init__.py | 0 utils/yolo/converter.md | 38 ++++++ utils/yolo/converter.py | 264 ++++++++++++++++++++++++++++++++++++ utils/yolo/requirements.txt | 4 + 6 files changed, 312 insertions(+), 4 deletions(-) create mode 100644 utils/yolo/__init__.py create mode 100644 utils/yolo/converter.md create mode 100644 utils/yolo/converter.py create mode 100644 utils/yolo/requirements.txt diff --git a/CHANGELOG.md b/CHANGELOG.md index d40ae678edd9..b50a48747b9e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] ### Added +- A converter to YOLO format - Installation guide - Linear interpolation for a single point - Video frame filter diff --git a/utils/README.md b/utils/README.md index 68735adfb65b..2d856c6c0cf3 100644 --- a/utils/README.md +++ b/utils/README.md @@ -4,7 +4,8 @@ ## Description This folder contains some useful utilities for Computer Vision Annotation Tool (CVAT). To read about a certain utility please choose a link: -- [Convert CVAT XML to PASCAL VOC](voc/converter.md) -- [Convert CVAT XML to MS COCO](coco/converter.md) -- [Convert CVAT XML to PNG mask](mask/converter.md) -- [Convert CVAT XML to TFRECORDS](tfrecords/converter.md) +- [Convert CVAT XML to PASCAL VOC](voc/converter.md) +- [Convert CVAT XML to MS COCO](coco/converter.md) +- [Convert CVAT XML to PNG mask](mask/converter.md) +- [Convert CVAT XML to TFRECORDS](tfrecords/converter.md) +- [Convert CVAT XML to YOLO](yolo/converter.md) diff --git a/utils/yolo/__init__.py b/utils/yolo/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/utils/yolo/converter.md b/utils/yolo/converter.md new file mode 100644 index 000000000000..6ac3ec051766 --- /dev/null +++ b/utils/yolo/converter.md @@ -0,0 +1,38 @@ +# Utility for converting CVAT XML annotation file to YOLO format + +## Description + +Given a CVAT XML, this script reads the CVAT XML and writes the +annotations in YOLO format into a given directory. This implementation +supports both interpolation tracks from video and annotated images. + +## Installation + +Install necessary packages and create a virtual environment. + +```bash +sudo apt-get update +sudo apt-get install -y --no-install-recommends python3-pip python3-venv python3-dev +``` + +```bash +python3 -m venv .env +. .env/bin/activate +cat requirements.txt | xargs -n 1 -L 1 pip install +``` + +## Usage + +Run the script inside the virtual environment: + +```bash +python converter.py --cvat-xml --image-dir --output-dir +``` + +Case you need download frames from annotated video file submited to CVAT: + +```bash +python converter.py --cvat-xml --output-dir --username --password +``` + +Please run `python converter.py --help` for more details. diff --git a/utils/yolo/converter.py b/utils/yolo/converter.py new file mode 100644 index 000000000000..23da12b9b526 --- /dev/null +++ b/utils/yolo/converter.py @@ -0,0 +1,264 @@ +#!/usr/bin/env python +# +# SPDX-License-Identifier: MIT +""" +Given a CVAT XML, this script reads the CVAT XML and writes the +annotations in YOLO format into a given directory. + +This implementation supports both interpolation tracks from video and +annotated images. +""" + +import os +import argparse +import glog as log +from lxml import etree +import requests + + +def parse_args(): + """Parse arguments of command line""" + parser = argparse.ArgumentParser( + description='Convert CVAT XML annotations to YOLO format' + ) + + parser.add_argument( + '--cvat-xml', metavar='FILE', required=True, + help='input file with CVAT annotation in xml format' + ) + + parser.add_argument( + '--image-dir', metavar='DIRECTORY', required=False, + help='directory which contains original images' + ) + + parser.add_argument( + '--output-dir', metavar='DIRECTORY', required=True, + help='directory for output annotations in YOLO format' + ) + + parser.add_argument( + '--username', metavar='USERNAME', required=False, + help='Username from CVAT Login page, required to download images' + ) + + parser.add_argument( + '--password', metavar='PASSWORD', required=False, + help='Password from CVAT Login page, required to download images' + ) + + parser.add_argument( + '--labels', metavar='ILABELS', required=False, + help='Labels (separated by comma) to extract. Example: car,truck,motorcycle' + ) + + return parser.parse_args() + + +def process_cvat_xml(xml_file, image_dir, output_dir,username,password,ilabels): + """ + Transforms a single XML in CVAT format to YOLO TXT files and download images when not in IMAGE_DIR + + :param xml_file: CVAT format XML + :param image_dir: image directory of the dataset + :param output_dir: directory of annotations with YOLO format + :param username: Username used to login CVAT. Required to download images + :param password: Password used to login CVAT. Required to download images + :param ilabels: Comma separated ordered labels + :return: + """ + KNOWN_TAGS = {'box', 'image', 'attribute'} + + if (image_dir is None): + image_dir=os.path.join(output_dir,"data/obj") + os.makedirs(image_dir, exist_ok=True) + + os.makedirs(output_dir, exist_ok=True) + cvat_xml = etree.parse(xml_file) + basename = os.path.splitext( os.path.basename( xml_file ) )[0] + current_labels = {} + traintxt = "" + auto_lbl_count = 0 + + if (ilabels is not None): + vlabels=ilabels.split(',') + for _label in vlabels: + current_labels[_label]=auto_lbl_count + auto_lbl_count+=1 + + tracks= cvat_xml.findall( './/track' ) + + if (tracks is not None) and (len(tracks) > 0): + frames = {} + + for track in tracks: + trackid = int(track.get("id")) + label = track.get("label") + boxes = track.findall( './box' ) + for box in boxes: + frameid = int(box.get('frame')) + outside = int(box.get('outside')) + #occluded = int(box.get('occluded')) #currently unused + #keyframe = int(box.get('keyframe')) #currently unused + xtl = float(box.get('xtl')) + ytl = float(box.get('ytl')) + xbr = float(box.get('xbr')) + ybr = float(box.get('ybr')) + + frame = frames.get( frameid, {} ) + + if outside == 0: + frame[ trackid ] = { 'xtl': xtl, 'ytl': ytl, 'xbr': xbr, 'ybr': ybr, 'label': label } + + frames[ frameid ] = frame + + width = int(cvat_xml.find('.//original_size/width').text) + height = int(cvat_xml.find('.//original_size/height').text) + + taskid = int(cvat_xml.find('.//task/id').text) + + urlsegment = cvat_xml.find(".//segments/segment/url").text + urlbase = urlsegment.split("?")[0] + + httpclient = requests.session() + httpclient.get(urlbase) + + csrftoken = "none" + sessionid = "none" + + # Spit out a list of each object for each frame + for frameid in sorted(frames.keys()): + image_name = "%s_%08d.jpg" % (basename, frameid) + image_path = os.path.join(image_dir, image_name) + if not os.path.exists(image_path): + if username is None: + log.warn('{} image cannot be found. Is `{}` image directory correct?\n'.format(image_path, image_dir)) + else: + log.info('{} image cannot be found. Downloading from task ID {}\n'.format(image_path, taskid)) + + if sessionid == "none": + if "csrftoken" in httpclient.cookies: + csrftoken = httpclient.cookies["csrftoken"] + elif "csrf" in httpclient.cookies: + csrftoken = httpclient.cookies["csrf"] + + login_data = dict(username=username, password=password, + csrfmiddlewaretoken=csrftoken, next='/dashboard') + + urllogin = urlbase+"/auth/login" + httpclient.post(urllogin, data=login_data, + headers=dict(Referer=urllogin)) + + if ("sessionid" in httpclient.cookies): + sessionid = httpclient.cookies["sessionid"] + + url = urlbase+"/api/v1/tasks/"+str(taskid)+"/frames/"+ str(frameid) + + req = httpclient.get(url, headers=dict( + csrftoken=csrftoken, sessionid=sessionid)) + + with open(image_path, 'wb') as fo: + fo.write(req.content) + print('Url saved as %s\n' % image_path) + + + frame = frames[frameid] + + _yoloAnnotationContent="" + + objids = sorted(frame.keys()) + + for objid in objids: + + box = frame[objid] + + label = box.get('label') + xmin = float(box.get('xtl')) + ymin = float(box.get('ytl')) + xmax = float(box.get('xbr')) + ymax = float(box.get('ybr')) + + if not label in current_labels: + current_labels[label] = auto_lbl_count + auto_lbl_count+=1 + + labelid=current_labels[label] + yolo_x= (xmin + ((xmax-xmin)/2))/width + yolo_y= (ymin + ((ymax-ymin)/2))/height + yolo_w = (xmax - xmin) / width + yolo_h = (ymax - ymin) / height + + if len(_yoloAnnotationContent) != 0: + _yoloAnnotationContent += "\n" + + _yoloAnnotationContent+=str(labelid)+" "+"{:.6f}".format(yolo_x) +" "+"{:.6f}".format(yolo_y) +" "+"{:.6f}".format(yolo_w) +" "+"{:.6f}".format(yolo_h) + anno_name = os.path.basename(os.path.splitext(image_name)[0] + '.txt') + anno_path = os.path.join(image_dir, anno_name) + + _yoloFile = open(anno_path, "w", newline="\n") + _yoloFile.write(_yoloAnnotationContent) + _yoloFile.close() + + if len(traintxt)!=0: + traintxt+="\n" + + traintxt+=image_path + + else: + for img_tag in cvat_xml.findall('image'): + image_name = img_tag.get('name') + width = img_tag.get('width') + height = img_tag.get('height') + image_path = os.path.join(image_dir, image_name) + if not os.path.exists(image_path): + log.warn('{} image cannot be found. Is `{}` image directory correct?'. + format(image_path, image_dir)) + + unknown_tags = {x.tag for x in img_tag.iter()}.difference(KNOWN_TAGS) + if unknown_tags: + log.warn('Ignoring tags for image {}: {}'.format(image_path, unknown_tags)) + + _yoloAnnotationContent = "" + + for box in img_tag.findall('box'): + label = box.get('label') + xmin = float(box.get('xtl')) + ymin = float(box.get('ytl')) + xmax = float(box.get('xbr')) + ymax = float(box.get('ybr')) + + if not label in current_labels: + current_labels[label] = auto_lbl_count + auto_lbl_count += 1 + + labelid = current_labels[label] + yolo_x = (xmin + ((xmax-xmin)/2))/width + yolo_y = (ymin + ((ymax-ymin)/2))/height + yolo_w = (xmax - xmin) / width + yolo_h = (ymax - ymin) / height + + if len(_yoloAnnotationContent) != 0: + _yoloAnnotationContent += "\n" + + _yoloAnnotationContent += str(labelid)+" "+"{:.6f}".format(yolo_x) + " "+"{:.6f}".format( + yolo_y) + " "+"{:.6f}".format(yolo_w) + " "+"{:.6f}".format(yolo_h) + + anno_name = os.path.basename(os.path.splitext(image_name)[0] + '.txt') + anno_path = os.path.join(image_dir, anno_name) + + _yoloFile = open(anno_path, "w", newline="\n") + _yoloFile.write(_yoloAnnotationContent) + _yoloFile.close() + + traintxt_file=open(output_dir+"/train.txt","w",newline="\n") + traintxt_file.write(traintxt) + traintxt_file.close() + + +def main(): + args = parse_args() + process_cvat_xml(args.cvat_xml, args.image_dir, args.output_dir, args.username,args.password,args.labels) + + +if __name__ == "__main__": + main() diff --git a/utils/yolo/requirements.txt b/utils/yolo/requirements.txt new file mode 100644 index 000000000000..b76cddb6fc3c --- /dev/null +++ b/utils/yolo/requirements.txt @@ -0,0 +1,4 @@ +argparse>=1.1 +lxml>=3.5.0 +glog>=0.3.1 +requests==2.22.0