From 24bfb1876808807e2cb595b31aea2ae9819e2ec0 Mon Sep 17 00:00:00 2001 From: Xinyu Wang <45810070+xinke-wang@users.noreply.github.com> Date: Tue, 20 Dec 2022 20:15:34 +1030 Subject: [PATCH] [Feature] Add TextOCR to Dataset Preparer (#1543) * add textocr * cfg gen Co-authored-by: gaotongxiao --- dataset_zoo/textocr/metafile.yml | 27 ++++++++++++++ dataset_zoo/textocr/sample_anno.md | 57 +++++++++++++++++++++++++++++ dataset_zoo/textocr/textdet.py | 52 ++++++++++++++++++++++++++ dataset_zoo/textocr/textrecog.py | 5 +++ dataset_zoo/textocr/textspotting.py | 5 +++ 5 files changed, 146 insertions(+) create mode 100644 dataset_zoo/textocr/metafile.yml create mode 100644 dataset_zoo/textocr/sample_anno.md create mode 100644 dataset_zoo/textocr/textdet.py create mode 100644 dataset_zoo/textocr/textrecog.py create mode 100644 dataset_zoo/textocr/textspotting.py diff --git a/dataset_zoo/textocr/metafile.yml b/dataset_zoo/textocr/metafile.yml new file mode 100644 index 000000000..766ec5f01 --- /dev/null +++ b/dataset_zoo/textocr/metafile.yml @@ -0,0 +1,27 @@ +Name: 'Text OCR' +Paper: + Title: 'TextOCR: Towards large-scale end-to-end reasoning for arbitrary-shaped scene text' + URL: https://openaccess.thecvf.com/content/CVPR2021/papers/Singh_TextOCR_Towards_Large-Scale_End-to-End_Reasoning_for_Arbitrary-Shaped_Scene_Text_CVPR_2021_paper.pdf + Venue: CVPR + Year: '2021' + BibTeX: '@inproceedings{singh2021textocr, + title={{TextOCR}: Towards large-scale end-to-end reasoning for arbitrary-shaped scene text}, + author={Singh, Amanpreet and Pang, Guan and Toh, Mandy and Huang, Jing and Galuba, Wojciech and Hassner, Tal}, + journal={The Conference on Computer Vision and Pattern Recognition}, + year={2021}}' +Data: + Website: https://paperswithcode.com/dataset/textocr + Language: + - English + Scene: + - Natural Scene + Granularity: + - Word + Tasks: + - textdet + - textrecog + - textspotting + License: + Type: CC BY 4.0 + Link: https://creativecommons.org/licenses/by/4.0/ + Format: .json diff --git a/dataset_zoo/textocr/sample_anno.md b/dataset_zoo/textocr/sample_anno.md new file mode 100644 index 000000000..4659f16a1 --- /dev/null +++ b/dataset_zoo/textocr/sample_anno.md @@ -0,0 +1,57 @@ +**Text Detection/Recognition/Spotting** + +```json +{ + "imgs": { + "OpenImages_ImageID_1": { + "id": "OpenImages_ImageID_1", + "width": "INT, Width of the image", + "height": "INT, Height of the image", + "set": "Split train|val|test", + "filename": "train|test/OpenImages_ImageID_1.jpg" + }, + "OpenImages_ImageID_2": { + "...": "..." + } + }, + "anns": { + "OpenImages_ImageID_1_1": { + "id": "STR, OpenImages_ImageID_1_1, Specifies the nth annotation for an image", + "image_id": "OpenImages_ImageID_1", + "bbox": [ + "FLOAT x1", + "FLOAT y1", + "FLOAT x2", + "FLOAT y2" + ], + "points": [ + "FLOAT x1", + "FLOAT y1", + "FLOAT x2", + "FLOAT y2", + "...", + "FLOAT xN", + "FLOAT yN" + ], + "utf8_string": "text for this annotation", + "area": "FLOAT, area of this box" + }, + "OpenImages_ImageID_1_2": { + "...": "..." + }, + "OpenImages_ImageID_2_1": { + "...": "..." + } + }, + "img2Anns": { + "OpenImages_ImageID_1": [ + "OpenImages_ImageID_1_1", + "OpenImages_ImageID_1_2", + "OpenImages_ImageID_1_2" + ], + "OpenImages_ImageID_N": [ + "..." + ] + } +} +``` diff --git a/dataset_zoo/textocr/textdet.py b/dataset_zoo/textocr/textdet.py new file mode 100644 index 000000000..056489f91 --- /dev/null +++ b/dataset_zoo/textocr/textdet.py @@ -0,0 +1,52 @@ +data_root = 'data/textocr' +cache_path = 'data/cache' + +data_obtainer = dict( + type='NaiveDataObtainer', + cache_path=cache_path, + data_root=data_root, + files=[ + dict( + url='https://dl.fbaipublicfiles.com/textvqa/images/' + 'train_val_images.zip', + save_name='textocr_textdet_train_val_img.zip', + md5='d12dd8098899044e4ae1af34db7ecfef', + split=['train', 'val'], + content=['image'], + mapping=[[ + 'textocr_textdet_train_val_img/train_images', + 'textdet_imgs/train' + ]]), + dict( + url='https://dl.fbaipublicfiles.com/textvqa/data/textocr/' + 'TextOCR_0.1_train.json', + save_name='textocr_textdet_train.json', + md5='0f8ba1beefd2ca4d08a4f82bcbe6cfb4', + split=['train'], + content=['annotation'], + mapping=[['textocr_textdet_train.json', + 'annotations/train.json']]), + dict( + url='https://dl.fbaipublicfiles.com/textvqa/data/textocr/' + 'TextOCR_0.1_val.json', + save_name='textocr_textdet_val.json', + md5='fb151383ea7b3c530cde9ef0d5c08347', + split=['val'], + content=['annotation'], + mapping=[['textocr_textdet_val.json', 'annotations/val.json']]), + ]) + +data_converter = dict( + type='TextDetDataConverter', + splits=['train', 'val'], + data_root=data_root, + gatherer=dict( + type='mono_gather', train_ann='train.json', val_ann='val.json'), + parser=dict( + type='COCOTextDetAnnParser', + variant='textocr', + data_root=data_root + '/textdet_imgs/'), + dumper=dict(type='JsonDumper'), + delete=['annotations', 'textocr_textdet_train_val_img']) + +config_generator = dict(type='TextDetConfigGenerator', data_root=data_root) diff --git a/dataset_zoo/textocr/textrecog.py b/dataset_zoo/textocr/textrecog.py new file mode 100644 index 000000000..212c7e7d1 --- /dev/null +++ b/dataset_zoo/textocr/textrecog.py @@ -0,0 +1,5 @@ +_base_ = ['textdet.py'] + +data_converter = dict(type='TextRecogCropConverter') + +config_generator = dict(type='TextRecogConfigGenerator') diff --git a/dataset_zoo/textocr/textspotting.py b/dataset_zoo/textocr/textspotting.py new file mode 100644 index 000000000..88486337b --- /dev/null +++ b/dataset_zoo/textocr/textspotting.py @@ -0,0 +1,5 @@ +_base_ = ['textdet.py'] + +data_converter = dict(type='TextSpottingDataConverter') + +config_generator = dict(type='TextSpottingConfigGenerator')