[Feature] Add Funsd to dataset preparer (#1550)

* add funsd * done * done Co-authored-by: gaotongxiao <[email protected]>
open-mmlab · Dec 20, 2022 · fb78c94 · fb78c94
1 parent 4396e8f
commit fb78c94
Show file tree

Hide file tree

Showing 7 changed files with 190 additions and 1 deletion.
diff --git a/dataset_zoo/funsd/metafile.yml b/dataset_zoo/funsd/metafile.yml
@@ -0,0 +1,27 @@
+Name: 'FUNSD'
+Paper:
+  Title: 'FUNSD: A Dataset for Form Understanding in Noisy Scanned Documents'
+  URL: https://arxiv.org/pdf/1905.13538.pdf
+  Venue: ICDAR
+  Year: '2019'
+  BibTeX: '@inproceedings{jaume2019,
+    title = {FUNSD: A Dataset for Form Understanding in Noisy Scanned Documents},
+    author = {Guillaume Jaume, Hazim Kemal Ekenel, Jean-Philippe Thiran},
+    booktitle = {Accepted to ICDAR-OST},
+    year = {2019}}'
+Data:
+  Website: https://guillaumejaume.github.io/FUNSD/
+  Language:
+    - English
+  Scene:
+    - Document
+  Granularity:
+    - Word
+  Tasks:
+    - textdet
+    - textrecog
+    - textspotting
+  License:
+    Type: FUNSD License
+    Link: https://guillaumejaume.github.io/FUNSD/work/
+  Format: .json
diff --git a/dataset_zoo/funsd/sample_anno.md b/dataset_zoo/funsd/sample_anno.md
@@ -0,0 +1,73 @@
+**Text Detection/Recognition/Spotting**
+
+```json
+{
+  "form": [
+    {
+      "id": 0,
+      "text": "Registration No.",
+      "box": [
+          94,
+          169,
+          191,
+          186
+      ],
+      "linking": [
+          [
+              0,
+              1
+          ]
+      ],
+      "label": "question",
+      "words": [
+          {
+              "text": "Registration",
+              "box": [
+                  94,
+                  169,
+                  168,
+                  186
+              ]
+          },
+          {
+              "text": "No.",
+              "box": [
+                  170,
+                  169,
+                  191,
+                  183
+              ]
+          }
+      ]
+    },
+    {
+      "id": 1,
+      "text": "533",
+      "box": [
+          209,
+          169,
+          236,
+          182
+      ],
+      "label": "answer",
+      "words": [
+          {
+              "box": [
+                  209,
+                  169,
+                  236,
+                  182
+              ],
+              "text": "533"
+          }
+      ],
+      "linking": [
+          [
+              0,
+              1
+          ]
+      ]
+    }
+  ]
+}
+```
diff --git a/dataset_zoo/funsd/textdet.py b/dataset_zoo/funsd/textdet.py
@@ -0,0 +1,38 @@
+data_root = 'data/funsd'
+cache_path = 'data/cache'
+
+data_obtainer = dict(
+    type='NaiveDataObtainer',
+    cache_path=cache_path,
+    data_root=data_root,
+    files=[
+        dict(
+            url='https://guillaumejaume.github.io/FUNSD/dataset.zip',
+            save_name='funsd.zip',
+            md5='e05de47de238aa343bf55d8807d659a9',
+            split=['train', 'test'],
+            content=['image', 'annotation'],
+            mapping=[
+                ['funsd/dataset/training_data/images', 'textdet_imgs/train'],
+                ['funsd/dataset/testing_data/images', 'textdet_imgs/test'],
+                [
+                    'funsd/dataset/training_data/annotations',
+                    'annotations/train'
+                ],
+                ['funsd/dataset/testing_data/annotations', 'annotations/test'],
+            ]),
+    ])
+
+data_converter = dict(
+    type='TextDetDataConverter',
+    splits=['train', 'test'],
+    data_root=data_root,
+    gatherer=dict(
+        type='pair_gather',
+        suffixes=['.png'],
+        rule=[r'(\w+)\.png', r'\1.json']),
+    parser=dict(type='FUNSDTextDetAnnParser'),
+    dumper=dict(type='JsonDumper'),
+    delete=['annotations', 'funsd'])
+
+config_generator = dict(type='TextDetConfigGenerator', data_root=data_root)
diff --git a/dataset_zoo/funsd/textrecog.py b/dataset_zoo/funsd/textrecog.py
@@ -0,0 +1,5 @@
+_base_ = ['textdet.py']
+
+data_converter = dict(type='TextRecogCropConverter')
+
+config_generator = dict(type='TextRecogConfigGenerator')
diff --git a/dataset_zoo/funsd/textspotting.py b/dataset_zoo/funsd/textspotting.py
@@ -0,0 +1,5 @@
+_base_ = ['textdet.py']
+
+data_converter = dict(type='TextSpottingDataConverter')
+
+config_generator = dict(type='TextSpottingConfigGenerator')
diff --git a/mmocr/datasets/preparers/parsers/__init__.py b/mmocr/datasets/preparers/parsers/__init__.py
@@ -1,5 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from .coco_parser import COCOTextDetAnnParser
+from .funsd_parser import FUNSDTextDetAnnParser
 from .icdar_txt_parser import (ICDARTxtTextDetAnnParser,
                                ICDARTxtTextRecogAnnParser)
 from .svt_parser import SVTTextDetAnnParser
@@ -9,5 +10,5 @@
 __all__ = [
     'ICDARTxtTextDetAnnParser', 'ICDARTxtTextRecogAnnParser',
     'TotaltextTextDetAnnParser', 'WildreceiptKIEAnnParser',
-    'COCOTextDetAnnParser', 'SVTTextDetAnnParser'
+    'COCOTextDetAnnParser', 'SVTTextDetAnnParser', 'FUNSDTextDetAnnParser'
 ]
diff --git a/mmocr/datasets/preparers/parsers/funsd_parser.py b/mmocr/datasets/preparers/parsers/funsd_parser.py
@@ -0,0 +1,40 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import json
+from typing import Tuple
+
+from mmocr.utils import bbox2poly
+from ..data_preparer import DATA_PARSERS
+from .base import BaseParser
+
+
+@DATA_PARSERS.register_module()
+class FUNSDTextDetAnnParser(BaseParser):
+    """FUNSD Text Detection Annotation Parser. See
+    dataset_zoo/funsd/sample_anno.md for annotation example.
+
+    Args:
+        nproc (int): The number of processes to parse the annotation. Defaults
+            to 1.
+    """
+
+    def __init__(self, nproc: int = 1) -> None:
+        super().__init__(nproc=nproc)
+
+    def parse_file(self, file: Tuple, split: str) -> Tuple:
+        """Parse single annotation."""
+        img_file, json_file = file
+        instances = list()
+        for poly, text, ignore in self.loader(json_file):
+            instances.append(dict(poly=poly, text=text, ignore=ignore))
+
+        return img_file, instances
+
+    def loader(self, file_path: str):
+        with open(file_path, 'r') as f:
+            data = json.load(f)
+            for form in data['form']:
+                for word in form['words']:
+                    poly = bbox2poly(word['box']).tolist()
+                    text = word['text']
+                    ignore = len(text) == 0
+                    yield poly, text, ignore