Skip to content

Commit

Permalink
[Feature] Add Funsd to dataset preparer (#1550)
Browse files Browse the repository at this point in the history
* add funsd

* done

* done

Co-authored-by: gaotongxiao <[email protected]>
  • Loading branch information
xinke-wang and gaotongxiao authored Dec 20, 2022
1 parent 4396e8f commit fb78c94
Show file tree
Hide file tree
Showing 7 changed files with 190 additions and 1 deletion.
27 changes: 27 additions & 0 deletions dataset_zoo/funsd/metafile.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
Name: 'FUNSD'
Paper:
Title: 'FUNSD: A Dataset for Form Understanding in Noisy Scanned Documents'
URL: https://arxiv.org/pdf/1905.13538.pdf
Venue: ICDAR
Year: '2019'
BibTeX: '@inproceedings{jaume2019,
title = {FUNSD: A Dataset for Form Understanding in Noisy Scanned Documents},
author = {Guillaume Jaume, Hazim Kemal Ekenel, Jean-Philippe Thiran},
booktitle = {Accepted to ICDAR-OST},
year = {2019}}'
Data:
Website: https://guillaumejaume.github.io/FUNSD/
Language:
- English
Scene:
- Document
Granularity:
- Word
Tasks:
- textdet
- textrecog
- textspotting
License:
Type: FUNSD License
Link: https://guillaumejaume.github.io/FUNSD/work/
Format: .json
73 changes: 73 additions & 0 deletions dataset_zoo/funsd/sample_anno.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
**Text Detection/Recognition/Spotting**

```json
{
"form": [
{
"id": 0,
"text": "Registration No.",
"box": [
94,
169,
191,
186
],
"linking": [
[
0,
1
]
],
"label": "question",
"words": [
{
"text": "Registration",
"box": [
94,
169,
168,
186
]
},
{
"text": "No.",
"box": [
170,
169,
191,
183
]
}
]
},
{
"id": 1,
"text": "533",
"box": [
209,
169,
236,
182
],
"label": "answer",
"words": [
{
"box": [
209,
169,
236,
182
],
"text": "533"
}
],
"linking": [
[
0,
1
]
]
}
]
}
```
38 changes: 38 additions & 0 deletions dataset_zoo/funsd/textdet.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
data_root = 'data/funsd'
cache_path = 'data/cache'

data_obtainer = dict(
type='NaiveDataObtainer',
cache_path=cache_path,
data_root=data_root,
files=[
dict(
url='https://guillaumejaume.github.io/FUNSD/dataset.zip',
save_name='funsd.zip',
md5='e05de47de238aa343bf55d8807d659a9',
split=['train', 'test'],
content=['image', 'annotation'],
mapping=[
['funsd/dataset/training_data/images', 'textdet_imgs/train'],
['funsd/dataset/testing_data/images', 'textdet_imgs/test'],
[
'funsd/dataset/training_data/annotations',
'annotations/train'
],
['funsd/dataset/testing_data/annotations', 'annotations/test'],
]),
])

data_converter = dict(
type='TextDetDataConverter',
splits=['train', 'test'],
data_root=data_root,
gatherer=dict(
type='pair_gather',
suffixes=['.png'],
rule=[r'(\w+)\.png', r'\1.json']),
parser=dict(type='FUNSDTextDetAnnParser'),
dumper=dict(type='JsonDumper'),
delete=['annotations', 'funsd'])

config_generator = dict(type='TextDetConfigGenerator', data_root=data_root)
5 changes: 5 additions & 0 deletions dataset_zoo/funsd/textrecog.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
_base_ = ['textdet.py']

data_converter = dict(type='TextRecogCropConverter')

config_generator = dict(type='TextRecogConfigGenerator')
5 changes: 5 additions & 0 deletions dataset_zoo/funsd/textspotting.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
_base_ = ['textdet.py']

data_converter = dict(type='TextSpottingDataConverter')

config_generator = dict(type='TextSpottingConfigGenerator')
3 changes: 2 additions & 1 deletion mmocr/datasets/preparers/parsers/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# Copyright (c) OpenMMLab. All rights reserved.
from .coco_parser import COCOTextDetAnnParser
from .funsd_parser import FUNSDTextDetAnnParser
from .icdar_txt_parser import (ICDARTxtTextDetAnnParser,
ICDARTxtTextRecogAnnParser)
from .svt_parser import SVTTextDetAnnParser
Expand All @@ -9,5 +10,5 @@
__all__ = [
'ICDARTxtTextDetAnnParser', 'ICDARTxtTextRecogAnnParser',
'TotaltextTextDetAnnParser', 'WildreceiptKIEAnnParser',
'COCOTextDetAnnParser', 'SVTTextDetAnnParser'
'COCOTextDetAnnParser', 'SVTTextDetAnnParser', 'FUNSDTextDetAnnParser'
]
40 changes: 40 additions & 0 deletions mmocr/datasets/preparers/parsers/funsd_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# Copyright (c) OpenMMLab. All rights reserved.
import json
from typing import Tuple

from mmocr.utils import bbox2poly
from ..data_preparer import DATA_PARSERS
from .base import BaseParser


@DATA_PARSERS.register_module()
class FUNSDTextDetAnnParser(BaseParser):
"""FUNSD Text Detection Annotation Parser. See
dataset_zoo/funsd/sample_anno.md for annotation example.
Args:
nproc (int): The number of processes to parse the annotation. Defaults
to 1.
"""

def __init__(self, nproc: int = 1) -> None:
super().__init__(nproc=nproc)

def parse_file(self, file: Tuple, split: str) -> Tuple:
"""Parse single annotation."""
img_file, json_file = file
instances = list()
for poly, text, ignore in self.loader(json_file):
instances.append(dict(poly=poly, text=text, ignore=ignore))

return img_file, instances

def loader(self, file_path: str):
with open(file_path, 'r') as f:
data = json.load(f)
for form in data['form']:
for word in form['words']:
poly = bbox2poly(word['box']).tolist()
text = word['text']
ignore = len(text) == 0
yield poly, text, ignore

0 comments on commit fb78c94

Please sign in to comment.