-
Notifications
You must be signed in to change notification settings - Fork 5
/
ocrd_eval.schema.yml
217 lines (182 loc) · 6.67 KB
/
ocrd_eval.schema.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
$schema: https://json-schema.org/draft/2019-09/schema
$id: https://ocr-d.de/en/spec/ocrd_eval.schema.json
title: A List of Evaluations for OCR-D
description: >
- All references to URL are JSON-LD-like objects with at least an `@id`
property referencing the URL and `label` for a human-readable label to be
used in the UI.
type: array
items:
required: ['@id', 'label', 'metadata', 'evaluation_results']
unevaluatedProperties: false
allOf:
- { '$ref': '#/$defs/LabeledUrl' }
- properties:
metadata: { '$ref': '#/$defs/EvaluationMetadata' }
evaluation_results: { '$ref': '#/$defs/EvaluationReport' }
# Reusable definitions
$defs:
LabeledUrl:
type: object
required: ['@id']
properties:
'@id':
type: string
format: uri
description: URL of the thing
label:
type: string
description: Description of the thing for UI purposes
EvaluationMetadata:
type: object
title: Metadata about one evaluation
additionalProperties: false
description: >
EvaluationMetadata contains all the info on how an EvaluationReport came to be.
There are two OCR-D *workflows* involved:
- ocr_workflow: The workflow which produced the OCR results to evaluate
- eval_workflow: The workflow run to evaluate OCR and GT
There are three OCR-D *workspaces* involved:
- gt_workspace: The workspace containing the GT
- ocr_workspace: The workspace containing the OCR results from ocr_workflow
- eval_workspace: The workspace on which the eval_workflow was run
required:
- ocr_workflow
- ocr_workspace
- eval_workflow
- eval_workspace
- gt_workspace
- document_metadata
properties:
ocr_workflow:
allOf: [{ '$ref': '#/$defs/LabeledUrl' }]
description: The OCR-D workflow that produced the ocr_workspace
ocr_workspace:
allOf: [{ '$ref': '#/$defs/LabeledUrl' }]
description: The workspace containing the OCR
eval_workflow:
allOf: [{ '$ref': '#/$defs/LabeledUrl' }]
description: The OCR-D workflow that produced the eval_workspace
eval_workspace:
allOf: [{ '$ref': '#/$defs/LabeledUrl' }]
description: The workspace containing the evaluation results
gt_workspace:
allOf: [{ '$ref': '#/$defs/LabeledUrl' }]
description: The workspace containing the GT
workflow_steps:
type: array
description: Human readable description of the individual steps and their parameters in the workflow (for UI)
minItems: 1
items:
type: object
properties:
id:
type: string
description: The name of the processor used for this workflow step
pattern: '^ocrd-[a-z\-]+'
params:
type: object
description: A map of parameters and their values applied to the processor used for this workflow step
required: ['id', 'params']
workflow_model:
type: string
description: Human readable name of the main model used for recognition in the OCR workflow (for UI)
eval_tool:
type: string
description: Human readable name and version of evaluation tool used (for UI)
document_metadata:
type: object
title: Bibliographical and typographical metadata about the work to be evaluated
properties:
publication_year:
type: number
description: Year the document was originally published
publication_century:
type: string
description: Century the document was originally published
pattern: '[12][0-9]{3}-[12][0-9]{3}'
publication_decade:
type: string
description: Decade the document was originally published
pattern: '[12][0-9]{2}0-[12][0-9]{2}0'
number_of_pages:
type: number
description: Number of pages in this work (i.e. the number of images in the gt_workspace)
layout:
type: string
enum: ['simple', 'complex']
fonts:
type: array
items:
type: string
enum: ['antiqua', 'textura', 'gotico-antiqua', 'rotunda', 'italic', 'bastarda', 'greek', 'schwabacher', 'hebrew', 'fraktur']
provenance:
type: object
description: Information on which tools in which version were used in determining metrics
properties:
parameters:
type: object
description: Parameters passed to the evaluation processor
EvaluationReport:
type: object
additionalProperties: false
description: The metrics measured for this document
properties:
document_wide:
type: object
description: Document-wide metrics
allOf: [
{ $ref: '#$defs/DocumentEvaluationMetrics' },
{ $ref: '#$defs/CommonEvaluationMetrics' }
]
unevaluatedProperties: false
by_page:
type: array
description: Metrics page-by-page
items:
type: object
allOf: [
{ $ref: '#$defs/CommonEvaluationMetrics' },
{ $ref: '#$defs/PageId' }
]
unevaluatedProperties: false
PageId:
type: object
properties:
page_id:
type: string
description: PAGE ID
CommonEvaluationMetrics:
type: object
properties:
cer_mean:
type: number
description: Arithmetic mean of the page-wise CER (in document_wide) or regions on a page (in by_page)
wer:
type: number
description: CER calculated over the text of a whole page (in by_page) or combined text of all pages (in document_wide)
DocumentEvaluationMetrics:
type: object
properties:
cer_median:
type: number
description: Median of the page-wise CER (in document_wide) or regions on a page (in by_page)
cer_range:
type: array
minItems: 2
maxItems: 2
items:
type: number
description: Minimum and maximum of CER calculated over the text of a whole page (in by_page) or combined text of all pages (in document_wide)
cer_standard_deviation:
type: number
description: Standard deviation the page-wise CER (in document_wide) or regions on a page (in by_page)
wall_time:
type: number
description: Actual time needed for processing workflow
cpu_time:
type: number
description: Cumulative CPU time used for processing workflow
pages_per_minute:
type: number
description: Number of pages processed per minute