Skip to content

Commit

Permalink
Add validator for classification & detection tasks (cvat-ai#160)
Browse files Browse the repository at this point in the history
* Implemented validator (classification & detection), CLI, and unit tests.

* Modified validation error messages, reworked validation reports to use datumaro.components.error, simplified unittest

* Fixed voc parsing

* Removed 'x' and 'y' stats from being computed. Extended valid attributes to include shared attributes.

* Added item subset to validation reports.

Co-authored-by: Maxim Zhiltsov <[email protected]>
  • Loading branch information
seongjun-park-dl and Maxim Zhiltsov authored Mar 19, 2021
1 parent 053e9cd commit cdd5184
Show file tree
Hide file tree
Showing 9 changed files with 1,723 additions and 5 deletions.
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]
### Added
-
- Dataset validation for classification and detection datasets (<https://github.com/openvinotoolkit/datumaro/pull/160>)

### Changed
-
Expand Down
1 change: 1 addition & 0 deletions datumaro/cli/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ def make_parser():
('stats', commands.stats, "Compute project statistics"),
('info', commands.info, "Print project info"),
('explain', commands.explain, "Run Explainable AI algorithm for model"),
('validate', commands.validate, "Validate project")
]

# Argparse doesn't support subparser groups:
Expand Down
2 changes: 1 addition & 1 deletion datumaro/cli/commands/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,5 @@
explain,
export, merge, convert, transform, filter,
diff, ediff, stats,
info
info, validate
)
7 changes: 7 additions & 0 deletions datumaro/cli/commands/validate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# Copyright (C) 2020-2021 Intel Corporation
#
# SPDX-License-Identifier: MIT

# pylint: disable=unused-import

from ..contexts.project import build_validate_parser as build_parser
41 changes: 41 additions & 0 deletions datumaro/cli/contexts/project/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from datumaro.components.project import \
PROJECT_DEFAULT_CONFIG as DEFAULT_CONFIG
from datumaro.components.project import Environment, Project
from datumaro.components.validator import validate_annotations
from datumaro.util import error_rollback

from ...util import (CliException, MultilineFormatter, add_subparser,
Expand Down Expand Up @@ -791,6 +792,45 @@ def print_extractor_info(extractor, indent=''):

return 0

def build_validate_parser(parser_ctor=argparse.ArgumentParser):
parser = parser_ctor(help="Validate project",
description="""
Validates project based on specified task type and stores
results like statistics, reports and summary in JSON file.
""",
formatter_class=MultilineFormatter)

TaskType = Enum('TaskType', ['classification', 'detection'])

parser.add_argument(
'task_type',
help="Task type for validation",
choices=[task_type.name for task_type in TaskType]
)
parser.add_argument('-s', '--subset', dest='subset_name', default=None,
help="Subset to validate (default: None)")
parser.add_argument('-p', '--project', dest='project_dir', default='.',
help="Directory of the project to validate (default: current dir)")
parser.set_defaults(command=validate_command)

return parser

def validate_command(args):
project = load_project(args.project_dir)
task_type = args.task_type
subset_name = args.subset_name
dst_file_name = 'validation_results'

dataset = project.make_dataset()
if subset_name is not None:
dataset = dataset.get_subset(subset_name)
dst_file_name += f'-{subset_name}'
validation_results = validate_annotations(dataset, task_type)

dst_file = generate_next_file_name(dst_file_name, ext='.json')
log.info("Writing project validation results to '%s'" % dst_file)
with open(dst_file, 'w') as f:
json.dump(validation_results, f, indent=4, sort_keys=True)

def build_parser(parser_ctor=argparse.ArgumentParser):
parser = parser_ctor(
Expand All @@ -814,5 +854,6 @@ def build_parser(parser_ctor=argparse.ArgumentParser):
add_subparser(subparsers, 'transform', build_transform_parser)
add_subparser(subparsers, 'info', build_info_parser)
add_subparser(subparsers, 'stats', build_stats_parser)
add_subparser(subparsers, 'validate', build_validate_parser)

return parser
219 changes: 217 additions & 2 deletions datumaro/components/errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ class DatasetError(DatumaroError):
@attrs
class RepeatedItemError(DatasetError):
def __str__(self):
return "Item %s is repeated in the source sequence." % (self.item_id)
return "Item %s is repeated in the source sequence." % (self.item_id, )

@attrs
class MismatchingImageInfoError(DatasetError):
Expand Down Expand Up @@ -89,4 +89,219 @@ class FailedAttrVotingError(MergeError):
def __str__(self):
return "Item %s: attribute voting failed " \
"for ann %s, votes %s, sources %s" % \
(self.item_id, self.ann, self.votes, self.sources)
(self.item_id, self.ann, self.votes, self.sources)

@attrs
class DatasetValidationError(DatumaroError):
severity = attrib()

def to_dict(self):
return {
'anomaly_type': self.__class__.__name__,
'description': str(self),
'severity': self.severity.name,
}

@attrs
class DatasetItemValidationError(DatasetValidationError):
item_id = attrib()
subset = attrib()

def to_dict(self):
dict_repr = super().to_dict()
dict_repr['item_id'] = self.item_id
dict_repr['subset'] = self.subset
return dict_repr

@attrs
class MissingLabelCategories(DatasetValidationError):
def __str__(self):
return "Metadata (ex. LabelCategories) should be defined" \
" to validate a dataset."

@attrs
class MissingLabelAnnotation(DatasetItemValidationError):
def __str__(self):
return "Item needs a label, but not found."

@attrs
class MultiLabelAnnotations(DatasetItemValidationError):
def __str__(self):
return 'Item needs a single label but multiple labels are found.'

@attrs
class MissingAttribute(DatasetItemValidationError):
label_name = attrib()
attr_name = attrib()

def __str__(self):
return f"Item needs the attribute '{self.attr_name}' " \
f"for the label '{self.label_name}'."

@attrs
class UndefinedLabel(DatasetItemValidationError):
label_name = attrib()

def __str__(self):
return f"Item has the label '{self.label_name}' which " \
"is not defined in metadata."

@attrs
class UndefinedAttribute(DatasetItemValidationError):
label_name = attrib()
attr_name = attrib()

def __str__(self):
return f"Item has the attribute '{self.attr_name}' for the " \
f"label '{self.label_name}' which is not defined in metadata."

@attrs
class LabelDefinedButNotFound(DatasetValidationError):
label_name = attrib()

def __str__(self):
return f"The label '{self.label_name}' is defined in " \
"metadata, but not found in the dataset."

@attrs
class AttributeDefinedButNotFound(DatasetValidationError):
label_name = attrib()
attr_name = attrib()

def __str__(self):
return f"The attribute '{self.attr_name}' for the label " \
f"'{self.label_name}' is defined in metadata, but not " \
"found in the dataset."

@attrs
class OnlyOneLabel(DatasetValidationError):
label_name = attrib()

def __str__(self):
return f"The dataset has only one label '{self.label_name}'."

@attrs
class OnlyOneAttributeValue(DatasetValidationError):
label_name = attrib()
attr_name = attrib()
value = attrib()

def __str__(self):
return "The dataset has the only attribute value " \
f"'{self.value}' for the attribute '{self.attr_name}' for the " \
f"label '{self.label_name}'."

@attrs
class FewSamplesInLabel(DatasetValidationError):
label_name = attrib()
count = attrib()

def __str__(self):
return f"The number of samples in the label '{self.label_name}'" \
f" might be too low. Found '{self.count}' samples."

@attrs
class FewSamplesInAttribute(DatasetValidationError):
label_name = attrib()
attr_name = attrib()
attr_value = attrib()
count = attrib()

def __str__(self):
return "The number of samples for attribute = value " \
f"'{self.attr_name} = {self.attr_value}' for the label " \
f"'{self.label_name}' might be too low. " \
f"Found '{self.count}' samples."

@attrs
class ImbalancedLabels(DatasetValidationError):
def __str__(self):
return 'There is an imbalance in the label distribution.'

@attrs
class ImbalancedAttribute(DatasetValidationError):
label_name = attrib()
attr_name = attrib()

def __str__(self):
return "There is an imbalance in the distribution of attribute" \
f" '{self. attr_name}' for the label '{self.label_name}'."

@attrs
class ImbalancedBboxDistInLabel(DatasetValidationError):
label_name = attrib()
prop = attrib()

def __str__(self):
return f"Values of bbox '{self.prop}' are not evenly " \
f"distributed for '{self.label_name}' label."

@attrs
class ImbalancedBboxDistInAttribute(DatasetValidationError):
label_name = attrib()
attr_name = attrib()
attr_value = attrib()
prop = attrib()

def __str__(self):
return f"Values of bbox '{self.prop}' are not evenly " \
f"distributed for '{self.attr_name}' = '{self.attr_value}' for " \
f"the '{self.label_name}' label."

@attrs
class MissingBboxAnnotation(DatasetItemValidationError):
def __str__(self):
return 'Item needs one or more bounding box annotations, ' \
'but not found.'

@attrs
class NegativeLength(DatasetItemValidationError):
ann_id = attrib()
prop = attrib()
val = attrib()

def __str__(self):
return f"Bounding box annotation '{self.ann_id}' in " \
"the item should have a positive value of " \
f"'{self.prop}' but got '{self.val}'."

@attrs
class InvalidValue(DatasetItemValidationError):
ann_id = attrib()
prop = attrib()

def __str__(self):
return f"Bounding box annotation '{self.ann_id}' in " \
'the item has an inf or a NaN value of ' \
f"bounding box '{self.prop}'."

@attrs
class FarFromLabelMean(DatasetItemValidationError):
label_name = attrib()
ann_id = attrib()
prop = attrib()
mean = attrib()
val = attrib()

def __str__(self):
return f"Bounding box annotation '{self.ann_id}' in " \
f"the item has a value of bounding box '{self.prop}' that " \
"is too far from the label average. (mean of " \
f"'{self.label_name}' label: {self.mean}, got '{self.val}')."

@attrs
class FarFromAttrMean(DatasetItemValidationError):
label_name = attrib()
ann_id = attrib()
attr_name = attrib()
attr_value = attrib()
prop = attrib()
mean = attrib()
val = attrib()

def __str__(self):
return f"Bounding box annotation '{self.ann_id}' in the " \
f"item has a value of bounding box '{self.prop}' that " \
"is too far from the attribute average. (mean of " \
f"'{self.attr_name}' = '{self.attr_value}' for the " \
f"'{self.label_name}' label: {self.mean}, got '{self.val}')."
Loading

0 comments on commit cdd5184

Please sign in to comment.