Add validator for classification & detection tasks (cvat-ai#160)

* Implemented validator (classification & detection), CLI, and unit tests. * Modified validation error messages, reworked validation reports to use datumaro.components.error, simplified unittest * Fixed voc parsing * Removed 'x' and 'y' stats from being computed. Extended valid attributes to include shared attributes. * Added item subset to validation reports. Co-authored-by: Maxim Zhiltsov <[email protected]>
TOsmanov · Mar 19, 2021 · cdd5184 · cdd5184
1 parent 053e9cd
commit cdd5184
Show file tree

Hide file tree

Showing 9 changed files with 1,723 additions and 5 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -8,7 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 ### Added
--
+- Dataset validation for classification and detection datasets (<https://github.com/openvinotoolkit/datumaro/pull/160>)
 
 ### Changed
 -

diff --git a/datumaro/cli/__main__.py b/datumaro/cli/__main__.py
@@ -77,6 +77,7 @@ def make_parser():
         ('stats', commands.stats, "Compute project statistics"),
         ('info', commands.info, "Print project info"),
         ('explain', commands.explain, "Run Explainable AI algorithm for model"),
+        ('validate', commands.validate, "Validate project")
     ]
 
     # Argparse doesn't support subparser groups:

diff --git a/datumaro/cli/commands/__init__.py b/datumaro/cli/commands/__init__.py
@@ -9,5 +9,5 @@
     explain,
     export, merge, convert, transform, filter,
     diff, ediff, stats,
-    info
+    info, validate
 )
diff --git a/datumaro/cli/commands/validate.py b/datumaro/cli/commands/validate.py
@@ -0,0 +1,7 @@
+# Copyright (C) 2020-2021 Intel Corporation
+#
+# SPDX-License-Identifier: MIT
+
+# pylint: disable=unused-import
+
+from ..contexts.project import build_validate_parser as build_parser
diff --git a/datumaro/cli/contexts/project/__init__.py b/datumaro/cli/contexts/project/__init__.py
@@ -18,6 +18,7 @@
 from datumaro.components.project import \
     PROJECT_DEFAULT_CONFIG as DEFAULT_CONFIG
 from datumaro.components.project import Environment, Project
+from datumaro.components.validator import validate_annotations
 from datumaro.util import error_rollback
 
 from ...util import (CliException, MultilineFormatter, add_subparser,
@@ -791,6 +792,45 @@ def print_extractor_info(extractor, indent=''):
 
     return 0
 
+def build_validate_parser(parser_ctor=argparse.ArgumentParser):
+    parser = parser_ctor(help="Validate project",
+        description="""
+            Validates project based on specified task type and stores
+            results like statistics, reports and summary in JSON file.
+        """,
+        formatter_class=MultilineFormatter)
+
+    TaskType = Enum('TaskType', ['classification', 'detection'])
+
+    parser.add_argument(
+        'task_type',
+        help="Task type for validation",
+        choices=[task_type.name for task_type in TaskType]
+    )
+    parser.add_argument('-s', '--subset', dest='subset_name', default=None,
+        help="Subset to validate (default: None)")
+    parser.add_argument('-p', '--project', dest='project_dir', default='.',
+        help="Directory of the project to validate (default: current dir)")
+    parser.set_defaults(command=validate_command)
+
+    return parser
+
+def validate_command(args):
+    project = load_project(args.project_dir)
+    task_type = args.task_type
+    subset_name = args.subset_name
+    dst_file_name = 'validation_results'
+
+    dataset = project.make_dataset()
+    if subset_name is not None:
+        dataset = dataset.get_subset(subset_name)
+        dst_file_name += f'-{subset_name}'
+    validation_results = validate_annotations(dataset, task_type)
+
+    dst_file = generate_next_file_name(dst_file_name, ext='.json')
+    log.info("Writing project validation results to '%s'" % dst_file)
+    with open(dst_file, 'w') as f:
+        json.dump(validation_results, f, indent=4, sort_keys=True)
 
 def build_parser(parser_ctor=argparse.ArgumentParser):
     parser = parser_ctor(
@@ -814,5 +854,6 @@ def build_parser(parser_ctor=argparse.ArgumentParser):
     add_subparser(subparsers, 'transform', build_transform_parser)
     add_subparser(subparsers, 'info', build_info_parser)
     add_subparser(subparsers, 'stats', build_stats_parser)
+    add_subparser(subparsers, 'validate', build_validate_parser)
 
     return parser
diff --git a/datumaro/components/errors.py b/datumaro/components/errors.py
@@ -15,7 +15,7 @@ class DatasetError(DatumaroError):
 @attrs
 class RepeatedItemError(DatasetError):
     def __str__(self):
-        return "Item %s is repeated in the source sequence." % (self.item_id)
+        return "Item %s is repeated in the source sequence." % (self.item_id, )
 
 @attrs
 class MismatchingImageInfoError(DatasetError):
@@ -89,4 +89,219 @@ class FailedAttrVotingError(MergeError):
     def __str__(self):
         return "Item %s: attribute voting failed " \
             "for ann %s, votes %s, sources %s" % \
-            (self.item_id, self.ann, self.votes, self.sources)
+            (self.item_id, self.ann, self.votes, self.sources)
+
+@attrs
+class DatasetValidationError(DatumaroError):
+    severity = attrib()
+
+    def to_dict(self):
+        return {
+            'anomaly_type': self.__class__.__name__,
+            'description': str(self),
+            'severity': self.severity.name,
+        }
+
+@attrs
+class DatasetItemValidationError(DatasetValidationError):
+    item_id = attrib()
+    subset = attrib()
+
+    def to_dict(self):
+        dict_repr = super().to_dict()
+        dict_repr['item_id'] = self.item_id
+        dict_repr['subset'] = self.subset
+        return dict_repr
+
+@attrs
+class MissingLabelCategories(DatasetValidationError):
+    def __str__(self):
+        return "Metadata (ex. LabelCategories) should be defined" \
+            " to validate a dataset."
+
+@attrs
+class MissingLabelAnnotation(DatasetItemValidationError):
+    def __str__(self):
+        return "Item needs a label, but not found."
+
+@attrs
+class MultiLabelAnnotations(DatasetItemValidationError):
+    def __str__(self):
+        return 'Item needs a single label but multiple labels are found.'
+
+@attrs
+class MissingAttribute(DatasetItemValidationError):
+    label_name = attrib()
+    attr_name = attrib()
+
+    def __str__(self):
+        return f"Item needs the attribute '{self.attr_name}' " \
+            f"for the label '{self.label_name}'."
+
+@attrs
+class UndefinedLabel(DatasetItemValidationError):
+    label_name = attrib()
+
+    def __str__(self):
+        return f"Item has the label '{self.label_name}' which " \
+            "is not defined in metadata."
+
+@attrs
+class UndefinedAttribute(DatasetItemValidationError):
+    label_name = attrib()
+    attr_name = attrib()
+
+    def __str__(self):
+        return f"Item has the attribute '{self.attr_name}' for the " \
+            f"label '{self.label_name}' which is not defined in metadata."
+
+@attrs
+class LabelDefinedButNotFound(DatasetValidationError):
+    label_name = attrib()
+
+    def __str__(self):
+        return f"The label '{self.label_name}' is defined in " \
+                "metadata, but not found in the dataset."
+
+@attrs
+class AttributeDefinedButNotFound(DatasetValidationError):
+    label_name = attrib()
+    attr_name = attrib()
+
+    def __str__(self):
+        return f"The attribute '{self.attr_name}' for the label " \
+            f"'{self.label_name}' is defined in metadata, but not " \
+            "found in the dataset."
+
+@attrs
+class OnlyOneLabel(DatasetValidationError):
+    label_name = attrib()
+
+    def __str__(self):
+        return f"The dataset has only one label '{self.label_name}'."
+
+@attrs
+class OnlyOneAttributeValue(DatasetValidationError):
+    label_name = attrib()
+    attr_name = attrib()
+    value = attrib()
+
+    def __str__(self):
+        return "The dataset has the only attribute value " \
+            f"'{self.value}' for the attribute '{self.attr_name}' for the " \
+            f"label '{self.label_name}'."
+
+@attrs
+class FewSamplesInLabel(DatasetValidationError):
+    label_name = attrib()
+    count = attrib()
+
+    def __str__(self):
+        return f"The number of samples in the label '{self.label_name}'" \
+            f" might be too low. Found '{self.count}' samples."
+
+@attrs
+class FewSamplesInAttribute(DatasetValidationError):
+    label_name = attrib()
+    attr_name = attrib()
+    attr_value = attrib()
+    count = attrib()
+
+    def __str__(self):
+        return "The number of samples for attribute = value " \
+            f"'{self.attr_name} = {self.attr_value}' for the label " \
+            f"'{self.label_name}' might be too low. " \
+            f"Found '{self.count}' samples."
+
+@attrs
+class ImbalancedLabels(DatasetValidationError):
+    def __str__(self):
+        return 'There is an imbalance in the label distribution.'
+
+@attrs
+class ImbalancedAttribute(DatasetValidationError):
+    label_name = attrib()
+    attr_name = attrib()
+
+    def __str__(self):
+        return "There is an imbalance in the distribution of attribute" \
+            f" '{self. attr_name}' for the label '{self.label_name}'."
+
+@attrs
+class ImbalancedBboxDistInLabel(DatasetValidationError):
+    label_name = attrib()
+    prop = attrib()
+
+    def __str__(self):
+        return f"Values of bbox '{self.prop}' are not evenly " \
+                f"distributed for '{self.label_name}' label."
+
+@attrs
+class ImbalancedBboxDistInAttribute(DatasetValidationError):
+    label_name = attrib()
+    attr_name = attrib()
+    attr_value = attrib()
+    prop = attrib()
+
+    def __str__(self):
+        return f"Values of bbox '{self.prop}' are not evenly " \
+            f"distributed for '{self.attr_name}' = '{self.attr_value}' for " \
+            f"the '{self.label_name}' label."
+
+@attrs
+class MissingBboxAnnotation(DatasetItemValidationError):
+    def __str__(self):
+        return 'Item needs one or more bounding box annotations, ' \
+            'but not found.'
+
+@attrs
+class NegativeLength(DatasetItemValidationError):
+    ann_id = attrib()
+    prop = attrib()
+    val = attrib()
+
+    def __str__(self):
+        return f"Bounding box annotation '{self.ann_id}' in " \
+            "the item should have a positive value of " \
+            f"'{self.prop}' but got '{self.val}'."
+
+@attrs
+class InvalidValue(DatasetItemValidationError):
+    ann_id = attrib()
+    prop = attrib()
+
+    def __str__(self):
+        return f"Bounding box annotation '{self.ann_id}' in " \
+            'the item has an inf or a NaN value of ' \
+            f"bounding box '{self.prop}'."
+
+@attrs
+class FarFromLabelMean(DatasetItemValidationError):
+    label_name = attrib()
+    ann_id = attrib()
+    prop = attrib()
+    mean = attrib()
+    val = attrib()
+
+    def __str__(self):
+        return f"Bounding box annotation '{self.ann_id}' in " \
+            f"the item has a value of bounding box '{self.prop}' that " \
+            "is too far from the label average. (mean of " \
+            f"'{self.label_name}' label: {self.mean}, got '{self.val}')."
+
+@attrs
+class FarFromAttrMean(DatasetItemValidationError):
+    label_name = attrib()
+    ann_id = attrib()
+    attr_name = attrib()
+    attr_value = attrib()
+    prop = attrib()
+    mean = attrib()
+    val = attrib()
+
+    def __str__(self):
+        return f"Bounding box annotation '{self.ann_id}' in the " \
+            f"item has a value of bounding box '{self.prop}' that " \
+            "is too far from the attribute average. (mean of " \
+            f"'{self.attr_name}' = '{self.attr_value}' for the " \
+            f"'{self.label_name}' label: {self.mean}, got '{self.val}')."