From d8cb65db8fb3646ea45127560ec3bd8118ac0cb1 Mon Sep 17 00:00:00 2001
From: Jihyeon Yi <jihyeon.yi@intel.com>
Date: Mon, 10 May 2021 16:45:51 +0900
Subject: [PATCH] add documentation for validator (#233)

* add documentation for validator
---
 README.md           |   1 +
 docs/user_manual.md | 150 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 151 insertions(+)
diff --git a/README.md b/README.md
index 2c3a2f70ba28..00697b4bbe23 100644
--- a/README.md
+++ b/README.md
@@ -169,6 +169,7 @@ CVAT annotations                             ---> Publication, statistics etc.
   - Simple checking for errors
   - Comparison with model infernece
   - Merging and comparison of multiple datasets
+  - Annotation validation based on the task type(classification, etc)
 - Dataset comparison
 - Dataset statistics (image mean and std, annotation statistics)
 - Model integration
diff --git a/docs/user_manual.md b/docs/user_manual.md
index df9bb331242f..cee40321dcb1 100644
--- a/docs/user_manual.md
+++ b/docs/user_manual.md
@@ -20,6 +20,7 @@
   - [Compare projects](#compare-projects)
   - [Obtaining project info](#get-project-info)
   - [Obtaining project statistics](#get-project-statistics)
+  - [Validate project annotations](#validate-project-annotations)
   - [Register model](#register-model)
   - [Run inference](#run-model)
   - [Run inference explanation](#explain-inference)
@@ -878,6 +879,155 @@ datum stats -p test_project
 
 </details>
 
+
+### Validate project annotations
+
+This command inspects annotations with respect to the task type
+and stores the result in JSON file.
+
+The task types supported are `classification`, `detection`, and `segmentation`.
+
+The validation result contains
+- annotation statistics based on the task type
+- validation reports, such as
+    - items not having annotations
+    - items having undefined annotations
+    - imbalanced distribution in class/attributes
+    - too small or large values
+- summary
+
+Usage:
+
+``` bash
+datum validate --help
+
+datum validate -p <project dir> <task_type>
+```
+
+Validation Result:
+
+<details>
+
+``` bash
+{
+    'statistics': {
+        ## common statistics
+        'label_distribution': {
+            'defined_labels': <dict>,   # <label:str>: <count:int>
+            'undefined_labels': <dict>
+            # <label:str>: {
+            #     'count': <int>,
+            #     'items_with_undefined_label': [<item_key>, ]
+            # }
+        },
+        'attribute_distribution': {
+            'defined_attributes': <dict>,
+            # <label:str>: {
+            #     <attribute:str>: {
+            #         'distribution': {<attr_value:str>: <count:int>, },
+            #         'items_missing_attribute': [<item_key>, ]
+            #     }
+            # }
+            'undefined_attributes': <dict>
+            # <label:str>: {
+            #     <attribute:str>: {
+            #         'distribution': {<attr_value:str>: <count:int>, },
+            #         'items_with_undefined_attr': [<item_key>, ]
+            #     }
+            # }
+        },
+        'total_ann_count': <int>,
+        'items_missing_annotation': <list>, # [<item_key>, ]
+
+        ## statistics for classification task
+        'items_with_multiple_labels': <list>, # [<item_key>, ]
+
+        ## statistics for detection task
+        'items_with_invalid_value': <dict>,
+        # '<item_key>': {<ann_id:int>: [ <property:str>, ], }
+        # - properties: 'x', 'y', 'width', 'height',
+        #               'area(wxh)', 'ratio(w/h)', 'short', 'long'
+        # - 'short' is min(w,h) and 'long' is max(w,h).
+        'items_with_negative_length': <dict>,
+        # '<item_key>': { <ann_id:int>: { <'width'|'height'>: <value>, }, }
+        'bbox_distribution_in_label': <dict>, # <label:str>: <bbox_template>
+        'bbox_distribution_in_attribute': <dict>,
+        # <label:str>: {<attribute:str>: { <attr_value>: <bbox_template>, }, }
+        'bbox_distribution_in_dataset_item': <dict>,
+        # '<item_key>': <bbox count:int>
+
+        ## statistics for segmentation task
+        'items_with_invalid_value'] = <dict>,
+        # '<item_key>': {<ann_id:int>: [ <property:str>, ], }
+        # - properties: 'area', 'width', 'height'
+        'mask_distribution_in_label'] = <dict>, # <label:str>: <mask_template>
+        'mask_distribution_in_attribute'] = <dict>,
+        # <label:str>: {
+        #     <attribute:str>: { <attr_value>: <mask_template>, }
+        # }
+        'mask_distribution_in_dataset_item'] = <dict>,
+        # '<item_key>': <mask/polygon count: int>
+    },
+    'validation_reports': <list>, #[ <validation_error_format>, ]
+    # validation_error_format = {
+    #     'anomaly_type': <str>,  # see datumaro/components/errors.py
+    #     'description': <str>,   # see datumaro/components/errors.py
+    #     'severity': <str>, # 'warning' or 'error'
+    #     'item_id': <str>,  # optional, when it is related to a DatasetItem
+    #     'subset': <str>,   # optional, when it is related to a DatasetItem
+    # }
+    'summary': {
+        'errors': <count: int>,
+        'warnings': <count: int>
+    }
+}
+
+```
+
+`item_key` is defined as,
+``` python
+item_key = (<DatasetItem.id:str>, <DatasetItem.subset:str>)
+```
+
+`bbox_template` and `mask_template` are defined as,
+
+``` python
+bbox_template = {
+    'width': <numerical_stat_template>,
+    'height': <numerical_stat_template>,
+    'area(wxh)': <numerical_stat_template>,
+    'ratio(w/h)': <numerical_stat_template>,
+    'short': <numerical_stat_template>, # short = min(w, h)
+    'long': <numerical_stat_template>   # long = max(w, h)
+}
+mask_template = {
+    'area': <numerical_stat_template>,
+    'width': <numerical_stat_template>,
+    'height': <numerical_stat_template>
+}
+```
+
+`numerical_stat_template` is defined as,
+
+``` python
+numerical_stat_template = {
+    'items_far_from_mean': <dict>,
+    # {'<item_key>': {<ann_id:int>: <value:float>, }, }
+    'mean': <float>,
+    'stdev': <float>,
+    'min': <float>,
+    'max': <float>,
+    'median': <float>,
+    'histogram': {
+        'bins': <list>,   # [<float>, ]
+        'counts': <list>, # [<int>, ]
+    }
+}
+```
+
+</details>
+
+
 ### Register model
 
 Supported models: