Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Datumaro] Add dataset statistics #1668

Merged
merged 6 commits into from
Aug 7, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Added
- Siammask tracker as DL serverless function (<https://github.com/opencv/cvat/pull/1988>)
- [Datumaro] Added model info and source info commands (<https://github.com/opencv/cvat/pull/1973>)
- [Datumaro] Dataset statistics (<https://github.com/opencv/cvat/pull/1668>)

### Changed
- Shape coordinates are rounded to 2 digits in dumped annotations (<https://github.com/opencv/cvat/pull/1970>)
Expand Down
27 changes: 11 additions & 16 deletions datumaro/datumaro/cli/contexts/project/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@
from datumaro.components.dataset_filter import DatasetItemEncoder
from datumaro.components.extractor import AnnotationType
from datumaro.components.cli_plugin import CliPlugin
from datumaro.components.operations import mean_std
from datumaro.components.operations import \
compute_image_statistics, compute_ann_statistics
from .diff import DiffVisualizer
from ...util import add_subparser, CliException, MultilineFormatter, \
make_file_name
Expand Down Expand Up @@ -648,22 +649,16 @@ def build_stats_parser(parser_ctor=argparse.ArgumentParser):

def stats_command(args):
project = load_project(args.project_dir)
dataset = project.make_dataset()

def print_extractor_info(extractor, indent=''):
mean, std = mean_std(dataset)
print("%sImage mean:" % indent, ', '.join('%.3f' % n for n in mean))
print("%sImage std:" % indent, ', '.join('%.3f' % n for n in std))

print("Dataset: ")
print_extractor_info(dataset)

if 1 < len(dataset.subsets()):
print("Subsets: ")
for subset_name in dataset.subsets():
subset = dataset.get_subset(subset_name)
print(" %s:" % subset_name)
print_extractor_info(subset, " " * 4)
dataset = project.make_dataset()
stats = {}
stats.update(compute_image_statistics(dataset))
stats.update(compute_ann_statistics(dataset))

dst_file = generate_next_file_name('statistics', ext='.json')
log.info("Writing project statistics to '%s'" % dst_file)
with open(dst_file, 'w') as f:
json.dump(stats, f, indent=4, sort_keys=True)

def build_info_parser(parser_ctor=argparse.ArgumentParser):
parser = parser_ctor(help="Get project info",
Expand Down
155 changes: 155 additions & 0 deletions datumaro/datumaro/components/operations.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,14 @@
#
# SPDX-License-Identifier: MIT

import logging as log
from copy import deepcopy

import cv2
import numpy as np

from datumaro.components.extractor import AnnotationType


def mean_std(dataset):
"""
Expand All @@ -14,6 +19,8 @@ def mean_std(dataset):
# Use an online algorithm to:
# - handle different image sizes
# - avoid cancellation problem
if len(dataset) == 0:
return [0, 0, 0], [0, 0, 0]

stats = np.empty((len(dataset), 2, 3), dtype=np.double)
counts = np.empty(len(dataset), dtype=np.uint32)
Expand Down Expand Up @@ -80,3 +87,151 @@ def compute_stats(stats, counts, mean_accessor, variance_accessor):
*__class__.compute_stats(stats[:h], counts[:h], m, v),
*__class__.compute_stats(stats[h:], counts[h:], m, v)
)

def compute_image_statistics(dataset):
stats = {
'dataset': {},
'subsets': {}
}

def _extractor_stats(extractor):
available = True
for item in extractor:
if not (item.has_image and item.image.has_data):
available = False
log.warn("Item %s has no image. Image stats won't be computed",
item.id)
break

stats = {
'images count': len(extractor),
}

if available:
mean, std = mean_std(extractor)
stats.update({
'image mean': [float(n) for n in mean[::-1]],
'image std': [float(n) for n in std[::-1]],
})
else:
stats.update({
'image mean': 'n/a',
'image std': 'n/a',
})
return stats

stats['dataset'].update(_extractor_stats(dataset))

subsets = dataset.subsets() or [None]
if subsets and 0 < len([s for s in subsets if s]):
for subset_name in subsets:
stats['subsets'][subset_name] = _extractor_stats(
dataset.get_subset(subset_name))

return stats

def compute_ann_statistics(dataset):
labels = dataset.categories().get(AnnotationType.label)
def get_label(ann):
return labels.items[ann.label].name if ann.label is not None else None

stats = {
'images count': len(dataset),
'annotations count': 0,
'unannotated images count': 0,
'unannotated images': [],
'annotations by type': { t.name: {
'count': 0,
} for t in AnnotationType },
'annotations': {},
}
by_type = stats['annotations by type']

attr_template = {
'count': 0,
'values count': 0,
'values present': set(),
'distribution': {}, # value -> (count, total%)
}
label_stat = {
'count': 0,
'distribution': { l.name: [0, 0] for l in labels.items
}, # label -> (count, total%)

'attributes': {},
}
stats['annotations']['labels'] = label_stat
segm_stat = {
'avg. area': 0,
'area distribution': [], # a histogram with 10 bins
# (min, min+10%), ..., (min+90%, max) -> (count, total%)

'pixel distribution': { l.name: [0, 0] for l in labels.items
}, # label -> (count, total%)
}
stats['annotations']['segments'] = segm_stat
segm_areas = []
pixel_dist = segm_stat['pixel distribution']
total_pixels = 0

for item in dataset:
if len(item.annotations) == 0:
stats['unannotated images'].append(item.id)
continue

for ann in item.annotations:
by_type[ann.type.name]['count'] += 1

if not hasattr(ann, 'label') or ann.label is None:
continue

if ann.type in {AnnotationType.mask,
AnnotationType.polygon, AnnotationType.bbox}:
area = ann.get_area()
segm_areas.append(area)
pixel_dist[get_label(ann)][0] += int(area)

label_stat['count'] += 1
label_stat['distribution'][get_label(ann)][0] += 1

for name, value in ann.attributes.items():
if name.lower() in { 'occluded', 'visibility', 'score',
'id', 'track_id' }:
continue
attrs_stat = label_stat['attributes'].setdefault(name,
deepcopy(attr_template))
attrs_stat['count'] += 1
attrs_stat['values present'].add(str(value))
attrs_stat['distribution'] \
.setdefault(str(value), [0, 0])[0] += 1

stats['annotations count'] = sum(t['count'] for t in
stats['annotations by type'].values())
stats['unannotated images count'] = len(stats['unannotated images'])

for label_info in label_stat['distribution'].values():
label_info[1] = label_info[0] / label_stat['count']

for label_attr in label_stat['attributes'].values():
label_attr['values count'] = len(label_attr['values present'])
label_attr['values present'] = sorted(label_attr['values present'])
for attr_info in label_attr['distribution'].values():
attr_info[1] = attr_info[0] / label_attr['count']

# numpy.sum might be faster, but could overflow with large datasets.
# Python's int can transparently mutate to be of indefinite precision (long)
total_pixels = sum(int(a) for a in segm_areas)

segm_stat['avg. area'] = total_pixels / (len(segm_areas) or 1.0)

for label_info in segm_stat['pixel distribution'].values():
label_info[1] = label_info[0] / total_pixels

if len(segm_areas) != 0:
hist, bins = np.histogram(segm_areas)
segm_stat['area distribution'] = [{
'min': float(bin_min), 'max': float(bin_max),
'count': int(c), 'percent': int(c) / len(segm_areas)
} for c, (bin_min, bin_max) in zip(hist, zip(bins[:-1], bins[1:]))]

return stats
109 changes: 106 additions & 3 deletions datumaro/tests/test_ops.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import numpy as np

from datumaro.components.extractor import Extractor, DatasetItem
from datumaro.components.operations import mean_std
from datumaro.components.extractor import (Extractor, DatasetItem, Label,
Mask, Bbox, Points, Caption)
from datumaro.components.project import Dataset
from datumaro.components.operations import mean_std, compute_ann_statistics

from unittest import TestCase

Expand All @@ -28,4 +30,105 @@ def __iter__(self):
for em, am in zip(expected_mean, actual_mean):
self.assertAlmostEqual(em, am, places=0)
for estd, astd in zip(expected_std, actual_std):
self.assertAlmostEqual(estd, astd, places=0)
self.assertAlmostEqual(estd, astd, places=0)

def test_stats(self):
dataset = Dataset.from_iterable([
DatasetItem(id=1, image=np.ones((5, 5, 3)), annotations=[
Caption('hello'),
Caption('world'),
Label(2, attributes={ 'x': 1, 'y': '2', }),
Bbox(1, 2, 2, 2, label=2, attributes={ 'score': 0.5, }),
Bbox(5, 6, 2, 2, attributes={
'x': 1, 'y': '3', 'occluded': True,
}),
Points([1, 2, 2, 0, 1, 1], label=0),
Mask(label=3, image=np.array([
[0, 0, 1, 1, 1],
[0, 0, 1, 1, 1],
[0, 0, 1, 1, 1],
[0, 0, 0, 0, 0],
[0, 0, 0, 0, 0],
])),
]),
DatasetItem(id=2, image=np.ones((2, 4, 3)), annotations=[
Label(2, attributes={ 'x': 2, 'y': '2', }),
Bbox(1, 2, 2, 2, label=3, attributes={ 'score': 0.5, }),
Bbox(5, 6, 2, 2, attributes={
'x': 2, 'y': '3', 'occluded': False,
}),
]),
DatasetItem(id=3),
], categories=['label_%s' % i for i in range(4)])

expected = {
'images count': 3,
'annotations count': 10,
'unannotated images count': 1,
'unannotated images': ['3'],
'annotations by type': {
'label': { 'count': 2, },
'polygon': { 'count': 0, },
'polyline': { 'count': 0, },
'bbox': { 'count': 4, },
'mask': { 'count': 1, },
'points': { 'count': 1, },
'caption': { 'count': 2, },
},
'annotations': {
'labels': {
'count': 6,
'distribution': {
'label_0': [1, 1/6],
'label_1': [0, 0.0],
'label_2': [3, 3/6],
'label_3': [2, 2/6],
},
'attributes': {
'x': {
'count': 2, # unnotations with no label are skipped
'values count': 2,
'values present': ['1', '2'],
'distribution': {
'1': [1, 1/2],
'2': [1, 1/2],
},
},
'y': {
'count': 2, # unnotations with no label are skipped
'values count': 1,
'values present': ['2'],
'distribution': {
'2': [2, 2/2],
},
},
# must not include "special" attributes like "occluded"
}
},
'segments': {
'avg. area': (4 * 2 + 9 * 1) / 3,
'area distribution': [
{'min': 4.0, 'max': 4.5, 'count': 2, 'percent': 2/3},
{'min': 4.5, 'max': 5.0, 'count': 0, 'percent': 0.0},
{'min': 5.0, 'max': 5.5, 'count': 0, 'percent': 0.0},
{'min': 5.5, 'max': 6.0, 'count': 0, 'percent': 0.0},
{'min': 6.0, 'max': 6.5, 'count': 0, 'percent': 0.0},
{'min': 6.5, 'max': 7.0, 'count': 0, 'percent': 0.0},
{'min': 7.0, 'max': 7.5, 'count': 0, 'percent': 0.0},
{'min': 7.5, 'max': 8.0, 'count': 0, 'percent': 0.0},
{'min': 8.0, 'max': 8.5, 'count': 0, 'percent': 0.0},
{'min': 8.5, 'max': 9.0, 'count': 1, 'percent': 1/3},
],
'pixel distribution': {
'label_0': [0, 0.0],
'label_1': [0, 0.0],
'label_2': [4, 4/17],
'label_3': [13, 13/17],
},
}
},
}

actual = compute_ann_statistics(dataset)

self.assertEqual(expected, actual)