diff --git a/datumaro/plugins/splitter.py b/datumaro/plugins/splitter.py index d5473a05c4..cbdf5b0917 100644 --- a/datumaro/plugins/splitter.py +++ b/datumaro/plugins/splitter.py @@ -10,20 +10,20 @@ DEFAULT_SUBSET_NAME) from datumaro.components.cli_plugin import CliPlugin -class TaskSpecificSplitter(Transform): +class TaskSpecificSplitter(Transform): _parts = [] def __init__(self, dataset:Dataset, seed): super().__init__(dataset) - - np.random.seed(seed) - + + np.random.seed(seed) + def _get_required(self, ratio): min_value = np.max(ratio) for i in ratio: if i < min_value and i > 1e-7: - min_value = i - required = int (np.around(1.0) / min_value) + min_value = i + required = int (np.around(1.0) / min_value) return required def _normalize_ratio(ratio): @@ -34,23 +34,23 @@ def _normalize_ratio(ratio): ratio = np.array(ratio) ratio /= np.sum(ratio) - return ratio + return ratio def _get_sections(self, dataset_size, ratio): n_splits = [int(np.around(dataset_size * r)) for r in ratio[:-1]] n_splits.append(dataset_size - np.sum(n_splits)) - # if there are splits with zero samples even if ratio is not 0, + # if there are splits with zero samples even if ratio is not 0, # borrow one from the split who has one or more. for ii in range(len(n_splits)): if n_splits[ii] == 0 and ratio[ii] > 1e-7: midx = np.argmax(n_splits) if n_splits[midx] > 0: n_splits[ii] += 1 - n_splits[midx] -= 1 - sections = np.add.accumulate(n_splits[:-1]) + n_splits[midx] -= 1 + sections = np.add.accumulate(n_splits[:-1]) return sections - + def _group_by_attributes(self, items): ''' Args: @@ -68,16 +68,16 @@ def _group_by_attributes(self, items): return by_attributes def _split_indice(self, indice, group_name, ratio, required): - filtered_size = len(indice) + filtered_size = len(indice) if required > filtered_size: log.warning("There's not enough samples for filtered group, \ '{}'}'".format(group_name)) - sections = self._get_sections(filtered_size, ratio) - splits = np.array_split(indice, sections) - assert len(ratio)==len(splits) - return splits + sections = self._get_sections(filtered_size, ratio) + splits = np.array_split(indice, sections) + assert len(ratio)==len(splits) + return splits - def _find_split(self, index): + def _find_split(self, index): for subset_indices, subset in self._parts: if index in subset_indices: return subset @@ -90,29 +90,29 @@ def __iter__(self): class SplitforClassification(TaskSpecificSplitter, CliPlugin): """ Splits dataset into train/val/test set in class-wise manner. |n - |n + |n Notes:|n - - Single label is expected for each DatasetItem.|n - - If there are not enough images in some class or attributes group, - the split ratio can't be guaranteed.|n - - The split ratio will be normalized so that the sum is 1.|n + - Single label is expected for each DatasetItem.|n + - If there are not enough images in some class or attributes group, + the split ratio can't be guaranteed.|n |n Example:|n |s|s%(prog)s --train .5 --val .2 --test .3 - """ + """ @classmethod def build_cmdline_parser(cls, **kwargs): - parser = super().build_cmdline_parser(**kwargs) - parser.add_argument('-t', '--train', type=float, - help="Ratio for train set") + parser = super().build_cmdline_parser(**kwargs) + parser.add_argument('-t', '--train', type=float, + help="Ratio for train set") parser.add_argument('-v', '--val', type=float, - help="Ratio for validation set") + help="Ratio for validation set") parser.add_argument('-e', '--test',type=float, help="Ratio for test set") parser.add_argument('--seed', type=int, help="Random seed") return parser - def __init__(self, dataset:Dataset, train=0.0, val=0.0, test=0.0, seed=None): + def __init__(self, dataset:Dataset, + train=0.0, val=0.0, test=0.0, seed=None): super().__init__(dataset, seed) subsets = ['train', 'val', 'test'] @@ -123,19 +123,17 @@ def __init__(self, dataset:Dataset, train=0.0, val=0.0, test=0.0, seed=None): "Ratios are expected to be in the range [0, 1], but got %s" % splits total_ratio = np.sum(sratio) - assert total_ratio > 1e-7, \ - "Sum of ratios is expected to be positive, got %s, which is %s" % \ - (splits, total_ratio) - - sratio /= total_ratio # normalize - required = self._get_required(sratio) + if not abs(total_ratio - 1.0) <= 1e-7: + raise Exception( + "Sum of ratios is expected to be 1, got %s, which is %s" % + (splits, total_ratio)) - ## support only single label for a DatasetItem + ## support only single label for a DatasetItem ## 1. group by label by_labels = dict() for idx, item in enumerate(self._extractor): labels = [] - for ann in item.annotations: + for ann in item.annotations: if ann.type == AnnotationType.label: labels.append(ann) assert len(labels) == 1, \ @@ -148,13 +146,14 @@ def __init__(self, dataset:Dataset, train=0.0, val=0.0, test=0.0, seed=None): if label not in by_labels: by_labels[label] = [] by_labels[label].append((idx, ann)) - + self._subsets = set(subsets) # output subset names by_splits = dict() for subset in subsets: by_splits[subset] = [] - - ## 2. group by attributes + + ## 2. group by attributes + required = self._get_required(sratio) for label, items in by_labels.items(): np.random.shuffle(items) by_attributes = self._group_by_attributes(items) @@ -164,7 +163,7 @@ def __init__(self, dataset:Dataset, train=0.0, val=0.0, test=0.0, seed=None): for subset, split in zip(subsets, splits): if len(split) > 0: by_splits[subset].extend(split) - + parts = [] for subset in self._subsets: parts.append((set(by_splits[subset]), subset)) @@ -181,39 +180,35 @@ class SplitforMatchingReID(TaskSpecificSplitter, CliPlugin): Then, tags 'test' into 'gallery'/'query' in class-wise random manner.|n Then, splits 'train+val' into 'train'/'val' sets in the same way.|n Therefore, the final subsets would be 'train', 'val', 'test'. |n - And 'gallery', 'query' are tagged using anntoation group.|n - |n + And 'gallery', 'query' are tagged using anntoation group.|n + You can get the 'gallery' and 'query' subsets using 'get_subset_by_group'.|n Notes:|n - - Single label is expected for each DatasetItem.|n - - Each label is expected to have "PID" attribute. |n - - If there is "Auxiliray" attribute in labels of test set, - it would be overwritten by "gallery" or "query". - - The train/val/test ratio and gallery/query ratio are normalized - so that the sum is 1, respectively.|n + - Single label is expected for each DatasetItem.|n + - Each label is expected to have "PID" attribute. |n |n Example:|n |s|s%(prog)s --train .5 --val .2 --test .3 --gallery .5 --query .5 - """ + """ _group_map = dict() @classmethod def build_cmdline_parser(cls, **kwargs): - parser = super().build_cmdline_parser(**kwargs) + parser = super().build_cmdline_parser(**kwargs) parser.add_argument('-t', '--train', type=float, - help="Ratio for train set") - parser.add_argument('-v', '--val', type=float, - help="Ratio for validation set") - parser.add_argument('-e', '--test',type=float, + help="Ratio for train set") + parser.add_argument('-v', '--val', type=float, + help="Ratio for validation set") + parser.add_argument('-e', '--test',type=float, help="Ratio for test set") - parser.add_argument('-g', '--gallery', type=float, - help="Ratio for gallery in test set") - parser.add_argument('-q', '--query',type=float, + parser.add_argument('-g', '--gallery', type=float, + help="Ratio for gallery in test set") + parser.add_argument('-q', '--query',type=float, help="Ratio for query in test set") parser.add_argument('--seed', type=int, help="Random seed") return parser - def __init__(self, dataset:Dataset, train=0.0, val=0.0, test=0.0, - gallery=0.0, query=0.0, seed=None): + def __init__(self, dataset:Dataset, train=0.0, val=0.0, test=0.0,\ + gallery=0.0, query=0.0, attr_pid="PID", seed=None): super().__init__(dataset, seed) id_subsets = ['train', 'val', 'test'] @@ -222,10 +217,10 @@ def __init__(self, dataset:Dataset, train=0.0, val=0.0, test=0.0, assert all(0.0 <= r and r <= 1.0 for _, r in id_splits), "Ratios \ are expected to be in the range [0, 1], but got %s" % id_splits total_ratio = np.sum(id_ratio) - assert total_ratio > 1e-7, \ - "Sum of ratios is expected to be positive, got %s, which is %s" % \ - (id_splits, total_ratio) - id_ratio /= total_ratio # normalize + if not abs(total_ratio - 1.0) <= 1e-7: + raise Exception( + "Sum of ratios is expected to be 1, got %s, which is %s" % + (id_splits, total_ratio)) test_subsets = ['gallery', 'query'] test_ratio = np.array([gallery, query]) @@ -233,28 +228,28 @@ def __init__(self, dataset:Dataset, train=0.0, val=0.0, test=0.0, assert all(0.0 <= r and r <= 1.0 for _, r in test_splits), \ "Ratios are expected to be in the range [0, 1], but got %s"\ % test_splits - - groups = set() - ## group by PID + + groups = set() + ## group by PID(attr_pid) by_pid = dict() for idx, item in enumerate(self._extractor): labels = [] - for ann in item.annotations: + for ann in item.annotations: if ann.type == AnnotationType.label: labels.append(ann) assert len(labels) == 1, \ "Expected exact one label for a DatasetItem" ann = labels[0] - attributes = dict(ann.attributes.items()) - assert "PID" in attributes, \ - "'PID' is expected as attribute" - person_id = attributes['PID'] + attributes = dict(ann.attributes.items()) + assert attr_pid in attributes, \ + "'{}' is expected as attribute name".format(attr_pid) + person_id = attributes[attr_pid] if person_id not in by_pid: by_pid[person_id] = [] by_pid[person_id].append((idx, ann)) groups.add(ann.group) - max_group_id = max(groups) + max_group_id = max(groups) self._group_map["gallery"] = max_group_id + 1 self._group_map["query"] = max_group_id + 2 @@ -262,7 +257,7 @@ def __init__(self, dataset:Dataset, train=0.0, val=0.0, test=0.0, if len(by_pid) < required: log.warning("There's not enough IDs, which is {}, \ so train/val/test ratio can't be guaranteed." % len(by_pid)) - + self._subsets = set(id_subsets) # output subset names by_splits = dict() for subset in self._subsets: @@ -281,22 +276,24 @@ def __init__(self, dataset:Dataset, train=0.0, val=0.0, test=0.0, trainval = { pid: by_pid[pid] for pid in splits[1] } ## follow the ratio of datasetitems as possible. - ## naive heuristic: exchange the best item one by one. - expected_count = int(len(self._extractor) * split_ratio[0]) + ## naive heuristic: exchange the best item one by one. + expected_count = int(len(self._extractor) * split_ratio[0]) testset_total = int(np.sum([len(v) for v in testset.values()])) - if testset_total != expected_count: + if testset_total != expected_count: diffs = dict() for id_test, items_test in testset.items(): count_test = len(items_test) for id_trval, items_trval in trainval.items(): count_trval = len(items_trval) diff = count_trval - count_test + if diff==0: + continue # exchange has no effect if diff not in diffs: diffs[diff] = [(id_test, id_trval)] else: diffs[diff].append((id_test, id_trval)) - exchanges = [] + exchanges = [] while True: target_diff = expected_count - testset_total # find nearest diff. @@ -308,7 +305,7 @@ def __init__(self, dataset:Dataset, train=0.0, val=0.0, test=0.0, choice = np.random.choice(range(len(diffs[nearest]))) pid_test, pid_trval = diffs[nearest][choice] testset_total += nearest - new_diffs = dict() + new_diffs = dict() for diff, person_ids in diffs.items(): new_list = [] for id1, id2 in person_ids: @@ -318,7 +315,7 @@ def __init__(self, dataset:Dataset, train=0.0, val=0.0, test=0.0, if len(new_list)>0: new_diffs[diff] = new_list diffs = new_diffs - exchanges.append((pid_test, pid_trval)) + exchanges.append((pid_test, pid_trval)) # exchange for pid_test, pid_trval in exchanges: testset[pid_trval] = trainval.pop(pid_trval) @@ -326,39 +323,38 @@ def __init__(self, dataset:Dataset, train=0.0, val=0.0, test=0.0, else: testset = dict() trainval = by_pid - + ## 2. split 'test' into 'gallery' and 'query' if len(testset)>0: - for person_id, items in testset.items(): + for person_id, items in testset.items(): indice = [idx for idx, _ in items] - by_splits['test'].extend(indice) + by_splits['test'].extend(indice) total_ratio = np.sum(test_ratio) - if total_ratio < 1e-7: - log.warning( - "Sum of ratios is expected to be positive,\ - got %s, which is %s" % (test_splits, total_ratio)) - else: - test_ratio /= total_ratio # normalize - required = self._get_required(test_ratio) - for person_id, items in testset.items(): - np.random.shuffle(items) - by_attributes = self._group_by_attributes(items) - for attributes, indice in by_attributes.items(): - gname = 'person_id: {}, attributes: {}'.format( - person_id, attributes) - splits = self._split_indice(indice, gname, - test_ratio, required) - - # tag using group - for idx, item in enumerate(self._extractor): - for subset, split in zip(test_subsets, splits): - if idx in split: - group_id = self._group_map[subset] - item.annotations[0].group = group_id - break - - ## 3. split 'trainval' into 'train' and 'val' + if not abs(total_ratio - 1.0) <= 1e-7: + raise Exception( + "Sum of ratios is expected to be 1, got %s, which is %s" % + (test_splits, total_ratio)) + + required = self._get_required(test_ratio) + for person_id, items in testset.items(): + np.random.shuffle(items) + by_attributes = self._group_by_attributes(items) + for attributes, indice in by_attributes.items(): + gname = 'person_id: {}, attributes: {}'.format( + person_id, attributes) + splits = self._split_indice(indice, gname, + test_ratio, required) + + # tag using group + for idx, item in enumerate(self._extractor): + for subset, split in zip(test_subsets, splits): + if idx in split: + group_id = self._group_map[subset] + item.annotations[0].group = group_id + break + + ## 3. split 'trainval' into 'train' and 'val' trainval_subsets = ["train", "val"] trainval_ratio = np.array([train, val]) total_ratio = np.sum(trainval_ratio) @@ -376,11 +372,11 @@ def __init__(self, dataset:Dataset, train=0.0, val=0.0, test=0.0, for attributes, indice in by_attributes.items(): gname = 'person_id: {}, attributes: {}'.format( person_id, attributes) - splits = self._split_indice(indice, gname, + splits = self._split_indice(indice, gname, trainval_ratio, required) for subset, split in zip(trainval_subsets, splits): if len(split) > 0: - by_splits[subset].extend(split) + by_splits[subset].extend(split) parts = [] for subset in self._subsets: @@ -398,4 +394,139 @@ def get_subset_by_group(self, group:str): subset = self.select(lambda item: item.annotations[0].group == group_id) return subset - \ No newline at end of file +class SplitforDetection(TaskSpecificSplitter, CliPlugin): + """ + Splits dataset into train/val/test set for detection task.|n + For detection dataset, each image can have multiple bbox annotations.|n + Since one DataItem can't be included in multiple subsets at the same time, + the dataset can't be divided according to the bbox annotations.|n + Thus, we split dataset based on DatasetItem + while preserving label distribution as possible.|n + |n + Notes:|n + - Each DatsetItem is expected to have one or more Bbox annotations.|n + - Label annotations are ignored. We only focus on the Bbox annotations.|n + |n + Example:|n + |s|s%(prog)s --train .5 --val .2 --test .3 + """ + @classmethod + def build_cmdline_parser(cls, **kwargs): + parser = super().build_cmdline_parser(**kwargs) + parser.add_argument('-t', '--train', type=float, + help="Ratio for train set") + parser.add_argument('-v', '--val', type=float, + help="Ratio for validation set") + parser.add_argument('-e', '--test',type=float, + help="Ratio for test set") + parser.add_argument('--seed', type=int, help="Random seed") + return parser + + def __init__(self, dataset:Dataset, + train=0.0, val=0.0, test=0.0, seed=None): + super().__init__(dataset, seed) + + subsets = ['train', 'val', 'test'] + sratio = np.array([train, val, test]) + splits = list(zip(subsets, sratio)) + + assert all(0.0 <= r and r <= 1.0 for _, r in splits), \ + "Ratios are expected to be in the range [0, 1], but got %s" % splits + + total_ratio = np.sum(sratio) + if not abs(total_ratio - 1.0) <= 1e-7: + raise Exception( + "Sum of ratios is expected to be 1, got %s, which is %s" % + (splits, total_ratio)) + + ## 1. group by bbox label + by_labels = dict() + for idx, item in enumerate(self._extractor): + for ann in item.annotations: + if ann.type == AnnotationType.bbox: + if not hasattr(ann, 'label') or ann.label is None: + label = None + else: + label = str(ann.label) + if label not in by_labels: + by_labels[label] = [(idx, ann)] + else: + by_labels[label].append((idx, ann)) + + ## 2. group by attributes + by_combinations = dict() + for label, items in by_labels.items(): + by_attributes = self._group_by_attributes(items) + for attributes, indice in by_attributes.items(): + gname = 'label: {}, attributes: {}'.format(label, attributes) + by_combinations[gname] = indice + + ## total number of GT samples per label-attr combinations + NC = {k: len(v) for k, v in by_combinations.items()} + + ## 3-1. initially count per-image GT samples + scores_all = {} + init_scores = {} + for idx, item in enumerate(self._extractor): + counts = { k: v.count(idx) for k, v in by_combinations.items() } + scores_all[idx] = counts + init_scores[idx] = np.sum( [v / NC[k] for k, v in counts.items()] ) + + self._subsets = set(subsets) # output subset names + by_splits = dict() + for sname in subsets: + by_splits[sname] = [] + + total = len(self._extractor) + target_size = dict() + NC_all = [] # expected numbers of per split GT samples + for sname, ratio in zip (subsets, sratio): + target_size[sname] = total * ratio + NC_all.append((sname, {k: v * ratio for k, v in NC.items()})) + + ### + # functions for keep the # of annotations not exceed the expected number + def compute_penalty(counts, NC): + p = 0 + for k, v in counts.items(): + p += max(0, (v / NC[k]) - 1.0) + return p + def update_nc(counts, NC): + for k, v in counts.items(): + NC[k] = max(0, NC[k] - v) + if NC[k] == 0: + NC[k] = -1 + return NC + ### + + # 3-2. assign each DatasetItem to a split, one by one + for idx, _ in sorted(init_scores.items(), \ + key=lambda item: item[1], reverse=True): + counts = scores_all[idx] + + # shuffling split order to add randomness + # when two or more splits have the same penalty value + np.random.shuffle(NC_all) + + pp = [] + for sname, nc in NC_all: + if len(by_splits[sname]) >= target_size[sname]: + # the split has enough images, stop adding more images to this split + pp.append(1e+08) + else: + # compute penalty based on number of GT samples added in the split + pp.append(compute_penalty(counts, nc)) + + # we push an image to a split with the minimum penalty + midx = np.argmin(pp) + + sname, nc = NC_all[midx] + by_splits[sname].append(idx) + update_nc(counts, nc) + + parts = [] + for subset in self._subsets: + parts.append((set(by_splits[subset]), subset)) + self._parts = parts + + self._length = 'parent' \ No newline at end of file diff --git a/tests/test_splitter.py b/tests/test_splitter.py index f4312ed732..f0fe695a49 100644 --- a/tests/test_splitter.py +++ b/tests/test_splitter.py @@ -3,9 +3,10 @@ import random from unittest import TestCase + from datumaro.components.project import Dataset -from datumaro.components.extractor import (DatasetItem, - Label, LabelCategories, AnnotationType) +from datumaro.components.extractor import (DatasetItem, + Label, LabelCategories, AnnotationType, Bbox) import datumaro.plugins.splitter as splitter from datumaro.components.operations import compute_ann_statistics @@ -15,17 +16,17 @@ def _generate_dataset(self, config): # counts = {(0,0):20, (0,1):20, (0,2):30, (1,0):20, (1,1):10, (1,2):20} # attr1 = ['attr1', 'attr2'] # attr2 = ['attr1', 'attr3'] - # config = { "label1": { "attrs": attr1, "counts": counts }, - # "label2": { "attrs": attr2, "counts": counts }} - def _get_subset(): + # config = { "label1": { "attrs": attr1, "counts": counts }, + # "label2": { "attrs": attr2, "counts": counts }} + def _get_subset(): return np.random.choice(['', 'a', 'b'], p = [0.5, 0.3, 0.2]) - iterable = [] + iterable = [] label_cat = LabelCategories() - idx = 0 - for label_id, label in enumerate(config.keys()): + idx = 0 + for label_id, label in enumerate(config.keys()): anames = config[label]['attrs'] - counts = config[label]['counts'] + counts = config[label]['counts'] label_cat.add(label, attributes=anames) if isinstance(counts, dict): for attrs, count in counts.items(): @@ -55,35 +56,35 @@ def _get_subset(): return dataset def test_split_for_classification_multi_class_no_attr(self): - config = { "label1": { "attrs": None, "counts": 10 }, + config = { "label1": { "attrs": None, "counts": 10 }, "label2": { "attrs": None, "counts": 20 }, "label3": { "attrs": None, "counts": 30 }} source_dataset = self._generate_dataset(config) - - actual = splitter.SplitforClassification(source_dataset, - train=0.7, test=0.3) + + actual = splitter.SplitforClassification(source_dataset, + train=0.7, test=0.3) self.assertEqual(42, len(actual.get_subset('train'))) self.assertEqual(18, len(actual.get_subset('test'))) # check stats for train - stat_train = compute_ann_statistics(actual.get_subset('train')) + stat_train = compute_ann_statistics(actual.get_subset('train')) dist_train = stat_train["annotations"]["labels"]["distribution"] self.assertEqual(7, dist_train["label1"][0]) self.assertEqual(14, dist_train["label2"][0]) self.assertEqual(21, dist_train["label3"][0]) - + # check stats for test - stat_test = compute_ann_statistics(actual.get_subset('test')) + stat_test = compute_ann_statistics(actual.get_subset('test')) dist_test = stat_test["annotations"]["labels"]["distribution"] self.assertEqual(3, dist_test["label1"][0]) self.assertEqual(6, dist_test["label2"][0]) self.assertEqual(9, dist_test["label3"][0]) - def test_split_for_classification_single_class_single_attr(self): - counts = {0:10, 1:20, 2:30} + def test_split_for_classification_single_class_single_attr(self): + counts = {0:10, 1:20, 2:30} config = { "label": { "attrs": ['attr'], "counts": counts }} - source_dataset = self._generate_dataset(config) - + source_dataset = self._generate_dataset(config) + actual = splitter.SplitforClassification(source_dataset, \ train=0.7, test=0.3) @@ -91,56 +92,56 @@ def test_split_for_classification_single_class_single_attr(self): self.assertEqual(18, len(actual.get_subset('test'))) # check stats for train - stat_train = compute_ann_statistics(actual.get_subset('train')) + stat_train = compute_ann_statistics(actual.get_subset('train')) attr_train = stat_train["annotations"]["labels"]["attributes"] self.assertEqual(7, attr_train['attr']["distribution"]["0"][0]) self.assertEqual(14, attr_train['attr']["distribution"]["1"][0]) self.assertEqual(21, attr_train['attr']["distribution"]["2"][0]) - + # check stats for test - stat_test = compute_ann_statistics(actual.get_subset('test')) + stat_test = compute_ann_statistics(actual.get_subset('test')) attr_test = stat_test["annotations"]["labels"]["attributes"] self.assertEqual(3, attr_test['attr']["distribution"]["0"][0]) self.assertEqual(6, attr_test['attr']["distribution"]["1"][0]) - self.assertEqual(9, attr_test['attr']["distribution"]["2"][0]) - + self.assertEqual(9, attr_test['attr']["distribution"]["2"][0]) + def test_split_for_classification_single_class_multi_attr(self): counts = {(0,0):20, (0,1):20, (0,2):30, (1,0):20, (1,1):10, (1,2):20} attrs = ['attr1', 'attr2'] config = { "label": { "attrs": attrs, "counts": counts }} - source_dataset = self._generate_dataset(config) - + source_dataset = self._generate_dataset(config) + actual = splitter.SplitforClassification(source_dataset, train=0.7, test=0.3) self.assertEqual(84, len(actual.get_subset('train'))) self.assertEqual(36, len(actual.get_subset('test'))) - + # check stats for train - stat_train = compute_ann_statistics(actual.get_subset('train')) - attr_train = stat_train["annotations"]["labels"]["attributes"] + stat_train = compute_ann_statistics(actual.get_subset('train')) + attr_train = stat_train["annotations"]["labels"]["attributes"] self.assertEqual(49, attr_train["attr1"]["distribution"]["0"][0]) self.assertEqual(35, attr_train["attr1"]["distribution"]["1"][0]) - self.assertEqual(28, attr_train["attr2"]["distribution"]["0"][0]) - self.assertEqual(21, attr_train["attr2"]["distribution"]["1"][0]) + self.assertEqual(28, attr_train["attr2"]["distribution"]["0"][0]) + self.assertEqual(21, attr_train["attr2"]["distribution"]["1"][0]) self.assertEqual(35, attr_train["attr2"]["distribution"]["2"][0]) - + # check stats for test - stat_test = compute_ann_statistics(actual.get_subset('test')) + stat_test = compute_ann_statistics(actual.get_subset('test')) attr_test = stat_test["annotations"]["labels"]["attributes"] self.assertEqual(21, attr_test["attr1"]["distribution"]["0"][0]) self.assertEqual(15, attr_test["attr1"]["distribution"]["1"][0]) - self.assertEqual(12, attr_test["attr2"]["distribution"]["0"][0]) - self.assertEqual( 9, attr_test["attr2"]["distribution"]["1"][0]) + self.assertEqual(12, attr_test["attr2"]["distribution"]["0"][0]) + self.assertEqual( 9, attr_test["attr2"]["distribution"]["1"][0]) self.assertEqual(15, attr_test["attr2"]["distribution"]["2"][0]) - - def test_split_for_classification_multi_label_with_attr(self): + + def test_split_for_classification_multi_label_with_attr(self): counts = {(0,0):20, (0,1):20, (0,2):30, (1,0):20, (1,1):10, (1,2):20} attr1 = ['attr1', 'attr2'] attr2 = ['attr1', 'attr3'] - config = { "label1": { "attrs": attr1, "counts": counts }, + config = { "label1": { "attrs": attr1, "counts": counts }, "label2": { "attrs": attr2, "counts": counts }} - source_dataset = self._generate_dataset(config) + source_dataset = self._generate_dataset(config) actual = splitter.SplitforClassification(source_dataset, train=0.7, test=0.3) @@ -149,35 +150,35 @@ def test_split_for_classification_multi_label_with_attr(self): self.assertEqual(72, len(actual.get_subset('test'))) # check stats for train - stat_train = compute_ann_statistics(actual.get_subset('train')) + stat_train = compute_ann_statistics(actual.get_subset('train')) dist_train = stat_train["annotations"]["labels"]["distribution"] self.assertEqual(84, dist_train["label1"][0]) - self.assertEqual(84, dist_train["label2"][0]) - attr_train = stat_train["annotations"]["labels"]["attributes"] + self.assertEqual(84, dist_train["label2"][0]) + attr_train = stat_train["annotations"]["labels"]["attributes"] self.assertEqual(49*2, attr_train["attr1"]["distribution"]["0"][0]) self.assertEqual(35*2, attr_train["attr1"]["distribution"]["1"][0]) - self.assertEqual(28, attr_train["attr2"]["distribution"]["0"][0]) - self.assertEqual(21, attr_train["attr2"]["distribution"]["1"][0]) + self.assertEqual(28, attr_train["attr2"]["distribution"]["0"][0]) + self.assertEqual(21, attr_train["attr2"]["distribution"]["1"][0]) self.assertEqual(35, attr_train["attr2"]["distribution"]["2"][0]) - self.assertEqual(28, attr_train["attr3"]["distribution"]["0"][0]) - self.assertEqual(21, attr_train["attr3"]["distribution"]["1"][0]) + self.assertEqual(28, attr_train["attr3"]["distribution"]["0"][0]) + self.assertEqual(21, attr_train["attr3"]["distribution"]["1"][0]) self.assertEqual(35, attr_train["attr3"]["distribution"]["2"][0]) - + # check stats for test - stat_test = compute_ann_statistics(actual.get_subset('test')) + stat_test = compute_ann_statistics(actual.get_subset('test')) dist_test = stat_test["annotations"]["labels"]["distribution"] self.assertEqual(36, dist_test["label1"][0]) - self.assertEqual(36, dist_test["label2"][0]) - attr_test = stat_test["annotations"]["labels"]["attributes"] + self.assertEqual(36, dist_test["label2"][0]) + attr_test = stat_test["annotations"]["labels"]["attributes"] self.assertEqual(21*2, attr_test["attr1"]["distribution"]["0"][0]) self.assertEqual(15*2, attr_test["attr1"]["distribution"]["1"][0]) - self.assertEqual(12, attr_test["attr2"]["distribution"]["0"][0]) - self.assertEqual( 9, attr_test["attr2"]["distribution"]["1"][0]) + self.assertEqual(12, attr_test["attr2"]["distribution"]["0"][0]) + self.assertEqual( 9, attr_test["attr2"]["distribution"]["1"][0]) self.assertEqual(15, attr_test["attr2"]["distribution"]["2"][0]) - self.assertEqual(12, attr_test["attr3"]["distribution"]["0"][0]) - self.assertEqual( 9, attr_test["attr3"]["distribution"]["1"][0]) + self.assertEqual(12, attr_test["attr3"]["distribution"]["0"][0]) + self.assertEqual( 9, attr_test["attr3"]["distribution"]["1"][0]) self.assertEqual(15, attr_test["attr3"]["distribution"]["2"][0]) - + def test_split_for_classification_gives_error_on_multi_label(self): iterable = [ DatasetItem( @@ -190,51 +191,51 @@ def test_split_for_classification_gives_error_on_multi_label(self): annotations=[Label(0), Label(2)], subset="" ) - ] + ] categories = { AnnotationType.label: LabelCategories.from_iterable( 'label_' + str(label) for label in range(3) - )} + )} source_dataset = Dataset.from_iterable(iterable, categories) with self.assertRaises(Exception): - actual = splitter.SplitforClassification(source_dataset, + actual = splitter.SplitforClassification(source_dataset, train=0.7, test=0.3) def test_split_for_classification_gives_error_on_wrong_ratios(self): source_dataset = Dataset.from_iterable([DatasetItem(id=1)]) with self.assertRaises(Exception): - splitter.SplitforClassification(source_dataset, + splitter.SplitforClassification(source_dataset, train=-0.5, test=1.5) - + def test_split_for_matching_reid(self): counts = { i:(i%3+1)*7 for i in range(10) } config = { "person": { "attrs": ['PID'], "counts": counts }} source_dataset = self._generate_dataset(config) - + actual = splitter.SplitforMatchingReID(source_dataset, \ - train=0.5, val=0.2, test=0.3, query=0.4, gallery=0.3) - + train=0.5, val=0.2, test=0.3, query=0.4/0.7, gallery=0.3/0.7) + stats = dict() for sname in ['train', 'val', 'test']: subset = actual.get_subset(sname) stat_subset = compute_ann_statistics(subset)["annotations"] stat_attr = stat_subset["labels"]["attributes"]["PID"] stats[sname] = stat_attr - + for sname in ['gallery', 'query']: subset = actual.get_subset_by_group(sname) stat_subset = compute_ann_statistics(subset)["annotations"] stat_attr = stat_subset["labels"]["attributes"]["PID"] stats[sname] = stat_attr - + self.assertEqual(65, stats['train']['count']) # depends on heuristic self.assertEqual(26, stats['val']['count']) # depends on heuristic self.assertEqual(42, stats['test']['count']) # depends on heuristic - + train_ids = stats['train']['values present'] self.assertEqual(7, len(train_ids)) self.assertEqual(train_ids, stats['val']['values present']) - + trainval = stats['train']['count'] + stats['val']['count'] self.assertEqual(int(trainval * 0.5 / 0.7), stats['train']['count']) self.assertEqual(int(trainval * 0.2 / 0.7), stats['val']['count']) @@ -245,12 +246,12 @@ def test_split_for_matching_reid(self): total = counts[int(pid)] self.assertEqual(int(total * 0.5 / 0.7), dist_train[pid][0]) self.assertEqual(int(total * 0.2 / 0.7), dist_val[pid][0]) - + test_ids = stats['test']['values present'] self.assertEqual(3, len(test_ids)) self.assertEqual(test_ids, stats['gallery']['values present']) self.assertEqual(test_ids, stats['query']['values present']) - + dist_test = stats['test']['distribution'] dist_gallery = stats['gallery']['distribution'] dist_query = stats['query']['distribution'] @@ -259,4 +260,130 @@ def test_split_for_matching_reid(self): self.assertEqual(total, dist_test[pid][0]) self.assertEqual(int(total * 0.3 / 0.7), dist_gallery[pid][0]) self.assertEqual(int(total * 0.4 / 0.7), dist_query[pid][0]) - \ No newline at end of file + + def _generate_detection_dataset(self, style, with_attr=False, nimages=10): + def _get_subset(): + return np.random.choice(['', 'a', 'b'], p = [0.5, 0.3, 0.2]) + + label_cat = LabelCategories() + for i in range(6): + label = "label{}".format(i+1) + if with_attr is True: + attributes = {"attr0", "attr{}".format(i+1)} + else: + attributes = {} + label_cat.add(label, attributes = attributes) + categories = { AnnotationType.label: label_cat } + + iterable = [] + def append_bbox_coco(annotations, **kwargs): + annotations.append(Bbox(1,1,2,2, label=kwargs['label_id'], + id=kwargs['ann_id'], + attributes=kwargs['attributes'], + group=kwargs['label_id'])) + annotations.append(Label(kwargs['label_id'], + attributes=kwargs['attributes'])) + def append_bbox_voc(annotations, **kwargs): + annotations.append(Bbox(1,1,2,2, label=kwargs['label_id'], + id=kwargs['ann_id']+1, + attributes=kwargs['attributes'], + group=kwargs['label_id'])) # obj + annotations.append(Label(kwargs['label_id'], + attributes=kwargs['attributes'])) + annotations.append(Bbox(1,1,2,2, label=kwargs['label_id']+3, + group=kwargs['label_id'])) # part + annotations.append(Label(kwargs['label_id']+3, + attributes=kwargs['attributes'])) + def append_bbox_yolo(annotations, **kwargs): + annotations.append(Bbox(1,1,2,2, label=kwargs['label_id'])) + annotations.append(Label(kwargs['label_id'], + attributes=kwargs['attributes'])) + def append_bbox_cvat(annotations, **kwargs): + annotations.append(Bbox(1,1,2,2, label=kwargs['label_id'], + id=kwargs['ann_id'], + attributes=kwargs['attributes'], + group=kwargs['label_id'], + z_order=kwargs['ann_id'])) + annotations.append(Label(kwargs['label_id'], + attributes=kwargs['attributes'])) + def append_bbox_labelme(annotations, **kwargs): + annotations.append(Bbox(1,1,2,2, label=kwargs['label_id'], + attributes=kwargs['attributes'], + id=kwargs['ann_id'])) + annotations.append(Label(kwargs['label_id'], + attributes=kwargs['attributes'])) + def append_bbox_mot(annotations, **kwargs): + annotations.append(Bbox(1,1,2,2, label=kwargs['label_id'], + attributes=kwargs['attributes'])) + annotations.append(Label(kwargs['label_id'], + attributes=kwargs['attributes'])) + def append_bbox_widerface(annotations, **kwargs): + annotations.append(Bbox(1,1,2,2, attributes=kwargs['attributes'])) + annotations.append(Label(0, attributes=kwargs['attributes'])) + + if style == "coco": + append_bbox = append_bbox_coco + elif style == "voc": + append_bbox = append_bbox_voc + elif style in ["yolo", "tf_detection"]: + append_bbox = append_bbox_yolo + elif style in ["datumaro", "cvat"]: + append_bbox = append_bbox_cvat + elif style == "labelme": + append_bbox = append_bbox_labelme + elif style == "mot": + append_bbox = append_bbox_mot + elif style == "widerface": + append_bbox = append_bbox_widerface + + attr_val = 0 + totals = np.zeros(3) + for img_id in range(nimages): + cnts = np.random.randint(0, 5, 3) + totals += cnts + annotations = [] + for label_id, count in enumerate(cnts): + attributes = {} + if with_attr: + attr_val += 1 + attributes["attr0"] = attr_val % 3 + attributes["attr{}".format(label_id+1)] = attr_val % 2 + for ann_id in range(count): + append_bbox(annotations, label_id=label_id, \ + ann_id=ann_id, attributes=attributes) + item = DatasetItem( + id=str(img_id), + annotations=annotations, + subset=_get_subset(), + attributes={"id":img_id} + ) + iterable.append(item) + + dataset = Dataset.from_iterable(iterable, categories) + return dataset, totals + + def test_split_for_detection(self): + styles = [ + "coco", "voc", "yolo", "cvat", "labelme", "mot", "widerface" + ] + params = [] + for style in styles: + for with_attr in [False, True]: + params.append((style, with_attr, 10, 5, 3, 2)) + params.append((style, with_attr, 10, 7, 0, 3)) + + for style, with_attr, nimages, train, val, test in params: + with self.subTest(style=style, with_attr=with_attr, nimage=nimages, + train=train, val=val, test=test): + source, _ = self._generate_detection_dataset( + style, with_attr, nimages) + total = np.sum([train, val, test]) + actual = splitter.SplitforDetection(source, \ + train=train/total, val=val/total, test=test/total) + subsets = dict() + for sname in ['train', 'val', 'test']: + subset = actual.get_subset(sname) + subsets[sname] = subset + self.assertEqual(train, len(subsets['train'])) + self.assertEqual(val, len(subsets['val'])) + self.assertEqual(test, len(subsets['test'])) \ No newline at end of file