From dbfadc0adfa9f44bfb540e3abdc4ad59e8c3d8ca Mon Sep 17 00:00:00 2001
From: Emily Chun <emily.chun@intel.com>
Date: Mon, 10 May 2021 16:42:31 +0900
Subject: [PATCH] update detection splitter algorithm from # of samples to # of
 instances (#235)

---
 datumaro/plugins/splitter.py | 118 ++++++++++++++++++-----------------
 docs/user_manual.md          |  10 ++-
 tests/test_splitter.py       |  49 ++++++++-------
 3 files changed, 91 insertions(+), 86 deletions(-)

diff --git a/datumaro/plugins/splitter.py b/datumaro/plugins/splitter.py
index abc391ab19db..786828b3c58f 100644
--- a/datumaro/plugins/splitter.py
+++ b/datumaro/plugins/splitter.py
@@ -4,6 +4,7 @@
 
 import logging as log
 import numpy as np
+import copy
 from math import gcd
 from enum import Enum
 
@@ -295,7 +296,7 @@ def _get_sections(dataset_size, ratio):
     def _group_by_attr(items):
         """
         Args:
-            items: list of (idx, ann). ann is the annotation from Label object.
+            items: list of (idx_img, ann). ann is the annotation from Label object.
         Returns:
             by_attributes: dict of { combination-of-attrs : list of index }
         """
@@ -315,17 +316,17 @@ def _is_float(value):
 
         # group by attributes
         by_attributes = dict()
-        for idx, ann in items:
+        for idx_img, ann in items:
             # ignore numeric attributes
             filtered = {}
-            for k, v in ann.attributes.items():
-                if _is_float(v):
+            for attr, value in ann.attributes.items():
+                if _is_float(value):
                     continue
-                filtered[k] = v
+                filtered[attr] = value
             attributes = tuple(sorted(filtered.items()))
             if attributes not in by_attributes:
                 by_attributes[attributes] = []
-            by_attributes[attributes].append(idx)
+            by_attributes[attributes].append(idx_img)
 
         return by_attributes
 
@@ -344,9 +345,9 @@ def _split_indice(indice):
         for _, items in datasets.items():
             np.random.shuffle(items)
             by_attributes = self._group_by_attr(items)
-            attr_names = list(by_attributes.keys())
-            np.random.shuffle(attr_names)  # add randomness
-            for attr in attr_names:
+            attr_combinations = list(by_attributes.keys())
+            np.random.shuffle(attr_combinations)  # add randomness
+            for attr in attr_combinations:
                 indice = by_attributes[attr]
                 quo = len(indice) // required
                 if quo > 0:
@@ -719,17 +720,19 @@ def __init__(self, dataset, splits, task, seed=None):
     def _group_by_labels(self, dataset):
         by_labels = dict()
         unlabeled = []
+
         for idx, item in enumerate(dataset):
-            bbox_anns = [a for a in item.annotations if a.type in self.annotation_type]
-            if len(bbox_anns) == 0:
+            instance_anns = [a for a in item.annotations if a.type in self.annotation_type]
+            if len(instance_anns) == 0:
                 unlabeled.append(idx)
                 continue
-            for ann in bbox_anns:
-                label = getattr(ann, "label", None)
+            for instance_ann in instance_anns:
+                label = getattr(instance_ann, "label", None)
                 if label not in by_labels:
-                    by_labels[label] = [(idx, ann)]
+                    by_labels[label] = [(idx, instance_ann)]
                 else:
-                    by_labels[label].append((idx, ann))
+                    by_labels[label].append((idx, instance_ann))
+
         return by_labels, unlabeled
 
     def _split_dataset(self):
@@ -746,79 +749,80 @@ def _split_dataset(self):
         for _, items in by_labels.items():
             by_attributes = self._group_by_attr(items)
             # merge groups which have too small samples.
-            attr_names = list(by_attributes.keys())
-            np.random.shuffle(attr_names)  # add randomless
+            attr_combinations = list(by_attributes.keys())
+            np.random.shuffle(attr_combinations)  # add randomless
             cluster = []
-            minumum = max(required, len(items) * 0.1)  # temp solution
-            for attr in attr_names:
+            min_cluster = max(required, len(items) * 0.01)  # temp solution
+            for attr in attr_combinations:
                 indice = by_attributes[attr]
-                if len(indice) >= minumum:
+                if len(indice) >= min_cluster:
                     by_combinations.append(indice)
                 else:
                     cluster.extend(indice)
-                    if len(cluster) >= minumum:
+                    if len(cluster) >= min_cluster:
                         by_combinations.append(cluster)
                         cluster = []
+
             if len(cluster) > 0:
                 by_combinations.append(cluster)
                 cluster = []
 
         total = len(self._extractor)
-
         # total number of GT samples per label-attr combinations
         n_combs = [len(v) for v in by_combinations]
 
         # 3-1. initially count per-image GT samples
         counts_all = {}
-        for idx in range(total):
-            if idx not in unlabeled:
-                counts_all[idx] = dict()
+        for idx_img in range(total):
+            if idx_img not in unlabeled:
+                counts_all[idx_img] = dict()
 
         for idx_comb, indice in enumerate(by_combinations):
-            for idx in indice:
-                if idx_comb not in counts_all[idx]:
-                    counts_all[idx] = {idx_comb: 1}
+            for idx_img in indice:
+                if idx_comb not in counts_all[idx_img]:
+                    counts_all[idx_img][idx_comb] = 1
                 else:
-                    counts_all[idx][idx_comb] += 1
-
-        init_scores = {}
-        for idx, counts in counts_all.items():
-            norm_sum = 0.0
-            for idx_comb, count in counts.items():
-                norm_sum += count / n_combs[idx_comb]
-            init_scores[idx] = norm_sum
+                    counts_all[idx_img][idx_comb] += 1
 
         by_splits = dict()
         for sname in self._subsets:
             by_splits[sname] = []
 
-        target_size = dict()
-        expected = []  # expected numbers of per split GT samples
+        target_ins = []  # target instance numbers to be split
         for sname, ratio in zip(subsets, sratio):
-            target_size[sname] = (total - len(unlabeled)) * ratio
-            expected.append([sname, np.array(n_combs) * ratio])
+            target_ins.append([sname, np.array(n_combs) * ratio])
+
+        init_scores = {}
+        for idx_img, distributions in counts_all.items():
+            norm_sum = 0.0
+            for idx_comb, dis in distributions.items():
+                norm_sum += dis / n_combs[idx_comb]
+            init_scores[idx_img] = norm_sum
 
-        # functions for keep the # of annotations not exceed the expected num
+        by_scores = dict()
+        for idx_img, score in init_scores.items():
+            if score not in by_scores:
+                by_scores[score] = [idx_img]
+            else:
+                by_scores[score].append(idx_img)
+
+        # functions for keep the # of annotations not exceed the target_ins num
         def compute_penalty(counts, n_combs):
             p = 0
             for idx_comb, v in counts.items():
-                p += max(0, (v / n_combs[idx_comb]) - 1.0)
+                if n_combs[idx_comb] <= 0:
+                    p += 1
+                else:
+                    p += max(0, (v / n_combs[idx_comb]) - 1.0)
+
             return p
 
         def update_nc(counts, n_combs):
             for idx_comb, v in counts.items():
-                n_combs[idx_comb] = max(0, n_combs[idx_comb] - v)
-                if n_combs[idx_comb] == 0:
-                    n_combs[idx_comb] = -1
-
-        by_scores = dict()
-        for idx, score in init_scores.items():
-            if score not in by_scores:
-                by_scores[score] = [idx]
-            else:
-                by_scores[score].append(idx)
+                n_combs[idx_comb] = n_combs[idx_comb] - v
 
         # 3-2. assign each DatasetItem to a split, one by one
+        actual_ins = copy.deepcopy(target_ins)
         for score in sorted(by_scores.keys(), reverse=True):
             indice = by_scores[score]
             np.random.shuffle(indice)  # add randomness for the same score
@@ -827,12 +831,12 @@ def update_nc(counts, n_combs):
                 counts = counts_all[idx]
                 # shuffling split order to add randomness
                 # when two or more splits have the same penalty value
-                np.random.shuffle(expected)
+                np.random.shuffle(actual_ins)
 
                 pp = []
-                for sname, nc in expected:
-                    if target_size[sname] <= len(by_splits[sname]):
-                        # the split has enough images,
+                for sname, nc in actual_ins:
+                    if np.sum(nc) <= 0:
+                        # the split has enough instances,
                         # stop adding more images to this split
                         pp.append(1e08)
                     else:
@@ -842,7 +846,7 @@ def update_nc(counts, n_combs):
 
                 # we push an image to a split with the minimum penalty
                 midx = np.argmin(pp)
-                sname, nc = expected[midx]
+                sname, nc = actual_ins[midx]
                 by_splits[sname].append(idx)
                 update_nc(counts, nc)
 
diff --git a/docs/user_manual.md b/docs/user_manual.md
index a1602be2ec4e..df9bb331242f 100644
--- a/docs/user_manual.md
+++ b/docs/user_manual.md
@@ -1037,8 +1037,8 @@ Example: split a dataset randomly to `train` and `test` subsets, ratio is 2:1
 datum transform -t random_split -- --subset train:.67 --subset test:.33
 ```
 
-Example: split a dataset in task-specific manner. Supported tasks are
-classification, detection, re-identification and segmentation.
+Example: split a dataset in task-specific manner. The tasks supported are
+classification, detection, segmentation and re-identification.
 
 ``` bash
 datum transform -t split -- \
@@ -1081,9 +1081,7 @@ datum transform -t rename -- -e '|pattern|replacement|'
 datum transform -t rename -- -e '|frame_(\d+)|\\1|'
 ```
 
-Example: Sampling dataset items, subset `train` is divided into `sampled`(sampled_subset) and `unsampled`
-- `train` has 100 data, and 20 samples are selected. There are `sampled`(20 samples) and 80 `unsampled`(80 datas) subsets.
-- Remove `train` subset (if sampled_subset=`train` or unsampled_name=`train`, still remain)
+Example: sampling dataset items as many as the number of target samples with sampling method entered by the user, divide into `sampled` and `unsampled` subsets
 - There are five methods of sampling the m option.
     - `topk`: Return the k with high uncertainty data
     - `lowk`: Return the k with low uncertainty data
@@ -1101,7 +1099,7 @@ datum transform -t sampler -- \
     -k 20
 ```
 
-Example : Control number of outputs to 100 after NDR
+Example : control number of outputs to 100 after NDR
 - There are two methods in NDR e option
     - `random`: sample from removed data randomly
     - `similarity`: sample from removed data with ascending
diff --git a/tests/test_splitter.py b/tests/test_splitter.py
index 4c233f0eb22e..a6b778e19271 100644
--- a/tests/test_splitter.py
+++ b/tests/test_splitter.py
@@ -79,7 +79,7 @@ def test_split_for_classification_multi_class_no_attr(self):
         task = splitter.SplitTask.classification.name
 
         splits = [("train", 0.7), ("test", 0.3)]
-        actual = splitter.Split(source, task, splits)
+        actual = splitter.Split(source, task, splits, seed=100)
 
         self.assertEqual(42, len(actual.get_subset("train")))
         self.assertEqual(18, len(actual.get_subset("test")))
@@ -105,7 +105,7 @@ def test_split_for_classification_single_class_single_attr(self):
         task = splitter.SplitTask.classification.name
 
         splits = [("train", 0.7), ("test", 0.3)]
-        actual = splitter.Split(source, task, splits)
+        actual = splitter.Split(source, task, splits, seed=100)
 
         self.assertEqual(42, len(actual.get_subset("train")))
         self.assertEqual(18, len(actual.get_subset("test")))
@@ -140,7 +140,7 @@ def test_split_for_classification_single_class_multi_attr(self):
 
         with self.subTest("zero remainder"):
             splits = [("train", 0.7), ("test", 0.3)]
-            actual = splitter.Split(source, task, splits)
+            actual = splitter.Split(source, task, splits, seed=100)
 
             self.assertEqual(84, len(actual.get_subset("train")))
             self.assertEqual(36, len(actual.get_subset("test")))
@@ -165,7 +165,7 @@ def test_split_for_classification_single_class_multi_attr(self):
 
         with self.subTest("non-zero remainder"):
             splits = [("train", 0.95), ("test", 0.05)]
-            actual = splitter.Split(source, task, splits)
+            actual = splitter.Split(source, task, splits, seed=100)
 
             self.assertEqual(114, len(actual.get_subset("train")))
             self.assertEqual(6, len(actual.get_subset("test")))
@@ -189,7 +189,7 @@ def test_split_for_classification_multi_label_with_attr(self):
         task = splitter.SplitTask.classification.name
 
         splits = [("train", 0.7), ("test", 0.3)]
-        actual = splitter.Split(source, task, splits)
+        actual = splitter.Split(source, task, splits, seed=100)
 
         train = actual.get_subset("train")
         test = actual.get_subset("test")
@@ -243,7 +243,7 @@ def test_split_for_classification_zero_ratio(self):
         splits = [("train", 0.1), ("val", 0.9), ("test", 0.0)]
         task = splitter.SplitTask.classification.name
 
-        actual = splitter.Split(source, task, splits)
+        actual = splitter.Split(source, task, splits, seed=100)
 
         self.assertEqual(1, len(actual.get_subset("train")))
         self.assertEqual(4, len(actual.get_subset("val")))
@@ -255,7 +255,7 @@ def test_split_for_classification_unlabeled(self):
             source = Dataset.from_iterable(iterable, categories=["a", "b"])
             splits = [("train", 0.7), ("test", 0.3)]
             task = splitter.SplitTask.classification.name
-            actual = splitter.Split(source, task, splits)
+            actual = splitter.Split(source, task, splits, seed=100)
 
             self.assertEqual(7, len(actual.get_subset("train")))
             self.assertEqual(3, len(actual.get_subset("test")))
@@ -266,7 +266,7 @@ def test_split_for_classification_unlabeled(self):
             source = Dataset.from_iterable(iterable, categories=["a", "b"])
             splits = [("train", 0.7), ("test", 0.3)]
             task = splitter.SplitTask.classification.name
-            actual = splitter.Split(source, task, splits)
+            actual = splitter.Split(source, task, splits, seed=100)
 
             self.assertEqual(7, len(actual.get_subset("train")))
             self.assertEqual(3, len(actual.get_subset("test")))
@@ -405,7 +405,7 @@ def test_split_for_reidentification_rebalance(self):
         task = splitter.SplitTask.reid.name
         splits = [("train", 0.5), ("val", 0.2), ("test", 0.3)]
         query = 0.4 / 0.7
-        actual = splitter.Split(source, task, splits, query)
+        actual = splitter.Split(source, task, splits, query, seed=100)
 
         self.assertEqual(350, len(actual.get_subset("train")))
         self.assertEqual(140, len(actual.get_subset("val")))
@@ -420,7 +420,7 @@ def test_split_for_reidentification_unlabeled(self):
             iterable = [DatasetItem(i, annotations=[]) for i in range(10)]
             source = Dataset.from_iterable(iterable, categories=["a", "b"])
             splits = [("train", 0.6), ("test", 0.4)]
-            actual = splitter.Split(source, task, splits, query)
+            actual = splitter.Split(source, task, splits, query, seed=100)
             self.assertEqual(10, len(actual.get_subset("not-supported")))
 
         with self.subTest("multi label"):
@@ -428,7 +428,7 @@ def test_split_for_reidentification_unlabeled(self):
             iterable = [DatasetItem(i, annotations=anns) for i in range(10)]
             source = Dataset.from_iterable(iterable, categories=["a", "b"])
             splits = [("train", 0.6), ("test", 0.4)]
-            actual = splitter.Split(source, task, splits, query)
+            actual = splitter.Split(source, task, splits, query, seed=100)
 
             self.assertEqual(10, len(actual.get_subset("not-supported")))
 
@@ -827,7 +827,7 @@ def test_split_for_detection(self):
                 test=test,
                 task=task,
             ):
-                actual = splitter.Split(source, task, splits)
+                actual = splitter.Split(source, task, splits, seed=100)
 
                 self.assertEqual(train, len(actual.get_subset("train")))
                 self.assertEqual(val, len(actual.get_subset("val")))
@@ -858,7 +858,7 @@ def test_split_for_detection_with_unlabeled(self):
 
         splits = [("train", 0.5), ("val", 0.2), ("test", 0.3)]
         task = splitter.SplitTask.detection.name
-        actual = splitter.Split(source, task, splits)
+        actual = splitter.Split(source, task, splits, seed=100)
         self.assertEqual(10, len(actual.get_subset("train")))
         self.assertEqual(4, len(actual.get_subset("val")))
         self.assertEqual(6, len(actual.get_subset("test")))
@@ -898,7 +898,7 @@ def test_no_subset_name_and_count_restriction(self):
             config = {"label1": {"attrs": None, "counts": 10}}
             task = splitter.SplitTask.classification.name
             source = self._generate_dataset(config)
-            actual = splitter.Split(source, task, splits)
+            actual = splitter.Split(source, task, splits, seed=100)
             self.assertEqual(5, len(actual.get_subset("_train")))
             self.assertEqual(1, len(actual.get_subset("valid")))
             self.assertEqual(1, len(actual.get_subset("valid2")))
@@ -912,10 +912,10 @@ def test_no_subset_name_and_count_restriction(self):
                 nimages=10,
             )
             task = splitter.SplitTask.detection.name
-            actual = splitter.Split(source, task, splits)
-            self.assertEqual(5, len(actual.get_subset("_train")))
+            actual = splitter.Split(source, task, splits, seed=21)
+            self.assertEqual(4, len(actual.get_subset("_train")))
             self.assertEqual(1, len(actual.get_subset("valid")))
-            self.assertEqual(1, len(actual.get_subset("valid2")))
+            self.assertEqual(2, len(actual.get_subset("valid2")))
             self.assertEqual(2, len(actual.get_subset("test*")))
             self.assertEqual(1, len(actual.get_subset("test2")))
 
@@ -926,7 +926,7 @@ def test_no_subset_name_and_count_restriction(self):
                 nimages=10,
             )
             task = splitter.SplitTask.detection.name
-            actual = splitter.Split(source, task, splits)
+            actual = splitter.Split(source, task, splits, seed=100)
             self.assertEqual(5, len(actual.get_subset("_train")))
             self.assertEqual(1, len(actual.get_subset("valid")))
             self.assertEqual(1, len(actual.get_subset("valid2")))
@@ -938,7 +938,7 @@ def test_no_subset_name_and_count_restriction(self):
                 with_attr=True,
                 nimages=10,
             )
-            actual = splitter.Split(source, task, splits)
+            actual = splitter.Split(source, task, splits, seed=100)
             self.assertEqual(5, len(actual.get_subset("_train")))
             self.assertEqual(1, len(actual.get_subset("valid")))
             self.assertEqual(1, len(actual.get_subset("valid2")))
@@ -977,7 +977,7 @@ def test_split_for_segmentation(self):
                     test=test,
                     task=task,
                 ):
-                    actual = splitter.Split(source, task, splits)
+                    actual = splitter.Split(source, task, splits, seed=100)
 
                     self.assertEqual(train, len(actual.get_subset("train")))
                     self.assertEqual(val, len(actual.get_subset("val")))
@@ -1008,6 +1008,7 @@ def test_split_for_segmentation(self):
                     params.append((dtype, with_attr, 10, 5, 3, 2))
                     params.append((dtype, with_attr, 10, 7, 0, 3))
 
+            expected = []
             for dtype, with_attr, nimages, train, val, test in params:
                 source, _ = self._generate_detection_segmentation_dataset(
                     annotation_type=self._get_append_polygon(dtype),
@@ -1029,7 +1030,9 @@ def test_split_for_segmentation(self):
                     test=test,
                     task=task,
                 ):
-                    actual = splitter.Split(source, task, splits)
+                    actual = splitter.Split(source, task, splits, seed=21)
+
+                    expected.append([dtype, with_attr, len(actual.get_subset("train")), len(actual.get_subset("val")), len(actual.get_subset("test"))])
 
                     self.assertEqual(train, len(actual.get_subset("train")))
                     self.assertEqual(val, len(actual.get_subset("val")))
@@ -1064,7 +1067,7 @@ def test_split_for_segmentation_with_unlabeled(self):
 
             splits = [("train", 0.5), ("val", 0.2), ("test", 0.3)]
             task = splitter.SplitTask.segmentation.name
-            actual = splitter.Split(source, task, splits)
+            actual = splitter.Split(source, task, splits, seed=100)
             self.assertEqual(10, len(actual.get_subset("train")))
             self.assertEqual(4, len(actual.get_subset("val")))
             self.assertEqual(6, len(actual.get_subset("test")))
@@ -1080,7 +1083,7 @@ def test_split_for_segmentation_with_unlabeled(self):
 
             splits = [("train", 0.5), ("val", 0.2), ("test", 0.3)]
             task = splitter.SplitTask.segmentation.name
-            actual = splitter.Split(source, task, splits)
+            actual = splitter.Split(source, task, splits, seed=100)
             self.assertEqual(10, len(actual.get_subset("train")))
             self.assertEqual(4, len(actual.get_subset("val")))
             self.assertEqual(6, len(actual.get_subset("test")))