update detection splitter algorithm from # of samples to # of instanc…

…es (cvat-ai#235)
TOsmanov · May 10, 2021 · dbfadc0 · dbfadc0
1 parent 4375cdb
commit dbfadc0
Show file tree

Hide file tree

Showing 3 changed files with 91 additions and 86 deletions.
diff --git a/datumaro/plugins/splitter.py b/datumaro/plugins/splitter.py
@@ -4,6 +4,7 @@
 
 import logging as log
 import numpy as np
+import copy
 from math import gcd
 from enum import Enum
 
@@ -295,7 +296,7 @@ def _get_sections(dataset_size, ratio):
     def _group_by_attr(items):
         """
         Args:
-            items: list of (idx, ann). ann is the annotation from Label object.
+            items: list of (idx_img, ann). ann is the annotation from Label object.
         Returns:
             by_attributes: dict of { combination-of-attrs : list of index }
         """
@@ -315,17 +316,17 @@ def _is_float(value):
 
         # group by attributes
         by_attributes = dict()
-        for idx, ann in items:
+        for idx_img, ann in items:
             # ignore numeric attributes
             filtered = {}
-            for k, v in ann.attributes.items():
-                if _is_float(v):
+            for attr, value in ann.attributes.items():
+                if _is_float(value):
                     continue
-                filtered[k] = v
+                filtered[attr] = value
             attributes = tuple(sorted(filtered.items()))
             if attributes not in by_attributes:
                 by_attributes[attributes] = []
-            by_attributes[attributes].append(idx)
+            by_attributes[attributes].append(idx_img)
 
         return by_attributes
 
@@ -344,9 +345,9 @@ def _split_indice(indice):
         for _, items in datasets.items():
             np.random.shuffle(items)
             by_attributes = self._group_by_attr(items)
-            attr_names = list(by_attributes.keys())
-            np.random.shuffle(attr_names)  # add randomness
-            for attr in attr_names:
+            attr_combinations = list(by_attributes.keys())
+            np.random.shuffle(attr_combinations)  # add randomness
+            for attr in attr_combinations:
                 indice = by_attributes[attr]
                 quo = len(indice) // required
                 if quo > 0:
@@ -719,17 +720,19 @@ def __init__(self, dataset, splits, task, seed=None):
     def _group_by_labels(self, dataset):
         by_labels = dict()
         unlabeled = []
+
         for idx, item in enumerate(dataset):
-            bbox_anns = [a for a in item.annotations if a.type in self.annotation_type]
-            if len(bbox_anns) == 0:
+            instance_anns = [a for a in item.annotations if a.type in self.annotation_type]
+            if len(instance_anns) == 0:
                 unlabeled.append(idx)
                 continue
-            for ann in bbox_anns:
-                label = getattr(ann, "label", None)
+            for instance_ann in instance_anns:
+                label = getattr(instance_ann, "label", None)
                 if label not in by_labels:
-                    by_labels[label] = [(idx, ann)]
+                    by_labels[label] = [(idx, instance_ann)]
                 else:
-                    by_labels[label].append((idx, ann))
+                    by_labels[label].append((idx, instance_ann))
+
         return by_labels, unlabeled
 
     def _split_dataset(self):
@@ -746,79 +749,80 @@ def _split_dataset(self):
         for _, items in by_labels.items():
             by_attributes = self._group_by_attr(items)
             # merge groups which have too small samples.
-            attr_names = list(by_attributes.keys())
-            np.random.shuffle(attr_names)  # add randomless
+            attr_combinations = list(by_attributes.keys())
+            np.random.shuffle(attr_combinations)  # add randomless
             cluster = []
-            minumum = max(required, len(items) * 0.1)  # temp solution
-            for attr in attr_names:
+            min_cluster = max(required, len(items) * 0.01)  # temp solution
+            for attr in attr_combinations:
                 indice = by_attributes[attr]
-                if len(indice) >= minumum:
+                if len(indice) >= min_cluster:
                     by_combinations.append(indice)
                 else:
                     cluster.extend(indice)
-                    if len(cluster) >= minumum:
+                    if len(cluster) >= min_cluster:
                         by_combinations.append(cluster)
                         cluster = []
+
             if len(cluster) > 0:
                 by_combinations.append(cluster)
                 cluster = []
 
         total = len(self._extractor)
-
         # total number of GT samples per label-attr combinations
         n_combs = [len(v) for v in by_combinations]
 
         # 3-1. initially count per-image GT samples
         counts_all = {}
-        for idx in range(total):
-            if idx not in unlabeled:
-                counts_all[idx] = dict()
+        for idx_img in range(total):
+            if idx_img not in unlabeled:
+                counts_all[idx_img] = dict()
 
         for idx_comb, indice in enumerate(by_combinations):
-            for idx in indice:
-                if idx_comb not in counts_all[idx]:
-                    counts_all[idx] = {idx_comb: 1}
+            for idx_img in indice:
+                if idx_comb not in counts_all[idx_img]:
+                    counts_all[idx_img][idx_comb] = 1
                 else:
-                    counts_all[idx][idx_comb] += 1
-
-        init_scores = {}
-        for idx, counts in counts_all.items():
-            norm_sum = 0.0
-            for idx_comb, count in counts.items():
-                norm_sum += count / n_combs[idx_comb]
-            init_scores[idx] = norm_sum
+                    counts_all[idx_img][idx_comb] += 1
 
         by_splits = dict()
         for sname in self._subsets:
             by_splits[sname] = []
 
-        target_size = dict()
-        expected = []  # expected numbers of per split GT samples
+        target_ins = []  # target instance numbers to be split
         for sname, ratio in zip(subsets, sratio):
-            target_size[sname] = (total - len(unlabeled)) * ratio
-            expected.append([sname, np.array(n_combs) * ratio])
+            target_ins.append([sname, np.array(n_combs) * ratio])
+
+        init_scores = {}
+        for idx_img, distributions in counts_all.items():
+            norm_sum = 0.0
+            for idx_comb, dis in distributions.items():
+                norm_sum += dis / n_combs[idx_comb]
+            init_scores[idx_img] = norm_sum
 
-        # functions for keep the # of annotations not exceed the expected num
+        by_scores = dict()
+        for idx_img, score in init_scores.items():
+            if score not in by_scores:
+                by_scores[score] = [idx_img]
+            else:
+                by_scores[score].append(idx_img)
+
+        # functions for keep the # of annotations not exceed the target_ins num
         def compute_penalty(counts, n_combs):
             p = 0
             for idx_comb, v in counts.items():
-                p += max(0, (v / n_combs[idx_comb]) - 1.0)
+                if n_combs[idx_comb] <= 0:
+                    p += 1
+                else:
+                    p += max(0, (v / n_combs[idx_comb]) - 1.0)
+
             return p
 
         def update_nc(counts, n_combs):
             for idx_comb, v in counts.items():
-                n_combs[idx_comb] = max(0, n_combs[idx_comb] - v)
-                if n_combs[idx_comb] == 0:
-                    n_combs[idx_comb] = -1
-
-        by_scores = dict()
-        for idx, score in init_scores.items():
-            if score not in by_scores:
-                by_scores[score] = [idx]
-            else:
-                by_scores[score].append(idx)
+                n_combs[idx_comb] = n_combs[idx_comb] - v
 
         # 3-2. assign each DatasetItem to a split, one by one
+        actual_ins = copy.deepcopy(target_ins)
         for score in sorted(by_scores.keys(), reverse=True):
             indice = by_scores[score]
             np.random.shuffle(indice)  # add randomness for the same score
@@ -827,12 +831,12 @@ def update_nc(counts, n_combs):
                 counts = counts_all[idx]
                 # shuffling split order to add randomness
                 # when two or more splits have the same penalty value
-                np.random.shuffle(expected)
+                np.random.shuffle(actual_ins)
 
                 pp = []
-                for sname, nc in expected:
-                    if target_size[sname] <= len(by_splits[sname]):
-                        # the split has enough images,
+                for sname, nc in actual_ins:
+                    if np.sum(nc) <= 0:
+                        # the split has enough instances,
                         # stop adding more images to this split
                         pp.append(1e08)
                     else:
@@ -842,7 +846,7 @@ def update_nc(counts, n_combs):
 
                 # we push an image to a split with the minimum penalty
                 midx = np.argmin(pp)
-                sname, nc = expected[midx]
+                sname, nc = actual_ins[midx]
                 by_splits[sname].append(idx)
                 update_nc(counts, nc)
 

diff --git a/docs/user_manual.md b/docs/user_manual.md
@@ -1037,8 +1037,8 @@ Example: split a dataset randomly to `train` and `test` subsets, ratio is 2:1
 datum transform -t random_split -- --subset train:.67 --subset test:.33
 ```
 
-Example: split a dataset in task-specific manner. Supported tasks are
-classification, detection, re-identification and segmentation.
+Example: split a dataset in task-specific manner. The tasks supported are
+classification, detection, segmentation and re-identification.
 
 ``` bash
 datum transform -t split -- \
@@ -1081,9 +1081,7 @@ datum transform -t rename -- -e '|pattern|replacement|'
 datum transform -t rename -- -e '|frame_(\d+)|\\1|'
 ```
 
-Example: Sampling dataset items, subset `train` is divided into `sampled`(sampled_subset) and `unsampled`
-- `train` has 100 data, and 20 samples are selected. There are `sampled`(20 samples) and 80 `unsampled`(80 datas) subsets.
-- Remove `train` subset (if sampled_subset=`train` or unsampled_name=`train`, still remain)
+Example: sampling dataset items as many as the number of target samples with sampling method entered by the user, divide into `sampled` and `unsampled` subsets
 - There are five methods of sampling the m option.
     - `topk`: Return the k with high uncertainty data
     - `lowk`: Return the k with low uncertainty data
@@ -1101,7 +1099,7 @@ datum transform -t sampler -- \
     -k 20
 ```
 
-Example : Control number of outputs to 100 after NDR
+Example : control number of outputs to 100 after NDR
 - There are two methods in NDR e option
     - `random`: sample from removed data randomly
     - `similarity`: sample from removed data with ascending