diff --git a/.travis.yml b/.travis.yml index 24843e8ee6..6b9f252237 100644 --- a/.travis.yml +++ b/.travis.yml @@ -31,6 +31,7 @@ matrix: install: - pip install -e ./ - pip install tensorflow + - pip install pandas script: - python -m unittest discover -v diff --git a/CHANGELOG.md b/CHANGELOG.md index 12e1062671..001cc1ddf6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -33,6 +33,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - `LFW` dataset format () - Support of polygons' and masks' confusion matrices and mismathing classes in `diff` command () - Add near duplicate image removal plugin () +- Sampler Plugin that analyzes inference result from the given dataset and selects samples for annotation() ### Changed - OpenVINO model launcher is updated for OpenVINO r2021.1 () diff --git a/README.md b/README.md index 5a4e582943..65aa2817c2 100644 --- a/README.md +++ b/README.md @@ -158,6 +158,11 @@ CVAT annotations ---> Publication, statistics etc. - for detection task, based on bboxes - for re-identification task, based on labels, avoiding having same IDs in training and test splits + - Sampling a dataset + - analyzes inference result from the given dataset + and selects the ‘best’ and the ‘least amount of’ samples for annotation. + - Select the sample that best suits model training. + - sampling with Entropy based algorithm - Dataset quality checking - Simple checking for errors - Comparison with model infernece diff --git a/datumaro/plugins/sampler/__init__.py b/datumaro/plugins/sampler/__init__.py new file mode 100644 index 0000000000..0aa5e58c75 --- /dev/null +++ b/datumaro/plugins/sampler/__init__.py @@ -0,0 +1,3 @@ +# Copyright (C) 2021 Intel Corporation +# +# SPDX-License-Identifier: MIT diff --git a/datumaro/plugins/sampler/algorithm/algorithm.py b/datumaro/plugins/sampler/algorithm/algorithm.py new file mode 100644 index 0000000000..7dd9758173 --- /dev/null +++ b/datumaro/plugins/sampler/algorithm/algorithm.py @@ -0,0 +1,22 @@ +# Copyright (C) 2021 Intel Corporation +# +# SPDX-License-Identifier: MIT + +from enum import Enum + +SamplingMethod = Enum("SamplingMethod", ["topk", "lowk", "randk", "mixk", "randtopk"]) +Algorithm = Enum("Algorithm", ["entropy"]) + + +class InferenceResultAnalyzer: + """ + Basic interface for IRA (Inference Result Analyzer) + """ + + def __init__(self, dataset, inference): + self.data = dataset + self.inference = inference + self.sampling_method = SamplingMethod + + def get_sample(self, method: str, k: int): + raise NotImplementedError() diff --git a/datumaro/plugins/sampler/algorithm/entropy.py b/datumaro/plugins/sampler/algorithm/entropy.py new file mode 100644 index 0000000000..8ea69cf5aa --- /dev/null +++ b/datumaro/plugins/sampler/algorithm/entropy.py @@ -0,0 +1,191 @@ +# Copyright (C) 2021 Intel Corporation +# +# SPDX-License-Identifier: MIT + +import pandas as pd +import math +import re +import logging as log + +from .algorithm import InferenceResultAnalyzer + + +class SampleEntropy(InferenceResultAnalyzer): + """ + Entropy is a class that inherits an Sampler, + calculates an uncertainty score based on an entropy, + and get samples based on that score. + """ + + def __init__(self, data, inference): + """ + Constructor function + Args: + data: Receive the data format in pd.DataFrame format. ImageID is an essential element for data. + inference: + Receive the inference format in the form of pd.DataFrame. + ImageID and ClassProbability are essential for inferences. + """ + super().__init__(data, inference) + + # check the existence of "ImageID" in data & inference + if "ImageID" not in data: + raise Exception("Invalid Data, ImageID not found in data") + if "ImageID" not in inference: + raise Exception("Invalid Data, ImageID not found in inference") + + # check the existence of "ClassProbability" in inference + self.num_classes = 0 + for head in list(inference): + m = re.match("ClassProbability\d+", head) + if m is not None: + self.num_classes += 1 + + if not self.num_classes > 0: + raise Exception( + "Invalid data, Inference do not have ClassProbability values!" + ) + + # rank: The inference DataFrame, sorted according to the score. + self.rank = self._rank_images().sort_values(by="rank") + + def get_sample(self, method: str, k: int, n: int = 3) -> pd.DataFrame: + """ + A function that extracts sample data and returns it. + Args: + method: + - 'topk': It extracts the k sample data with the highest uncertainty. + - 'lowk': It extracts the k sample data with the lowest uncertainty. + - 'randomk': Extract and return random k sample data. + k: number of sample data + n: Parameters to be used in the randtopk method, Variable to first extract data of multiple n of k. + Returns: + Extracted sample data : pd.DataFrame + """ + temp_rank = self.rank + + # 1. k value check + if not isinstance(k, int): + raise Exception( + f"Invalid value {k}. k must have an integer greater than zero." + ) + elif k <= 0: + raise Exception( + f"Invalid number {k}. k must have a positive number greater than zero." + ) + + # 2. Select a sample according to the method + if k <= len(temp_rank): + if method == self.sampling_method.topk.name: + temp_rank = temp_rank[:k] + elif method == self.sampling_method.lowk.name: + temp_rank = temp_rank[-k:] + elif method == self.sampling_method.randk.name: + return self.data.sample(n=k).reset_index(drop=True) + elif method in [ + self.sampling_method.mixk.name, + self.sampling_method.randtopk.name, + ]: + return self._get_sample_mixed(method=method, k=k, n=n) + else: + raise Exception(f"Not Found method '{method}'") + else: + log.warning( + "The number of samples is greater than the size of the selected subset." + ) + + columns = list(self.data.columns) + merged_df = pd.merge(temp_rank, self.data, how="inner", on=["ImageID"]) + return merged_df[columns].reset_index(drop=True) + + def _get_sample_mixed(self, method: str, k: int, n: int = 3) -> pd.DataFrame: + """ + A function that extracts sample data and returns it. + Args: + method: + - 'mixk': Return top-k and low-k halves based on uncertainty. + - 'randomtopk': Randomly extract n*k and return k with high uncertainty. + k: number of sample data + n: Number to extract n*k from total data according to n, and top-k from it + Returns: + Extracted sample data : pd.DataFrame + """ + temp_rank = self.rank + + # Select a sample according to the method + if k <= len(temp_rank): + if method == self.sampling_method.mixk.name: + if k % 2 == 0: + temp_rank = pd.concat([temp_rank[: k // 2], temp_rank[-(k // 2) :]]) + else: + temp_rank = pd.concat( + [temp_rank[: (k // 2) + 1], temp_rank[-(k // 2) :]] + ) + elif method == self.sampling_method.randtopk.name: + if n * k <= len(temp_rank): + temp_rank = temp_rank.sample(n=n * k).sort_values(by="rank") + else: + log.warning(msg="n * k exceeds the length of the inference") + temp_rank = temp_rank[:k] + + columns = list(self.data.columns) + merged_df = pd.merge(temp_rank, self.data, how="inner", on=["ImageID"]) + return merged_df[columns].reset_index(drop=True) + + def _rank_images(self) -> pd.DataFrame: + """ + A internal function that ranks the inference data based on uncertainty. + Returns: + inference data sorted by uncertainty. pd.DataFrame + """ + # 1. Load Inference + inference, res = None, None + if self.inference is not None: + inference = pd.DataFrame(self.inference) + else: + raise Exception("Invalid Data, Failed to load inference result!") + + # 2. If the reference data frame does not contain an uncertify score, calculate it + if "Uncertainty" not in inference: + inference = self._calculate_uncertainty_from_classprob(inference=inference) + + # 3. Check that Uncertainty values are in place. + na_df = inference.isna().sum() + if "Uncertainty" in na_df and na_df["Uncertainty"] > 0: + raise Exception("Some inference results do not have Uncertainty values!") + + # 4. Ranked based on Uncertainty score + res = inference[["ImageID", "Uncertainty"]].groupby("ImageID").mean() + res["rank"] = res["Uncertainty"].rank(ascending=False, method="first") + res = res.reset_index() + + return res + + def _calculate_uncertainty_from_classprob( + self, inference: pd.DataFrame + ) -> pd.DataFrame: + """ + A function that calculates uncertainty based on entropy through ClassProbability values. + Args: + inference: Inference data where uncertainty has not been calculated + Returns: + inference data with uncertainty variable + """ + + # Calculate Entropy (Uncertainty Score) + uncertainty = [] + for i in range(len(inference)): + entropy = 0 + for j in range(self.num_classes): + p = inference.loc[i][f"ClassProbability{j+1}"] + if p < 0 or p > 1: + raise Exception( + "Invalid data, Math domain Error! p is between 0 and 1" + ) + entropy -= p * math.log(p + 1e-14, math.e) + + uncertainty.append(entropy) + + inference["Uncertainty"] = uncertainty + + return inference diff --git a/datumaro/plugins/sampler/sampler.py b/datumaro/plugins/sampler/sampler.py new file mode 100644 index 0000000000..e808d91fe7 --- /dev/null +++ b/datumaro/plugins/sampler/sampler.py @@ -0,0 +1,216 @@ +# Copyright (C) 2021 Intel Corporation +# +# SPDX-License-Identifier: MIT + +import pandas as pd +from collections import defaultdict +from .algorithm.algorithm import SamplingMethod, Algorithm + +from datumaro.components.extractor import Transform, DEFAULT_SUBSET_NAME +from datumaro.components.cli_plugin import CliPlugin + + +class Sampler(Transform, CliPlugin): + """ + Sampler that analyzes the inference result of the dataset |n + and picks the best sample for training.|n + |n + Notes:|n + - Each image's inference result must contain the probability for all classes.|n + - Requesting a sample larger than the number of all images will return all images.|n + |n + Example:|n + |s|s%(prog)s \ |n + |s|s|s|s-a entropy \ |n + |s|s|s|s-subset_name train \ |n + |s|s|s|s-sample_name sample \ |n + |s|s|s|s-unsampled_name unsampled \ |n + |s|s|s|s-m topk -k 20 + """ + + @classmethod + def build_cmdline_parser(cls, **kwargs): + parser = super().build_cmdline_parser(**kwargs) + parser.add_argument( + "-a", + "--algorithm", + type=str, + default="entropy", + choices=[t.name for t in Algorithm], + help=f"Select algorithm, example: {[t.name for t in Algorithm]}", + ) + parser.add_argument( + "-subset_name", + "--subset_name", + type=str, + help="Subset name to select sample", + ) + parser.add_argument( + "-sample_name", + "--sampled_name", + type=str, + default="sampled_set", + help="sampled data subset name", + ) + parser.add_argument( + "-unsample_name", + "--unsampled_name", + type=str, + default="unsampled_set", + help="unsampled data subset name name", + ) + parser.add_argument( + "-m", + "--sampling_method", + type=str, + default="topk", + choices=[t.name for t in SamplingMethod], + help=f"Method of sampling, example: {[t.name for t in SamplingMethod]}", + ) + parser.add_argument("-k", "--num_sample", type=int, help="Num of sample") + parser.add_argument( + "-o", + "--output_file", + type=str, + default=None, + help="Output Sample file path, The extension of the file must end with .csv", + ) + return parser + + def __init__( + self, + extractor, + algorithm, + subset_name, + sampled_name, + unsampled_name, + sampling_method, + num_sample, + output_file, + ): + """ + Parameters + ---------- + extractor : Extractor, Dataset + algorithm : str + Specifying the algorithm to calculate the uncertainty + for sample selection. default: 'entropy' + subset_name : str + The name of the subset to which you want to select a sample. + sample_name : str + Subset name of the selected sample, default: 'sample' + sampling_method : str + Method of sampling, 'topk' or 'lowk' or 'randk' + num_sample : int + Number of samples extracted + output_file : str + Path of sampler result, Use when user want to save results + """ + super().__init__(extractor) + + # Get Parameters + self.subset_name = subset_name + self.sampled_name = sampled_name + self.unsampled_name = unsampled_name + self.algorithm = algorithm + self.sampling_method = sampling_method + self.num_sample = num_sample + self.output_file = output_file + + # optional. Use the --output_file option to save the sample list as a csv file. + if output_file is not None and output_file.split(".")[-1] != ".csv": + raise Exception( + "Invalid extension, The extension of the file must end with .csv" + ) + + @staticmethod + def _load_inference_from_subset(extractor, subset_name): + # 1. Get Dataset from subset name + if subset_name in extractor.subsets().keys(): + subset = extractor.get_subset(subset_name) + else: + raise Exception(f"Not Found subset '{subset_name}'") + + data_df = defaultdict(list) + infer_df = defaultdict(list) + + # 2. Fill the data_df and infer_df to fit the sampler algorithm input format. + for item in subset: + data_df["ImageID"].append(item.id) + + if not item.has_image or item.image.size is None: + raise Exception(f"Invalid data, data.id: {item.id}") + + width, height = item.image.size + data_df["Width"].append(width) + data_df["Height"].append(height) + data_df["ImagePath"].append(item.image.path) + + if not item.annotations: + raise Exception("Invalid data, data.annotations is empty") + + for annotation in item.annotations: + if "score" not in annotation.attributes: + raise Exception("Invalid data, probability score is None") + probs = annotation.attributes["score"] + + infer_df["ImageID"].append(item.id) + + for prob_idx, prob in enumerate(probs): + infer_df[f"ClassProbability{prob_idx+1}"].append(prob) + + data_df = pd.DataFrame(data_df) + infer_df = pd.DataFrame(infer_df) + + return data_df, infer_df + + @staticmethod + def _calculate_uncertainty(algorithm, data, inference): + # Checking and creating algorithms + algorithms = Algorithm + if algorithm == algorithms.entropy.name: + from .algorithm.entropy import SampleEntropy + + # Data delivery, uncertainty score calculations also proceed. + sampler = SampleEntropy(data, inference) + else: + raise Exception( + f"Not Found algorithm '{algorithm}', available algorithms: {algorithms}" + ) + return sampler + + def _check_sample(self, image): + # The function that determines the subset name of the data. + if image.subset: + if image.subset == self.subset_name: + # 1. Returns the sample subset if the id belongs to samples. + if image.id in self.sample_id: + return self.sampled_name + else: + return self.unsampled_name + else: + # 2. Returns the existing subset name if it is not a sample + return image.subset + else: + return DEFAULT_SUBSET_NAME + + def __iter__(self): + # Import data into a subset name and convert it + # to a format that will be used in the sampler algorithm with the inference result. + data_df, infer_df = self._load_inference_from_subset( + self._extractor, self.subset_name + ) + + # Transfer the data to sampler algorithm to calculate uncertainty & get sample list + sampler = self._calculate_uncertainty(self.algorithm, data_df, infer_df) + self.result = sampler.get_sample(method=self.sampling_method, k=self.num_sample) + + if self.output_file is not None: + self.result.to_csv(self.output_file, index=False) + + self.sample_id = self.result["ImageID"].to_list() + + # Transform properties for each data + for item in self._extractor: + # After checking whether each item belongs to a sample, rename the subset. + yield self.wrap_item(item, subset=self._check_sample(item)) diff --git a/docs/user_manual.md b/docs/user_manual.md index 6266c323d8..99eb220523 100644 --- a/docs/user_manual.md +++ b/docs/user_manual.md @@ -1023,6 +1023,26 @@ datum transform -t rename -- -e '|pattern|replacement|' datum transform -t rename -- -e '|frame_(\d+)|\\1|' ``` +Example: Sampling dataset items, subset `train` is divided into `sampled`(sampled_subset) and `unsampled` +- `train` has 100 data, and 20 samples are selected. There are `sampled`(20 samples) and 80 `unsampled`(80 datas) subsets. +- Remove `train` subset (if sample_name=`train` or unsample_name=`train`, still remain) +- There are five methods of sampling the m option. + - `topk`: Return the k with high uncertainty data + - `lowk`: Return the k with low uncertainty data + - `randk`: Return the random k data + - `mixk`: Return half to topk method and the rest to lowk method + - `randtopk`: First, select 3 times the number of k randomly, and return the topk among them. + +``` bash +datum transform -t sampler -- \ + -a entropy \ + -subset_name train \ + -sample_name sampled \ + -unsample_name unsampled \ + -m topk \ + -k 20 +``` + ## Extending There are few ways to extend and customize Datumaro behaviour, which is supported by plugins. diff --git a/requirements.txt b/requirements.txt index 6bc3c7ee79..5cfc7dd4f2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,3 +10,4 @@ pycocotools>=2.0.0 PyYAML>=5.3.1 scikit-image>=0.15.0 tensorboardX>=1.8 +pandas>=1.1.5 diff --git a/tests/assets/sampler/inference.csv b/tests/assets/sampler/inference.csv new file mode 100644 index 0000000000..e08065831a --- /dev/null +++ b/tests/assets/sampler/inference.csv @@ -0,0 +1,31 @@ +ImageID,ClassProbability1,ClassProbability2,ClassProbability3,Uncertainty +1,0.975242317,0.024469912,0.000287826,0.117586322 +2,0.999715984,0.000281501,2.53E-06,0.002618015 +3,0.999299884,0.000691595,8.50E-06,0.005831472 +4,0.971567273,0.027958876,0.000473852,0.131661266 +5,0.999411225,0.000576135,1.26E-05,0.005028461 +6,0.999715269,0.00027976,4.95E-06,0.002634019 +7,0.978483677,0.021343108,0.00017317,0.104890488 +8,0.984344006,0.015289294,0.000366639,0.082351737 +9,0.974284053,0.025472108,0.000243954,0.120898865 +10,0.964820206,0.034958012,0.000221764,0.153654948 +11,0.996293604,0.003278826,0.000427532,0.02577186 +12,0.999689937,0.000307999,2.14E-06,0.002828279 +13,0.997596323,0.000604421,0.001799274,0.018252373 +14,0.999696493,0.000294724,8.87E-06,0.002802743 +15,0.999686837,0.000309912,3.27E-06,0.002858304 +16,0.999234438,0.000750318,1.53E-05,0.006333055 +17,0.999581277,0.000413273,5.49E-06,0.003705278 +18,0.999384761,0.000604751,1.05E-05,0.005217474 +19,0.999574125,0.000416982,8.93E-06,0.003774712 +20,0.999575078,0.000411838,1.31E-05,0.003782649 +21,0.999712646,0.000286349,9.24E-07,0.002636151 +22,0.998748422,0.001103578,0.000147974,0.010070177 +23,0.999729574,0.000268848,1.53E-06,0.002501184 +24,0.999636412,0.000354998,8.59E-06,0.003283583 +25,0.999675989,0.000322926,1.11E-06,0.002934833 +26,0.970380008,0.029310413,0.000309611,0.135138899 +27,0.979150653,0.019359451,0.001489813,0.106692567 +28,0.999622822,0.000374233,3.02E-06,0.003368486 +29,0.999201596,0.000615866,0.000182658,0.006923281 +30,0.999691606,0.0002986,9.82E-06,0.002845172 diff --git a/tests/test_sampler.py b/tests/test_sampler.py new file mode 100644 index 0000000000..e31dd12ed0 --- /dev/null +++ b/tests/test_sampler.py @@ -0,0 +1,1120 @@ +from collections import defaultdict +from unittest import TestCase + +from datumaro.components.project import Dataset +from datumaro.components.extractor import ( + DatasetItem, + Label, + LabelCategories, + AnnotationType, +) +from datumaro.util.image import Image + +import csv +import pandas as pd + +import datumaro.plugins.sampler.sampler as sampler +from datumaro.plugins.sampler.algorithm.entropy import SampleEntropy as entropy + + +class SamplerTest(TestCase): + @staticmethod + def _get_probs(out_range=False): + probs = [] + # data length is 500 + inference_file = "tests/assets/sampler/inference.csv" + with open(inference_file) as csv_file: + csv_reader = csv.reader(csv_file) + col = 0 + for row in csv_reader: + if col == 0: + col += 1 + continue + else: + if out_range: + probs.append(list(map(lambda x: -float(x), row[1:4]))) + else: + probs.append(list(map(float, row[1:4]))) + return probs + + def _generate_classification_dataset( + self, + config, + subset=None, + empty_score=False, + out_range=False, + no_attr=False, + no_img=False, + ): + + probs = self._get_probs(out_range) + if subset is None: + self.subset = ["train", "val", "test"] + else: + self.subset = subset + + iterable = [] + label_cat = LabelCategories() + idx = 0 + for label_id, label in enumerate(config.keys()): + num_item = config[label] + label_cat.add(label, attributes=None) + for _ in range(num_item): + score = probs[idx] + idx += 1 + if empty_score: + score = [] + attr = {"score": score} + if no_attr: + attr = {} + img = Image(path=f"test/dataset/{idx}.jpg", size=(90, 90)) + if no_img: + img = None + iterable.append( + DatasetItem( + idx, + subset=self.subset[idx % len(self.subset)], + annotations=[ + Label( + label_id, + attributes=attr, + ) + ], + image=img, + ) + ) + categories = {AnnotationType.label: label_cat} + dataset = Dataset.from_iterable(iterable, categories) + return dataset + + def test_sampler_get_sample_classification(self): + config = { + "label1": 10, + "label2": 10, + "label3": 10, + } + + source = self._generate_classification_dataset(config, ["train"]) + num_pre_train_subset = len(source.get_subset("train")) + + num_sample = 5 + + with self.subTest("Top-K method"): + result = sampler.Sampler( + source, + algorithm="entropy", + subset_name="train", + sampled_name="sample", + unsampled_name="unsampled", + sampling_method="topk", + num_sample=num_sample, + output_file=None, + ) + self.assertEqual(num_sample, len(result.get_subset("sample"))) + self.assertEqual( + len(result.get_subset("unsampled")), + num_pre_train_subset - len(result.get_subset("sample")), + ) + topk_expected_result = [1, 4, 9, 10, 26] + topk_result = list(map(int, result.result["ImageID"].to_list())) + self.assertEqual(sorted(topk_result), topk_expected_result) + + with self.subTest("Low-K method"): + result = sampler.Sampler( + source, + algorithm="entropy", + subset_name="train", + sampled_name="sample", + unsampled_name="unsampled", + sampling_method="lowk", + num_sample=num_sample, + output_file=None, + ) + self.assertEqual(num_sample, len(result.get_subset("sample"))) + self.assertEqual( + len(result.get_subset("unsampled")), + num_pre_train_subset - len(result.get_subset("sample")), + ) + lowk_expected_result = [2, 6, 14, 21, 23] + lowk_result = list(map(int, result.result["ImageID"].to_list())) + self.assertEqual(sorted(lowk_result), lowk_expected_result) + + with self.subTest("Rand-K method"): + result = sampler.Sampler( + source, + algorithm="entropy", + subset_name="train", + sampled_name="sample", + unsampled_name="unsampled", + sampling_method="randk", + num_sample=num_sample, + output_file=None, + ) + self.assertEqual(num_sample, len(result.get_subset("sample"))) + self.assertEqual( + len(result.get_subset("unsampled")), + num_pre_train_subset - len(result.get_subset("sample")), + ) + + with self.subTest("Mix-K method"): + result = sampler.Sampler( + source, + algorithm="entropy", + subset_name="train", + sampled_name="sample", + unsampled_name="unsampled", + sampling_method="mixk", + num_sample=num_sample, + output_file=None, + ) + self.assertEqual(num_sample, len(result.get_subset("sample"))) + self.assertEqual( + len(result.get_subset("unsampled")), + num_pre_train_subset - len(result.get_subset("sample")), + ) + mixk_expected_result = [2, 4, 10, 23, 26] + mixk_result = list(map(int, result.result["ImageID"].to_list())) + self.assertEqual(sorted(mixk_result), mixk_expected_result) + + result = sampler.Sampler( + source, + algorithm="entropy", + subset_name="train", + sampled_name="sample", + unsampled_name="unsampled", + sampling_method="mixk", + num_sample=6, + output_file=None, + ) + self.assertEqual(6, len(result.get_subset("sample"))) + self.assertEqual( + len(result.get_subset("unsampled")), + num_pre_train_subset - len(result.get_subset("sample")), + ) + mixk_expected_result = [2, 4, 6, 10, 23, 26] + mixk_result = list(map(int, result.result["ImageID"].to_list())) + self.assertEqual(sorted(mixk_result), mixk_expected_result) + + with self.subTest("Randtop-K method"): + result = sampler.Sampler( + source, + algorithm="entropy", + subset_name="train", + sampled_name="sample", + unsampled_name="unsampled", + sampling_method="randtopk", + num_sample=num_sample, + output_file=None, + ) + + self.assertEqual(num_sample, len(result.get_subset("sample"))) + self.assertEqual( + len(result.get_subset("unsampled")), + num_pre_train_subset - len(result.get_subset("sample")), + ) + + def test_sampler_gives_error(self): + config = { + "label1": 10, + "label2": 10, + "label3": 10, + } + num_sample = 5 + + source = self._generate_classification_dataset(config) + + with self.subTest("Not found"): + with self.assertRaisesRegex(Exception, "Not Found subset"): + subset = "hello" + result = sampler.Sampler( + source, + algorithm="entropy", + subset_name=subset, + sampled_name="sample", + unsampled_name="unsampled", + sampling_method="topk", + num_sample=num_sample, + output_file=None, + ) + result = iter(result) + next(result) + + with self.assertRaisesRegex(Exception, "Not Found algorithm"): + algorithm = "hello" + result = sampler.Sampler( + source, + algorithm=algorithm, + subset_name="train", + sampled_name="sample", + unsampled_name="unsampled", + sampling_method="topk", + num_sample=num_sample, + output_file=None, + ) + result = iter(result) + next(result) + + with self.assertRaisesRegex(Exception, "Not Found method"): + sampling_method = "hello" + result = sampler.Sampler( + source, + algorithm="entropy", + subset_name="train", + sampled_name="sample", + unsampled_name="unsampled", + sampling_method=sampling_method, + num_sample=num_sample, + output_file=None, + ) + result = iter(result) + next(result) + + with self.subTest("Invalid Value"): + with self.assertRaisesRegex(Exception, "Invalid number"): + k = 0 + result = sampler.Sampler( + source, + algorithm="entropy", + subset_name="train", + sampled_name="sample", + unsampled_name="unsampled", + sampling_method="topk", + num_sample=k, + output_file=None, + ) + result = iter(result) + next(result) + + with self.assertRaisesRegex(Exception, "Invalid number"): + k = -1 + result = sampler.Sampler( + source, + algorithm="entropy", + subset_name="train", + sampled_name="sample", + unsampled_name="unsampled", + sampling_method="topk", + num_sample=k, + output_file=None, + ) + result = iter(result) + next(result) + + with self.assertRaisesRegex(Exception, "Invalid value"): + k = "string" + result = sampler.Sampler( + source, + algorithm="entropy", + subset_name="train", + sampled_name="sample", + unsampled_name="unsampled", + sampling_method="topk", + num_sample=k, + output_file=None, + ) + result = iter(result) + next(result) + + with self.assertRaisesRegex(Exception, "Invalid extension"): + output_file = "string.xml" + result = sampler.Sampler( + source, + algorithm="entropy", + subset_name="train", + sampled_name="sample", + unsampled_name="unsampled", + sampling_method="topk", + num_sample=num_sample, + output_file=output_file, + ) + result = iter(result) + next(result) + + with self.assertRaisesRegex(Exception, "Invalid extension"): + output_file = "string" + result = sampler.Sampler( + source, + algorithm="entropy", + subset_name="train", + sampled_name="sample", + unsampled_name="unsampled", + sampling_method="topk", + num_sample=num_sample, + output_file=output_file, + ) + result = iter(result) + next(result) + + with self.assertRaisesRegex( + Exception, "Invalid Data, ImageID not found in data" + ): + sub = source.get_subset("train") + + data_df = defaultdict(list) + infer_df = defaultdict(list) + + for data in sub: + width, height = data.image.size + data_df["Width"].append(width) + data_df["Height"].append(height) + data_df["ImagePath"].append(data.image.path) + + for annotation in data.annotations: + probs = annotation.attributes["score"] + infer_df["ImageID"].append(data.id) + + for prob_idx, prob in enumerate(probs): + infer_df[f"ClassProbability{prob_idx+1}"].append(prob) + + data_df = pd.DataFrame(data_df) + infer_df = pd.DataFrame(infer_df) + + entropy(data_df, infer_df) + + with self.assertRaisesRegex( + Exception, "Invalid Data, ImageID not found in inference" + ): + sub = source.get_subset("train") + + data_df = defaultdict(list) + infer_df = defaultdict(list) + + for data in sub: + width, height = data.image.size + data_df["ImageID"].append(data.id) + data_df["Width"].append(width) + data_df["Height"].append(height) + data_df["ImagePath"].append(data.image.path) + + for annotation in data.annotations: + probs = annotation.attributes["score"] + + for prob_idx, prob in enumerate(probs): + infer_df[f"ClassProbability{prob_idx+1}"].append(prob) + + data_df = pd.DataFrame(data_df) + infer_df = pd.DataFrame(infer_df) + + entropy(data_df, infer_df) + + def test_sampler_get_invalid_data(self): + with self.subTest("empty dataset"): + config = { + "label1": 0, + "label2": 0, + "label3": 0, + } + + source = self._generate_classification_dataset(config) + with self.assertRaisesRegex(Exception, "Not Found"): + result = sampler.Sampler( + source, + algorithm="entropy", + subset_name="train", + sampled_name="sample", + unsampled_name="unsampled", + sampling_method="topk", + num_sample=5, + output_file=None, + ) + result = iter(result) + next(result) + + with self.subTest("Dataset without Score (Probability)"): + config = { + "label1": 10, + "label2": 10, + "label3": 10, + } + + source = self._generate_classification_dataset(config, empty_score=True) + with self.assertRaisesRegex(Exception, "Invalid data"): + result = sampler.Sampler( + source, + algorithm="entropy", + subset_name="train", + sampled_name="sample", + unsampled_name="unsampled", + sampling_method="topk", + num_sample=5, + output_file=None, + ) + result = iter(result) + next(result) + + with self.subTest("Out of range, probability (Less than 0 or more than 1)"): + config = { + "label1": 10, + "label2": 10, + "label3": 10, + } + + source = self._generate_classification_dataset( + config, empty_score=False, out_range=True + ) + with self.assertRaisesRegex(Exception, "Invalid data"): + result = sampler.Sampler( + source, + algorithm="entropy", + subset_name="train", + sampled_name="sample", + unsampled_name="unsampled", + sampling_method="topk", + num_sample=5, + output_file=None, + ) + result = iter(result) + next(result) + + with self.subTest("No Score Attribute Data"): + config = { + "label1": 10, + "label2": 10, + "label3": 10, + } + + source = self._generate_classification_dataset(config, no_attr=True) + with self.assertRaisesRegex(Exception, "Invalid data"): + result = sampler.Sampler( + source, + algorithm="entropy", + subset_name="train", + sampled_name="sample", + unsampled_name="unsampled", + sampling_method="topk", + num_sample=5, + output_file=None, + ) + result = iter(result) + next(result) + + with self.subTest("No Image Data"): + config = { + "label1": 10, + "label2": 10, + "label3": 10, + } + + source = self._generate_classification_dataset(config, no_img=True) + with self.assertRaisesRegex(Exception, "Invalid data"): + result = sampler.Sampler( + source, + algorithm="entropy", + subset_name="train", + sampled_name="sample", + unsampled_name="unsampled", + sampling_method="topk", + num_sample=5, + output_file=None, + ) + result = iter(result) + next(result) + + def test_sampler_number_of_samples(self): + config = { + "label1": 10, + "label2": 10, + "label3": 10, + } + + source = self._generate_classification_dataset(config) + num_pre_train_subset = len(source.get_subset("train")) + + with self.subTest("k > num of data with top-k"): + num_sample = 500 + sampling_method = "topk" + + result = sampler.Sampler( + source, + algorithm="entropy", + subset_name="train", + sampled_name="sample", + unsampled_name="unsampled", + sampling_method=sampling_method, + num_sample=num_sample, + output_file=None, + ) + self.assertEqual(num_pre_train_subset, len(result.get_subset("sample"))) + + with self.subTest("k > num of data with low-k"): + num_sample = 500 + sampling_method = "lowk" + + result = sampler.Sampler( + source, + algorithm="entropy", + subset_name="train", + sampled_name="sample", + unsampled_name="unsampled", + sampling_method=sampling_method, + num_sample=num_sample, + output_file=None, + ) + self.assertEqual(num_pre_train_subset, len(result.get_subset("sample"))) + + with self.subTest("k > num of data with rand-k"): + num_sample = 500 + sampling_method = "randk" + + result = sampler.Sampler( + source, + algorithm="entropy", + subset_name="train", + sampled_name="sample", + unsampled_name="unsampled", + sampling_method=sampling_method, + num_sample=num_sample, + output_file=None, + ) + self.assertEqual(num_pre_train_subset, len(result.get_subset("sample"))) + + with self.subTest("k > num of data with mix-k"): + num_sample = 500 + sampling_method = "mixk" + + result = sampler.Sampler( + source, + algorithm="entropy", + subset_name="train", + sampled_name="sample", + unsampled_name="unsampled", + sampling_method=sampling_method, + num_sample=num_sample, + output_file=None, + ) + self.assertEqual(num_pre_train_subset, len(result.get_subset("sample"))) + + with self.subTest("k > num of data with randtop-k"): + num_sample = 500 + sampling_method = "randtopk" + + result = sampler.Sampler( + source, + algorithm="entropy", + subset_name="train", + sampled_name="sample", + unsampled_name="unsampled", + sampling_method=sampling_method, + num_sample=num_sample, + output_file=None, + ) + self.assertEqual(num_pre_train_subset, len(result.get_subset("sample"))) + + with self.subTest("k == num of data with top-k"): + num_sample = 10 + sampling_method = "topk" + + result = sampler.Sampler( + source, + algorithm="entropy", + subset_name="train", + sampled_name="sample", + unsampled_name="unsampled", + sampling_method=sampling_method, + num_sample=num_sample, + output_file=None, + ) + self.assertEqual(num_pre_train_subset, len(result.get_subset("sample"))) + + with self.subTest("k == num of data with low-k"): + num_sample = 10 + sampling_method = "lowk" + + result = sampler.Sampler( + source, + algorithm="entropy", + subset_name="train", + sampled_name="sample", + unsampled_name="unsampled", + sampling_method=sampling_method, + num_sample=num_sample, + output_file=None, + ) + self.assertEqual(num_pre_train_subset, len(result.get_subset("sample"))) + + with self.subTest("k == num of data with rand-k"): + num_sample = 10 + sampling_method = "randk" + + result = sampler.Sampler( + source, + algorithm="entropy", + subset_name="train", + sampled_name="sample", + unsampled_name="unsampled", + sampling_method=sampling_method, + num_sample=num_sample, + output_file=None, + ) + self.assertEqual(num_pre_train_subset, len(result.get_subset("sample"))) + + with self.subTest("k == num of data with mix-k"): + num_sample = 10 + sampling_method = "mixk" + + result = sampler.Sampler( + source, + algorithm="entropy", + subset_name="train", + sampled_name="sample", + unsampled_name="unsampled", + sampling_method=sampling_method, + num_sample=num_sample, + output_file=None, + ) + self.assertEqual(num_pre_train_subset, len(result.get_subset("sample"))) + + with self.subTest("k == num of data with randtop-k"): + num_sample = 10 + sampling_method = "randtopk" + + result = sampler.Sampler( + source, + algorithm="entropy", + subset_name="train", + sampled_name="sample", + unsampled_name="unsampled", + sampling_method=sampling_method, + num_sample=num_sample, + output_file=None, + ) + self.assertEqual(num_pre_train_subset, len(result.get_subset("sample"))) + + num_sample = 9 + + result = sampler.Sampler( + source, + algorithm="entropy", + subset_name="train", + sampled_name="sample", + unsampled_name="unsampled", + sampling_method=sampling_method, + num_sample=num_sample, + output_file=None, + ) + self.assertEqual(len(result.get_subset("sample")), 9) + + def test_sampler_accumulated_sampling(self): + config = { + "label1": 10, + "label2": 10, + "label3": 10, + } + + source = self._generate_classification_dataset(config) + + num_pre_train_subset = len(source.get_subset("train")) + num_pre_val_subset = len(source.get_subset("val")) + num_pre_test_subset = len(source.get_subset("test")) + + with self.subTest("Same Subset, Same number of datas 3times"): + num_sample = 3 + sample_subset_name = "sample" + + result = sampler.Sampler( + source, + algorithm="entropy", + subset_name="train", + sampled_name=sample_subset_name, + unsampled_name="train", + sampling_method="topk", + num_sample=num_sample, + output_file=None, + ) + + self.assertEqual(len(result.get_subset("sample")), num_sample) + self.assertEqual( + len(result.get_subset("train")), num_pre_train_subset - num_sample + ) + + result = sampler.Sampler( + result, + algorithm="entropy", + subset_name="train", + sampled_name=sample_subset_name, + unsampled_name="train", + sampling_method="topk", + num_sample=num_sample, + output_file=None, + ) + + self.assertEqual(len(result.get_subset("sample")), num_sample * 2) + self.assertEqual( + len(result.get_subset("train")), num_pre_train_subset - num_sample * 2 + ) + + result = sampler.Sampler( + result, + algorithm="entropy", + subset_name="train", + sampled_name=sample_subset_name, + unsampled_name="train", + sampling_method="topk", + num_sample=num_sample, + output_file=None, + ) + + self.assertEqual(len(result.get_subset("sample")), num_sample * 3) + self.assertEqual( + len(result.get_subset("train")), num_pre_train_subset - num_sample * 3 + ) + + with self.subTest("Same Subset, 2, 3, 4 sampling"): + sample_subset_name = "sample" + + result = sampler.Sampler( + source, + algorithm="entropy", + subset_name="train", + sampled_name=sample_subset_name, + unsampled_name="train", + sampling_method="topk", + num_sample=2, + output_file=None, + ) + + self.assertEqual(len(result.get_subset("sample")), 2) + self.assertEqual(len(result.get_subset("train")), num_pre_train_subset - 2) + + result = sampler.Sampler( + result, + algorithm="entropy", + subset_name="train", + sampled_name=sample_subset_name, + unsampled_name="train", + sampling_method="topk", + num_sample=3, + output_file=None, + ) + + self.assertEqual(len(result.get_subset("sample")), 5) + self.assertEqual(len(result.get_subset("train")), num_pre_train_subset - 5) + + result = sampler.Sampler( + result, + algorithm="entropy", + subset_name="train", + sampled_name=sample_subset_name, + unsampled_name="train", + sampling_method="topk", + num_sample=4, + output_file=None, + ) + + self.assertEqual(len(result.get_subset("sample")), 9) + self.assertEqual(len(result.get_subset("train")), num_pre_train_subset - 9) + + with self.subTest("Different Subset, Same number of datas 3times"): + num_sample = 3 + sample_subset_name = "sample" + + result = sampler.Sampler( + source, + algorithm="entropy", + subset_name="train", + sampled_name=sample_subset_name, + unsampled_name="train", + sampling_method="topk", + num_sample=num_sample, + output_file=None, + ) + + self.assertEqual(len(result.get_subset("sample")), num_sample) + self.assertEqual( + len(result.get_subset("train")), num_pre_train_subset - num_sample + ) + + result = sampler.Sampler( + result, + algorithm="entropy", + subset_name="val", + sampled_name=sample_subset_name, + unsampled_name="val", + sampling_method="topk", + num_sample=num_sample, + output_file=None, + ) + + self.assertEqual(len(result.get_subset("sample")), num_sample * 2) + self.assertEqual( + len(result.get_subset("val")), num_pre_val_subset - num_sample + ) + + result = sampler.Sampler( + result, + algorithm="entropy", + subset_name="test", + sampled_name=sample_subset_name, + unsampled_name="test", + sampling_method="topk", + num_sample=num_sample, + output_file=None, + ) + + self.assertEqual(len(result.get_subset("sample")), num_sample * 3) + self.assertEqual( + len(result.get_subset("test")), num_pre_test_subset - num_sample + ) + + with self.subTest("Different Subset, 2, 3, 4 sampling"): + sample_subset_name = "sample" + + result = sampler.Sampler( + source, + algorithm="entropy", + subset_name="train", + sampled_name=sample_subset_name, + unsampled_name="train", + sampling_method="topk", + num_sample=2, + output_file=None, + ) + + self.assertEqual(len(result.get_subset("sample")), 2) + self.assertEqual(len(result.get_subset("train")), num_pre_train_subset - 2) + + result = sampler.Sampler( + result, + algorithm="entropy", + subset_name="val", + sampled_name=sample_subset_name, + unsampled_name="val", + sampling_method="topk", + num_sample=3, + output_file=None, + ) + + self.assertEqual(len(result.get_subset("sample")), 5) + self.assertEqual(len(result.get_subset("val")), num_pre_val_subset - 3) + + result = sampler.Sampler( + result, + algorithm="entropy", + subset_name="test", + sampled_name=sample_subset_name, + unsampled_name="test", + sampling_method="topk", + num_sample=4, + output_file=None, + ) + + self.assertEqual(len(result.get_subset("sample")), 9) + self.assertEqual(len(result.get_subset("test")), num_pre_test_subset - 4) + + def test_sampler_unaccumulated_sampling(self): + config = { + "label1": 10, + "label2": 10, + "label3": 10, + } + + source = self._generate_classification_dataset(config) + + num_pre_train_subset = len(source.get_subset("train")) + num_pre_val_subset = len(source.get_subset("val")) + num_pre_test_subset = len(source.get_subset("test")) + + with self.subTest("Same Subset, Same number of datas 3times"): + num_sample = 3 + + result = sampler.Sampler( + source, + algorithm="entropy", + subset_name="train", + sampled_name="sample1", + unsampled_name="train", + sampling_method="topk", + num_sample=num_sample, + output_file=None, + ) + + self.assertEqual(len(result.get_subset("sample1")), num_sample) + self.assertEqual( + len(result.get_subset("train")), num_pre_train_subset - num_sample + ) + + result = sampler.Sampler( + result, + algorithm="entropy", + subset_name="train", + sampled_name="sample2", + unsampled_name="train", + sampling_method="topk", + num_sample=num_sample, + output_file=None, + ) + + self.assertEqual(len(result.get_subset("sample1")), num_sample) + self.assertEqual(len(result.get_subset("sample2")), num_sample) + self.assertEqual( + len(result.get_subset("train")), num_pre_train_subset - num_sample * 2 + ) + + result = sampler.Sampler( + result, + algorithm="entropy", + subset_name="train", + sampled_name="sample3", + unsampled_name="train", + sampling_method="topk", + num_sample=num_sample, + output_file=None, + ) + + self.assertEqual(len(result.get_subset("sample1")), num_sample) + self.assertEqual(len(result.get_subset("sample2")), num_sample) + self.assertEqual(len(result.get_subset("sample3")), num_sample) + self.assertEqual( + len(result.get_subset("train")), num_pre_train_subset - num_sample * 3 + ) + + with self.subTest("Same Subset, 2, 3, 4 sampling"): + result = sampler.Sampler( + source, + algorithm="entropy", + subset_name="train", + sampled_name="sample1", + unsampled_name="train", + sampling_method="topk", + num_sample=2, + output_file=None, + ) + + self.assertEqual(len(result.get_subset("sample1")), 2) + self.assertEqual(len(result.get_subset("train")), num_pre_train_subset - 2) + + result = sampler.Sampler( + result, + algorithm="entropy", + subset_name="train", + sampled_name="sample2", + unsampled_name="train", + sampling_method="topk", + num_sample=3, + output_file=None, + ) + + self.assertEqual(len(result.get_subset("sample1")), 2) + self.assertEqual(len(result.get_subset("sample2")), 3) + self.assertEqual(len(result.get_subset("train")), num_pre_train_subset - 5) + + result = sampler.Sampler( + result, + algorithm="entropy", + subset_name="train", + sampled_name="sample3", + unsampled_name="train", + sampling_method="topk", + num_sample=4, + output_file=None, + ) + + self.assertEqual(len(result.get_subset("sample1")), 2) + self.assertEqual(len(result.get_subset("sample2")), 3) + self.assertEqual(len(result.get_subset("sample3")), 4) + self.assertEqual(len(result.get_subset("train")), num_pre_train_subset - 9) + + with self.subTest("Different Subset, Same number of datas 3times"): + num_sample = 3 + + result = sampler.Sampler( + source, + algorithm="entropy", + subset_name="train", + sampled_name="sample1", + unsampled_name="train", + sampling_method="topk", + num_sample=num_sample, + output_file=None, + ) + + self.assertEqual(len(result.get_subset("sample1")), num_sample) + self.assertEqual( + len(result.get_subset("train")), num_pre_train_subset - num_sample + ) + + result = sampler.Sampler( + result, + algorithm="entropy", + subset_name="val", + sampled_name="sample2", + unsampled_name="val", + sampling_method="topk", + num_sample=num_sample, + output_file=None, + ) + + self.assertEqual(len(result.get_subset("sample1")), num_sample) + self.assertEqual(len(result.get_subset("sample2")), num_sample) + self.assertEqual( + len(result.get_subset("val")), num_pre_val_subset - num_sample + ) + + result = sampler.Sampler( + result, + algorithm="entropy", + subset_name="test", + sampled_name="sample3", + unsampled_name="test", + sampling_method="topk", + num_sample=num_sample, + output_file=None, + ) + + self.assertEqual(len(result.get_subset("sample1")), num_sample) + self.assertEqual(len(result.get_subset("sample2")), num_sample) + self.assertEqual(len(result.get_subset("sample3")), num_sample) + self.assertEqual( + len(result.get_subset("test")), num_pre_test_subset - num_sample + ) + + with self.subTest("Different Subset, 2, 3, 4 sampling"): + result = sampler.Sampler( + source, + algorithm="entropy", + subset_name="train", + sampled_name="sample1", + unsampled_name="train", + sampling_method="topk", + num_sample=2, + output_file=None, + ) + + self.assertEqual(len(result.get_subset("sample1")), 2) + self.assertEqual(len(result.get_subset("train")), num_pre_train_subset - 2) + + result = sampler.Sampler( + result, + algorithm="entropy", + subset_name="val", + sampled_name="sample2", + unsampled_name="val", + sampling_method="topk", + num_sample=3, + output_file=None, + ) + + self.assertEqual(len(result.get_subset("sample1")), 2) + self.assertEqual(len(result.get_subset("sample2")), 3) + self.assertEqual(len(result.get_subset("val")), num_pre_val_subset - 3) + + result = sampler.Sampler( + result, + algorithm="entropy", + subset_name="test", + sampled_name="sample3", + unsampled_name="test", + sampling_method="topk", + num_sample=4, + output_file=None, + ) + + self.assertEqual(len(result.get_subset("sample1")), 2) + self.assertEqual(len(result.get_subset("sample2")), 3) + self.assertEqual(len(result.get_subset("sample3")), 4) + self.assertEqual(len(result.get_subset("test")), num_pre_test_subset - 4) + + def test_sampler_parser(self): + from argparse import ArgumentParser + + assert isinstance(sampler.Sampler.build_cmdline_parser(), ArgumentParser)