Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Sampler Plugin #115

Merged
merged 7 commits into from
Mar 2, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ matrix:
install:
- pip install -e ./
- pip install tensorflow
- pip install pandas

script:
- python -m unittest discover -v
Expand Down
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- `LFW` dataset format (<https://github.com/openvinotoolkit/datumaro/pull/110>)
- Support of polygons' and masks' confusion matrices and mismathing classes in `diff` command (<https://github.com/openvinotoolkit/datumaro/pull/117>)
- Add near duplicate image removal plugin (<https://github.com/openvinotoolkit/datumaro/pull/113>)
- Sampler Plugin that analyzes inference result from the given dataset and selects samples for annotation(<https://github.com/openvinotoolkit/datumaro/pull/115>)

### Changed
- OpenVINO model launcher is updated for OpenVINO r2021.1 (<https://github.com/openvinotoolkit/datumaro/pull/100>)
Expand Down
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,11 @@ CVAT annotations ---> Publication, statistics etc.
- for detection task, based on bboxes
- for re-identification task, based on labels,
avoiding having same IDs in training and test splits
- Sampling a dataset
- analyzes inference result from the given dataset
and selects the ‘best’ and the ‘least amount of’ samples for annotation.
- Select the sample that best suits model training.
- sampling with Entropy based algorithm
- Dataset quality checking
- Simple checking for errors
- Comparison with model infernece
Expand Down
3 changes: 3 additions & 0 deletions datumaro/plugins/sampler/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# Copyright (C) 2021 Intel Corporation
#
# SPDX-License-Identifier: MIT
22 changes: 22 additions & 0 deletions datumaro/plugins/sampler/algorithm/algorithm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# Copyright (C) 2021 Intel Corporation
#
# SPDX-License-Identifier: MIT

from enum import Enum

SamplingMethod = Enum("SamplingMethod", ["topk", "lowk", "randk", "mixk", "randtopk"])
Algorithm = Enum("Algorithm", ["entropy"])


class InferenceResultAnalyzer:
"""
Basic interface for IRA (Inference Result Analyzer)
"""

def __init__(self, dataset, inference):
self.data = dataset
self.inference = inference
self.sampling_method = SamplingMethod

def get_sample(self, method: str, k: int):
raise NotImplementedError()
191 changes: 191 additions & 0 deletions datumaro/plugins/sampler/algorithm/entropy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,191 @@
# Copyright (C) 2021 Intel Corporation
#
# SPDX-License-Identifier: MIT

import pandas as pd
import math
import re
import logging as log

from .algorithm import InferenceResultAnalyzer


class SampleEntropy(InferenceResultAnalyzer):
"""
Entropy is a class that inherits an Sampler,
calculates an uncertainty score based on an entropy,
and get samples based on that score.
"""

def __init__(self, data, inference):
"""
Constructor function
Args:
data: Receive the data format in pd.DataFrame format. ImageID is an essential element for data.
inference:
Receive the inference format in the form of pd.DataFrame.
ImageID and ClassProbability are essential for inferences.
"""
super().__init__(data, inference)

# check the existence of "ImageID" in data & inference
if "ImageID" not in data:
raise Exception("Invalid Data, ImageID not found in data")
if "ImageID" not in inference:
raise Exception("Invalid Data, ImageID not found in inference")

# check the existence of "ClassProbability" in inference
self.num_classes = 0
for head in list(inference):
m = re.match("ClassProbability\d+", head)
if m is not None:
self.num_classes += 1

if not self.num_classes > 0:
raise Exception(
"Invalid data, Inference do not have ClassProbability values!"
)

# rank: The inference DataFrame, sorted according to the score.
self.rank = self._rank_images().sort_values(by="rank")

def get_sample(self, method: str, k: int, n: int = 3) -> pd.DataFrame:
"""
A function that extracts sample data and returns it.
Args:
method:
- 'topk': It extracts the k sample data with the highest uncertainty.
- 'lowk': It extracts the k sample data with the lowest uncertainty.
- 'randomk': Extract and return random k sample data.
k: number of sample data
n: Parameters to be used in the randtopk method, Variable to first extract data of multiple n of k.
Returns:
Extracted sample data : pd.DataFrame
"""
temp_rank = self.rank

# 1. k value check
if not isinstance(k, int):
raise Exception(
f"Invalid value {k}. k must have an integer greater than zero."
)
elif k <= 0:
raise Exception(
f"Invalid number {k}. k must have a positive number greater than zero."
)

# 2. Select a sample according to the method
if k <= len(temp_rank):
if method == self.sampling_method.topk.name:
temp_rank = temp_rank[:k]
elif method == self.sampling_method.lowk.name:
temp_rank = temp_rank[-k:]
elif method == self.sampling_method.randk.name:
return self.data.sample(n=k).reset_index(drop=True)
elif method in [
self.sampling_method.mixk.name,
self.sampling_method.randtopk.name,
]:
return self._get_sample_mixed(method=method, k=k, n=n)
else:
raise Exception(f"Not Found method '{method}'")
else:
log.warning(
"The number of samples is greater than the size of the selected subset."
)

columns = list(self.data.columns)
merged_df = pd.merge(temp_rank, self.data, how="inner", on=["ImageID"])
return merged_df[columns].reset_index(drop=True)

def _get_sample_mixed(self, method: str, k: int, n: int = 3) -> pd.DataFrame:
"""
A function that extracts sample data and returns it.
Args:
method:
- 'mixk': Return top-k and low-k halves based on uncertainty.
- 'randomtopk': Randomly extract n*k and return k with high uncertainty.
k: number of sample data
n: Number to extract n*k from total data according to n, and top-k from it
Returns:
Extracted sample data : pd.DataFrame
"""
temp_rank = self.rank

# Select a sample according to the method
if k <= len(temp_rank):
if method == self.sampling_method.mixk.name:
if k % 2 == 0:
temp_rank = pd.concat([temp_rank[: k // 2], temp_rank[-(k // 2) :]])
else:
temp_rank = pd.concat(
[temp_rank[: (k // 2) + 1], temp_rank[-(k // 2) :]]
)
elif method == self.sampling_method.randtopk.name:
if n * k <= len(temp_rank):
temp_rank = temp_rank.sample(n=n * k).sort_values(by="rank")
else:
log.warning(msg="n * k exceeds the length of the inference")
temp_rank = temp_rank[:k]

columns = list(self.data.columns)
merged_df = pd.merge(temp_rank, self.data, how="inner", on=["ImageID"])
return merged_df[columns].reset_index(drop=True)

def _rank_images(self) -> pd.DataFrame:
"""
A internal function that ranks the inference data based on uncertainty.
Returns:
inference data sorted by uncertainty. pd.DataFrame
"""
# 1. Load Inference
inference, res = None, None
if self.inference is not None:
inference = pd.DataFrame(self.inference)
else:
raise Exception("Invalid Data, Failed to load inference result!")

# 2. If the reference data frame does not contain an uncertify score, calculate it
if "Uncertainty" not in inference:
inference = self._calculate_uncertainty_from_classprob(inference=inference)

# 3. Check that Uncertainty values are in place.
na_df = inference.isna().sum()
if "Uncertainty" in na_df and na_df["Uncertainty"] > 0:
raise Exception("Some inference results do not have Uncertainty values!")

# 4. Ranked based on Uncertainty score
res = inference[["ImageID", "Uncertainty"]].groupby("ImageID").mean()
res["rank"] = res["Uncertainty"].rank(ascending=False, method="first")
res = res.reset_index()

return res

def _calculate_uncertainty_from_classprob(
self, inference: pd.DataFrame
) -> pd.DataFrame:
"""
A function that calculates uncertainty based on entropy through ClassProbability values.
Args:
inference: Inference data where uncertainty has not been calculated
Returns:
inference data with uncertainty variable
"""

# Calculate Entropy (Uncertainty Score)
uncertainty = []
for i in range(len(inference)):
entropy = 0
for j in range(self.num_classes):
p = inference.loc[i][f"ClassProbability{j+1}"]
if p < 0 or p > 1:
raise Exception(
"Invalid data, Math domain Error! p is between 0 and 1"
)
entropy -= p * math.log(p + 1e-14, math.e)

uncertainty.append(entropy)

inference["Uncertainty"] = uncertainty

return inference
Loading