Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Sampler Plugin #115

Merged
merged 7 commits into from
Mar 2, 2021
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- `Icdar13/15` dataset format (<https://github.com/openvinotoolkit/datumaro/pull/96>)
- Laziness, source caching, tracking of changes and partial updating for `Dataset` (<https://github.com/openvinotoolkit/datumaro/pull/102>)
- `Market-1501` dataset format (<https://github.com/openvinotoolkit/datumaro/pull/108>)
- Sampler Plugin that analyzes inference result from the given dataset and selects samples for annotation(<https://github.com/openvinotoolkit/datumaro/pull/115>)

### Changed
- OpenVINO model launcher is updated for OpenVINO r2021.1 (<https://github.com/openvinotoolkit/datumaro/pull/100>)
Expand Down
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,11 @@ CVAT annotations ---> Publication, statistics etc.
- for detection task, based on bboxes
- for re-identification task, based on labels,
avoiding having same IDs in training and test splits
- Sampling a dataset
- analyzes inference result from the given dataset
and selects the ‘best’ and the ‘least amount of’ samples for annotation.
- Select the sample that best suits model training.
- sampling with Entropy based algorithm
- Dataset quality checking
- Simple checking for errors
- Comparison with model infernece
Expand Down
3 changes: 3 additions & 0 deletions datumaro/plugins/sampler/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# Copyright (C) 2021 Intel Corporation
#
# SPDX-License-Identifier: MIT
21 changes: 21 additions & 0 deletions datumaro/plugins/sampler/algorithm/algorithm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# Copyright (C) 2021 Intel Corporation
#
# SPDX-License-Identifier: MIT

from enum import Enum

SamplingMethod = Enum("SamplingMethod", ["topk", "lowk", "randk", "mixk", "randtopk"])


class InferenceResultAnalyzer:
"""
Basic interface for IRA (Inference Result Analyzer)
"""

def __init__(self, dataset, inference):
self.data = dataset
self.inference = inference
self.sampling_method = SamplingMethod

def get_sample(self, method: str, k: int):
raise NotImplementedError()
196 changes: 196 additions & 0 deletions datumaro/plugins/sampler/algorithm/entropy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,196 @@
# Copyright (C) 2021 Intel Corporation
#
# SPDX-License-Identifier: MIT

import pandas as pd
import math
import re
import logging as log

from .algorithm import InferenceResultAnalyzer


class SampleEntropy(InferenceResultAnalyzer):
"""
Entropy is a class that inherits an Sampler,
calculates an uncertainty score based on an entropy,
and get samples based on that score.
"""

def __init__(self, data, inference):
"""
Constructor function
Args:
data: Receive the data format in pd.DataFrame format. ImageID is an essential element for data.
inference:
Receive the inference format in the form of pd.DataFrame.
ImageID and ClassProbability are essential for inferences.
"""
super().__init__(data, inference)

# check the existence of "ImageID" in data & inference
if "ImageID" not in data:
msg = "Invalid Data, ImageID not found in data"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd suggest to avoid this pattern, because it spreads reader's attention. Just do raise Exception("some text")

raise Exception(msg)
if "ImageID" not in inference:
msg = "Invalid Data, ImageID not found in inference"
raise Exception(msg)

# check the existence of "ClassProbability" in inference
self.num_classes = 0
for head in list(inference):
m = re.match("ClassProbability\d+", head)
if m is not None:
self.num_classes += 1

if not self.num_classes > 0:
msg = "Invalid data, Inference do not have ClassProbability values!"
raise Exception(msg)

# rank: The inference DataFrame, sorted according to the score.
self.rank = self._rank_images().sort_values(by="rank")

def get_sample(self, method: str, k: int, n: int = 3) -> pd.DataFrame:
"""
A function that extracts sample data and returns it.
Args:
method:
- 'topk': It extracts the k sample data with the highest uncertainty.
- 'lowk': It extracts the k sample data with the lowest uncertainty.
- 'randomk': Extract and return random k sample data.
k: number of sample data
n: Parameters to be used in the randtopk method, Variable to first extract data of multiple n of k.
Returns:
Extracted sample data : pd.DataFrame
"""
temp_rank = self.rank

# 1. k value check
if not isinstance(k, int):
msg = f"Invalid value {k}. k must have an integer greater than zero."
raise Exception(msg)
elif k <= 0:
msg = (
f"Invalid number {k}. k must have a positive number greater than zero."
)
raise Exception(msg)

# 2. Select a sample according to the method
if k <= len(temp_rank):
if method == self.sampling_method.topk.name:
temp_rank = temp_rank[:k]
elif method == self.sampling_method.lowk.name:
temp_rank = temp_rank[-k:]
elif method == self.sampling_method.randk.name:
return self.data.sample(n=k).reset_index(drop=True)
elif method in [
self.sampling_method.mixk.name,
self.sampling_method.randtopk.name,
]:
return self._get_sample_mixed(method=method, k=k, n=n)
else:
msg = f"Not Found method '{method}'"
raise Exception(msg)
else:
msg = (
"The number of samples is greater than the size of the selected subset."
)
log.warning(msg=msg)

columns = list(self.data.columns)
merged_df = pd.merge(temp_rank, self.data, how="inner", on=["ImageID"])
return merged_df[columns].reset_index(drop=True)

def _get_sample_mixed(self, method: str, k: int, n: int = 3) -> pd.DataFrame:
"""
A function that extracts sample data and returns it.
Args:
method:
- 'mixk': Return top-k and low-k halves based on uncertainty.
- 'randomtopk': Randomly extract n*k and return k with high uncertainty.
k: number of sample data
n: Number to extract n*k from total data according to n, and top-k from it
Returns:
Extracted sample data : pd.DataFrame
"""
temp_rank = self.rank

# Select a sample according to the method
if k <= len(temp_rank):
if method == self.sampling_method.mixk.name:
if k % 2 == 0:
temp_rank = pd.concat([temp_rank[: k // 2], temp_rank[-(k // 2) :]])
else:
temp_rank = pd.concat(
[temp_rank[: (k // 2) + 1], temp_rank[-(k // 2) :]]
)
elif method == self.sampling_method.randtopk.name:
if n * k <= len(temp_rank):
temp_rank = temp_rank.sample(n=n * k).sort_values(by="rank")
else:
msg = f"n * k exceeds the length of the inference"
log.warning(msg=msg)
temp_rank = temp_rank[:k]

columns = list(self.data.columns)
merged_df = pd.merge(temp_rank, self.data, how="inner", on=["ImageID"])
return merged_df[columns].reset_index(drop=True)

def _rank_images(self) -> pd.DataFrame:
"""
A internal function that ranks the inference data based on uncertainty.
Returns:
inference data sorted by uncertainty. pd.DataFrame
"""
# 1. Load Inference
inference, res = None, None
if self.inference is not None:
inference = pd.DataFrame(self.inference)
else:
msg = "Invalid Data, Failed to load inference result!"
raise Exception(msg)

# 2. If the reference data frame does not contain an uncertify score, calculate it
if "Uncertainty" not in inference:
inference = self._calculate_uncertainty_from_classprob(inference=inference)

# 3. Check that Uncertainty values are in place.
na_df = inference.isna().sum()
if "Uncertainty" in na_df and na_df["Uncertainty"] > 0:
msg = "Some inference results do not have Uncertainty values!"
raise Exception(msg)

# 4. Ranked based on Uncertainty score
res = inference[["ImageID", "Uncertainty"]].groupby("ImageID").mean()
res["rank"] = res["Uncertainty"].rank(ascending=False, method="first")
res = res.reset_index()

return res

def _calculate_uncertainty_from_classprob(
self, inference: pd.DataFrame
) -> pd.DataFrame:
"""
A function that calculates uncertainty based on entropy through ClassProbability values.
Args:
inference: Inference data where uncertainty has not been calculated
Returns:
inference data with uncertainty variable
"""

# Calculate Entropy (Uncertainty Score)
uncertainty = []
for i in range(len(inference)):
entropy = 0
for j in range(self.num_classes):
p = inference.loc[i][f"ClassProbability{j+1}"]
if p < 0 or p > 1:
msg = "Invalid data, Math domain Error! p is between 0 and 1"
raise Exception(msg)
entropy -= p * math.log(p + 1e-14, math.e)

uncertainty.append(entropy)

inference["Uncertainty"] = uncertainty

return inference
Loading