Skip to content

Commit

Permalink
Refactor sampler (#149)
Browse files Browse the repository at this point in the history
* Refactor sampler, make tests check pandas availability

- updated error messages
- updated cli options
  • Loading branch information
Maxim Zhiltsov authored Mar 6, 2021
1 parent 9ce4218 commit e32a4f9
Show file tree
Hide file tree
Showing 5 changed files with 416 additions and 465 deletions.
6 changes: 4 additions & 2 deletions datumaro/plugins/sampler/algorithm/algorithm.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,11 @@

from enum import Enum

SamplingMethod = Enum("SamplingMethod", ["topk", "lowk", "randk", "mixk", "randtopk"])
Algorithm = Enum("Algorithm", ["entropy"])

SamplingMethod = Enum("SamplingMethod",
["topk", "lowk", "randk", "mixk", "randtopk"])

Algorithm = Enum("Algorithm", ["entropy"])

class InferenceResultAnalyzer:
"""
Expand Down
86 changes: 43 additions & 43 deletions datumaro/plugins/sampler/algorithm/entropy.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,11 @@
#
# SPDX-License-Identifier: MIT

import pandas as pd
import logging as log
import math
import re
import logging as log

import pandas as pd

from .algorithm import InferenceResultAnalyzer

Expand All @@ -21,58 +22,56 @@ def __init__(self, data, inference):
"""
Constructor function
Args:
data: Receive the data format in pd.DataFrame format. ImageID is an essential element for data.
data: Receive the data format in pd.DataFrame format.
ImageID is an essential element for data.
inference:
Receive the inference format in the form of pd.DataFrame.
ImageID and ClassProbability are essential for inferences.
"""
super().__init__(data, inference)

# check the existence of "ImageID" in data & inference
if "ImageID" not in data:
if 'ImageID' not in data:
raise Exception("Invalid Data, ImageID not found in data")
if "ImageID" not in inference:
if 'ImageID' not in inference:
raise Exception("Invalid Data, ImageID not found in inference")

# check the existence of "ClassProbability" in inference
self.num_classes = 0
for head in list(inference):
m = re.match("ClassProbability\d+", head)
if m is not None:
if re.match(r"ClassProbability\d+", head):
self.num_classes += 1

if not self.num_classes > 0:
if self.num_classes == 0:
raise Exception(
"Invalid data, Inference do not have ClassProbability values!"
)
"Invalid data, Inference do not have ClassProbability values")

# rank: The inference DataFrame, sorted according to the score.
self.rank = self._rank_images().sort_values(by="rank")
self.rank = self._rank_images().sort_values(by='rank')

def get_sample(self, method: str, k: int, n: int = 3) -> pd.DataFrame:
"""
A function that extracts sample data and returns it.
Args:
method:
- 'topk': It extracts the k sample data with the highest uncertainty.
- 'lowk': It extracts the k sample data with the lowest uncertainty.
- 'topk': It extracts the k sample data with the
highest uncertainty.
- 'lowk': It extracts the k sample data with the
lowest uncertainty.
- 'randomk': Extract and return random k sample data.
k: number of sample data
n: Parameters to be used in the randtopk method, Variable to first extract data of multiple n of k.
n: Parameters to be used in the randtopk method, Variable to first
extract data of multiple n of k.
Returns:
Extracted sample data : pd.DataFrame
"""
temp_rank = self.rank

# 1. k value check
if not isinstance(k, int):
raise Exception(
if not isinstance(k, int) or k <= 0:
raise ValueError(
f"Invalid value {k}. k must have an integer greater than zero."
)
elif k <= 0:
raise Exception(
f"Invalid number {k}. k must have a positive number greater than zero."
)

# 2. Select a sample according to the method
if k <= len(temp_rank):
Expand All @@ -82,20 +81,19 @@ def get_sample(self, method: str, k: int, n: int = 3) -> pd.DataFrame:
temp_rank = temp_rank[-k:]
elif method == self.sampling_method.randk.name:
return self.data.sample(n=k).reset_index(drop=True)
elif method in [
self.sampling_method.mixk.name,
self.sampling_method.randtopk.name,
]:
elif method in {self.sampling_method.mixk.name,
self.sampling_method.randtopk.name}:
return self._get_sample_mixed(method=method, k=k, n=n)
else:
raise Exception(f"Not Found method '{method}'")
raise ValueError(f"Unknown sampling method '{method}'")
else:
log.warning(
"The number of samples is greater than the size of the selected subset."
"The number of samples is greater than the size of the"
"selected subset."
)

columns = list(self.data.columns)
merged_df = pd.merge(temp_rank, self.data, how="inner", on=["ImageID"])
merged_df = pd.merge(temp_rank, self.data, how='inner', on=['ImageID'])
return merged_df[columns].reset_index(drop=True)

def _get_sample_mixed(self, method: str, k: int, n: int = 3) -> pd.DataFrame:
Expand All @@ -104,9 +102,11 @@ def _get_sample_mixed(self, method: str, k: int, n: int = 3) -> pd.DataFrame:
Args:
method:
- 'mixk': Return top-k and low-k halves based on uncertainty.
- 'randomtopk': Randomly extract n*k and return k with high uncertainty.
- 'randomtopk': Randomly extract n*k and return k
with high uncertainty.
k: number of sample data
n: Number to extract n*k from total data according to n, and top-k from it
n: Number to extract n * k from total data according to n,
and top-k from it
Returns:
Extracted sample data : pd.DataFrame
"""
Expand All @@ -123,13 +123,13 @@ def _get_sample_mixed(self, method: str, k: int, n: int = 3) -> pd.DataFrame:
)
elif method == self.sampling_method.randtopk.name:
if n * k <= len(temp_rank):
temp_rank = temp_rank.sample(n=n * k).sort_values(by="rank")
temp_rank = temp_rank.sample(n=n * k).sort_values(by='rank')
else:
log.warning(msg="n * k exceeds the length of the inference")
temp_rank = temp_rank[:k]

columns = list(self.data.columns)
merged_df = pd.merge(temp_rank, self.data, how="inner", on=["ImageID"])
merged_df = pd.merge(temp_rank, self.data, how='inner', on=['ImageID'])
return merged_df[columns].reset_index(drop=True)

def _rank_images(self) -> pd.DataFrame:
Expand All @@ -143,29 +143,29 @@ def _rank_images(self) -> pd.DataFrame:
if self.inference is not None:
inference = pd.DataFrame(self.inference)
else:
raise Exception("Invalid Data, Failed to load inference result!")
raise Exception("Invalid Data, Failed to load inference result")

# 2. If the reference data frame does not contain an uncertify score, calculate it
if "Uncertainty" not in inference:
inference = self._calculate_uncertainty_from_classprob(inference=inference)
if 'Uncertainty' not in inference:
inference = self._calculate_uncertainty_from_classprob(inference)

# 3. Check that Uncertainty values are in place.
na_df = inference.isna().sum()
if "Uncertainty" in na_df and na_df["Uncertainty"] > 0:
raise Exception("Some inference results do not have Uncertainty values!")
if 'Uncertainty' in na_df and na_df['Uncertainty'] > 0:
raise Exception("Some inference results do not have Uncertainty values")

# 4. Ranked based on Uncertainty score
res = inference[["ImageID", "Uncertainty"]].groupby("ImageID").mean()
res["rank"] = res["Uncertainty"].rank(ascending=False, method="first")
res = inference[['ImageID', 'Uncertainty']].groupby('ImageID').mean()
res['rank'] = res['Uncertainty'].rank(ascending=False, method='first')
res = res.reset_index()

return res

def _calculate_uncertainty_from_classprob(
self, inference: pd.DataFrame
) -> pd.DataFrame:
self, inference: pd.DataFrame) -> pd.DataFrame:
"""
A function that calculates uncertainty based on entropy through ClassProbability values.
A function that calculates uncertainty based on entropy through
ClassProbability values.
Args:
inference: Inference data where uncertainty has not been calculated
Returns:
Expand All @@ -177,7 +177,7 @@ def _calculate_uncertainty_from_classprob(
for i in range(len(inference)):
entropy = 0
for j in range(self.num_classes):
p = inference.loc[i][f"ClassProbability{j+1}"]
p = inference.loc[i][f'ClassProbability{j+1}']
if p < 0 or p > 1:
raise Exception(
"Invalid data, Math domain Error! p is between 0 and 1"
Expand All @@ -186,6 +186,6 @@ def _calculate_uncertainty_from_classprob(

uncertainty.append(entropy)

inference["Uncertainty"] = uncertainty
inference['Uncertainty'] = uncertainty

return inference
Loading

0 comments on commit e32a4f9

Please sign in to comment.