Skip to content

Commit

Permalink
update new features (#17)
Browse files Browse the repository at this point in the history
* enhance pool loader instantiation

* enhance pool loader instantiation

* add logger save_to_parquet method

* add logger save_to_parquet method

* enhance datastores -- enforce default column names

* make BADGE an hybrid strategy

* add warning when index is bigger than dataset

* update lock file

* make estimator call hooks defined on Self

* rename target to labels

* format clustering utils

* rename target to labels

* rename target to labels

* format

* lint

* format and lint

* condense pool-based steps

* fix typo and lint

* lint

* fix seals

* fix train loader when anchoral
  • Loading branch information
pietrolesci authored Oct 17, 2023
1 parent e063ede commit 279ae46
Show file tree
Hide file tree
Showing 25 changed files with 1,529 additions and 1,525 deletions.
27 changes: 14 additions & 13 deletions energizer/active_learning/clustering_utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@

import numpy as np
from numpy.random import RandomState
from scipy.spatial.distance import cdist

# from scipy.spatial.distance import cdist
from sklearn.cluster import KMeans, kmeans_plusplus
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
Expand Down Expand Up @@ -81,6 +82,18 @@ def kmeans_silhouette_sampling(X: np.ndarray, num_clusters: int, rng: RandomStat
return _kmeans(X, num_clusters, rng, True, normalize)


def kmeans_pp_sampling(X: np.ndarray, num_clusters: int, rng: RandomState, *args, **kwargs) -> List[int]:
_, indices = kmeans_plusplus(X, num_clusters, random_state=rng)

unique_ids = list(set(indices.tolist()))

# can this generate duplicates?
if len(unique_ids) != len(indices):
print(f"Kmeans++ returned duplicates. {X.shape=} {num_clusters=}")

return unique_ids


# def kmeans_pp_sampling(X: np.ndarray, num_clusters: int, rng: RandomState, normalize: bool = True) -> List[int]:
# """kmeans++ seeding algorithm.

Expand Down Expand Up @@ -111,15 +124,3 @@ def kmeans_silhouette_sampling(X: np.ndarray, num_clusters: int, rng: RandomStat
# centers_ids.append(new_center_id)

# return centers_ids


def kmeans_pp_sampling(X: np.ndarray, num_clusters: int, rng: RandomState, *args, **kwargs) -> List[int]:
_, indices = kmeans_plusplus(X, num_clusters, random_state=rng)

unique_ids = list(set(indices.tolist()))

# can this generate duplicates?
if len(unique_ids) != len(indices):
print(f"Kmeans++ returned duplicates. {X.shape=} {num_clusters=}")

return unique_ids
21 changes: 11 additions & 10 deletions energizer/active_learning/datastores/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ def pool_dataset(self, round: Optional[int] = None, with_indices: Optional[List[
if with_indices is not None:
mask = mask & self._train_data[SpecialKeys.ID].isin(with_indices)
return Dataset.from_pandas(
self._train_data.loc[mask, [i for i in self._train_data.columns if i != InputKeys.TARGET]]
self._train_data.loc[mask, [i for i in self._train_data.columns if i != InputKeys.LABELS]]
)

def label(
Expand All @@ -155,11 +155,11 @@ def label(
# train-validation split
if validation_perc is not None:
n_val = floor(validation_perc * len(indices)) or 1 # at least add one
currentdata = self._train_data.loc[mask, [SpecialKeys.ID, InputKeys.TARGET]]
currentdata = self._train_data.loc[mask, [SpecialKeys.ID, InputKeys.LABELS]]
val_indices = sample(
indices=currentdata[SpecialKeys.ID].tolist(),
size=n_val,
labels=currentdata[InputKeys.TARGET].tolist(),
labels=currentdata[InputKeys.LABELS].tolist(),
mode=validation_sampling,
random_state=self._rng,
)
Expand All @@ -180,13 +180,13 @@ def sample_from_pool(
mask = self._pool_mask(round)
if with_indices:
mask = mask & self._train_data[SpecialKeys.ID].isin(with_indices)
data = self._train_data.loc[mask, [SpecialKeys.ID, InputKeys.TARGET]]
data = self._train_data.loc[mask, [SpecialKeys.ID, InputKeys.LABELS]]

return sample(
indices=data[SpecialKeys.ID].tolist(),
size=size,
random_state=random_state or self._rng,
labels=data[InputKeys.TARGET].tolist(),
labels=data[InputKeys.LABELS].tolist(),
**kwargs,
)

Expand All @@ -200,19 +200,19 @@ def save_labelled_dataset(self, save_dir: Union[str, Path]) -> None:
"""

def _labelled_mask(self, round: Optional[int] = None) -> pd.Series:
mask = self._train_data[SpecialKeys.IS_LABELLED] == True
mask = self._train_data[SpecialKeys.IS_LABELLED] == True # noqa: E712
if round is not None:
mask = mask & (self._train_data[SpecialKeys.LABELLING_ROUND] <= round)
return mask

def _train_mask(self, round: Optional[int] = None) -> pd.Series:
return self._labelled_mask(round) & (self._train_data[SpecialKeys.IS_VALIDATION] == False)
return self._labelled_mask(round) & (self._train_data[SpecialKeys.IS_VALIDATION] == False) # noqa: E712

def _validation_mask(self, round: Optional[int] = None) -> pd.Series:
return self._labelled_mask(round) & (self._train_data[SpecialKeys.IS_VALIDATION] == True)
return self._labelled_mask(round) & (self._train_data[SpecialKeys.IS_VALIDATION] == True) # noqa: E712

def _pool_mask(self, round: Optional[int] = None) -> pd.Series:
mask = self._train_data[SpecialKeys.IS_LABELLED] == False
mask = self._train_data[SpecialKeys.IS_LABELLED] == False # noqa: E712
if round is not None:
mask = mask | (self._train_data[SpecialKeys.LABELLING_ROUND] > round)
return mask
Expand All @@ -233,7 +233,8 @@ def get_pool_embeddings(self, ids: List[int]) -> np.ndarray:

def get_train_embeddings(self, ids: List[int]) -> np.ndarray:
# check all the ids are training ids
assert len(set(self.get_train_ids()).intersection(set(ids))) == len(ids) # type: ignore
train_ids = self.get_train_ids() # type: ignore
assert all(i in train_ids for i in ids), set(train_ids).difference(set(ids))

# now that we are sure, let's unmask them and get the items
self.unmask_ids_from_index(ids)
Expand Down
17 changes: 8 additions & 9 deletions energizer/active_learning/datastores/classification.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import List, Optional, Union
from typing import List, Optional

from datasets import Dataset
from transformers import PreTrainedTokenizerBase
Expand All @@ -13,21 +13,20 @@ class ActivePandasDataStoreForSequenceClassification(SequenceClassificationMixin
@classmethod
def from_datasets(
cls,
input_names: Union[str, List[str]],
target_name: str,
tokenizer: PreTrainedTokenizerBase,
train_dataset: Dataset,
validation_dataset: Optional[Dataset] = None,
test_dataset: Optional[Dataset] = None,
uid_name: Optional[str] = None,
on_cpu: Optional[List[str]] = None,
seed: Optional[int] = 42,
train_dataset: Optional[Dataset] = None,
validation_dataset: Optional[Dataset] = None,
test_dataset: Optional[Dataset] = None,
) -> Self:
obj = cls(seed)
obj = cls(seed) # type: ignore
obj = _from_datasets(
obj=obj,
input_names=input_names,
target_name=target_name,
mandatory_input_names=cls.MANDATORY_INPUT_NAMES,
optional_input_names=cls.OPTIONAL_INPUT_NAMES,
mandatory_target_name=cls.MANDATORY_TARGET_NAME,
tokenizer=tokenizer,
uid_name=uid_name,
on_cpu=on_cpu,
Expand Down
Loading

0 comments on commit 279ae46

Please sign in to comment.