Skip to content

Commit

Permalink
Add weighted scheme and tutorial
Browse files Browse the repository at this point in the history
Signed-off-by: zethson <[email protected]>
  • Loading branch information
Zethson committed Apr 24, 2024
1 parent 1dc85ab commit d350978
Show file tree
Hide file tree
Showing 7 changed files with 350 additions and 85 deletions.
5 changes: 3 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,9 @@ and this project adheres to [Semantic Versioning][].
[keep a changelog]: https://keepachangelog.com/en/1.0.0/
[semantic versioning]: https://semver.org/spec/v2.0.0.html

## [Unreleased]
## [0.0.1]

### Added

- TODO
- First implementation of FaissImputer
- mean, median, weighted for strategies
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@ Please refer to the [documentation][link-docs]. In particular, the

## Installation

You need to have Python 3.9 or newer installed on your system. If you don't have
Python installed, we recommend installing [Mambaforge](https://github.com/conda-forge/miniforge#mambaforge).
You need to have Python 3.10 or newer installed on your system.
If you don't have Python installed, we recommend installing [Mambaforge](https://github.com/conda-forge/miniforge#mambaforge).

Install the latest release of `fknni` from `PyPI <https://pypi.org/project/fknni/>`\_:

Expand Down
44 changes: 0 additions & 44 deletions docs/notebooks/example.ipynb

This file was deleted.

288 changes: 288 additions & 0 deletions docs/notebooks/faiss.ipynb

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -41,11 +41,11 @@ doc = [
"ipykernel",
"ipython",
"sphinx-copybutton",
"matplotlib"
]
test = [
"pytest",
"coverage",
"pandas",
]

[tool.coverage.run]
Expand Down
2 changes: 2 additions & 0 deletions src/fknni/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,6 @@

__all__ = ["faiss"]

from .faiss import FaissImputer

__version__ = version("fknni")
90 changes: 54 additions & 36 deletions src/fknni/faiss/faiss.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,22 +8,24 @@


class FaissImputer(BaseEstimator, TransformerMixin):
"""Imputer for completing missing values using Faiss."""
"""Imputer for completing missing values using Faiss, incorporating weighted averages based on distance."""

def __init__(
self,
n_neighbors: int = 3,
n_neighbors: int = 5,
metric: Literal["l2", "ip"] = "l2",
strategy: Literal["mean", "median"] = "mean",
strategy: Literal["mean", "median", "weighted"] = "weighted",
index_factory: str = "Flat",
):
"""Initializes FaissImputer with specified parameters.
"""Initializes FaissImputer with specified parameters that are used for the imputation.
Args:
n_neighbors: Number of neighbors to use for imputation.
metric: Distance metric to use for neighbor search.
strategy: Method to compute imputed values.
index_factory: Description of the Faiss index type to build.
n_neighbors: Number of neighbors to use for imputation. Defaults to 5.
metric: Distance metric to use for neighbor search. Defaults to 'l2'.
strategy: Method to compute imputed values among neighbors.
The weighted strategy is similar to scikt-learn's implementation,
where closer neighbors have a higher influence on the imputation.
index_factory: Description of the Faiss index type to build. Defaults to 'Flat'.
"""
super().__init__()
self.n_neighbors = n_neighbors
Expand All @@ -39,19 +41,14 @@ def fit(self, X: np.ndarray | pd.DataFrame, *, y: np.ndarray | None = None) -> "
y: Ignored, present for compatibility with sklearn's TransformerMixin.
Raises:
ValueError: If any parameters are set to an invalid value.
ValueError: If any parameters are set to an invalid value.
"""
X = check_array(X, dtype=np.float32, force_all_finite="allow-nan")
X = check_array(X, force_all_finite="allow-nan")
self.input_dtype_ = X.dtype

if not isinstance(self.n_neighbors, int) or self.n_neighbors <= 0:
raise ValueError("n_neighbors must be a positive integer")
if self.metric not in {"l2", "ip"}:
raise ValueError("metric must be either 'l2' or 'ip'")
if self.strategy not in {"mean", "median"}:
raise ValueError("strategy must be either 'mean' or 'median'")

mask = ~np.isnan(X).any(axis=1)
X_non_missing = X[mask]
# Handle missing values for indexing
self.means_ = np.nanmean(X, axis=0) # Store means for missing value handling
X_non_missing = np.where(np.isnan(X), self.means_, X).astype(np.float32)

index = faiss.index_factory(
X_non_missing.shape[1],
Expand All @@ -71,28 +68,49 @@ def transform(self, X: np.ndarray | pd.DataFrame) -> np.ndarray:
X: Data with missing values to impute. Expected to be either a NumPy array or a pandas DataFrame.
Returns:
Data with imputed values as a NumPy array.
Data with imputed values as a NumPy array of the original data type.
"""
X = check_array(X, dtype=np.float32, force_all_finite="allow-nan")
X = check_array(X, force_all_finite="allow-nan")
check_is_fitted(self, "index_")
X_imputed = np.array(X, copy=True)
X_imputed = np.array(X, dtype=np.float32) # Use float32 for processing
missing_mask = np.isnan(X_imputed)

placeholder_values = (
np.nanmean(X_imputed, axis=0) if self.strategy == "mean" else np.nanmedian(X_imputed, axis=0)
)
X_filled = np.where(missing_mask, self.means_, X_imputed)

for sample_idx in np.where(missing_mask.any(axis=1))[0]:
sample_row = X_imputed[sample_idx, :]
sample_row_filled = X_filled[sample_idx]
sample_missing_cols = np.where(missing_mask[sample_idx])[0]
sample_row[sample_missing_cols] = placeholder_values[sample_missing_cols]

_, neighbor_indices = self.index_.search(sample_row.reshape(1, -1), self.n_neighbors)
selected_values = X_imputed[neighbor_indices[0], :][:, sample_missing_cols]

sample_row[sample_missing_cols] = (
np.mean(selected_values, axis=0) if self.strategy == "mean" else np.median(selected_values, axis=0)
)
X_imputed[sample_idx, :] = sample_row

return X_imputed
distances, neighbor_indices = self.index_.search(sample_row_filled.reshape(1, -1), self.n_neighbors)
neighbors = X_filled[neighbor_indices[0]]

for col in sample_missing_cols:
valid_neighbors = neighbors[:, col][~np.isnan(neighbors[:, col])]
valid_distances = distances[0, : len(valid_neighbors)]

if len(valid_neighbors) < self.n_neighbors:
if len(valid_neighbors) == 0:
imputed_value = self.means_[col]
else:
if self.strategy in {"mean", "weighted"}:
weights = (
1 / (1 + valid_distances)
if self.strategy == "weighted"
else np.ones_like(valid_distances)
)
imputed_value = np.average(valid_neighbors, weights=weights)
elif self.strategy == "median":
imputed_value = np.median(valid_neighbors)
else:
if self.strategy == "mean":
imputed_value = np.mean(valid_neighbors)
elif self.strategy == "median":
imputed_value = np.median(valid_neighbors)
elif self.strategy == "weighted":
small_constant = 1e-10 # Small constant to prevent division by zero
weights = 1 / (valid_distances + small_constant)
imputed_value = np.average(valid_neighbors, weights=weights)

X_imputed[sample_idx, col] = imputed_value

return X_imputed.astype(self.input_dtype_) # Cast back to the original input dtype

0 comments on commit d350978

Please sign in to comment.