Skip to content

Commit

Permalink
Adds SV Notebook and Reworks Imputers (#257)
Browse files Browse the repository at this point in the history
* Added interpretations for Shapley properties and extend lore of the cooking example

* updated stacked-bar default label and title

* updated function signatures

* updated sv notebook

* updated sv notebook

* updates games input validator

makes the validator a bit more readable and reduces the lines of code

* adds NotImplementedError to aggregate_interaction_values

* reworks imputer and fixes #264

---------

Co-authored-by: Maximilian <[email protected]>
  • Loading branch information
Advueu963 and mmschlk authored Oct 31, 2024
1 parent fdcff8e commit 0b02599
Show file tree
Hide file tree
Showing 18 changed files with 953 additions and 144 deletions.
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@
- computing metrics now tries to resolve not-matching interaction indices and will throw a warning instead of a ValueError [#179](https://github.com/mmschlk/shapiq/issues/179)
- removed the `sample_replacements` parameter from `MarginalImputer` which is now handled by the `BaselineImputer`. Added a DeprecationWarning for the parameter, which will be removed in the next release.
- adds `BaselineImputer` [#107](https://github.com/mmschlk/shapiq/issues/107)
- adds `joint_marginal_distribution` parameter to `MarginalImputer` [#261](https://github.com/mmschlk/shapiq/issues/261)
- adds `joint_marginal_distribution` parameter to `MarginalImputer` with default value `True` [#261](https://github.com/mmschlk/shapiq/issues/261)
- fixes a bug with SIs not adding to model prediciton because of wrong value in empty set [#264](https://github.com/mmschlk/shapiq/issues/264)

### v1.0.1 (2024-06-05)

Expand Down
797 changes: 771 additions & 26 deletions docs/source/notebooks/sv_calculation.ipynb

Large diffs are not rendered by default.

5 changes: 5 additions & 0 deletions shapiq/approximator/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -320,6 +320,11 @@ def aggregate_interaction_values(
"""
from ..aggregation import aggregate_interaction_values

if player_set is not None:
raise NotImplementedError(
"Aggregating interaction values for a subset of players is not implemented."
)

return aggregate_interaction_values(base_interactions, order=order)

@staticmethod
Expand Down
46 changes: 36 additions & 10 deletions shapiq/datasets/_all.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
"""This module contains functions to load datasets."""

import os
from typing import Union

import numpy as np
import pandas as pd

GITHUB_DATA_URL = "https://raw.githubusercontent.com/mmschlk/shapiq/main/data/"
Expand Down Expand Up @@ -29,14 +31,22 @@ def _try_load(csv_file_name: str) -> pd.DataFrame:
return data


def load_california_housing(to_numpy=False) -> tuple[pd.DataFrame, pd.Series]:
def load_california_housing(
to_numpy=False,
) -> Union[tuple[pd.DataFrame, pd.Series], tuple[np.ndarray, np.ndarray]]:
"""Load the California housing dataset.
Args:
to_numpy: Return numpy objects instead of pandas. Default is ``False``.
Returns:
The California housing dataset as a pandas DataFrame.
Example:
>>> from shapiq.datasets import load_california_housing
>>> x_data, y_data = load_california_housing()
>>> print(x_data.shape, y_data.shape)
((20640, 8), (20640,))
"""
dataset = _try_load("california_housing.csv")
class_label = "MedHouseVal"
Expand All @@ -49,17 +59,25 @@ def load_california_housing(to_numpy=False) -> tuple[pd.DataFrame, pd.Series]:
return x_data, y_data


def load_bike_sharing(to_numpy=False) -> tuple[pd.DataFrame, pd.Series]:
"""Load the bike-sharing dataset from openml.
Args:
to_numpy: Return numpy objects instead of pandas. ``Default is False.``
def load_bike_sharing(
to_numpy=False,
) -> Union[tuple[pd.DataFrame, pd.Series], tuple[np.ndarray, np.ndarray]]:
"""Load the bike-sharing dataset from openml and preprocess it.
Note:
The function requires the `sklearn` package to be installed.
Args:
to_numpy: Return numpy objects instead of pandas. ``Default is False.``
Returns:
The bike-sharing dataset as a pandas DataFrame.
Example:
>>> from shapiq.datasets import load_bike_sharing
>>> x_data, y_data = load_bike_sharing()
>>> print(x_data.shape, y_data.shape)
((17379, 12), (17379,))
"""
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
Expand Down Expand Up @@ -112,19 +130,27 @@ def load_bike_sharing(to_numpy=False) -> tuple[pd.DataFrame, pd.Series]:
return x_data, y_data


def load_adult_census(to_numpy=False) -> tuple[pd.DataFrame, pd.Series]:
def load_adult_census(
to_numpy=False,
) -> Union[tuple[pd.DataFrame, pd.Series], tuple[np.ndarray, np.ndarray]]:
"""Load the adult census dataset from the UCI Machine Learning Repository.
Original source: https://archive.ics.uci.edu/ml/datasets/adult
Args:
to_numpy: Return numpy objects instead of pandas. Default is ``False``.
Note:
The function requires the `sklearn` package to be installed.
Args:
to_numpy: Return numpy objects instead of pandas. Default is ``False``.
Returns:
The adult census dataset as a pandas DataFrame.
Example:
>>> from shapiq.datasets import load_adult_census
>>> x_data, y_data = load_adult_census()
>>> print(x_data.shape, y_data.shape)
((45222, 14), (45222,))
"""
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
Expand Down
14 changes: 11 additions & 3 deletions shapiq/explainer/tabular.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ def __init__(
random_state: Optional[int] = None,
**kwargs,
) -> None:
from shapiq.games.imputer import ConditionalImputer, MarginalImputer
from shapiq.games.imputer import BaselineImputer, ConditionalImputer, MarginalImputer

if index not in AVAILABLE_INDICES:
raise ValueError(f"Invalid index `{index}`. " f"Valid indices are {AVAILABLE_INDICES}.")
Expand All @@ -102,7 +102,15 @@ def __init__(
self._imputer = ConditionalImputer(
self.predict, self.data, random_state=random_state, **kwargs
)
elif isinstance(imputer, MarginalImputer) or isinstance(imputer, ConditionalImputer):
elif imputer == "baseline":
self._imputer = BaselineImputer(
self.predict, self.data, random_state=random_state, **kwargs
)
elif (
isinstance(imputer, MarginalImputer)
or isinstance(imputer, ConditionalImputer)
or isinstance(imputer, BaselineImputer)
):
self._imputer = imputer
else:
raise ValueError(
Expand Down Expand Up @@ -143,7 +151,7 @@ def explain(
imputer = self._imputer.fit(x)

# explain
interaction_values = self._approximator.approximate(budget=budget, game=imputer)
interaction_values = self._approximator(budget=budget, game=imputer)
interaction_values.baseline_value = self.baseline_value

return interaction_values
Expand Down
99 changes: 45 additions & 54 deletions shapiq/games/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@ def precomputed(self) -> bool:

@property
def normalize(self) -> bool:
"""Indication whether the game values are normalized."""
"""Indication whether the game values are getting normalized."""
return self.normalization_value != 0

@property
Expand All @@ -167,17 +167,28 @@ def is_normalized(self) -> bool:

def _check_coalitions(
self,
coalitions: Union[np.ndarray, list[Union[tuple[int], tuple[str]]]],
coalitions: Union[
np.ndarray,
list[tuple[int, ...]],
list[tuple[str, ...]],
tuple[int, ...],
tuple[str, ...],
],
) -> np.ndarray:
"""
"""Validates the coalitions and convert them to one-hot encoding.
Check if the coalitions are in the correct format and convert them to one-hot encoding.
The format may either be a numpy array containg the coalitions in one-hot encoding or a list of tuples with integers or strings.
The format may either be a numpy array containg the coalitions in one-hot encoding or a
list of tuples with integers or strings.
Args:
coalitions: The coalitions to convert to one-hot encoding.
Returns:
np.ndarray: The coalitions in the correct format
Raises:
TypeError: If the coalitions are not in the correct format.
Examples:
>>> coalitions = np.asarray([[1, 0, 0, 0], [0, 1, 1, 0]])
>>> coalitions = [(0, 1), (1, 2)]
Expand All @@ -189,79 +200,63 @@ def _check_coalitions(
>>> coalitions = [1, 0, 0, 0]
>>> coalitions = [(1, "Alice")]
>>> coalitions = np.array([1, -1, 2])
"""
error_message = (
"List may only contain tuples of integers or strings."
"The tuples are not allowed to have heterogeneous types."
"Reconcile the docs for correct format of coalitions."
"List may only contain tuples of integers or strings. The tuples are not allowed to "
"have heterogeneous types. See the docs for correct format of coalitions. If strings "
"are used, the player_name_lookup has to be provided during initialization."
)

# check for array input and do validation
if isinstance(coalitions, np.ndarray):

# Check that coalition is contained in array
if len(coalitions) == 0:
if len(coalitions) == 0: # check that coalition is contained in array
raise TypeError("The array of coalitions is empty.")

# Check if single coalition is correctly given
if coalitions.ndim == 1:
if coalitions.ndim == 1: # check if single coalition is correctly given
if len(coalitions) < self.n_players or len(coalitions) > self.n_players:
raise TypeError(
"The array of coalitions is not correctly formatted."
f"It should have a length of {self.n_players}"
)
coalitions = coalitions.reshape((1, self.n_players))

# Check that all coalitions have the correct number of players
if coalitions.shape[1] != self.n_players:
if coalitions.shape[1] != self.n_players: # check if players match
raise TypeError(
f"The number of players in the coalitions ({coalitions.shape[1]}) does not match "
f"Number of players in the coalitions ({coalitions.shape[1]}) does not match "
f"the number of players in the game ({self.n_players})."
)

# TODO maybe remove this, as it might increase runtime unnecessarily
# Check that values of numpy array are either 0 or 1
# check that values of numpy array are either 0 or 1
if not np.all(np.logical_or(coalitions == 0, coalitions == 1)):
raise TypeError("The values in the array of coalitions are not binary.")

return coalitions

# We now assume to work with list of tuples
# try for list of tuples
if isinstance(coalitions, tuple):
# if by any chance a tuple was given wrap into a list
coalitions = [coalitions]

try:
# convert list of tuples to one-hot encoding
coalitions = transform_coalitions_to_array(coalitions, self.n_players)

return coalitions
except Exception as err:
# It may either be the tuples contain strings or wrong format
if self.player_name_lookup is not None:
# We now assume the tuples to contain strings
try:
coalitions = [
(
tuple(self.player_name_lookup[player] for player in coalition)
if coalition != tuple()
else tuple()
)
for coalition in coalitions
]
coalitions = transform_coalitions_to_array(coalitions, self.n_players)

return coalitions
except Exception as err:
raise TypeError(error_message) from err

raise TypeError(error_message) from err
except (IndexError, TypeError):
pass
# assuming str input
if self.player_name_lookup is None:
raise ValueError("Player names are not provided. Cannot convert string to integer.")
try:
coalitions_from_str = []
for coalition in coalitions:
coal_indices = sorted([self.player_name_lookup[player] for player in coalition])
coalitions_from_str.append(tuple(coal_indices))
coalitions = transform_coalitions_to_array(coalitions_from_str, self.n_players)
return coalitions
except Exception as error:
raise TypeError(error_message) from error

def __call__(
self,
coalitions: Union[
np.ndarray, list[Union[tuple[int], tuple[str]]], tuple[Union[int, str]], str
np.ndarray,
list[tuple[int, ...]],
list[tuple[str, ...]],
tuple[int, ...],
tuple[str, ...],
],
verbose: bool = False,
) -> np.ndarray:
Expand All @@ -275,11 +270,8 @@ def __call__(
Returns:
The values of the coalitions.
"""
# check if coalitions are correct format
coalitions = self._check_coalitions(coalitions)

coalitions = self._check_coalitions(coalitions) # validate and convert input coalitions
verbose = verbose or self.verbose

if not self.precomputed and not verbose:
values = self.value_function(coalitions)
elif not self.precomputed and verbose:
Expand All @@ -291,7 +283,6 @@ def __call__(
values[i] = self.value_function(coalition)[0]
else:
values = self._lookup_coalitions(coalitions) # lookup the values present in the storage

return values - self.normalization_value

def _lookup_coalitions(self, coalitions: np.ndarray) -> np.ndarray:
Expand Down
20 changes: 18 additions & 2 deletions shapiq/games/imputer/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ class Imputer(Game):
data: The background data to use for the imputer.
model: The model to impute missing values for as a callable function.
sample_size: The number of samples to draw from the background data.
random_state: The random state to use for sampling.
empty_prediction: The model's prediction on an empty data point (all features missing).
Properties:
x: The explanation point to use the imputer on.
Expand All @@ -54,10 +56,11 @@ def __init__(
data = data.reshape(1, data.shape[0])
self.data = data
self.sample_size = sample_size
self.empty_prediction: float = 0.0 # will be overwritten in the subclasses
self.n_features = self.data.shape[1]
self._cat_features: list = [] if categorical_features is None else categorical_features
self._random_state = random_state
self._rng = np.random.default_rng(self._random_state)
self.random_state = random_state
self._rng = np.random.default_rng(self.random_state)

# fit x
self._x: Optional[np.ndarray] = None # will be overwritten @ fit
Expand Down Expand Up @@ -98,3 +101,16 @@ def fit(self, x: np.ndarray) -> "Imputer":
if self._x.ndim == 1:
self._x = self._x.reshape(1, x.shape[0])
return self

def insert_empty_value(self, outputs: np.ndarray, coalitions: np.ndarray) -> np.ndarray:
"""Inserts the empty value into the outputs.
Args:
outputs: The model's predictions on the imputed data points.
coalitions: The coalitions for which the model's predictions were made.
Returns:
The model's predictions with the empty value inserted for the empty coalitions.
"""
outputs[~np.any(coalitions, axis=1)] = self.empty_prediction
return outputs
5 changes: 3 additions & 2 deletions shapiq/games/imputer/baseline_imputer.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,6 @@ def __init__(
self.init_background(self.data)

# set empty value and normalization
self.empty_prediction: float = self._calc_empty_prediction()
if normalize:
self.normalization_value = self.empty_prediction

Expand Down Expand Up @@ -135,16 +134,18 @@ def init_background(self, data: np.ndarray) -> "BaselineImputer":
)
self._cat_features.append(feature)
self.baseline_values[0, feature] = summarized_feature
self.calc_empty_prediction() # reset the empty prediction to the new baseline values
return self

def _calc_empty_prediction(self) -> float:
def calc_empty_prediction(self) -> float:
"""Runs the model on empty data points (all features missing) to get the empty prediction.
Returns:
The empty prediction.
"""
empty_predictions = self.predict(self.baseline_values)
empty_prediction = float(empty_predictions[0])
self.empty_prediction = empty_prediction
if self.normalize: # reset the normalization value
self.normalization_value = empty_prediction
return empty_prediction
Loading

0 comments on commit 0b02599

Please sign in to comment.