Adds SV Notebook and Reworks Imputers (#257)

* Added interpretations for Shapley properties and extend lore of the cooking example * updated stacked-bar default label and title * updated function signatures * updated sv notebook * updated sv notebook * updates games input validator makes the validator a bit more readable and reduces the lines of code * adds NotImplementedError to aggregate_interaction_values * reworks imputer and fixes #264 --------- Co-authored-by: Maximilian <[email protected]>
mmschlk · Oct 31, 2024 · 0b02599 · 0b02599
1 parent fdcff8e
commit 0b02599
Show file tree

Hide file tree

Showing 18 changed files with 953 additions and 144 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -14,7 +14,8 @@
 - computing metrics now tries to resolve not-matching interaction indices and will throw a warning instead of a ValueError [#179](https://github.com/mmschlk/shapiq/issues/179)
 - removed the `sample_replacements` parameter from `MarginalImputer` which is now handled by the `BaselineImputer`. Added a DeprecationWarning for the parameter, which will be removed in the next release.
 - adds `BaselineImputer` [#107](https://github.com/mmschlk/shapiq/issues/107)
-- adds `joint_marginal_distribution` parameter to `MarginalImputer` [#261](https://github.com/mmschlk/shapiq/issues/261)
+- adds `joint_marginal_distribution` parameter to `MarginalImputer` with default value `True` [#261](https://github.com/mmschlk/shapiq/issues/261)
+- fixes a bug with SIs not adding to model prediciton because of wrong value in empty set [#264](https://github.com/mmschlk/shapiq/issues/264)
 
 ### v1.0.1 (2024-06-05)
 

diff --git a/docs/source/notebooks/sv_calculation.ipynb b/docs/source/notebooks/sv_calculation.ipynb
diff --git a/shapiq/approximator/_base.py b/shapiq/approximator/_base.py
@@ -320,6 +320,11 @@ def aggregate_interaction_values(
         """
         from ..aggregation import aggregate_interaction_values
 
+        if player_set is not None:
+            raise NotImplementedError(
+                "Aggregating interaction values for a subset of players is not implemented."
+            )
+
         return aggregate_interaction_values(base_interactions, order=order)
 
     @staticmethod

diff --git a/shapiq/datasets/_all.py b/shapiq/datasets/_all.py
@@ -1,7 +1,9 @@
 """This module contains functions to load datasets."""
 
 import os
+from typing import Union
 
+import numpy as np
 import pandas as pd
 
 GITHUB_DATA_URL = "https://raw.githubusercontent.com/mmschlk/shapiq/main/data/"
@@ -29,14 +31,22 @@ def _try_load(csv_file_name: str) -> pd.DataFrame:
         return data
 
 
-def load_california_housing(to_numpy=False) -> tuple[pd.DataFrame, pd.Series]:
+def load_california_housing(
+    to_numpy=False,
+) -> Union[tuple[pd.DataFrame, pd.Series], tuple[np.ndarray, np.ndarray]]:
     """Load the California housing dataset.
 
     Args:
         to_numpy: Return numpy objects instead of pandas. Default is ``False``.
 
     Returns:
         The California housing dataset as a pandas DataFrame.
+
+    Example:
+        >>> from shapiq.datasets import load_california_housing
+        >>> x_data, y_data = load_california_housing()
+        >>> print(x_data.shape, y_data.shape)
+        ((20640, 8), (20640,))
     """
     dataset = _try_load("california_housing.csv")
     class_label = "MedHouseVal"
@@ -49,17 +59,25 @@ def load_california_housing(to_numpy=False) -> tuple[pd.DataFrame, pd.Series]:
         return x_data, y_data
 
 
-def load_bike_sharing(to_numpy=False) -> tuple[pd.DataFrame, pd.Series]:
-    """Load the bike-sharing dataset from openml.
-
-    Args:
-        to_numpy: Return numpy objects instead of pandas. ``Default is False.``
+def load_bike_sharing(
+    to_numpy=False,
+) -> Union[tuple[pd.DataFrame, pd.Series], tuple[np.ndarray, np.ndarray]]:
+    """Load the bike-sharing dataset from openml and preprocess it.
 
     Note:
         The function requires the `sklearn` package to be installed.
 
+    Args:
+        to_numpy: Return numpy objects instead of pandas. ``Default is False.``
+
     Returns:
         The bike-sharing dataset as a pandas DataFrame.
+
+    Example:
+        >>> from shapiq.datasets import load_bike_sharing
+        >>> x_data, y_data = load_bike_sharing()
+        >>> print(x_data.shape, y_data.shape)
+        ((17379, 12), (17379,))
     """
     from sklearn.compose import ColumnTransformer
     from sklearn.pipeline import Pipeline
@@ -112,19 +130,27 @@ def load_bike_sharing(to_numpy=False) -> tuple[pd.DataFrame, pd.Series]:
         return x_data, y_data
 
 
-def load_adult_census(to_numpy=False) -> tuple[pd.DataFrame, pd.Series]:
+def load_adult_census(
+    to_numpy=False,
+) -> Union[tuple[pd.DataFrame, pd.Series], tuple[np.ndarray, np.ndarray]]:
     """Load the adult census dataset from the UCI Machine Learning Repository.
 
     Original source: https://archive.ics.uci.edu/ml/datasets/adult
 
-    Args:
-        to_numpy: Return numpy objects instead of pandas. Default is ``False``.
-
     Note:
         The function requires the `sklearn` package to be installed.
 
+    Args:
+        to_numpy: Return numpy objects instead of pandas. Default is ``False``.
+
     Returns:
         The adult census dataset as a pandas DataFrame.
+
+    Example:
+        >>> from shapiq.datasets import load_adult_census
+        >>> x_data, y_data = load_adult_census()
+        >>> print(x_data.shape, y_data.shape)
+        ((45222, 14), (45222,))
     """
     from sklearn.compose import ColumnTransformer
     from sklearn.impute import SimpleImputer

diff --git a/shapiq/explainer/tabular.py b/shapiq/explainer/tabular.py
@@ -86,7 +86,7 @@ def __init__(
         random_state: Optional[int] = None,
         **kwargs,
     ) -> None:
-        from shapiq.games.imputer import ConditionalImputer, MarginalImputer
+        from shapiq.games.imputer import BaselineImputer, ConditionalImputer, MarginalImputer
 
         if index not in AVAILABLE_INDICES:
             raise ValueError(f"Invalid index `{index}`. " f"Valid indices are {AVAILABLE_INDICES}.")
@@ -102,7 +102,15 @@ def __init__(
             self._imputer = ConditionalImputer(
                 self.predict, self.data, random_state=random_state, **kwargs
             )
-        elif isinstance(imputer, MarginalImputer) or isinstance(imputer, ConditionalImputer):
+        elif imputer == "baseline":
+            self._imputer = BaselineImputer(
+                self.predict, self.data, random_state=random_state, **kwargs
+            )
+        elif (
+            isinstance(imputer, MarginalImputer)
+            or isinstance(imputer, ConditionalImputer)
+            or isinstance(imputer, BaselineImputer)
+        ):
             self._imputer = imputer
         else:
             raise ValueError(
@@ -143,7 +151,7 @@ def explain(
         imputer = self._imputer.fit(x)
 
         # explain
-        interaction_values = self._approximator.approximate(budget=budget, game=imputer)
+        interaction_values = self._approximator(budget=budget, game=imputer)
         interaction_values.baseline_value = self.baseline_value
 
         return interaction_values

diff --git a/shapiq/games/base.py b/shapiq/games/base.py
@@ -157,7 +157,7 @@ def precomputed(self) -> bool:
 
     @property
     def normalize(self) -> bool:
-        """Indication whether the game values are normalized."""
+        """Indication whether the game values are getting normalized."""
         return self.normalization_value != 0
 
     @property
@@ -167,17 +167,28 @@ def is_normalized(self) -> bool:
 
     def _check_coalitions(
         self,
-        coalitions: Union[np.ndarray, list[Union[tuple[int], tuple[str]]]],
+        coalitions: Union[
+            np.ndarray,
+            list[tuple[int, ...]],
+            list[tuple[str, ...]],
+            tuple[int, ...],
+            tuple[str, ...],
+        ],
     ) -> np.ndarray:
-        """
+        """Validates the coalitions and convert them to one-hot encoding.
+
         Check if the coalitions are in the correct format and convert them to one-hot encoding.
-        The format may either be a numpy array containg the coalitions in one-hot encoding or a list of tuples with integers or strings.
+        The format may either be a numpy array containg the coalitions in one-hot encoding or a
+        list of tuples with integers or strings.
+
         Args:
             coalitions: The coalitions to convert to one-hot encoding.
         Returns:
             np.ndarray: The coalitions in the correct format
+
         Raises:
             TypeError: If the coalitions are not in the correct format.
+
         Examples:
             >>> coalitions = np.asarray([[1, 0, 0, 0], [0, 1, 1, 0]])
             >>> coalitions = [(0, 1), (1, 2)]
@@ -189,79 +200,63 @@ def _check_coalitions(
             >>> coalitions = [1, 0, 0, 0]
             >>> coalitions = [(1, "Alice")]
             >>> coalitions = np.array([1, -1, 2])
-
-
         """
         error_message = (
-            "List may only contain tuples of integers or strings."
-            "The tuples are not allowed to have heterogeneous types."
-            "Reconcile the docs for correct format of coalitions."
+            "List may only contain tuples of integers or strings. The tuples are not allowed to "
+            "have heterogeneous types. See the docs for correct format of coalitions. If strings "
+            "are used, the player_name_lookup has to be provided during initialization."
         )
-
+        # check for array input and do validation
         if isinstance(coalitions, np.ndarray):
-
-            # Check that coalition is contained in array
-            if len(coalitions) == 0:
+            if len(coalitions) == 0:  # check that coalition is contained in array
                 raise TypeError("The array of coalitions is empty.")
-
-            # Check if single coalition is correctly given
-            if coalitions.ndim == 1:
+            if coalitions.ndim == 1:  # check if single coalition is correctly given
                 if len(coalitions) < self.n_players or len(coalitions) > self.n_players:
                     raise TypeError(
                         "The array of coalitions is not correctly formatted."
                         f"It should have a length of {self.n_players}"
                     )
                 coalitions = coalitions.reshape((1, self.n_players))
-
-            # Check that all coalitions have the correct number of players
-            if coalitions.shape[1] != self.n_players:
+            if coalitions.shape[1] != self.n_players:  # check if players match
                 raise TypeError(
-                    f"The number of players in the coalitions ({coalitions.shape[1]}) does not match "
+                    f"Number of players in the coalitions ({coalitions.shape[1]}) does not match "
                     f"the number of players in the game ({self.n_players})."
                 )
-
             # TODO maybe remove this, as it might increase runtime unnecessarily
-            # Check that values of numpy array are either 0 or 1
+            # check that values of numpy array are either 0 or 1
             if not np.all(np.logical_or(coalitions == 0, coalitions == 1)):
                 raise TypeError("The values in the array of coalitions are not binary.")
-
             return coalitions
-
-        # We now assume to work with list of tuples
+        # try for list of tuples
         if isinstance(coalitions, tuple):
-            # if by any chance a tuple was given wrap into a list
             coalitions = [coalitions]
-
         try:
             # convert list of tuples to one-hot encoding
             coalitions = transform_coalitions_to_array(coalitions, self.n_players)
-
             return coalitions
-        except Exception as err:
-            # It may either be the tuples contain strings or wrong format
-            if self.player_name_lookup is not None:
-                # We now assume the tuples to contain strings
-                try:
-                    coalitions = [
-                        (
-                            tuple(self.player_name_lookup[player] for player in coalition)
-                            if coalition != tuple()
-                            else tuple()
-                        )
-                        for coalition in coalitions
-                    ]
-                    coalitions = transform_coalitions_to_array(coalitions, self.n_players)
-
-                    return coalitions
-                except Exception as err:
-                    raise TypeError(error_message) from err
-
-            raise TypeError(error_message) from err
+        except (IndexError, TypeError):
+            pass
+        # assuming str input
+        if self.player_name_lookup is None:
+            raise ValueError("Player names are not provided. Cannot convert string to integer.")
+        try:
+            coalitions_from_str = []
+            for coalition in coalitions:
+                coal_indices = sorted([self.player_name_lookup[player] for player in coalition])
+                coalitions_from_str.append(tuple(coal_indices))
+            coalitions = transform_coalitions_to_array(coalitions_from_str, self.n_players)
+            return coalitions
+        except Exception as error:
+            raise TypeError(error_message) from error
 
     def __call__(
         self,
         coalitions: Union[
-            np.ndarray, list[Union[tuple[int], tuple[str]]], tuple[Union[int, str]], str
+            np.ndarray,
+            list[tuple[int, ...]],
+            list[tuple[str, ...]],
+            tuple[int, ...],
+            tuple[str, ...],
         ],
         verbose: bool = False,
     ) -> np.ndarray:
@@ -275,11 +270,8 @@ def __call__(
         Returns:
             The values of the coalitions.
         """
-        # check if coalitions are correct format
-        coalitions = self._check_coalitions(coalitions)
-
+        coalitions = self._check_coalitions(coalitions)  # validate and convert input coalitions
         verbose = verbose or self.verbose
-
         if not self.precomputed and not verbose:
             values = self.value_function(coalitions)
         elif not self.precomputed and verbose:
@@ -291,7 +283,6 @@ def __call__(
                 values[i] = self.value_function(coalition)[0]
         else:
             values = self._lookup_coalitions(coalitions)  # lookup the values present in the storage
-
         return values - self.normalization_value
 
     def _lookup_coalitions(self, coalitions: np.ndarray) -> np.ndarray:

diff --git a/shapiq/games/imputer/base.py b/shapiq/games/imputer/base.py
@@ -29,6 +29,8 @@ class Imputer(Game):
         data: The background data to use for the imputer.
         model: The model to impute missing values for as a callable function.
         sample_size: The number of samples to draw from the background data.
+        random_state: The random state to use for sampling.
+        empty_prediction: The model's prediction on an empty data point (all features missing).
 
     Properties:
         x: The explanation point to use the imputer on.
@@ -54,10 +56,11 @@ def __init__(
             data = data.reshape(1, data.shape[0])
         self.data = data
         self.sample_size = sample_size
+        self.empty_prediction: float = 0.0  # will be overwritten in the subclasses
         self.n_features = self.data.shape[1]
         self._cat_features: list = [] if categorical_features is None else categorical_features
-        self._random_state = random_state
-        self._rng = np.random.default_rng(self._random_state)
+        self.random_state = random_state
+        self._rng = np.random.default_rng(self.random_state)
 
         # fit x
         self._x: Optional[np.ndarray] = None  # will be overwritten @ fit
@@ -98,3 +101,16 @@ def fit(self, x: np.ndarray) -> "Imputer":
         if self._x.ndim == 1:
             self._x = self._x.reshape(1, x.shape[0])
         return self
+
+    def insert_empty_value(self, outputs: np.ndarray, coalitions: np.ndarray) -> np.ndarray:
+        """Inserts the empty value into the outputs.
+
+        Args:
+            outputs: The model's predictions on the imputed data points.
+            coalitions: The coalitions for which the model's predictions were made.
+
+        Returns:
+            The model's predictions with the empty value inserted for the empty coalitions.
+        """
+        outputs[~np.any(coalitions, axis=1)] = self.empty_prediction
+        return outputs
diff --git a/shapiq/games/imputer/baseline_imputer.py b/shapiq/games/imputer/baseline_imputer.py
@@ -67,7 +67,6 @@ def __init__(
         self.init_background(self.data)
 
         # set empty value and normalization
-        self.empty_prediction: float = self._calc_empty_prediction()
         if normalize:
             self.normalization_value = self.empty_prediction
 
@@ -135,16 +134,18 @@ def init_background(self, data: np.ndarray) -> "BaselineImputer":
                     )
                     self._cat_features.append(feature)
             self.baseline_values[0, feature] = summarized_feature
+        self.calc_empty_prediction()  # reset the empty prediction to the new baseline values
         return self
 
-    def _calc_empty_prediction(self) -> float:
+    def calc_empty_prediction(self) -> float:
         """Runs the model on empty data points (all features missing) to get the empty prediction.
 
         Returns:
             The empty prediction.
         """
         empty_predictions = self.predict(self.baseline_values)
         empty_prediction = float(empty_predictions[0])
+        self.empty_prediction = empty_prediction
         if self.normalize:  # reset the normalization value
             self.normalization_value = empty_prediction
         return empty_prediction