Adds BaselineImputer and Reworks MarginalImputer (#262)

* started work on #107 * refactores imputer tests * reworks imputers * reworks imputers * finishes baseline imputer and adds joint-marginal-dist. to MarginalImputer * adds test to joint marginal prediciton and closes #261 * updated documentation
mmschlk · Oct 30, 2024 · 320518a · 320518a
1 parent 15532ff
commit 320518a
Show file tree

Hide file tree

Showing 16 changed files with 612 additions and 208 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -12,7 +12,9 @@
 - renames explanation graph to `si_graph`
 - `get_n_order` now has optional lower/upper limits for the order
 - computing metrics now tries to resolve not-matching interaction indices and will throw a warning instead of a ValueError [#179](https://github.com/mmschlk/shapiq/issues/179)
-- ...
+- removed the `sample_replacements` parameter from `MarginalImputer` which is now handled by the `BaselineImputer`. Added a DeprecationWarning for the parameter, which will be removed in the next release.
+- adds `BaselineImputer` [#107](https://github.com/mmschlk/shapiq/issues/107)
+- adds `joint_marginal_distribution` parameter to `MarginalImputer` [#261](https://github.com/mmschlk/shapiq/issues/261)
 
 ### v1.0.1 (2024-06-05)
 

diff --git a/shapiq/__init__.py b/shapiq/__init__.py
@@ -45,7 +45,8 @@
 from .explainer import Explainer, TabularExplainer, TreeExplainer
 
 # game classes
-from .games import ConditionalImputer, Game, MarginalImputer
+# imputer classes
+from .games import BaselineImputer, ConditionalImputer, Game, MarginalImputer
 
 # base classes
 from .interaction_values import InteractionValues
@@ -96,6 +97,7 @@
     "TreeExplainer",
     # imputers
     "MarginalImputer",
+    "BaselineImputer",
     "ConditionalImputer",
     # plots
     "network_plot",

diff --git a/shapiq/games/__init__.py b/shapiq/games/__init__.py
@@ -1,9 +1,9 @@
 """Game objects for the shapiq package."""
 
-# from . import benchmark
+# from . import benchmark  # not imported here to avoid circular imports and long import times
 from .base import Game
-from .imputer import ConditionalImputer, MarginalImputer
+from .imputer import BaselineImputer, ConditionalImputer, MarginalImputer
 
-__all__ = ["Game", "MarginalImputer", "ConditionalImputer"]  # + benchmark.__all__
+__all__ = ["Game", "MarginalImputer", "ConditionalImputer", "BaselineImputer"]
 
 # Path: shapiq/games/__init__.py
diff --git a/shapiq/games/base.py b/shapiq/games/base.py
@@ -187,8 +187,8 @@ def _check_coalitions(
             >>> coalitions = [("Alice", "Bob"), ("Bob", "Charlie")]
             Wrong format:
             >>> coalitions = [1, 0, 0, 0]
-            >>> coalitions = [(1,"Alice")]
-            >>> coalitions = np.array([1,-1,2])
+            >>> coalitions = [(1, "Alice")]
+            >>> coalitions = np.array([1, -1, 2])
 
 
         """
@@ -220,6 +220,7 @@ def _check_coalitions(
                     f"the number of players in the game ({self.n_players})."
                 )
 
+            # TODO maybe remove this, as it might increase runtime unnecessarily
             # Check that values of numpy array are either 0 or 1
             if not np.all(np.logical_or(coalitions == 0, coalitions == 1)):
                 raise TypeError("The values in the array of coalitions are not binary.")

diff --git a/shapiq/games/imputer/__init__.py b/shapiq/games/imputer/__init__.py
@@ -1,6 +1,7 @@
 """Imputer objects for the shapiq package."""
 
+from .baseline_imputer import BaselineImputer
 from .conditional_imputer import ConditionalImputer
 from .marginal_imputer import MarginalImputer
 
-__all__ = ["MarginalImputer", "ConditionalImputer"]
+__all__ = ["MarginalImputer", "ConditionalImputer", "BaselineImputer"]
diff --git a/shapiq/games/imputer/base.py b/shapiq/games/imputer/base.py
@@ -1,4 +1,4 @@
-"""Base class for all imputers."""
+"""Base class for all Imputers."""
 
 from abc import abstractmethod
 from typing import Optional
@@ -10,39 +10,91 @@
 
 
 class Imputer(Game):
-    """Base class for imputers.
+    """Base class for Imputers.
 
     Args:
         model: The model to explain as a callable function expecting a data points as input and
             returning the model's predictions.
         data: The background data to use for the explainer as a 2-dimensional array
             with shape ``(n_samples, n_features)``.
+        x: The explanation point to use the imputer on either as a 2-dimensional array with
+            shape ``(1, n_features)`` or as a vector with shape ``(n_features,)``.
+        sample_size: The number of samples to draw from the background data. Defaults to ``100`` but
+            can is usually overwritten in the subclasses.
         categorical_features: A list of indices of the categorical features in the background data.
         random_state: The random state to use for sampling. Defaults to ``None``.
+
+    Attributes:
+        n_features: The number of features in the data (equals the number of players in the game).
+        data: The background data to use for the imputer.
+        model: The model to impute missing values for as a callable function.
+        sample_size: The number of samples to draw from the background data.
+
+    Properties:
+        x: The explanation point to use the imputer on.
     """
 
     @abstractmethod
     def __init__(
         self,
         model,
         data: np.ndarray,
+        x: Optional[np.ndarray] = None,
+        sample_size: int = 100,
         categorical_features: list[int] = None,
         random_state: Optional[int] = None,
     ) -> None:
         if callable(model):
             self._predict_function = utils.predict_callable
-        else:  # shapiq.Explainer
+        else:  # shapiq.Explainer adds a predict function to the model to make it callable
             self._predict_function = model._predict_function
         self.model = model
+        # check if data is a vector
+        if data.ndim == 1:
+            data = data.reshape(1, data.shape[0])
         self.data = data
-        self._n_features = self.data.shape[1]
+        self.sample_size = sample_size
+        self.n_features = self.data.shape[1]
         self._cat_features: list = [] if categorical_features is None else categorical_features
         self._random_state = random_state
         self._rng = np.random.default_rng(self._random_state)
 
-        # the normalization_value needs to be set in the subclass
-        super().__init__(n_players=self._n_features, normalize=False)
+        # fit x
+        self._x: Optional[np.ndarray] = None  # will be overwritten @ fit
+        if x is not None:
+            self.fit(x)
+
+        # init the game
+        # developer note: the normalization_value needs to be set in the subclass
+        super().__init__(n_players=self.n_features, normalize=False)
+
+    @property
+    def x(self) -> Optional[np.ndarray]:
+        """Returns the explanation point if it is set."""
+        return self._x.copy() if self._x is not None else None
 
     def predict(self, x: np.ndarray) -> np.ndarray:
-        """Provides a unified prediction interface."""
+        """Provides a unified prediction interface.
+
+        Args:
+            x: The data point to predict the model's output for.
+
+        Returns:
+            The model's prediction for the given data point as a vector.
+        """
         return self._predict_function(self.model, x)
+
+    def fit(self, x: np.ndarray) -> "Imputer":
+        """Fits the imputer to the explanation point.
+
+        Args:
+            x: The explanation point to use the imputer on either as a 2-dimensional array with
+                shape ``(1, n_features)`` or as a vector with shape ``(n_features,)``.
+
+        Returns:
+            The fitted imputer.
+        """
+        self._x = x.copy()
+        if self._x.ndim == 1:
+            self._x = self._x.reshape(1, x.shape[0])
+        return self
diff --git a/shapiq/games/imputer/baseline_imputer.py b/shapiq/games/imputer/baseline_imputer.py
@@ -0,0 +1,150 @@
+"""Implementation of the baseline imputer."""
+
+import warnings
+from typing import Optional
+
+import numpy as np
+
+from shapiq.games.imputer.base import Imputer
+
+
+class BaselineImputer(Imputer):
+    """The baseline imputer for the shapiq package.
+
+    The baseline imputer is used to impute the missing values of a data point by using predefined
+    values (baseline values). If no baseline values are given, the imputer uses the mean (for
+    numerical features) or the mode (for categorical features) of the background data.
+
+    Args:
+        model: The model to explain as a callable function expecting a data points as input and
+            returning the model's predictions.
+        data: The background data to use for the explainer as either a vector of baseline values
+            or a two-dimensional array with shape ``(n_samples, n_features)``. If data is a matrix,
+            the baseline values are calculated from the data.
+        x: The explanation point to use the imputer to.
+        categorical_features: A list of indices of the categorical features in the background data.
+            If no categorical features are given, all features are assumed to be numerical or in
+            string format (where ``np.mean`` fails) features. Defaults to ``None``.
+        normalize: A flag to normalize the game values. If ``True``, then the game values are
+            normalized and centered to be zero for the empty set of features. Defaults to ``True``.
+        random_state: The random state to use for sampling. Defaults to ``None``.
+
+    Attributes:
+        baseline_values: The baseline values to use for imputation.
+        empty_prediction: The model's prediction on an empty data point (all features missing).
+
+    Examples:
+        >>> model = lambda x: np.sum(x, axis=1)  # some dummy model
+        >>> data = np.random.rand(1000, 4)  # some background data
+        >>> x_to_impute = np.array([[1, 1, 1, 1]])  # some data point to impute
+        >>> imputer = BaselineImputer(model=model, data=data, x=x_to_impute)
+        >>> # get the baseline values
+        >>> imputer.baseline_values
+        array([[0.5, 0.5, 0.5, 0.5]])  # computed from data
+        >>> # set new baseline values
+        >>> baseline_vector = np.array([0, 0, 0, 0])
+        >>> imputer.init_background(baseline_vector)
+        >>> imputer.baseline_values
+        array([[0, 0, 0, 0]])  # given as input
+        >>> # get the model prediction with missing values
+        >>> imputer(np.array([[True, False, True, False]]))
+        np.array([2.])  # model prediciton with the last baseline value
+    """
+
+    def __init__(
+        self,
+        model,
+        data: np.ndarray,
+        x: Optional[np.ndarray] = None,
+        categorical_features: list[int] = None,
+        normalize: bool = True,
+        random_state: Optional[int] = None,
+    ) -> None:
+        super().__init__(model, data, x, 1, categorical_features, random_state)
+
+        # setup attributes
+        self.baseline_values: np.ndarray = np.zeros((1, self.n_features))  # will be overwritten
+        self.init_background(self.data)
+
+        # set empty value and normalization
+        self.empty_prediction: float = self._calc_empty_prediction()
+        if normalize:
+            self.normalization_value = self.empty_prediction
+
+    def value_function(self, coalitions: np.ndarray) -> np.ndarray:
+        """Imputes the missing values of a data point and calls the model.
+
+        Args:
+            coalitions: A boolean array indicating which features are present (``True``) and which are
+                missing (``False``). The shape of the array must be ``(n_subsets, n_features)``.
+
+        Returns:
+            The model's predictions on the imputed data points. The shape of the array is
+               ``(n_subsets, n_outputs)``.
+        """
+        n_coalitions = coalitions.shape[0]
+        data = np.tile(np.copy(self._x), (n_coalitions, 1))
+        for i in range(n_coalitions):
+            data[i, ~coalitions[i]] = self.baseline_values[0, ~coalitions[i]]
+        outputs = self.predict(data)
+        return outputs
+
+    def init_background(self, data: np.ndarray) -> "BaselineImputer":
+        """Initializes the imputer to the background data.
+
+        Args:
+            data: The background data to use for the imputer. Either a vector of baseline values
+                of shape ``(n_features,)`` or a matrix of shape ``(n_samples, n_features)``.
+                If the data is a matrix, the baseline values are calculated from the data.
+
+        Returns:
+            The initialized imputer.
+
+        Examples:
+            >>> import numpy as np
+            >>> from shapiq.games.imputer import BaselineImputer
+            >>> data = np.array([[1, 2, "a"], [2, 3, "a"], [2, 4, "b"]], dtype=object)
+            >>> x = np.array([1, 2, 3])
+            >>> imputer = BaselineImputer(model=lambda x: np.sum(x, axis=1), data=data, x=x)
+            >>> imputer.baseline_values
+            array([[1.66, 3, 'a']], dtype=object)  # computed from data
+            >>> baseline_vector = np.array([0, 0, 0])
+            >>> imputer.init_background(baseline_vector)
+            >>> imputer.baseline_values
+            array([[0, 0, 0]])  # given as input
+        """
+        if data.ndim == 1 or data.shape[0] == 1:  # data is a vector -> use as baseline values
+            self.baseline_values = data.reshape(1, self.n_features)
+            return self
+        # data is a matrix -> calculate baseline values as mean or mode
+        self.baseline_values = np.zeros((1, self.n_features), dtype=object)
+        for feature in range(self.n_features):
+            feature_column = data[:, feature]
+            if feature in self._cat_features:  # get mode for categorical features
+                values, counts = np.unique(feature_column, return_counts=True)
+                summarized_feature = values[np.argmax(counts)]
+            else:
+                try:  # try to use mean for numerical features
+                    summarized_feature = np.mean(feature_column)
+                except TypeError:  # fallback to mode for potentially string features
+                    values, counts = np.unique(feature_column, return_counts=True)
+                    summarized_feature = values[np.argmax(counts)]
+                    # add feature to categorical features
+                    warnings.warn(
+                        f"Feature {feature} is not numerical. Adding it to categorical features."
+                    )
+                    self._cat_features.append(feature)
+            self.baseline_values[0, feature] = summarized_feature
+        return self
+
+    def _calc_empty_prediction(self) -> float:
+        """Runs the model on empty data points (all features missing) to get the empty prediction.
+
+        Returns:
+            The empty prediction.
+        """
+        empty_predictions = self.predict(self.baseline_values)
+        empty_prediction = float(empty_predictions[0])
+        if self.normalize:  # reset the normalization value
+            self.normalization_value = empty_prediction
+        return empty_prediction