Also score support and ranking 🚀

dunnkers · Jun 7, 2021 · 9ee15a5 · 9ee15a5
1 parent 07b62b7
commit 9ee15a5
Show file tree

Hide file tree

Showing 3 changed files with 185 additions and 32 deletions.
diff --git a/fseval/pipelines/_experiment.py b/fseval/pipelines/_experiment.py
@@ -4,10 +4,9 @@
 from typing import List
 
 import pandas as pd
-from humanfriendly import format_timespan
-
 from fseval.pipeline.estimator import Estimator
 from fseval.types import AbstractEstimator, TerminalColor
+from humanfriendly import format_timespan
 
 
 @dataclass
@@ -31,6 +30,9 @@ def _logger(self, estimator):
         return lambda text: getLogger(type(estimator).__name__).info(text)
 
     def _step_text(self, step_name, step_number, estimator):
+        """Provides a console logging string for logging during an experiment phase,
+        like in `fit` or `score`. Adds coloring and fit times to stdout."""
+
         # step text variables
         step = step_number + 1
         n_steps = len(self.estimators)

diff --git a/fseval/pipelines/rank_and_validate/_ranking_validator.py b/fseval/pipelines/rank_and_validate/_ranking_validator.py
@@ -3,10 +3,9 @@
 
 import numpy as np
 import pandas as pd
-from omegaconf import MISSING
-from sklearn.metrics import log_loss, r2_score
-
 from fseval.types import IncompatibilityError
+from omegaconf import MISSING
+from sklearn.metrics import accuracy_score, log_loss, r2_score
 
 from .._experiment import Experiment
 from ._config import RankAndValidatePipeline
@@ -46,39 +45,105 @@ def fit(self, X, y):
             super(RankingValidator, self).fit(X, y)
             self.storage_provider.save_pickle(filename, self.ranker.estimator)
 
-    def score(self, X, y):
-        """Scores a feature ranker, if a ground-truth on the desired dataset
-        feature importances is available. If this is the case, the estimated normalized
-        feature importances are compared to the desired ones using two metrics:
-        log loss and the R^2 score. Whilst the log loss converts the ground-truth
-        desired feature rankings to a binary value, 0/1, the R^2 score always works."""
-
-        score = {
-            "fit_time": self.ranker.fit_time_,
-            "bootstrap_state": self.bootstrap_state,
-        }
+    def _scores_to_ranking(self, scores):
+        """Converts a scoring vector to a ranking vector, or standardizes an existing
+        feature ranking vector. e.g.:
+
+        ```
+        [0.8, 0.1, 0.9, 0.0]
+        ```
+        is converted to
+        ```
+        [3, 2, 4, 1]
+        ```
+        """
+
+        _, inverse, counts = np.unique(scores, return_inverse=True, return_counts=True)
+        ranking_inverse = np.zeros_like(counts)
+        ranking_inverse[1:] = counts[:-1].cumsum()
+        ranking = ranking_inverse[inverse] + 1
+
+        return ranking
+
+    def _score_with_feature_importances(self, score):
+        """Scores this feature ranker with the available dataset ground-truth relevant
+        features, which are to be known apriori. Supports three types of feature rankings:
+        - a real-valued feature importance vector
+        - a boolean-valued feature support vector
+        - an integer-valued feature ranking vector."""
 
         X_importances = self.dataset.feature_importances
-        if X_importances is not None and self.ranker.estimates_feature_importances:
-            assert np.ndim(X_importances) == 1, "instance-based not supported yet."
 
-            # predicted feature importances: normalized ranker scores.
+        ### Feature importances
+        if self.ranker.estimates_feature_importances:
+            # predicted feature importances, normalized.
             y_pred = np.asarray(self.ranker.feature_importances_)
             y_pred = y_pred / sum(y_pred)
 
             # r2 score
             y_true = X_importances
-            score["r2_score"] = r2_score(y_true, y_pred)
+            score["importance.r2_score"] = r2_score(y_true, y_pred)
 
             # log loss
             y_true = X_importances > 0
-            score["log_loss"] = log_loss(y_true, y_pred, labels=[0, 1])
+            score["importance.log_loss"] = log_loss(y_true, y_pred, labels=[0, 1])
+
+        ### Feature support
+        if self.ranker.estimates_feature_support:
+            # predicted feature support
+            y_pred = np.asarray(self.ranker.feature_support_, dtype=bool)
+
+            # accuracy
+            y_true = X_importances > 0
+            score["support.accuracy"] = accuracy_score(y_true, y_pred)
+
+        ### Feature ranking
+        # grab ranking through either (1) `ranking_` or (2) `feature_importances_`
+        ranking = None
+        if self.ranker.estimates_feature_ranking:
+            ranking = self.ranker.feature_ranking_
+        elif self.ranker.estimates_feature_importances:
+            ranking = self.ranker.feature_importances_
+
+        # compute ranking r2 score
+        if ranking is not None:
+            # predicted feature ranking, re-ordered and normalized.
+            y_pred = self._scores_to_ranking(ranking)
+            y_pred = y_pred / sum(y_pred)
+
+            # convert ground-truth to a ranking as well.
+            y_true = self._scores_to_ranking(X_importances)
+            y_true = y_true / sum(y_true)
+
+            # in r2 score, only consider **relevant** features, not irrelevant ones. in
+            # this way, when `X_importances = [0, 2, 4, 0, 0]` we do not get misleadingly
+            # high scores because the ranking also
+            sample_weight = np.ones_like(X_importances)
+            sample_weight[X_importances == 0] = 0.0
+
+            # r2 score
+            score["ranking.r2_score"] = r2_score(
+                y_true, y_pred, sample_weight=sample_weight
+            )
+
+    def score(self, X, y):
+        """Scores a feature ranker, if a ground-truth on the desired dataset
+        feature importances is available. If this is the case, the estimated normalized
+        feature importances are compared to the desired ones using two metrics:
+        log loss and the R^2 score. Whilst the log loss converts the ground-truth
+        desired feature rankings to a binary value, 0/1, the R^2 score always works."""
+
+        score = {
+            "fit_time": self.ranker.fit_time_,
+            "bootstrap_state": self.bootstrap_state,
+        }
 
-        if X_importances is not None and self.ranker.estimates_feature_support:
-            ...
+        if self.dataset.feature_importances is not None:
+            assert (
+                np.ndim(self.dataset.feature_importances) == 1
+            ), "instance-based not supported yet."
 
-        if X_importances is not None and self.ranker.estimates_feature_ranking:
-            ...
+            self._score_with_feature_importances(score)
 
         # put a in a dataframe so can be easily merged with other pipeline scores
         scores = pd.DataFrame([score])

diff --git a/tests/integration/pipelines/test_rank_and_validate.py b/tests/integration/pipelines/test_rank_and_validate.py
@@ -7,32 +7,96 @@
 from fseval.pipelines._callback_collection import CallbackCollection
 from fseval.pipelines.rank_and_validate import RankAndValidateConfig
 from fseval.storage_providers.mock import MockStorageProvider
-from fseval.types import AbstractStorageProvider, Callback, IncompatibilityError, Task
+from fseval.types import (
+    AbstractEstimator,
+    AbstractStorageProvider,
+    Callback,
+    IncompatibilityError,
+    Task,
+)
 from hydra.utils import instantiate
 from omegaconf import OmegaConf
 from sklearn.model_selection import ShuffleSplit
 
 
+class MockRanker(AbstractEstimator):
+    def __init__(self, random_state=None):
+        self.random_state = random_state
+
+    def _get_random_state(self):
+        return np.random.RandomState(self.random_state)
+
+    def fit(self, X, y):
+        n, p = np.asarray(X).shape
+        self.n_features = p
+
+    def transform(self, X, y):
+        ...
+
+    def fit_transform(self, X, y):
+        ...
+
+    def score(self, X, y):
+        return self._get_random_state().rand()
+
+    @property
+    def feature_importances_(self):
+        return self._get_random_state().rand(self.n_features)
+
+    @property
+    def support_(self):
+        return self._get_random_state().rand(self.n_features)
+
+    @property
+    def ranking_(self):
+        return self._get_random_state().rand(self.n_features)
+
+
 @pytest.fixture
-def pipeline_cfg():
-    estimator = dict(_target_="sklearn.tree.DecisionTreeClassifier", random_state=0)
+def classifier():
+    estimator = dict(
+        _target_="tests.integration.pipelines.test_rank_and_validate.MockRanker",
+        random_state=0,
+    )
     classifier: EstimatorConfig = EstimatorConfig(estimator=estimator)
 
-    resample: ResampleConfig = ResampleConfig(name="shuffle")
+    return classifier
+
+
+@pytest.fixture
+def ranker(classifier):
     ranker: TaskedEstimatorConfig = TaskedEstimatorConfig(
         name="Decision Tree",
         task=Task.classification,
         classifier=classifier,
         is_multioutput_dataset=False,
         estimates_feature_importances=True,
+        estimates_feature_support=True,
+        estimates_feature_ranking=True,
     )
+    return ranker
+
+
+@pytest.fixture
+def validator(classifier):
     validator: TaskedEstimatorConfig = TaskedEstimatorConfig(
         name="Decision Tree",
         task=Task.classification,
         classifier=classifier,
         is_multioutput_dataset=False,
         estimates_target=True,
     )
+    return validator
+
+
+@pytest.fixture
+def resample():
+    resample: ResampleConfig = ResampleConfig(name="shuffle")
+    return resample
+
+
+@pytest.fixture
+def pipeline_cfg(classifier, ranker, validator, resample):
     n_bootstraps: int = 2
 
     config = RankAndValidateConfig(
@@ -103,7 +167,7 @@ def test_without_ranker_gt(
     score = pipeline.score(X_test, y_test)
 
     assert score["best"]["validator"]["fit_time"] > 0
-    assert score["best"]["validator"]["score"] == 1.0
+    assert score["best"]["validator"]["score"] >= 0.0
 
 
 def test_with_ranker_gt(pipeline_cfg, callbacks, dataset_with_gt, cv, storage_provider):
@@ -117,8 +181,30 @@ def test_with_ranker_gt(pipeline_cfg, callbacks, dataset_with_gt, cv, storage_pr
     assert score["best"]["validator"]["fit_time"] > 0
     assert score["best"]["ranker"]["fit_time"] > 0
 
-    assert score["best"]["ranker"]["r2_score"] <= 1.0
-    assert score["best"]["validator"]["score"] == 1.0
+    assert score["best"]["ranker"]["importance.r2_score"] <= 1.0
+    assert score["best"]["ranker"]["importance.log_loss"] >= 0
+    assert score["best"]["ranker"]["support.accuracy"] >= 0.0
+    assert score["best"]["ranker"]["support.accuracy"] <= 1.0
+    assert score["best"]["ranker"]["ranking.r2_score"] <= 1.0
+    assert score["best"]["validator"]["score"] >= 0.0
+
+
+def test_with_ranker_gt_no_importances_substitution(
+    pipeline_cfg, callbacks, dataset_with_gt, cv, storage_provider
+):
+    """When no `feature_ranking` available, `feature_importances` should substitute
+    for the ranking."""
+
+    dataset = dataset_with_gt
+    pipeline_cfg.ranker.estimates_feature_ranking = False
+
+    pipeline = instantiate(pipeline_cfg, callbacks, dataset, cv, storage_provider)
+    X_train, X_test, y_train, y_test = cv.train_test_split(dataset.X, dataset.y)
+    pipeline.fit(X_train, y_train)
+    score = pipeline.score(X_test, y_test)
+
+    assert score["best"]["ranker"]["ranking.r2_score"] <= 1.0
+    assert score["best"]["validator"]["score"] >= 0.0
 
 
 def test_validator_incompatibility_check(