Skip to content

Commit

Permalink
Validate feature support vector 🚀 closes #14
Browse files Browse the repository at this point in the history
  • Loading branch information
dunnkers committed Jun 13, 2021
1 parent e838a21 commit dea00ab
Show file tree
Hide file tree
Showing 4 changed files with 124 additions and 21 deletions.
48 changes: 48 additions & 0 deletions fseval/pipelines/rank_and_validate/_support_validator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
from dataclasses import dataclass

import numpy as np
from fseval.pipeline.estimator import Estimator
from fseval.types import IncompatibilityError
from omegaconf import MISSING
from sklearn.feature_selection import SelectFromModel

from .._experiment import Experiment
from ._config import RankAndValidatePipeline
from ._subset_validator import SubsetValidator


@dataclass
class SupportValidator(SubsetValidator):
"""Validates a feature support vector, i.e. a feature subset."""

bootstrap_state: int = MISSING
n_features_to_select: int = -1 # disable

def _prepare_data(self, X, y):
feature_support = getattr(self.ranker, "feature_support_", None)
assert feature_support is not None, "ranker must have support attribute"
assert isinstance(
feature_support, np.ndarray
), "feature support array must be a numpy ndarray"

# make sure support vector is boolean-valued
feature_support = feature_support.astype(bool)
self.subset_size = np.sum(feature_support)

# select feature subset
X = X[:, feature_support]

return X, y

@property
def _cache_filename(self):
override = f"bootstrap_state={self.bootstrap_state}"
filename = f"support[{override}].pickle"

return filename

def score(self, X, y, **kwargs):
score = super(SubsetValidator, self).score(X, y)
score["subset_size"] = self.subset_size
score["fit_time"] = self.validator.fit_time_
return score
69 changes: 54 additions & 15 deletions fseval/pipelines/rank_and_validate/rank_and_validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,17 +4,17 @@

import numpy as np
import pandas as pd
from fseval.callbacks import WandbCallback
from fseval.types import TerminalColor as tc
from omegaconf import MISSING
from sklearn.base import clone
from tqdm import tqdm

from fseval.callbacks import WandbCallback
from fseval.types import TerminalColor as tc

from .._experiment import Experiment
from ._config import RankAndValidatePipeline
from ._dataset_validator import DatasetValidator
from ._ranking_validator import RankingValidator
from ._support_validator import SupportValidator


@dataclass
Expand All @@ -29,45 +29,70 @@ class RankAndValidate(Experiment, RankAndValidatePipeline):
logger: Logger = getLogger(__name__)

def _get_estimator(self):
estimators = []

# make sure to clone the validator for support- and dataset validation.
config = self._get_config()
validator = config.pop("validator")

## first fit ranker, then run all validations
# instantiate ranking validator. pass bootstrap state for the cache filename
self.ranking_validator = RankingValidator(
**self._get_config(), bootstrap_state=self.bootstrap_state
)
estimators.append(self.ranking_validator)

# validate feature support - if available
if self.ranker.estimates_feature_support:
self.support_validator = SupportValidator(
**config,
validator=clone(validator),
bootstrap_state=self.bootstrap_state,
)
estimators.append(self.support_validator)

# instantiate dataset validator.
self.dataset_validator = DatasetValidator(
**self._get_config(), bootstrap_state=self.bootstrap_state
**config, validator=clone(validator), bootstrap_state=self.bootstrap_state
)
estimators.append(self.dataset_validator)

# first fit ranker, then run all validations
return [
self.ranking_validator,
self.dataset_validator,
]
return estimators

def _prepare_data(self, X, y):
# resample dataset: perform a bootstrap
self.resample.random_state = self.bootstrap_state
X, y = self.resample.transform(X, y)

return X, y

def score(self, X, y, **kwargs):
scores = pd.DataFrame()

# ranking scores
ranking_score = self.ranking_validator.score(
X, y, feature_importances=kwargs.get("feature_importances")
)
ranking_score["group"] = "ranking"
scores = scores.append(ranking_score)

# feature support scores - if available
if self.ranker.estimates_feature_support:
support_score = self.support_validator.score(X, y)
support_score["group"] = "support"
scores = scores.append(support_score)

# validation scores
validation_score = self.dataset_validator.score(X, y)
validation_score["group"] = "validation"

scores = pd.DataFrame()
scores = scores.append(ranking_score)
scores = scores.append(validation_score)
scores["bootstrap_state"] = self.bootstrap_state

# attach bootstrap and finish
scores["bootstrap_state"] = self.bootstrap_state
self.logger.info(
f"scored bootstrap_state={self.bootstrap_state} " + tc.green("✓")
)

return scores


Expand Down Expand Up @@ -102,10 +127,15 @@ def _get_ranker_attribute_table(self, attribute: str, attribute_name: str):
attribute_table = pd.DataFrame()

for rank_and_validate in self.estimators:
ranker = rank_and_validate.ranker
attribute_value = getattr(ranker, attribute)
# ensure dataset loaded
p = self.dataset.p
assert p is not None, "dataset must be loaded"

# get attribute from ranker
ranker = rank_and_validate.ranker
attribute_value = getattr(ranker, attribute)

# construct dataframe
attribute_data = {
"bootstrap_state": rank_and_validate.bootstrap_state,
"feature_index": np.arange(1, p + 1),
Expand All @@ -122,6 +152,10 @@ def score(self, X, y, **kwargs):
ranking_scores = scores[scores["group"] == "ranking"].dropna(axis=1)
ranking_scores = ranking_scores.drop(columns=["group"])
ranking_scores = ranking_scores.set_index("bootstrap_state")

support_scores = scores[scores["group"] == "support"].dropna(axis=1)
support_scores = support_scores.drop(columns=["group"])

validation_scores = scores[scores["group"] == "validation"].dropna(axis=1)
validation_scores = validation_scores.drop(columns=["group"])

Expand Down Expand Up @@ -185,6 +219,7 @@ def score(self, X, y, **kwargs):
importances_table = self._get_ranker_attribute_table(
"feature_importances_", "feature_importances"
)
# TODO normalize feature importances
wandb_callback.upload_table(importances_table, "feature_importances")

# feature support
Expand All @@ -208,6 +243,10 @@ def score(self, X, y, **kwargs):
if wandb_callback and self.upload_validation_scores:
self.logger.info(f"Uploading validation scores...")

## upload support scores
if self.ranker.estimates_feature_support:
wandb_callback.upload_table(support_scores, "support_scores")

## upload validation scores
wandb_callback.upload_table(validation_scores, "validation_scores")

Expand Down
17 changes: 14 additions & 3 deletions fseval/storage_providers/__init__.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,23 @@
from typing import Any, Callable
from dataclasses import dataclass
from typing import Any, Callable, Optional

from fseval.config import StorageProviderConfig
from fseval.types import AbstractStorageProvider

from .local import LocalStorageProvider
from .wandb import WandbStorageProvider


class MockStorageProvider(StorageProviderConfig):
@dataclass
class MockStorageProvider(AbstractStorageProvider):
load_dir: Optional[str] = None
save_dir: Optional[str] = None

def get_load_dir(self) -> str:
...

def get_save_dir(self) -> str:
...

def save(self, filename: str, writer: Callable, mode: str = "w"):
...

Expand Down
11 changes: 8 additions & 3 deletions fseval/storage_providers/local.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
from pickle import dump, load
from typing import Any, Callable, Optional


from fseval.types import AbstractStorageProvider, TerminalColor


Expand All @@ -16,10 +15,12 @@ class LocalStorageProvider(AbstractStorageProvider):
logger: Logger = getLogger(__name__)

def get_load_dir(self) -> str:
return self.load_dir or "."
load_dir = self.load_dir or "."
return load_dir

def get_save_dir(self) -> str:
return self.save_dir or "."
save_dir = self.save_dir or "."
return save_dir

def save(self, filename: str, writer: Callable, mode: str = "w"):
filedir = self.get_save_dir()
Expand All @@ -33,6 +34,7 @@ def save(self, filename: str, writer: Callable, mode: str = "w"):
+ TerminalColor.yellow("local disk")
+ TerminalColor.green(" ✓")
)
print(TerminalColor.blue(filepath))

def save_pickle(self, filename: str, obj: Any):
self.save(filename, lambda file: dump(obj, file), mode="wb")
Expand All @@ -41,6 +43,9 @@ def restore(self, filename: str, reader: Callable, mode: str = "r") -> Any:
filedir = self.get_load_dir()
filepath = os.path.join(filedir, filename)

self.logger.info("attempting to restore:")
print(TerminalColor.blue(filepath))

if not os.path.exists(filepath):
return None

Expand Down

0 comments on commit dea00ab

Please sign in to comment.