Skip to content

Commit

Permalink
giotto-ai#179 Comments, model keys refactoring
Browse files Browse the repository at this point in the history
  • Loading branch information
Sburyachenko committed May 25, 2020
1 parent 99cd60a commit 976e48f
Show file tree
Hide file tree
Showing 2 changed files with 78 additions and 56 deletions.
122 changes: 72 additions & 50 deletions gtime/model_selection/cv_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,31 +11,8 @@
)
from sklearn.utils.validation import check_is_fitted

result_cols = ["Fit time", "Train score", "Test score"]


def _default_selection(results: pd.DataFrame) -> RegressorMixin:
"""
Selects a model with lowest test score according to the first of the provided metrics
Parameters
----------
results: pd.DataFrame - cross-validation results
Returns
-------
best_model: RegressorMixin - selected model

"""
if len(results) == 0:
return None
first_metric = results['Metric'].iloc[0]
scores = results[results['Metric'] == first_metric]['Test score']
best_model_index = scores.idxmin()
return results.loc[best_model_index, 'Model']

def _models_are_equal(models, target):
return [(model.model == target.model) & (model.features == target.features) & (model.horizon == target.horizon) for model in models]
result_cols = ["Fit time", "Train score", "Test score"]


class CVPipeline(BaseEstimator, RegressorMixin):
Expand All @@ -62,27 +39,48 @@ def __init__(
):

self.models_sets = models_sets
model_list = []
model_list = {}
for model, param_grid in models_sets.items():
param_iterator = ParameterGrid(param_grid)
for params in param_iterator:
model_list.append(model(**params))
model_index = model.__name__ + ': ' + str(params)
model_list[model_index] = model(**params)
self.model_list = model_list
self.models = models_sets
self.metrics = mse if metrics is None else metrics
self.selection = _default_selection if selection is None else selection
self.selection = self._default_selection if selection is None else selection
self.cv = blocking_time_series_split if blocking else time_series_split
self.n_splits = n_splits
result_idx = pd.MultiIndex.from_product([self.model_list, self.metrics.keys()])
result_idx.names = ['Model', 'Metric']
self.cv_results_ = pd.DataFrame(
0.0, index=result_idx, columns=result_cols
).reset_index()

@staticmethod
def _default_selection(results: pd.DataFrame) -> RegressorMixin:
"""
Selects a model with lowest test score according to the first of the provided metrics
Parameters
----------
results: pd.DataFrame - cross-validation results
Returns
-------
best_model: RegressorMixin - selected model
"""
if len(results) == 0:
return None
first_metric = results['Metric'].iloc[0]
scores = results[results['Metric'] == first_metric]['Test score']
best_model_index = scores.idxmin()
return best_model_index

def _models_are_equal(self, target):
for idx, model in self.model_list.items():
if (model.model == target.model) & (model.features == target.features) & (model.horizon == target.horizon):
return idx
return None

def _fit_one_model(self, X_split: pd.DataFrame, model, results, only_model=False):
start_time = time()
model_index = results[_models_are_equal(results['Model'], model)].index
model_index = self._models_are_equal(model)
model.cache_features = True
model.fit(X_split, only_model=only_model)
scores = model.score(metrics=self.metrics)
Expand All @@ -91,6 +89,23 @@ def _fit_one_model(self, X_split: pd.DataFrame, model, results, only_model=False
results.loc[model_index, "Fit time"] = fit_time
return results

def _fit_ts_forecaster_model(self, model, params, X_split, results):

for feature in params['features']:
for horizon in params['horizon']:
submodel = model(features=feature, horizon=horizon, model=params['model'][0])
results = self._fit_one_model(X_split, submodel, results)
for next_model in params['model'][1:]:
submodel.set_model(next_model)
results = self._fit_one_model(X_split, submodel, results, only_model=True)
return results

def _fit_other_models(self, model, X_split, results):
model_list = list(filter(lambda x: isinstance(x, model), self.model_list.values()))
for submodel in model_list:
results = self._fit_one_model(X_split, submodel, results)
return results

def _cv_fit_one_split(self, X_split: pd.DataFrame) -> pd.DataFrame:
"""
Fits all models from ``self.model_list`` on a provided time series, splitting it to train and test and calculating fir time
Expand All @@ -108,20 +123,12 @@ def _cv_fit_one_split(self, X_split: pd.DataFrame) -> pd.DataFrame:
results = self.cv_results_.copy()
for model, params in self.models_sets.items():
if model == TimeSeriesForecastingModel:
for feature in params['features']:
for horizon in params['horizon']:
submodel = model(features=feature, horizon=horizon, model=params['model'][0])
results = self._fit_one_model(X_split, submodel, results)
for next_model in params['model'][1:]:
submodel.set_model(next_model)
results = self._fit_one_model(X_split, submodel, results, only_model=True)
results = self._fit_ts_forecaster_model(model, params, X_split, results)
else:
model_list = list(filter(lambda x: isinstance(x, model), self.model_list))
for submodel in model_list:
results = self._fit_one_model(X_split, submodel, results)
results = self._fit_other_models(model, X_split, results)
return results

def fit(self, X: pd.DataFrame, y: pd.DataFrame = None):
def fit(self, X: pd.DataFrame, y: pd.DataFrame = None, refit = 'best'):
"""
Performs cross-validation, selecting the best model from ``self.model_list`` according to ``self.selection``
and refits all the models on all available data.
Expand All @@ -136,14 +143,28 @@ def fit(self, X: pd.DataFrame, y: pd.DataFrame = None):
self: CVPipeline
"""

result_idx = pd.MultiIndex.from_product([self.model_list, self.metrics.keys()])
result_idx.names = ['Model', 'Metric']
self.cv_results_ = pd.DataFrame(
0.0, index=result_idx, columns=result_cols
).reset_index().set_index('Model')

for idx in self.cv(X, self.n_splits):
X_split = X.loc[idx]
self.cv_results_[result_cols] += self._cv_fit_one_split(X_split)[result_cols]

self.cv_results_[result_cols] /= self.n_splits
self.best_model_ = self.selection(self.cv_results_.dropna())
for model in self.model_list:
model.fit(X)
self.best_model_ = self.model_list[self.selection(self.cv_results_.dropna())]

if refit == 'all':
for model in self.model_list.values():
model.fit(X)
elif refit == 'best':
self.best_model_.fit(X)
else:
for idx in refit:
self.model_list[idx].fit(X)
return self

def predict(self, X: pd.DataFrame = None) -> pd.DataFrame:
Expand All @@ -166,7 +187,6 @@ def predict(self, X: pd.DataFrame = None) -> pd.DataFrame:

if __name__ == '__main__':
from gtime.preprocessing import TimeSeriesPreparation
# from gtime.model_selection import CVPipeline
from gtime.metrics import rmse, mape
from gtime.time_series_models import Naive, AR, TimeSeriesForecastingModel
from gtime.forecasting import NaiveForecaster, DriftForecaster
Expand Down Expand Up @@ -194,4 +214,6 @@ def predict(self, X: pd.DataFrame = None) -> pd.DataFrame:
}

c = CVPipeline(models_sets=models, metrics=scoring)
c.fit(period_index_time_series)
c.fit(period_index_time_series, refit='all')
print(c.predict())
print(c.model_list['AR: {\'horizon\': 7, \'p\': 2}'].predict())
12 changes: 6 additions & 6 deletions gtime/model_selection/tests/test_cv_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,21 +15,21 @@ def draw_unique_subset(draw, lst):

@st.composite
def naive_model(draw):
horizon = draw(st.lists(st.integers(min_value=1, max_value=20), min_size=1, max_size=4))
horizon = draw(st.lists(st.integers(min_value=1, max_value=20), min_size=1, max_size=4, unique=True))
return (Naive, {'horizon': horizon})


@st.composite
def seasonal_naive_model(draw):
horizon = draw(st.lists(st.integers(min_value=1, max_value=20), min_size=1, max_size=4))
seasonal_length = draw(st.lists(st.integers(min_value=1, max_value=10), min_size=1, max_size=4))
horizon = draw(st.lists(st.integers(min_value=1, max_value=20), min_size=1, max_size=4, unique=True))
seasonal_length = draw(st.lists(st.integers(min_value=1, max_value=10), min_size=1, max_size=4, unique=True))
return (SeasonalNaive, {'horizon': horizon, 'seasonal_length': seasonal_length})


@st.composite
def ar_model(draw):
horizon = draw(st.lists(st.integers(min_value=1, max_value=20), min_size=1, max_size=4))
p = draw(st.lists(st.integers(min_value=1, max_value=20), min_size=1, max_size=4))
horizon = draw(st.lists(st.integers(min_value=1, max_value=20), min_size=1, max_size=4, unique=True))
p = draw(st.lists(st.integers(min_value=1, max_value=20), min_size=1, max_size=4, unique=True))
explainer = draw(st.sampled_from([None, "lime", "shap"]))
return (AR, {'horizon': horizon, 'p': p, 'explainer_type': [explainer]})

Expand Down Expand Up @@ -74,7 +74,7 @@ def test_fit_predict(self, models, n_splits, blocking, metrics, seed):
idx = pd.period_range(start='2011-01-01', end='2012-01-01')
df = pd.DataFrame(np.random.standard_normal((len(idx), 1)), index=idx, columns=['1'])
cv_pipeline.fit(df)
assert cv_pipeline.cv_results_.shape == (len(cv_pipeline.model_list) * len(metrics), 5)
assert cv_pipeline.cv_results_.shape == (len(cv_pipeline.model_list) * len(metrics), 4)
y_pred = cv_pipeline.predict()
horizon = cv_pipeline.best_model_.horizon
assert y_pred.shape == (horizon, horizon)

0 comments on commit 976e48f

Please sign in to comment.