Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support for serializing detectors with scikit-learn backends and/or models #642

Merged
merged 28 commits into from
Oct 12, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
af3be06
Limit sphinx-autodoc-typehints upper bound
ascillitoe Sep 27, 2022
5800e79
Initial changes
ascillitoe Sep 27, 2022
5ceec2d
Remove ClassifierTF from test_saving
ascillitoe Sep 27, 2022
d2e9bab
Remove model from LARGE_ARTEFACTS
ascillitoe Sep 27, 2022
9c15b00
Revert incorrect changes
ascillitoe Sep 27, 2022
a3ccb66
Revert "Limit sphinx-autodoc-typehints upper bound"
ascillitoe Sep 27, 2022
2d8e89e
Remove changes to optional deps docstring
ascillitoe Sep 27, 2022
f632340
Make saving.tensorflow private
ascillitoe Sep 27, 2022
0edcac6
Fix missing _tensorflow update
ascillitoe Sep 27, 2022
05184e9
Merge branch 'feature/save_load_improvements' into feature/sklearn_sa…
ascillitoe Sep 27, 2022
0982465
Add __all__ to saving/_tensorflow/__init__.py
ascillitoe Sep 28, 2022
aa57bbe
Move incorrectly placed comment in saving.py
ascillitoe Sep 28, 2022
6394327
Merge branch 'feature/save_load_improvements' into feature/sklearn_sa…
ascillitoe Sep 28, 2022
d855a5a
WIP: Initial framework for sklearn save/load
ascillitoe Sep 29, 2022
a32cb89
Decouple model type and backend
ascillitoe Sep 29, 2022
c0b33b4
Simplify pydantic validation for model field
ascillitoe Sep 29, 2022
ae6e366
Remove config_spec
ascillitoe Sep 29, 2022
9e3bfd3
Merge branch 'feature/remove_config_spec' into feature/sklearn_saving
ascillitoe Sep 29, 2022
9261337
Partial refactor of save/load tests
ascillitoe Sep 30, 2022
70897cb
Fix typo in test_save_preprocess_nlp
ascillitoe Sep 30, 2022
362f808
Add xgboost model to sklearn save tests
ascillitoe Sep 30, 2022
13deafe
Update save/load docs
ascillitoe Sep 30, 2022
9a66a92
Update saving.md
ascillitoe Sep 30, 2022
4f304f1
Update saving.md
ascillitoe Sep 30, 2022
1a6cab8
Merge branch 'master' into feature/sklearn_saving
ascillitoe Sep 30, 2022
04b503e
Add backend-conditional validation back to SupportedModelsType
ascillitoe Oct 5, 2022
4d530e5
Changes to schemas.py as per comments
ascillitoe Oct 11, 2022
c3fd225
Remove load_dir from sklearn load_model
ascillitoe Oct 11, 2022
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions alibi_detect/saving/_sklearn/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
from alibi_detect.saving._sklearn.saving import save_model_config as save_model_config_sk
from alibi_detect.saving._sklearn.loading import load_model as load_model_sk
Copy link
Contributor Author

@ascillitoe ascillitoe Oct 5, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added __init__.py here for consistency with the _tensorflow subpackage. Not particularly important since _saving._sklearn is private anyway...


__all__ = [
"save_model_config_sk",
"load_model_sk"
]
26 changes: 26 additions & 0 deletions alibi_detect/saving/_sklearn/loading.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
import os
from pathlib import Path
from typing import Union

import joblib
from sklearn.base import BaseEstimator


def load_model(filepath: Union[str, os.PathLike],
) -> BaseEstimator:
"""
Load scikit-learn (or xgboost) model. Models are assumed to be a subclass of :class:`~sklearn.base.BaseEstimator`.
This includes xgboost models following the scikit-learn API
(see https://xgboost.readthedocs.io/en/latest/python/python_api.html#module-xgboost.sklearn).

Parameters
----------
filepath
Saved model directory.

Returns
-------
Loaded model.
"""
model_dir = Path(filepath)
return joblib.load(model_dir.joinpath('model.joblib'))
68 changes: 68 additions & 0 deletions alibi_detect/saving/_sklearn/saving.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
import logging
import os
from pathlib import Path
from typing import Union

import joblib
from sklearn.base import BaseEstimator

logger = logging.getLogger(__name__)


def save_model_config(model: BaseEstimator,
base_path: Path,
local_path: Path = Path('.')) -> dict:
"""
Save a scikit-learn (or xgboost) model to a config dictionary.
Models are assumed to be a subclass of :class:`~sklearn.base.BaseEstimator`. This includes xgboost models
following the scikit-learn API
(see https://xgboost.readthedocs.io/en/latest/python/python_api.html#module-xgboost.sklearn).

Parameters
----------
model
The model to save.
base_path
Base filepath to save to (the location of the `config.toml` file).
local_path
A local (relative) filepath to append to base_path.

Returns
-------
The model config dict.
"""
filepath = base_path.joinpath(local_path)
save_model(model, filepath=filepath, save_dir='model')
cfg_model = {
'flavour': 'sklearn',
'src': local_path.joinpath('model')
}
return cfg_model


def save_model(model: BaseEstimator,
filepath: Union[str, os.PathLike],
save_dir: Union[str, os.PathLike] = 'model') -> None:
"""
Save scikit-learn (and xgboost) models. Models are assumed to be a subclass of :class:`~sklearn.base.BaseEstimator`.
This includes xgboost models following the scikit-learn API
(see https://xgboost.readthedocs.io/en/latest/python/python_api.html#module-xgboost.sklearn).

Parameters
----------
model
The tf.keras.Model to save.
filepath
Save directory.
save_dir
Name of folder to save to within the filepath directory.
"""
# create folder to save model in
model_path = Path(filepath).joinpath(save_dir)
if not model_path.is_dir():
logger.warning('Directory {} does not exist and is now created.'.format(model_path))
model_path.mkdir(parents=True, exist_ok=True)

# save model
model_path = model_path.joinpath('model.joblib')
joblib.dump(model, model_path)
32 changes: 32 additions & 0 deletions alibi_detect/saving/_sklearn/tests/test_saving_sk.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
from pytest_cases import param_fixture, parametrize, parametrize_with_cases

from alibi_detect.saving.tests.datasets import ContinuousData
from alibi_detect.saving.tests.models import classifier_model, xgb_classifier_model

from alibi_detect.saving.loading import _load_model_config
from alibi_detect.saving.saving import _path2str, _save_model_config
from alibi_detect.saving.schemas import ModelConfig

backend = param_fixture("backend", ['sklearn'])


@parametrize_with_cases("data", cases=ContinuousData.data_synthetic_nd, prefix='data_')
@parametrize('model', [classifier_model, xgb_classifier_model])
def test_save_model_sk(data, model, tmp_path):
Copy link
Contributor Author

@ascillitoe ascillitoe Oct 5, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We arguably could be more specific about what tests we keep in saving/tests/test_saving.py and what ones we keep here.

For now, I've adopted moving tests that are very backend specific to the backend saving subpackages (e.g. _save_model_config is tested in saving/_sklearn/tests/test_saving_sk.py when model is a sklearn or xgboost model). However, for the more generic functional tests such as test_save_ksdrift, I test them with both backend's in saving/tests/test_saving.py to minimise duplication. There are some other tests in saving/tests/test_saving.py such as test_save_kernel that we could probably think about moving to the backend-specific testing files in the future...

Also note, our more usual convention would be to test saving._sklearn.save_model here, instead of the parent function saving._save_model_config. I am testing _save_model_config here so that we also test the logic inside, which should call saving._sklearn.save_model if it recognises model to be a sklearn.base.BaseEstimator.

"""
Unit test for _save_model_config and _load_model_config with scikit-learn and xgboost model.
"""
# Save model
filepath = tmp_path
cfg_model, _ = _save_model_config(model, base_path=filepath)
cfg_model = _path2str(cfg_model)
cfg_model = ModelConfig(**cfg_model).dict()
assert tmp_path.joinpath('model').is_dir()
assert tmp_path.joinpath('model/model.joblib').is_file()

# Adjust config
cfg_model['src'] = tmp_path.joinpath('model') # Need to manually set to absolute path here

# Load model
model_load = _load_model_config(cfg_model)
assert isinstance(model_load, type(model))
22 changes: 7 additions & 15 deletions alibi_detect/saving/_tensorflow/loading.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ def load_model(filepath: Union[str, os.PathLike],
return model


def prep_model_and_emb(model: Optional[Callable], emb: Optional[TransformerEmbedding]) -> Callable:
def prep_model_and_emb(model: Callable, emb: Optional[TransformerEmbedding]) -> Callable:
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

model no longer Optional, since the logic here is changed so that prep_model_and_emb is not called if model is None.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe a distraction here, but what about the typing? Of course since this is tensorflow specific we wouldn't use the new SupportedModelsType, but that raises a question that maybe we should have a custom tensorflow type. Perhaps one for later...

Copy link
Contributor Author

@ascillitoe ascillitoe Oct 11, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure we can use SupportedModelsType as is, since it contains if-statements conditional on the installed optional deps, so I don't think a static type checker (mypy) will like it? On the otherhand, there used to be a Union of fowardrefs in schemas.py for use with mypy, but I deleted it as pydantic doesn't like forwardrefs. I've realised that UAE and HiddenOutput etc are all subclasses of tf.Keras.Model so we could probably just simplify the type annotations anyway...

Will open a follow-up PR for this!

"""
Function to perform final preprocessing of model (and/or embedding) before it is passed to preprocess_drift.

Expand All @@ -78,25 +78,17 @@ def prep_model_and_emb(model: Optional[Callable], emb: Optional[TransformerEmbed
model
A compatible model.
emb
A text embedding model.
An optional text embedding model.

Returns
-------
The final model ready to passed to preprocess_drift.
"""
# If a model exists, process it (and embedding)
if model is not None:
model = model.encoder if isinstance(model, UAE) else model # This is to avoid nesting UAE's already a UAE
if emb is not None:
model = _Encoder(emb, mlp=model)
model = UAE(encoder_net=model)
# If no model exists, store embedding as model
else:
model = emb
if model is None:
raise ValueError("A 'model' and/or `embedding` must be specified when "
"preprocess_fn='preprocess_drift'")

# Process model (and embedding)
model = model.encoder if isinstance(model, UAE) else model # This is to avoid nesting UAE's already a UAE
if emb is not None:
model = _Encoder(emb, mlp=model)
model = UAE(encoder_net=model)
return model


Expand Down
13 changes: 10 additions & 3 deletions alibi_detect/saving/_tensorflow/saving.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,10 @@

def save_model_config(model: Callable,
base_path: Path,
input_shape: tuple,
input_shape: Optional[tuple],
local_path: Path = Path('.')) -> Tuple[dict, Optional[dict]]:
"""
Save a model to a config dictionary. When a model has a text embedding model contained within it,
Save a TensorFlow model to a config dictionary. When a model has a text embedding model contained within it,
this is extracted and saved separately.

Parameters
Expand All @@ -53,6 +53,9 @@ def save_model_config(model: Callable,
cfg_embed = None # type: Optional[Dict[str, Any]]
if isinstance(model, UAE):
if isinstance(model.encoder.layers[0], TransformerEmbedding): # if UAE contains embedding and encoder
if input_shape is None:
raise ValueError('Cannot save combined embedding and model when `input_shape` is None.')

# embedding
embed = model.encoder.layers[0]
cfg_embed = save_embedding_config(embed, base_path, local_path.joinpath('embedding'))
Expand All @@ -78,7 +81,10 @@ def save_model_config(model: Callable,
if model is not None:
filepath = base_path.joinpath(local_path)
save_model(model, filepath=filepath, save_dir='model')
cfg_model = {'src': local_path.joinpath('model')}
cfg_model = {
'flavour': 'tensorflow',
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Likely another distraction/follow-up... But I would say it would be more robust to define an enum once and then use its values everywhere to avoid possible typos, e.g. as in https://github.com/SeldonIO/alibi/blob/fa034a2ebf7cd69c1796cf81ac6a2862a15f03b7/alibi/utils/frameworks.py#L4-L6

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A sanity check would be that grepping the code base for a string like 'tensorflow' shouldn't bring up very many hits :)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will open a follow-up PR for this.

'src': local_path.joinpath('model')
}
return cfg_model, cfg_embed


Expand Down Expand Up @@ -142,6 +148,7 @@ def save_embedding_config(embed: TransformerEmbedding,
cfg_embed.update({'type': embed.emb_type})
cfg_embed.update({'layers': embed.hs_emb.keywords['layers']})
cfg_embed.update({'src': local_path})
cfg_embed.update({'flavour': 'tensorflow'})

# Save embedding model
logger.info('Saving embedding model to {}.'.format(filepath))
Expand Down
64 changes: 64 additions & 0 deletions alibi_detect/saving/_tensorflow/tests/test_saving_tf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
from pytest_cases import param_fixture, parametrize, parametrize_with_cases

from alibi_detect.saving.tests.datasets import ContinuousData
from alibi_detect.saving.tests.models import encoder_model

from alibi_detect.cd.tensorflow import HiddenOutput as HiddenOutput_tf
from alibi_detect.saving.loading import _load_model_config, _load_optimizer_config
from alibi_detect.saving.saving import _path2str, _save_model_config
from alibi_detect.saving.schemas import ModelConfig

backend = param_fixture("backend", ['tensorflow'])


def test_load_optimizer_tf(backend):
"Test the tensorflow _load_optimizer_config."
class_name = 'Adam'
learning_rate = 0.01
epsilon = 1e-7
amsgrad = False

# Load
cfg_opt = {
'class_name': class_name,
'config': {
'name': class_name,
'learning_rate': learning_rate,
'epsilon': epsilon,
'amsgrad': amsgrad
}
}
optimizer = _load_optimizer_config(cfg_opt, backend=backend)
assert type(optimizer).__name__ == class_name
assert optimizer.learning_rate == learning_rate
assert optimizer.epsilon == epsilon
assert optimizer.amsgrad == amsgrad


@parametrize_with_cases("data", cases=ContinuousData.data_synthetic_nd, prefix='data_')
@parametrize('model', [encoder_model])
@parametrize('layer', [None, -1])
def test_save_model_tf(data, model, layer, tmp_path):
"""
Unit test for _save_model_config and _load_model_config with tensorflow model.
"""
# Save model
filepath = tmp_path
input_shape = (data[0].shape[1],)
cfg_model, _ = _save_model_config(model, base_path=filepath, input_shape=input_shape)
cfg_model = _path2str(cfg_model)
cfg_model = ModelConfig(**cfg_model).dict()
assert tmp_path.joinpath('model').is_dir()
assert tmp_path.joinpath('model/model.h5').is_file()

# Adjust config
cfg_model['src'] = tmp_path.joinpath('model') # Need to manually set to absolute path here
if layer is not None:
cfg_model['layer'] = layer

# Load model
model_load = _load_model_config(cfg_model)
if layer is None:
assert isinstance(model_load, type(model))
else:
assert isinstance(model_load, HiddenOutput_tf)
Loading