-
Notifications
You must be signed in to change notification settings - Fork 225
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Support for serializing detectors with scikit-learn backends and/or models #642
Changes from all commits
af3be06
5800e79
5ceec2d
d2e9bab
9c15b00
a3ccb66
2d8e89e
f632340
0edcac6
05184e9
0982465
aa57bbe
6394327
d855a5a
a32cb89
c0b33b4
ae6e366
9e3bfd3
9261337
70897cb
362f808
13deafe
9a66a92
4f304f1
1a6cab8
04b503e
4d530e5
c3fd225
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
from alibi_detect.saving._sklearn.saving import save_model_config as save_model_config_sk | ||
from alibi_detect.saving._sklearn.loading import load_model as load_model_sk | ||
|
||
__all__ = [ | ||
"save_model_config_sk", | ||
"load_model_sk" | ||
] |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
import os | ||
from pathlib import Path | ||
from typing import Union | ||
|
||
import joblib | ||
from sklearn.base import BaseEstimator | ||
|
||
|
||
def load_model(filepath: Union[str, os.PathLike], | ||
) -> BaseEstimator: | ||
""" | ||
Load scikit-learn (or xgboost) model. Models are assumed to be a subclass of :class:`~sklearn.base.BaseEstimator`. | ||
This includes xgboost models following the scikit-learn API | ||
(see https://xgboost.readthedocs.io/en/latest/python/python_api.html#module-xgboost.sklearn). | ||
|
||
Parameters | ||
---------- | ||
filepath | ||
Saved model directory. | ||
|
||
Returns | ||
------- | ||
Loaded model. | ||
""" | ||
model_dir = Path(filepath) | ||
return joblib.load(model_dir.joinpath('model.joblib')) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,68 @@ | ||
import logging | ||
import os | ||
from pathlib import Path | ||
from typing import Union | ||
|
||
import joblib | ||
from sklearn.base import BaseEstimator | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
def save_model_config(model: BaseEstimator, | ||
base_path: Path, | ||
local_path: Path = Path('.')) -> dict: | ||
""" | ||
Save a scikit-learn (or xgboost) model to a config dictionary. | ||
Models are assumed to be a subclass of :class:`~sklearn.base.BaseEstimator`. This includes xgboost models | ||
following the scikit-learn API | ||
(see https://xgboost.readthedocs.io/en/latest/python/python_api.html#module-xgboost.sklearn). | ||
|
||
Parameters | ||
---------- | ||
model | ||
The model to save. | ||
base_path | ||
Base filepath to save to (the location of the `config.toml` file). | ||
local_path | ||
A local (relative) filepath to append to base_path. | ||
|
||
Returns | ||
------- | ||
The model config dict. | ||
""" | ||
filepath = base_path.joinpath(local_path) | ||
save_model(model, filepath=filepath, save_dir='model') | ||
cfg_model = { | ||
'flavour': 'sklearn', | ||
'src': local_path.joinpath('model') | ||
} | ||
return cfg_model | ||
|
||
|
||
def save_model(model: BaseEstimator, | ||
filepath: Union[str, os.PathLike], | ||
save_dir: Union[str, os.PathLike] = 'model') -> None: | ||
""" | ||
Save scikit-learn (and xgboost) models. Models are assumed to be a subclass of :class:`~sklearn.base.BaseEstimator`. | ||
This includes xgboost models following the scikit-learn API | ||
(see https://xgboost.readthedocs.io/en/latest/python/python_api.html#module-xgboost.sklearn). | ||
|
||
Parameters | ||
---------- | ||
model | ||
The tf.keras.Model to save. | ||
filepath | ||
Save directory. | ||
save_dir | ||
Name of folder to save to within the filepath directory. | ||
""" | ||
# create folder to save model in | ||
model_path = Path(filepath).joinpath(save_dir) | ||
if not model_path.is_dir(): | ||
logger.warning('Directory {} does not exist and is now created.'.format(model_path)) | ||
model_path.mkdir(parents=True, exist_ok=True) | ||
|
||
# save model | ||
model_path = model_path.joinpath('model.joblib') | ||
joblib.dump(model, model_path) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
from pytest_cases import param_fixture, parametrize, parametrize_with_cases | ||
|
||
from alibi_detect.saving.tests.datasets import ContinuousData | ||
from alibi_detect.saving.tests.models import classifier_model, xgb_classifier_model | ||
|
||
from alibi_detect.saving.loading import _load_model_config | ||
from alibi_detect.saving.saving import _path2str, _save_model_config | ||
from alibi_detect.saving.schemas import ModelConfig | ||
|
||
backend = param_fixture("backend", ['sklearn']) | ||
|
||
|
||
@parametrize_with_cases("data", cases=ContinuousData.data_synthetic_nd, prefix='data_') | ||
@parametrize('model', [classifier_model, xgb_classifier_model]) | ||
def test_save_model_sk(data, model, tmp_path): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We arguably could be more specific about what tests we keep in For now, I've adopted moving tests that are very backend specific to the backend saving subpackages (e.g. Also note, our more usual convention would be to test |
||
""" | ||
Unit test for _save_model_config and _load_model_config with scikit-learn and xgboost model. | ||
""" | ||
# Save model | ||
filepath = tmp_path | ||
cfg_model, _ = _save_model_config(model, base_path=filepath) | ||
cfg_model = _path2str(cfg_model) | ||
cfg_model = ModelConfig(**cfg_model).dict() | ||
assert tmp_path.joinpath('model').is_dir() | ||
assert tmp_path.joinpath('model/model.joblib').is_file() | ||
|
||
# Adjust config | ||
cfg_model['src'] = tmp_path.joinpath('model') # Need to manually set to absolute path here | ||
|
||
# Load model | ||
model_load = _load_model_config(cfg_model) | ||
assert isinstance(model_load, type(model)) |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -69,7 +69,7 @@ def load_model(filepath: Union[str, os.PathLike], | |
return model | ||
|
||
|
||
def prep_model_and_emb(model: Optional[Callable], emb: Optional[TransformerEmbedding]) -> Callable: | ||
def prep_model_and_emb(model: Callable, emb: Optional[TransformerEmbedding]) -> Callable: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe a distraction here, but what about the typing? Of course since this is There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not sure we can use Will open a follow-up PR for this! |
||
""" | ||
Function to perform final preprocessing of model (and/or embedding) before it is passed to preprocess_drift. | ||
|
||
|
@@ -78,25 +78,17 @@ def prep_model_and_emb(model: Optional[Callable], emb: Optional[TransformerEmbed | |
model | ||
A compatible model. | ||
emb | ||
A text embedding model. | ||
An optional text embedding model. | ||
|
||
Returns | ||
------- | ||
The final model ready to passed to preprocess_drift. | ||
""" | ||
# If a model exists, process it (and embedding) | ||
if model is not None: | ||
model = model.encoder if isinstance(model, UAE) else model # This is to avoid nesting UAE's already a UAE | ||
if emb is not None: | ||
model = _Encoder(emb, mlp=model) | ||
model = UAE(encoder_net=model) | ||
# If no model exists, store embedding as model | ||
else: | ||
model = emb | ||
if model is None: | ||
raise ValueError("A 'model' and/or `embedding` must be specified when " | ||
"preprocess_fn='preprocess_drift'") | ||
|
||
# Process model (and embedding) | ||
model = model.encoder if isinstance(model, UAE) else model # This is to avoid nesting UAE's already a UAE | ||
if emb is not None: | ||
model = _Encoder(emb, mlp=model) | ||
model = UAE(encoder_net=model) | ||
return model | ||
|
||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -28,10 +28,10 @@ | |
|
||
def save_model_config(model: Callable, | ||
base_path: Path, | ||
input_shape: tuple, | ||
input_shape: Optional[tuple], | ||
local_path: Path = Path('.')) -> Tuple[dict, Optional[dict]]: | ||
""" | ||
Save a model to a config dictionary. When a model has a text embedding model contained within it, | ||
Save a TensorFlow model to a config dictionary. When a model has a text embedding model contained within it, | ||
this is extracted and saved separately. | ||
|
||
Parameters | ||
|
@@ -53,6 +53,9 @@ def save_model_config(model: Callable, | |
cfg_embed = None # type: Optional[Dict[str, Any]] | ||
if isinstance(model, UAE): | ||
if isinstance(model.encoder.layers[0], TransformerEmbedding): # if UAE contains embedding and encoder | ||
if input_shape is None: | ||
raise ValueError('Cannot save combined embedding and model when `input_shape` is None.') | ||
|
||
# embedding | ||
embed = model.encoder.layers[0] | ||
cfg_embed = save_embedding_config(embed, base_path, local_path.joinpath('embedding')) | ||
|
@@ -78,7 +81,10 @@ def save_model_config(model: Callable, | |
if model is not None: | ||
filepath = base_path.joinpath(local_path) | ||
save_model(model, filepath=filepath, save_dir='model') | ||
cfg_model = {'src': local_path.joinpath('model')} | ||
cfg_model = { | ||
'flavour': 'tensorflow', | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Likely another distraction/follow-up... But I would say it would be more robust to define an enum once and then use its values everywhere to avoid possible typos, e.g. as in https://github.com/SeldonIO/alibi/blob/fa034a2ebf7cd69c1796cf81ac6a2862a15f03b7/alibi/utils/frameworks.py#L4-L6 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. A sanity check would be that grepping the code base for a string like There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Will open a follow-up PR for this. |
||
'src': local_path.joinpath('model') | ||
} | ||
return cfg_model, cfg_embed | ||
|
||
|
||
|
@@ -142,6 +148,7 @@ def save_embedding_config(embed: TransformerEmbedding, | |
cfg_embed.update({'type': embed.emb_type}) | ||
cfg_embed.update({'layers': embed.hs_emb.keywords['layers']}) | ||
cfg_embed.update({'src': local_path}) | ||
cfg_embed.update({'flavour': 'tensorflow'}) | ||
|
||
# Save embedding model | ||
logger.info('Saving embedding model to {}.'.format(filepath)) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,64 @@ | ||
from pytest_cases import param_fixture, parametrize, parametrize_with_cases | ||
|
||
from alibi_detect.saving.tests.datasets import ContinuousData | ||
from alibi_detect.saving.tests.models import encoder_model | ||
|
||
from alibi_detect.cd.tensorflow import HiddenOutput as HiddenOutput_tf | ||
from alibi_detect.saving.loading import _load_model_config, _load_optimizer_config | ||
from alibi_detect.saving.saving import _path2str, _save_model_config | ||
from alibi_detect.saving.schemas import ModelConfig | ||
|
||
backend = param_fixture("backend", ['tensorflow']) | ||
|
||
|
||
def test_load_optimizer_tf(backend): | ||
"Test the tensorflow _load_optimizer_config." | ||
class_name = 'Adam' | ||
learning_rate = 0.01 | ||
epsilon = 1e-7 | ||
amsgrad = False | ||
|
||
# Load | ||
cfg_opt = { | ||
'class_name': class_name, | ||
'config': { | ||
'name': class_name, | ||
'learning_rate': learning_rate, | ||
'epsilon': epsilon, | ||
'amsgrad': amsgrad | ||
} | ||
} | ||
optimizer = _load_optimizer_config(cfg_opt, backend=backend) | ||
assert type(optimizer).__name__ == class_name | ||
assert optimizer.learning_rate == learning_rate | ||
assert optimizer.epsilon == epsilon | ||
assert optimizer.amsgrad == amsgrad | ||
|
||
|
||
@parametrize_with_cases("data", cases=ContinuousData.data_synthetic_nd, prefix='data_') | ||
@parametrize('model', [encoder_model]) | ||
@parametrize('layer', [None, -1]) | ||
def test_save_model_tf(data, model, layer, tmp_path): | ||
""" | ||
Unit test for _save_model_config and _load_model_config with tensorflow model. | ||
""" | ||
# Save model | ||
filepath = tmp_path | ||
input_shape = (data[0].shape[1],) | ||
cfg_model, _ = _save_model_config(model, base_path=filepath, input_shape=input_shape) | ||
cfg_model = _path2str(cfg_model) | ||
cfg_model = ModelConfig(**cfg_model).dict() | ||
assert tmp_path.joinpath('model').is_dir() | ||
assert tmp_path.joinpath('model/model.h5').is_file() | ||
|
||
# Adjust config | ||
cfg_model['src'] = tmp_path.joinpath('model') # Need to manually set to absolute path here | ||
if layer is not None: | ||
cfg_model['layer'] = layer | ||
|
||
# Load model | ||
model_load = _load_model_config(cfg_model) | ||
if layer is None: | ||
assert isinstance(model_load, type(model)) | ||
else: | ||
assert isinstance(model_load, HiddenOutput_tf) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Added
__init__.py
here for consistency with the_tensorflow
subpackage. Not particularly important since_saving._sklearn
is private anyway...