Support for serializing detectors with scikit-learn backends and/or m…

…odels (#642)
SeldonIO · Oct 12, 2022 · 1898ad2 · 1898ad2
1 parent b915d63
commit 1898ad2
Show file tree

Hide file tree

Showing 14 changed files with 660 additions and 403 deletions.
diff --git a/alibi_detect/saving/_sklearn/__init__.py b/alibi_detect/saving/_sklearn/__init__.py
@@ -0,0 +1,7 @@
+from alibi_detect.saving._sklearn.saving import save_model_config as save_model_config_sk
+from alibi_detect.saving._sklearn.loading import load_model as load_model_sk
+
+__all__ = [
+    "save_model_config_sk",
+    "load_model_sk"
+]
diff --git a/alibi_detect/saving/_sklearn/loading.py b/alibi_detect/saving/_sklearn/loading.py
@@ -0,0 +1,26 @@
+import os
+from pathlib import Path
+from typing import Union
+
+import joblib
+from sklearn.base import BaseEstimator
+
+
+def load_model(filepath: Union[str, os.PathLike],
+               ) -> BaseEstimator:
+    """
+    Load scikit-learn (or xgboost) model. Models are assumed to be a subclass of :class:`~sklearn.base.BaseEstimator`.
+    This includes xgboost models following the scikit-learn API
+    (see https://xgboost.readthedocs.io/en/latest/python/python_api.html#module-xgboost.sklearn).
+
+    Parameters
+    ----------
+    filepath
+        Saved model directory.
+
+    Returns
+    -------
+    Loaded model.
+    """
+    model_dir = Path(filepath)
+    return joblib.load(model_dir.joinpath('model.joblib'))
diff --git a/alibi_detect/saving/_sklearn/saving.py b/alibi_detect/saving/_sklearn/saving.py
@@ -0,0 +1,68 @@
+import logging
+import os
+from pathlib import Path
+from typing import Union
+
+import joblib
+from sklearn.base import BaseEstimator
+
+logger = logging.getLogger(__name__)
+
+
+def save_model_config(model: BaseEstimator,
+                      base_path: Path,
+                      local_path: Path = Path('.')) -> dict:
+    """
+    Save a scikit-learn (or xgboost) model to a config dictionary.
+    Models are assumed to be a subclass of :class:`~sklearn.base.BaseEstimator`. This includes xgboost models
+    following the scikit-learn API
+    (see https://xgboost.readthedocs.io/en/latest/python/python_api.html#module-xgboost.sklearn).
+
+    Parameters
+    ----------
+    model
+        The model to save.
+    base_path
+        Base filepath to save to (the location of the `config.toml` file).
+    local_path
+        A local (relative) filepath to append to base_path.
+
+    Returns
+    -------
+    The model config dict.
+    """
+    filepath = base_path.joinpath(local_path)
+    save_model(model, filepath=filepath, save_dir='model')
+    cfg_model = {
+        'flavour': 'sklearn',
+        'src': local_path.joinpath('model')
+    }
+    return cfg_model
+
+
+def save_model(model: BaseEstimator,
+               filepath: Union[str, os.PathLike],
+               save_dir: Union[str, os.PathLike] = 'model') -> None:
+    """
+    Save scikit-learn (and xgboost) models. Models are assumed to be a subclass of :class:`~sklearn.base.BaseEstimator`.
+    This includes xgboost models following the scikit-learn API
+    (see https://xgboost.readthedocs.io/en/latest/python/python_api.html#module-xgboost.sklearn).
+
+    Parameters
+    ----------
+    model
+        The tf.keras.Model to save.
+    filepath
+        Save directory.
+    save_dir
+        Name of folder to save to within the filepath directory.
+    """
+    # create folder to save model in
+    model_path = Path(filepath).joinpath(save_dir)
+    if not model_path.is_dir():
+        logger.warning('Directory {} does not exist and is now created.'.format(model_path))
+        model_path.mkdir(parents=True, exist_ok=True)
+
+    # save model
+    model_path = model_path.joinpath('model.joblib')
+    joblib.dump(model, model_path)
diff --git a/alibi_detect/saving/_sklearn/tests/test_saving_sk.py b/alibi_detect/saving/_sklearn/tests/test_saving_sk.py
@@ -0,0 +1,32 @@
+from pytest_cases import param_fixture, parametrize, parametrize_with_cases
+
+from alibi_detect.saving.tests.datasets import ContinuousData
+from alibi_detect.saving.tests.models import classifier_model, xgb_classifier_model
+
+from alibi_detect.saving.loading import _load_model_config
+from alibi_detect.saving.saving import _path2str, _save_model_config
+from alibi_detect.saving.schemas import ModelConfig
+
+backend = param_fixture("backend", ['sklearn'])
+
+
+@parametrize_with_cases("data", cases=ContinuousData.data_synthetic_nd, prefix='data_')
+@parametrize('model', [classifier_model, xgb_classifier_model])
+def test_save_model_sk(data, model, tmp_path):
+    """
+    Unit test for _save_model_config and _load_model_config with scikit-learn and xgboost model.
+    """
+    # Save model
+    filepath = tmp_path
+    cfg_model, _ = _save_model_config(model, base_path=filepath)
+    cfg_model = _path2str(cfg_model)
+    cfg_model = ModelConfig(**cfg_model).dict()
+    assert tmp_path.joinpath('model').is_dir()
+    assert tmp_path.joinpath('model/model.joblib').is_file()
+
+    # Adjust config
+    cfg_model['src'] = tmp_path.joinpath('model')  # Need to manually set to absolute path here
+
+    # Load model
+    model_load = _load_model_config(cfg_model)
+    assert isinstance(model_load, type(model))
diff --git a/alibi_detect/saving/_tensorflow/loading.py b/alibi_detect/saving/_tensorflow/loading.py
@@ -69,7 +69,7 @@ def load_model(filepath: Union[str, os.PathLike],
     return model
 
 
-def prep_model_and_emb(model: Optional[Callable], emb: Optional[TransformerEmbedding]) -> Callable:
+def prep_model_and_emb(model: Callable, emb: Optional[TransformerEmbedding]) -> Callable:
     """
     Function to perform final preprocessing of model (and/or embedding) before it is passed to preprocess_drift.
 
@@ -78,25 +78,17 @@ def prep_model_and_emb(model: Optional[Callable], emb: Optional[TransformerEmbed
     model
         A compatible model.
     emb
-        A text embedding model.
+        An optional text embedding model.
 
     Returns
     -------
     The final model ready to passed to preprocess_drift.
     """
-    # If a model exists, process it (and embedding)
-    if model is not None:
-        model = model.encoder if isinstance(model, UAE) else model  # This is to avoid nesting UAE's already a UAE
-        if emb is not None:
-            model = _Encoder(emb, mlp=model)
-            model = UAE(encoder_net=model)
-    # If no model exists, store embedding as model
-    else:
-        model = emb
-    if model is None:
-        raise ValueError("A 'model'  and/or `embedding` must be specified when "
-                         "preprocess_fn='preprocess_drift'")
-
+    # Process model (and embedding)
+    model = model.encoder if isinstance(model, UAE) else model  # This is to avoid nesting UAE's already a UAE
+    if emb is not None:
+        model = _Encoder(emb, mlp=model)
+        model = UAE(encoder_net=model)
     return model
 
 

diff --git a/alibi_detect/saving/_tensorflow/saving.py b/alibi_detect/saving/_tensorflow/saving.py
@@ -28,10 +28,10 @@
 
 def save_model_config(model: Callable,
                       base_path: Path,
-                      input_shape: tuple,
+                      input_shape: Optional[tuple],
                       local_path: Path = Path('.')) -> Tuple[dict, Optional[dict]]:
     """
-    Save a model to a config dictionary. When a model has a text embedding model contained within it,
+    Save a TensorFlow model to a config dictionary. When a model has a text embedding model contained within it,
     this is extracted and saved separately.
 
     Parameters
@@ -53,6 +53,9 @@ def save_model_config(model: Callable,
     cfg_embed = None  # type: Optional[Dict[str, Any]]
     if isinstance(model, UAE):
         if isinstance(model.encoder.layers[0], TransformerEmbedding):  # if UAE contains embedding and encoder
+            if input_shape is None:
+                raise ValueError('Cannot save combined embedding and model when `input_shape` is None.')
+
             # embedding
             embed = model.encoder.layers[0]
             cfg_embed = save_embedding_config(embed, base_path, local_path.joinpath('embedding'))
@@ -78,7 +81,10 @@ def save_model_config(model: Callable,
     if model is not None:
         filepath = base_path.joinpath(local_path)
         save_model(model, filepath=filepath, save_dir='model')
-        cfg_model = {'src': local_path.joinpath('model')}
+        cfg_model = {
+            'flavour': 'tensorflow',
+            'src': local_path.joinpath('model')
+        }
     return cfg_model, cfg_embed
 
 
@@ -142,6 +148,7 @@ def save_embedding_config(embed: TransformerEmbedding,
     cfg_embed.update({'type': embed.emb_type})
     cfg_embed.update({'layers': embed.hs_emb.keywords['layers']})
     cfg_embed.update({'src': local_path})
+    cfg_embed.update({'flavour': 'tensorflow'})
 
     # Save embedding model
     logger.info('Saving embedding model to {}.'.format(filepath))

diff --git a/alibi_detect/saving/_tensorflow/tests/test_saving_tf.py b/alibi_detect/saving/_tensorflow/tests/test_saving_tf.py
@@ -0,0 +1,64 @@
+from pytest_cases import param_fixture, parametrize, parametrize_with_cases
+
+from alibi_detect.saving.tests.datasets import ContinuousData
+from alibi_detect.saving.tests.models import encoder_model
+
+from alibi_detect.cd.tensorflow import HiddenOutput as HiddenOutput_tf
+from alibi_detect.saving.loading import _load_model_config, _load_optimizer_config
+from alibi_detect.saving.saving import _path2str, _save_model_config
+from alibi_detect.saving.schemas import ModelConfig
+
+backend = param_fixture("backend", ['tensorflow'])
+
+
+def test_load_optimizer_tf(backend):
+    "Test the tensorflow _load_optimizer_config."
+    class_name = 'Adam'
+    learning_rate = 0.01
+    epsilon = 1e-7
+    amsgrad = False
+
+    # Load
+    cfg_opt = {
+        'class_name': class_name,
+        'config': {
+            'name': class_name,
+            'learning_rate': learning_rate,
+            'epsilon': epsilon,
+            'amsgrad': amsgrad
+        }
+    }
+    optimizer = _load_optimizer_config(cfg_opt, backend=backend)
+    assert type(optimizer).__name__ == class_name
+    assert optimizer.learning_rate == learning_rate
+    assert optimizer.epsilon == epsilon
+    assert optimizer.amsgrad == amsgrad
+
+
+@parametrize_with_cases("data", cases=ContinuousData.data_synthetic_nd, prefix='data_')
+@parametrize('model', [encoder_model])
+@parametrize('layer', [None, -1])
+def test_save_model_tf(data, model, layer, tmp_path):
+    """
+    Unit test for _save_model_config and _load_model_config with tensorflow model.
+    """
+    # Save model
+    filepath = tmp_path
+    input_shape = (data[0].shape[1],)
+    cfg_model, _ = _save_model_config(model, base_path=filepath, input_shape=input_shape)
+    cfg_model = _path2str(cfg_model)
+    cfg_model = ModelConfig(**cfg_model).dict()
+    assert tmp_path.joinpath('model').is_dir()
+    assert tmp_path.joinpath('model/model.h5').is_file()
+
+    # Adjust config
+    cfg_model['src'] = tmp_path.joinpath('model')  # Need to manually set to absolute path here
+    if layer is not None:
+        cfg_model['layer'] = layer
+
+    # Load model
+    model_load = _load_model_config(cfg_model)
+    if layer is None:
+        assert isinstance(model_load, type(model))
+    else:
+        assert isinstance(model_load, HiddenOutput_tf)