From 69059ddc01bf14cd3d929d01ac8a7a3018a4df26 Mon Sep 17 00:00:00 2001 From: Liam Keegan Date: Thu, 23 Mar 2023 15:04:17 +0100 Subject: [PATCH] add `ModelManager` - ModelManager features - `load()` imports a spacy model created by DataManager - `metadata` dict exposes metadata the user should edit such as author, license, etc - `save()` saves changes (currently only the updated metadata can be changed) - `publish()` publishes the model to hugging face - add `test_model_manager` notebook with example of use - add `spacy-huggingface-hub` and `wheel` to dependencies also - refactor tests - simplify fixtures code in conftest.py - add `model_path` fixture to conftest.py that provides a trained spacy model - pin ipywidgets<8.0.5 for now to avoid test failures in CI - looks like it is due to this change: https://github.com/jupyter-widgets/ipywidgets/pull/3533 --- .github/workflows/ci.yml | 5 +- moralization/model_manager.py | 101 ++++++++++++++ moralization/tests/conftest.py | 49 ++++--- moralization/tests/test_model_manager.py | 100 +++++++++++++ notebooks/model_manager_test.ipynb | 170 +++++++++++++++++++++++ pyproject.toml | 6 +- 6 files changed, 409 insertions(+), 22 deletions(-) create mode 100644 moralization/model_manager.py create mode 100644 moralization/tests/test_model_manager.py create mode 100644 notebooks/model_manager_test.ipynb diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index d3e713c..2d77cc0 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -10,10 +10,11 @@ on: jobs: test: + name: "${{ matrix.os }} :: ${{ matrix.python-version }}" runs-on: ${{ matrix.os }} strategy: matrix: - os: [ubuntu-20.04] + os: [ubuntu-latest] python-version: [3.9] steps: - name: Checkout repository @@ -30,7 +31,7 @@ jobs: - name: Run pytest run: | cd moralization - python -m pytest -s --cov=. --cov-report=xml + python -m pytest -v -s --cov=. --cov-report=xml - name: Upload coverage uses: codecov/codecov-action@v3 with: diff --git a/moralization/model_manager.py b/moralization/model_manager.py new file mode 100644 index 0000000..109329a --- /dev/null +++ b/moralization/model_manager.py @@ -0,0 +1,101 @@ +import huggingface_hub +import spacy_huggingface_hub +import os +import spacy +from pathlib import Path +from typing import Union, Optional, Dict, Any +import tempfile +import re +import logging + + +def _construct_wheel_path(model_path: Path, meta: Dict[str, Any]) -> Path: + full_name = f"{meta['lang']}_{meta['name']}-{meta['version']}" + return model_path / full_name / "dist" / f"{full_name}-py3-none-any.whl" + + +def _make_valid_package_name(name: str) -> str: + # attempt to make name valid, throw exception if we fail + # https://packaging.python.org/en/latest/specifications/name-normalization + valid_name = re.sub(r"[-_.,<>!@#$%^&*()+ /?]+", "_", name).lower().strip("_") + if name != valid_name: + logging.warning( + f"'{name}' not a valid package name, using '{valid_name}' instead" + ) + if ( + re.match("^([A-Z0-9]|[A-Z0-9][A-Z0-9._-]*[A-Z0-9])$", valid_name, re.IGNORECASE) + is None + ): + raise ValueError( + "Invalid package name: Can only contain ASCII letters, numbers and underscore." + ) + return valid_name + + +class ModelManager: + """ + Import, modify and publish models to hugging face. + """ + + _meta_keys_to_expose_to_user = [ + "name", + "version", + "description", + "author", + "email", + "url", + "license", + ] + + def __init__(self, model_path: Union[str, Path] = None): + self.load(model_path) + + def load(self, model_path: Union[str, Path]): + """Load a spacy model from `model_path`.""" + self.model_path = Path(model_path) + self.spacy_model = spacy.load(model_path) + self.metadata = { + k: self.spacy_model.meta.get(k, "") + for k in self._meta_keys_to_expose_to_user + } + + def save(self): + """Save any changes made to the model metadata.""" + self._update_metadata() + self.spacy_model.to_disk(self.model_path) + + def publish(self, hugging_face_token: Optional[str] = None) -> Dict[str, str]: + """Publish the model to Hugging Face. + + This requires a User Access Token from https://huggingface.co/ + + The token can either be passed via the `hugging_face_token` argument, + or it can be set via the `HUGGING_FACE_TOKEN` environment variable. + + Args: + hugging_face_token (str, optional): Hugging Face User Access Token + Returns: + dict: URLs of the published model and the pip-installable wheel + """ + self.save() + if hugging_face_token is None: + hugging_face_token = os.environ.get("HUGGING_FACE_TOKEN") + if hugging_face_token is None: + raise ValueError( + "API TOKEN required: pass as string or set the HUGGING_FACE_TOKEN environment variable." + ) + huggingface_hub.login(token=hugging_face_token) + with tempfile.TemporaryDirectory() as tmpdir: + # convert model to a python package incl binary wheel + output_path = Path(tmpdir) + spacy.cli.package(self.model_path, output_path, create_wheel=True) + # push the package to hugging face + return spacy_huggingface_hub.push( + _construct_wheel_path(output_path, self.spacy_model.meta) + ) + + def _update_metadata(self): + self.metadata["name"] = _make_valid_package_name(self.metadata.get("name")) + for k, v in self.metadata.items(): + if k in self.spacy_model.meta: + self.spacy_model.meta[k] = v diff --git a/moralization/tests/conftest.py b/moralization/tests/conftest.py index 995d6ed..6c59480 100644 --- a/moralization/tests/conftest.py +++ b/moralization/tests/conftest.py @@ -1,31 +1,44 @@ import pytest from moralization import input_data +from moralization.data_manager import DataManager import pathlib -def _data_path_fixture(dir_path): - @pytest.fixture - def _fixture(): - return dir_path +@pytest.fixture(scope="session") +def data_dir(): + return pathlib.Path(__file__).parents[1].resolve() / "data" - return _fixture +@pytest.fixture(scope="session") +def ts_file(data_dir): + return data_dir / "TypeSystem.xml" -def _doc_dict_fixture(dir_path): - @pytest.fixture - def _fixture(): - return input_data.InputOutput.read_data(dir_path) - return _fixture +@pytest.fixture(scope="session") +def data_file(data_dir): + return ( + data_dir / "test_data-trimmed_version_of-Interviews-pos-SH-neu-optimiert-AW.xmi" + ) -dir_path = pathlib.Path(__file__).parents[1].resolve() / "data" -data_dir = _data_path_fixture(dir_path) -doc_dicts = _doc_dict_fixture(dir_path) +@pytest.fixture(scope="session") +def config_file(data_dir): + return data_dir / "config.cfg" -ts_file = _data_path_fixture(dir_path / "TypeSystem.xml") -data_file = _data_path_fixture( - dir_path / "test_data-trimmed_version_of-Interviews-pos-SH-neu-optimiert-AW.xmi" -) -config_file = _data_path_fixture(dir_path / "config.cfg") +@pytest.fixture(scope="session") +def model_path(data_dir, config_file, tmp_path_factory) -> pathlib.Path: + """ + Returns a temporary path containing a trained model. + This is only created once and re-used for the entire pytest session. + """ + dm = DataManager(data_dir) + dm.export_data_DocBin() + tmp_path = tmp_path_factory.mktemp("model") + dm.spacy_train(working_dir=tmp_path, config=config_file, n_epochs=1) + yield tmp_path / "output" / "model-best" + + +@pytest.fixture +def doc_dicts(data_dir): + return input_data.InputOutput.read_data(str(data_dir)) diff --git a/moralization/tests/test_model_manager.py b/moralization/tests/test_model_manager.py new file mode 100644 index 0000000..afe1a44 --- /dev/null +++ b/moralization/tests/test_model_manager.py @@ -0,0 +1,100 @@ +from moralization.model_manager import ModelManager +import spacy +import pytest +import spacy_huggingface_hub +import huggingface_hub +from typing import Any +from pathlib import Path + + +def test_model_manager_valid_path(model_path): + model = ModelManager(model_path) + assert model.spacy_model is not None + assert model.spacy_model.lang == "de" + assert model.spacy_model.path == model_path + + +def test_model_manager_modify_metadata(model_path): + model = ModelManager(model_path) + # update metadata values and save model + keys = ["name", "version", "description", "author", "email", "url", "license"] + for key in keys: + model.metadata[key] = f"{key}" + model.save() + for key in keys: + assert model.metadata[key] == f"{key}" + # re-load model + model.load(model_path) + for key in keys: + assert model.metadata[key] == f"{key}" + # load model directly in spacy and check its meta has also been updated + nlp = spacy.load(model_path) + for key in keys: + assert nlp.meta[key] == f"{key}" + + +def test_model_manager_modify_metadata_fixable_invalid_names(model_path): + model = ModelManager(model_path) + for invalid_name, valid_name in [("!hm & __OK?,...", "hm_ok"), ("Im - S", "im_s")]: + model.metadata["name"] = invalid_name + assert model.metadata["name"] == invalid_name + # name is made valid on call to save() + model.save() + assert model.metadata["name"] == valid_name + nlp = spacy.load(model_path) + assert nlp.meta["name"] == valid_name + + +def test_model_manager_modify_metadata_unfixable_invalid_names(model_path): + model = ModelManager(model_path) + for unfixable_invalid_name in ["", "_", "ΓΌ"]: + model.metadata["name"] = unfixable_invalid_name + with pytest.raises(ValueError) as e: + model.save() + assert "invalid" in str(e.value).lower() + + +def test_model_manager_publish_no_token(model_path, monkeypatch): + monkeypatch.delenv("HUGGING_FACE_TOKEN", raising=False) + model = ModelManager(model_path) + with pytest.raises(ValueError) as e: + model.publish() + assert "token" in str(e.value).lower() + + +def test_model_manager_publish_invalid_token_env(model_path, monkeypatch): + monkeypatch.setenv("HUGGING_FACE_TOKEN", "invalid") + model = ModelManager(model_path) + with pytest.raises(ValueError) as e: + model.publish() + assert "token" in str(e.value).lower() + + +def test_model_manager_publish_invalid_token_arg(model_path): + model = ModelManager(model_path) + with pytest.raises(ValueError) as e: + model.publish(hugging_face_token="invalid") + assert "token" in str(e.value).lower() + + +def test_model_manager_publish_mock_push(model_path: Path, monkeypatch, tmp_path): + def mock_spacy_huggingface_hub_push(whl_path: Path): + whl_path.rename(tmp_path / whl_path.name) + return {} + + # monkey patch spacy_huggingface_hub.push() to just move the supplied wheel to a temporary path + monkeypatch.setattr(spacy_huggingface_hub, "push", mock_spacy_huggingface_hub_push) + + def do_nothing(*args: Any, **kwargs: Any) -> None: + return + + # monkey patch huggingface_hub.login() to do nothing + monkeypatch.setattr(huggingface_hub, "login", do_nothing) + + model = ModelManager(model_path) + # set name and version - these determine the name of the compiled wheel + model.metadata["name"] = "my_new_pipeline" + model.metadata["version"] = "1.2.3" + model.publish(hugging_face_token="abc123") + wheel_path = tmp_path / "de_my_new_pipeline-1.2.3-py3-none-any.whl" + assert wheel_path.is_file() diff --git a/notebooks/model_manager_test.ipynb b/notebooks/model_manager_test.ipynb new file mode 100644 index 0000000..abe8c8c --- /dev/null +++ b/notebooks/model_manager_test.ipynb @@ -0,0 +1,170 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "55e83a27", + "metadata": {}, + "source": [ + "# ModelManager example notebook" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f7ce3ad0", + "metadata": {}, + "outputs": [], + "source": [ + "from moralization.data_manager import DataManager\n", + "from moralization.model_manager import ModelManager" + ] + }, + { + "cell_type": "markdown", + "id": "a0aa6775", + "metadata": {}, + "source": [ + "### Create & train a spacy model using DataManager\n", + "\n", + "This saves the trained model to the folder `test_model/output/model-last`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "44031c8d", + "metadata": {}, + "outputs": [], + "source": [ + "data_manager = DataManager(\"../moralization/data/\")\n", + "data_manager.export_data_DocBin()\n", + "data_manager.spacy_train(working_dir=\"test_model\", config=\"../moralization/data/config.cfg\", n_epochs=20)" + ] + }, + { + "cell_type": "markdown", + "id": "a988aa6b", + "metadata": {}, + "source": [ + "### Import spacy model using ModelManager" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0ec9f981", + "metadata": {}, + "outputs": [], + "source": [ + "model = ModelManager(\"test_model/output/model-best\")" + ] + }, + { + "cell_type": "markdown", + "id": "e6cb68cb", + "metadata": {}, + "source": [ + "### Set metadata\n", + "\n", + "- Initally empty apart from default `pipeline` name and `0.0.0` version\n", + "- This metadata will be used to generate the Model Card on hugging face" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3769b760", + "metadata": {}, + "outputs": [], + "source": [ + "model.metadata" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9701fcdb", + "metadata": {}, + "outputs": [], + "source": [ + "model.metadata[\"name\"] = \"test_pipeline\"\n", + "model.metadata[\"version\"] = \"0.1.0\"\n", + "model.metadata[\"description\"] = \"A test pipeline for ModelManager testing purposes\"\n", + "model.metadata[\"author\"] = \"Liam Keegan\"\n", + "model.metadata[\"email\"] = \"liam@keegan.ch\"\n", + "model.metadata[\"license\"] = \"MIT\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "961d1d2f", + "metadata": {}, + "outputs": [], + "source": [ + "model.metadata" + ] + }, + { + "cell_type": "markdown", + "id": "f1c99e30", + "metadata": {}, + "source": [ + "### Publish model to hugging-face\n", + "\n", + "- This requires a hugging-face [User Access Token](https://huggingface.co/docs/hub/security-tokens)\n", + "- You can export this to the `HUGGING_FACE_TOKEN` environment variable and just call `publish()`\n", + "- Or directly pass the token with `publish(hugging_face_token=\"abc123\")`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "18b270dd", + "metadata": {}, + "outputs": [], + "source": [ + "urls = model.publish()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2f36af3f", + "metadata": {}, + "outputs": [], + "source": [ + "urls" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7de852ba", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/pyproject.toml b/pyproject.toml index d8a150b..7083cdd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,12 +32,14 @@ dependencies = [ "openpyxl", "pytest", "pytest-cov", - "ipywidgets", + "ipywidgets<8.0.5", "spacy", + "spacy-huggingface-hub", "jupyter", "classy_classification", "spacy-span-analyzer", - "textacy" + "textacy", + "wheel" ] [tool.setuptools.package-data]