Skip to content

Commit

Permalink
* ✨ Add a km.random_name resolver to enable auto-generated run names (#…
Browse files Browse the repository at this point in the history
…426)

* ✨ Add a km.random_name resolver to enable auto-generated names in configuration (#426)

* add template example, add tests, idempotency still fails

* add tests, idempotency still fails

* add doc, ignore idempotency test, remove unused argument

* add syntax with resolver in mlflow.yml

* fix typo in doc

* add changelog

* add nested kedy agin in mlflow.yml

* fix typo in changelog
  • Loading branch information
Galileo-Galilei authored Feb 9, 2024
1 parent e886799 commit c98d2a3
Show file tree
Hide file tree
Showing 6 changed files with 117 additions and 2 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
### Fixed

- :bug: Add support for dataset factories in ``KedroPipelineModel`` ([#516, sebastiandro](https://github.com/Galileo-Galilei/kedro-mlflow/pull/516))
- :sparkles: Add a ``km.random_name`` resolver which enables to use auto-generated names for kedro runs instead of pipeline name in the ``mlflow.yml`` configuration file ([#426](https://github.com/Galileo-Galilei/kedro-mlflow/issues/426))


## [0.12.0] - 2023-12-19

Expand Down
11 changes: 11 additions & 0 deletions docs/source/04_experimentation_tracking/01_configuration.md
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,17 @@ tracking:
nested: True # # if `nested` is False, you won't be able to launch sub-runs inside your nodes
```
```{tip}
If you want to generate a random name for each run (like mlflow's default), you can use the built-in ``km.random_name`` resolver:

```yaml
tracking:
run:
name: ${km.random_name:} # don't forget the trailing ":" at the end !
```
```


- If you want to continue to log in an existing mlflow run, write its id in the `id` key.
- If you want to enable the creation of sub runs inside your nodes (for instance, for model comparison or hyperparameter tuning), set the `nested` key to `True`

Expand Down
6 changes: 6 additions & 0 deletions kedro_mlflow/config/resolvers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
from mlflow.utils.name_utils import _generate_random_name


def resolve_random_name():
# a resolver must have an argument, see: https://github.com/omry/omegaconf/issues/1060
return _generate_random_name()
8 changes: 8 additions & 0 deletions kedro_mlflow/framework/hooks/mlflow_hook.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,11 @@
from mlflow.models import infer_signature
from mlflow.tracking import MlflowClient
from mlflow.utils.validation import MAX_PARAM_VAL_LENGTH
from omegaconf import OmegaConf
from pydantic import __version__ as pydantic_version

from kedro_mlflow.config.kedro_mlflow_config import KedroMlflowConfig
from kedro_mlflow.config.resolvers import resolve_random_name
from kedro_mlflow.framework.hooks.utils import (
_assert_mlflow_enabled,
_flatten_dict,
Expand Down Expand Up @@ -60,6 +62,12 @@ def after_context_created(
context: The context that was created.
"""

LOGGER.info(r"Registering new custom resolver: 'km.random_name'")
if not OmegaConf.has_resolver("km.random_name"):
OmegaConf.register_new_resolver(
"km.random_name", resolve_random_name, use_cache=True
)

try:
if "mlflow" not in context.config_loader.config_patterns.keys():
context.config_loader.config_patterns.update(
Expand Down
3 changes: 1 addition & 2 deletions kedro_mlflow/template/project/mlflow.yml
Original file line number Diff line number Diff line change
Expand Up @@ -39,9 +39,8 @@ tracking:

run:
id: null # if `id` is None, a new run will be created
name: null # if `name` is None, pipeline name will be used for the run name
name: null # if `name` is None, pipeline name will be used for the run name. You can use "${km.random_name:}" to generate a random name (mlflow's default)
nested: True # if `nested` is False, you won't be able to launch sub-runs inside your nodes

params:
dict_params:
flatten: False # if True, parameter which are dictionary will be splitted in multiple parameters when logged in mlflow, one for each key.
Expand Down
89 changes: 89 additions & 0 deletions tests/config/test_resolvers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
import re

import pytest
import yaml
from kedro.framework.session import KedroSession
from kedro.framework.startup import bootstrap_project
from mlflow.utils.name_utils import (
_GENERATOR_NOUNS,
_GENERATOR_PREDICATES,
)
from omegaconf import OmegaConf

from kedro_mlflow.config.resolvers import resolve_random_name


def _write_yaml(filepath, config):
yaml_str = yaml.dump(config)
filepath.write_text(yaml_str)


def _is_mlflow_name(name: str) -> bool:
splitted_name = name.split("-")
flag1 = len(splitted_name) == 3 # noqa: PLR2004
flag2 = splitted_name[0] in _GENERATOR_PREDICATES
flag3 = splitted_name[1] in _GENERATOR_NOUNS
flag4 = re.search(pattern=r"^\d+$", string=splitted_name[2])
return all({flag1, flag2, flag3, flag4})


@pytest.fixture
def kedro_project_with_random_name(kedro_project):
# kedro_project is a pytest.fixture in conftest
dict_config = dict(
server=dict(
mlflow_tracking_uri="mlruns",
mlflow_registry_uri=None,
credentials=None,
request_header_provider=dict(type=None, pass_context=False, init_kwargs={}),
),
tracking=dict(
disable_tracking=dict(pipelines=["my_disabled_pipeline"]),
experiment=dict(name="fake_package", restore_if_deleted=True),
run=dict(id="123456789", name="${km.random_name:}", nested=True),
params=dict(
dict_params=dict(
flatten=True,
recursive=False,
sep="-",
),
long_params_strategy="truncate",
),
),
ui=dict(port="5151", host="localhost"),
)

_write_yaml(kedro_project / "conf" / "local" / "mlflow.yml", dict_config)
expected = dict_config.copy()
expected["server"]["mlflow_tracking_uri"] = (kedro_project / "mlruns").as_uri()
return kedro_project


def test_resolve_random_name_is_valid_mlflow_name():
random_name = resolve_random_name()
assert _is_mlflow_name(random_name)


def test_resolve_random_name_is_registered(kedro_project_with_random_name):
bootstrap_project(kedro_project_with_random_name)
with KedroSession.create(project_path=kedro_project_with_random_name) as session:
session.load_context()
assert OmegaConf.has_resolver("km.random_name")


def test_resolve_random_name_is_called_in_project(kedro_project_with_random_name):
bootstrap_project(kedro_project_with_random_name)
with KedroSession.create(project_path=kedro_project_with_random_name) as session:
context = session.load_context()
assert _is_mlflow_name(context.mlflow.tracking.run.name)


@pytest.mark.skip(reason="kedro 0.19.2 does not take use_cache into account")
def test_resolve_random_name_is_idempotent(kedro_project_with_random_name):
bootstrap_project(kedro_project_with_random_name)
with KedroSession.create(project_path=kedro_project_with_random_name) as session:
context = session.load_context()
assert (
context.config_loader["mlflow"]["tracking"]["run"]["name"]
== context.config_loader["mlflow"]["tracking"]["run"]["name"]
) # when called twice, should be different is no use_cache because the resolver is random

0 comments on commit c98d2a3

Please sign in to comment.