From 52737cb1b2b817d145233dd0d566d76473ff5714 Mon Sep 17 00:00:00 2001 From: xq-yin Date: Wed, 26 Jun 2024 20:25:23 +0800 Subject: [PATCH] add more test Signed-off-by: xq-yin --- mlflow/models/evaluation/default_evaluator.py | 2 +- tests/evaluate/test_default_evaluator.py | 33 ++++++++++++++++++- 2 files changed, 33 insertions(+), 2 deletions(-) diff --git a/mlflow/models/evaluation/default_evaluator.py b/mlflow/models/evaluation/default_evaluator.py index 7c10c0cbd1707..0fc2ffcd8a513 100644 --- a/mlflow/models/evaluation/default_evaluator.py +++ b/mlflow/models/evaluation/default_evaluator.py @@ -1948,7 +1948,7 @@ def _evaluate( self.extra_metrics.remove(extra_metric) # When the field is present, the metric is created from either make_genai_metric # or make_genai_metric_from_prompt. We will log the metric definition. - if hasattr(extra_metric, "custom_metric_config"): + if extra_metric.custom_metric_config is not None: genai_custom_metrics.append(extra_metric.custom_metric_config) self._generate_model_predictions(compute_latency=compute_latency) self._handle_builtin_metrics_by_model_type() diff --git a/tests/evaluate/test_default_evaluator.py b/tests/evaluate/test_default_evaluator.py index 44fac3af5a700..ad1af6380d2f0 100644 --- a/tests/evaluate/test_default_evaluator.py +++ b/tests/evaluate/test_default_evaluator.py @@ -27,7 +27,9 @@ from mlflow.exceptions import MlflowException from mlflow.metrics import ( MetricValue, + flesch_kincaid_grade_level, make_metric, + toxicity, ) from mlflow.metrics.genai import model_utils from mlflow.metrics.genai.base import EvaluationExample @@ -4134,6 +4136,34 @@ def word_count_eval(predictions): ) +def test_do_not_log_built_in_metrics_as_artifacts(): + with mlflow.start_run() as run: + model_info = mlflow.pyfunc.log_model( + artifact_path="model", python_model=language_model, input_example=["a"] + ) + data = pd.DataFrame( + { + "inputs": ["words random", "This is a sentence."], + "ground_truth": ["words random", "This is a sentence."], + } + ) + evaluate( + model_info.model_uri, + data, + targets="ground_truth", + predictions="answer", + model_type="question-answering", + evaluators="default", + extra_metrics=[ + toxicity(), + flesch_kincaid_grade_level(), + ], + ) + client = mlflow.MlflowClient() + artifacts = [a.path for a in client.list_artifacts(run.info.run_id)] + assert _GENAI_CUSTOM_METRICS_FILE_NAME not in artifacts + + def test_log_llm_custom_metrics_as_artifacts(): with mlflow.start_run() as run: model_info = mlflow.pyfunc.log_model( @@ -4180,7 +4210,8 @@ def test_log_llm_custom_metrics_as_artifacts(): table = result.tables[_GENAI_CUSTOM_METRICS_FILE_NAME.split(".", 1)[0]] assert table.loc[0, "name"] == "answer_similarity" assert table.loc[0, "version"] == "v1" - assert table.loc[0, "metric_config"] is not None assert table.loc[1, "name"] == "custom llm judge" assert table.loc[1, "version"] is None + # TODO(xq-yin) ML-41356: Validate metric_config value once we implement deser function + assert table.loc[0, "metric_config"] is not None assert table.loc[1, "metric_config"] is not None