Skip to content

Commit

Permalink
Add metric_metadata to pre-built genai metrics (mlflow#12662)
Browse files Browse the repository at this point in the history
Signed-off-by: Chengzu Ou <[email protected]>
  • Loading branch information
freemso authored Jul 15, 2024
1 parent f4b7ed8 commit 99933f8
Show file tree
Hide file tree
Showing 2 changed files with 41 additions and 2 deletions.
27 changes: 26 additions & 1 deletion mlflow/metrics/genai/metric_definitions.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import List, Optional
from typing import Any, Dict, List, Optional

from mlflow.exceptions import MlflowException
from mlflow.metrics.genai.base import EvaluationExample
Expand All @@ -15,6 +15,7 @@ def answer_similarity(
model: Optional[str] = None,
metric_version: Optional[str] = None,
examples: Optional[List[EvaluationExample]] = None,
metric_metadata: Optional[Dict[str, Any]] = None,
) -> EvaluationMetric:
"""
This function will create a genai metric used to evaluate the answer similarity of an LLM
Expand All @@ -37,6 +38,9 @@ def answer_similarity(
examples: (Optional) Provide a list of examples to help the judge model evaluate the
answer similarity. It is highly recommended to add examples to be used as a reference to
evaluate the new results.
metric_metadata: (Optional) Dictionary of metadata to be attached to the
EvaluationMetric object. Useful for model evaluators that require additional
information to determine how to evaluate this metric.
Returns:
A metric object
Expand Down Expand Up @@ -75,6 +79,7 @@ def answer_similarity(
parameters=answer_similarity_class_module.parameters,
aggregations=["mean", "variance", "p90"],
greater_is_better=True,
metric_metadata=metric_metadata,
)


Expand All @@ -83,6 +88,7 @@ def answer_correctness(
model: Optional[str] = None,
metric_version: Optional[str] = None,
examples: Optional[List[EvaluationExample]] = None,
metric_metadata: Optional[Dict[str, Any]] = None,
) -> EvaluationMetric:
"""
This function will create a genai metric used to evaluate the answer correctness of an LLM
Expand All @@ -105,6 +111,9 @@ def answer_correctness(
examples: Provide a list of examples to help the judge model evaluate the
answer correctness. It is highly recommended to add examples to be used as a reference
to evaluate the new results.
metric_metadata: (Optional) Dictionary of metadata to be attached to the
EvaluationMetric object. Useful for model evaluators that require additional
information to determine how to evaluate this metric.
Returns:
A metric object
Expand Down Expand Up @@ -142,6 +151,7 @@ def answer_correctness(
parameters=answer_correctness_class_module.parameters,
aggregations=["mean", "variance", "p90"],
greater_is_better=True,
metric_metadata=metric_metadata,
)


Expand All @@ -150,6 +160,7 @@ def faithfulness(
model: Optional[str] = None,
metric_version: Optional[str] = _get_latest_metric_version(),
examples: Optional[List[EvaluationExample]] = None,
metric_metadata: Optional[Dict[str, Any]] = None,
) -> EvaluationMetric:
"""
This function will create a genai metric used to evaluate the faithfullness of an LLM using the
Expand All @@ -172,6 +183,9 @@ def faithfulness(
examples: Provide a list of examples to help the judge model evaluate the
faithfulness. It is highly recommended to add examples to be used as a reference to
evaluate the new results.
metric_metadata: (Optional) Dictionary of metadata to be attached to the
EvaluationMetric object. Useful for model evaluators that require additional
information to determine how to evaluate this metric.
Returns:
A metric object
Expand Down Expand Up @@ -208,6 +222,7 @@ def faithfulness(
parameters=faithfulness_class_module.parameters,
aggregations=["mean", "variance", "p90"],
greater_is_better=True,
metric_metadata=metric_metadata,
)


Expand All @@ -216,6 +231,7 @@ def answer_relevance(
model: Optional[str] = None,
metric_version: Optional[str] = _get_latest_metric_version(),
examples: Optional[List[EvaluationExample]] = None,
metric_metadata: Optional[Dict[str, Any]] = None,
) -> EvaluationMetric:
"""
This function will create a genai metric used to evaluate the answer relevance of an LLM
Expand All @@ -234,6 +250,9 @@ def answer_relevance(
examples: Provide a list of examples to help the judge model evaluate the
answer relevance. It is highly recommended to add examples to be used as a reference to
evaluate the new results.
metric_metadata: (Optional) Dictionary of metadata to be attached to the
EvaluationMetric object. Useful for model evaluators that require additional
information to determine how to evaluate this metric.
Returns:
A metric object
Expand Down Expand Up @@ -268,13 +287,15 @@ def answer_relevance(
parameters=answer_relevance_class_module.parameters,
aggregations=["mean", "variance", "p90"],
greater_is_better=True,
metric_metadata=metric_metadata,
)


def relevance(
model: Optional[str] = None,
metric_version: Optional[str] = None,
examples: Optional[List[EvaluationExample]] = None,
metric_metadata: Optional[Dict[str, Any]] = None,
) -> EvaluationMetric:
"""
This function will create a genai metric used to evaluate the evaluate the relevance of an
Expand All @@ -297,6 +318,9 @@ def relevance(
examples: (Optional) Provide a list of examples to help the judge model evaluate the
relevance. It is highly recommended to add examples to be used as a reference to
evaluate the new results.
metric_metadata: (Optional) Dictionary of metadata to be attached to the
EvaluationMetric object. Useful for model evaluators that require additional
information to determine how to evaluate this metric.
Returns:
A metric object
Expand Down Expand Up @@ -334,4 +358,5 @@ def relevance(
parameters=relevance_class_module.parameters,
aggregations=["mean", "variance", "p90"],
greater_is_better=True,
metric_metadata=metric_metadata,
)
16 changes: 15 additions & 1 deletion tests/metrics/genai/test_genai_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,6 @@
"justification": "{openai_justification1}"
}}"""


properly_formatted_openai_response2 = (
'{\n "score": 2,\n "justification": "The provided output gives a correct '
"and adequate explanation of what Apache Spark is, covering its main functions and "
Expand Down Expand Up @@ -1205,3 +1204,18 @@ def test_log_make_genai_metric_fn_args():
}

assert custom_metric.genai_metric_args == expected_genai_metric_args


@pytest.mark.parametrize(
"metric_fn",
[
answer_similarity,
answer_correctness,
faithfulness,
answer_relevance,
relevance,
],
)
def test_metric_metadata_on_prebuilt_genai_metrics(metric_fn):
metric = metric_fn(metric_metadata={"metadata_field": "metadata_value"})
assert metric.metric_metadata == {"metadata_field": "metadata_value"}

0 comments on commit 99933f8

Please sign in to comment.