Add metric_metadata to pre-built genai metrics (mlflow#12662)

Signed-off-by: Chengzu Ou <[email protected]>
xq-yin · Jul 15, 2024 · 99933f8 · 99933f8
1 parent f4b7ed8
commit 99933f8
Show file tree

Hide file tree

Showing 2 changed files with 41 additions and 2 deletions.
diff --git a/mlflow/metrics/genai/metric_definitions.py b/mlflow/metrics/genai/metric_definitions.py
@@ -1,4 +1,4 @@
-from typing import List, Optional
+from typing import Any, Dict, List, Optional
 
 from mlflow.exceptions import MlflowException
 from mlflow.metrics.genai.base import EvaluationExample
@@ -15,6 +15,7 @@ def answer_similarity(
     model: Optional[str] = None,
     metric_version: Optional[str] = None,
     examples: Optional[List[EvaluationExample]] = None,
+    metric_metadata: Optional[Dict[str, Any]] = None,
 ) -> EvaluationMetric:
     """
     This function will create a genai metric used to evaluate the answer similarity of an LLM
@@ -37,6 +38,9 @@ def answer_similarity(
         examples: (Optional) Provide a list of examples to help the judge model evaluate the
             answer similarity. It is highly recommended to add examples to be used as a reference to
             evaluate the new results.
+        metric_metadata: (Optional) Dictionary of metadata to be attached to the
+            EvaluationMetric object. Useful for model evaluators that require additional
+            information to determine how to evaluate this metric.
 
     Returns:
         A metric object
@@ -75,6 +79,7 @@ def answer_similarity(
         parameters=answer_similarity_class_module.parameters,
         aggregations=["mean", "variance", "p90"],
         greater_is_better=True,
+        metric_metadata=metric_metadata,
     )
 
 
@@ -83,6 +88,7 @@ def answer_correctness(
     model: Optional[str] = None,
     metric_version: Optional[str] = None,
     examples: Optional[List[EvaluationExample]] = None,
+    metric_metadata: Optional[Dict[str, Any]] = None,
 ) -> EvaluationMetric:
     """
     This function will create a genai metric used to evaluate the answer correctness of an LLM
@@ -105,6 +111,9 @@ def answer_correctness(
         examples: Provide a list of examples to help the judge model evaluate the
             answer correctness. It is highly recommended to add examples to be used as a reference
             to evaluate the new results.
+        metric_metadata: (Optional) Dictionary of metadata to be attached to the
+            EvaluationMetric object. Useful for model evaluators that require additional
+            information to determine how to evaluate this metric.
 
     Returns:
         A metric object
@@ -142,6 +151,7 @@ def answer_correctness(
         parameters=answer_correctness_class_module.parameters,
         aggregations=["mean", "variance", "p90"],
         greater_is_better=True,
+        metric_metadata=metric_metadata,
     )
 
 
@@ -150,6 +160,7 @@ def faithfulness(
     model: Optional[str] = None,
     metric_version: Optional[str] = _get_latest_metric_version(),
     examples: Optional[List[EvaluationExample]] = None,
+    metric_metadata: Optional[Dict[str, Any]] = None,
 ) -> EvaluationMetric:
     """
     This function will create a genai metric used to evaluate the faithfullness of an LLM using the
@@ -172,6 +183,9 @@ def faithfulness(
         examples: Provide a list of examples to help the judge model evaluate the
             faithfulness. It is highly recommended to add examples to be used as a reference to
             evaluate the new results.
+        metric_metadata: (Optional) Dictionary of metadata to be attached to the
+            EvaluationMetric object. Useful for model evaluators that require additional
+            information to determine how to evaluate this metric.
 
     Returns:
         A metric object
@@ -208,6 +222,7 @@ def faithfulness(
         parameters=faithfulness_class_module.parameters,
         aggregations=["mean", "variance", "p90"],
         greater_is_better=True,
+        metric_metadata=metric_metadata,
     )
 
 
@@ -216,6 +231,7 @@ def answer_relevance(
     model: Optional[str] = None,
     metric_version: Optional[str] = _get_latest_metric_version(),
     examples: Optional[List[EvaluationExample]] = None,
+    metric_metadata: Optional[Dict[str, Any]] = None,
 ) -> EvaluationMetric:
     """
     This function will create a genai metric used to evaluate the answer relevance of an LLM
@@ -234,6 +250,9 @@ def answer_relevance(
         examples: Provide a list of examples to help the judge model evaluate the
             answer relevance. It is highly recommended to add examples to be used as a reference to
             evaluate the new results.
+        metric_metadata: (Optional) Dictionary of metadata to be attached to the
+            EvaluationMetric object. Useful for model evaluators that require additional
+            information to determine how to evaluate this metric.
 
     Returns:
         A metric object
@@ -268,13 +287,15 @@ def answer_relevance(
         parameters=answer_relevance_class_module.parameters,
         aggregations=["mean", "variance", "p90"],
         greater_is_better=True,
+        metric_metadata=metric_metadata,
     )
 
 
 def relevance(
     model: Optional[str] = None,
     metric_version: Optional[str] = None,
     examples: Optional[List[EvaluationExample]] = None,
+    metric_metadata: Optional[Dict[str, Any]] = None,
 ) -> EvaluationMetric:
     """
     This function will create a genai metric used to evaluate the evaluate the relevance of an
@@ -297,6 +318,9 @@ def relevance(
         examples: (Optional) Provide a list of examples to help the judge model evaluate the
             relevance. It is highly recommended to add examples to be used as a reference to
             evaluate the new results.
+        metric_metadata: (Optional) Dictionary of metadata to be attached to the
+            EvaluationMetric object. Useful for model evaluators that require additional
+            information to determine how to evaluate this metric.
 
     Returns:
         A metric object
@@ -334,4 +358,5 @@ def relevance(
         parameters=relevance_class_module.parameters,
         aggregations=["mean", "variance", "p90"],
         greater_is_better=True,
+        metric_metadata=metric_metadata,
     )
diff --git a/tests/metrics/genai/test_genai_metrics.py b/tests/metrics/genai/test_genai_metrics.py
@@ -46,7 +46,6 @@
   "justification": "{openai_justification1}"
 }}"""
 
-
 properly_formatted_openai_response2 = (
     '{\n  "score": 2,\n  "justification": "The provided output gives a correct '
     "and adequate explanation of what Apache Spark is, covering its main functions and "
@@ -1205,3 +1204,18 @@ def test_log_make_genai_metric_fn_args():
     }
 
     assert custom_metric.genai_metric_args == expected_genai_metric_args
+
+
+@pytest.mark.parametrize(
+    "metric_fn",
+    [
+        answer_similarity,
+        answer_correctness,
+        faithfulness,
+        answer_relevance,
+        relevance,
+    ],
+)
+def test_metric_metadata_on_prebuilt_genai_metrics(metric_fn):
+    metric = metric_fn(metric_metadata={"metadata_field": "metadata_value"})
+    assert metric.metric_metadata == {"metadata_field": "metadata_value"}