diff --git a/integrations/ragas/src/haystack_integrations/components/evaluators/ragas/evaluator.py b/integrations/ragas/src/haystack_integrations/components/evaluators/ragas/evaluator.py index 71dacd6c7..5c8613553 100644 --- a/integrations/ragas/src/haystack_integrations/components/evaluators/ragas/evaluator.py +++ b/integrations/ragas/src/haystack_integrations/components/evaluators/ragas/evaluator.py @@ -11,7 +11,6 @@ from .metrics import ( METRIC_DESCRIPTORS, InputConverters, - MetricParamsValidator, OutputConverters, RagasMetric, ) @@ -66,7 +65,7 @@ def __init__( on required parameters. """ self.metric = metric if isinstance(metric, RagasMetric) else RagasMetric.from_str(metric) - self.metric_params = metric_params or {} + self.metric_params = metric_params self.descriptor = METRIC_DESCRIPTORS[self.metric] self._init_backend() @@ -79,10 +78,24 @@ def _init_backend(self): self._backend_callable = RagasEvaluator._invoke_evaluate def _init_metric(self): - MetricParamsValidator.validate_metric_parameters( - self.metric, self.descriptor.init_parameters, self.metric_params - ) - self._backend_metric = self.descriptor.backend(**self.metric_params) + if self.descriptor.init_parameters is not None: + if self.metric_params is None: + msg = f"Ragas metric '{self.metric}' expected init parameters but got none" + raise ValueError(msg) + elif not all(k in self.descriptor.init_parameters for k in self.metric_params.keys()): + msg = ( + f"Invalid init parameters for Ragas metric '{self.metric}'. " + f"Expected: {self.descriptor.init_parameters}" + ) + raise ValueError(msg) + elif self.metric_params is not None: + msg = ( + f"Invalid init parameters for Ragas metric '{self.metric}'. " + f"None expected but {self.metric_params} given" + ) + raise ValueError(msg) + metric_params = self.metric_params or {} + self._backend_metric = self.descriptor.backend(**metric_params) @staticmethod def _invoke_evaluate(dataset: Dataset, metric: Metric) -> Result: diff --git a/integrations/ragas/src/haystack_integrations/components/evaluators/ragas/metrics.py b/integrations/ragas/src/haystack_integrations/components/evaluators/ragas/metrics.py index 72f3e8a3b..ed807aa81 100644 --- a/integrations/ragas/src/haystack_integrations/components/evaluators/ragas/metrics.py +++ b/integrations/ragas/src/haystack_integrations/components/evaluators/ragas/metrics.py @@ -134,7 +134,7 @@ class MetricDescriptor: backend: Type[Metric] input_parameters: Dict[str, Type] input_converter: Callable[[Any], Iterable[Dict[str, str]]] - output_converter: Callable[[Result, RagasMetric, Dict[str, Any]], List[MetricResult]] + output_converter: Callable[[Result, RagasMetric, Optional[Dict[str, Any]]], List[MetricResult]] init_parameters: Optional[List[str]] = None @classmethod @@ -143,7 +143,9 @@ def new( metric: RagasMetric, backend: Type[Metric], input_converter: Callable[[Any], Iterable[Dict[str, str]]], - output_converter: Optional[Callable[[Result, RagasMetric, Dict[str, Any]], List[MetricResult]]] = None, + output_converter: Optional[ + Callable[[Result, RagasMetric, Optional[Dict[str, Any]]], List[MetricResult]] + ] = None, *, init_parameters: Optional[List[str]] = None, ) -> "MetricDescriptor": @@ -166,24 +168,6 @@ def new( ) -class MetricParamsValidator: - """ - Validates metric parameters. - - Depending on the metric type, different metric parameters are allowed. - The validator functions are responsible for validating the parameters and raising an error if they are invalid. - """ - - @staticmethod - def validate_metric_parameters(metric: RagasMetric, allowed: List[str], received: Dict[str, Any]) -> None: - if not set(received).issubset(allowed): - msg = ( - f"Invalid init parameters for Ragas metric '{metric}'. " - f"Allowed metric parameters {allowed} but got '{received}'" - ) - raise ValueError(msg) - - class InputConverters: """ Converters for input parameters. @@ -292,12 +276,15 @@ def _extract_default_results(output: Result, metric_name: str) -> List[MetricRes raise ValueError(msg) from e @staticmethod - def default(output: Result, metric: RagasMetric, _: Dict) -> List[MetricResult]: + def default(output: Result, metric: RagasMetric, _: Optional[Dict]) -> List[MetricResult]: metric_name = metric.value return OutputConverters._extract_default_results(output, metric_name) @staticmethod - def aspect_critique(output: Result, _: RagasMetric, metric_params: Dict[str, Any]) -> List[MetricResult]: + def aspect_critique(output: Result, _: RagasMetric, metric_params: Optional[Dict[str, Any]]) -> List[MetricResult]: + if metric_params is None: + msg = "Aspect critique metric requires metric parameters" + raise ValueError(msg) metric_name = metric_params["name"] return OutputConverters._extract_default_results(output, metric_name) @@ -307,55 +294,50 @@ def aspect_critique(output: Result, _: RagasMetric, metric_params: Dict[str, Any RagasMetric.ANSWER_CORRECTNESS, AnswerCorrectness, InputConverters.question_response_ground_truth, # type: ignore - init_parameters=["name", "weights", "answer_similarity"], + init_parameters=["weights"], ), RagasMetric.FAITHFULNESS: MetricDescriptor.new( RagasMetric.FAITHFULNESS, Faithfulness, InputConverters.question_context_response, # type: ignore - init_parameters=["name"], ), RagasMetric.ANSWER_SIMILARITY: MetricDescriptor.new( RagasMetric.ANSWER_SIMILARITY, AnswerSimilarity, InputConverters.response_ground_truth, # type: ignore - init_parameters=["name", "model_name", "threshold"], + init_parameters=["threshold"], ), RagasMetric.CONTEXT_PRECISION: MetricDescriptor.new( RagasMetric.CONTEXT_PRECISION, ContextPrecision, InputConverters.question_context_ground_truth, # type: ignore - init_parameters=["name"], ), RagasMetric.CONTEXT_UTILIZATION: MetricDescriptor.new( RagasMetric.CONTEXT_UTILIZATION, ContextUtilization, InputConverters.question_context_response, # type: ignore - init_parameters=["name"], ), RagasMetric.CONTEXT_RECALL: MetricDescriptor.new( RagasMetric.CONTEXT_RECALL, ContextRecall, InputConverters.question_context_ground_truth, # type: ignore - init_parameters=["name"], ), RagasMetric.ASPECT_CRITIQUE: MetricDescriptor.new( RagasMetric.ASPECT_CRITIQUE, AspectCritique, InputConverters.question_context_response, # type: ignore OutputConverters.aspect_critique, - init_parameters=["name", "definition", "strictness", "llm"], + init_parameters=["name", "definition", "strictness"], ), RagasMetric.CONTEXT_RELEVANCY: MetricDescriptor.new( RagasMetric.CONTEXT_RELEVANCY, ContextRelevancy, InputConverters.question_context, # type: ignore - init_parameters=["name"], ), RagasMetric.ANSWER_RELEVANCY: MetricDescriptor.new( RagasMetric.ANSWER_RELEVANCY, AnswerRelevancy, InputConverters.question_context_response, # type: ignore - init_parameters=["name", "strictness", "embeddings"], + init_parameters=["strictness"], ), } diff --git a/integrations/ragas/tests/test_evaluator.py b/integrations/ragas/tests/test_evaluator.py index 09f8cdfcc..0decc96cd 100644 --- a/integrations/ragas/tests/test_evaluator.py +++ b/integrations/ragas/tests/test_evaluator.py @@ -5,11 +5,10 @@ import pytest from datasets import Dataset from haystack import DeserializationError +from haystack_integrations.components.evaluators.ragas import RagasEvaluator, RagasMetric from ragas.evaluation import Result from ragas.metrics.base import Metric -from haystack_integrations.components.evaluators.ragas import RagasEvaluator, RagasMetric - DEFAULT_QUESTIONS = [ "Which is the most popular global sport?", "Who created the Python language?", @@ -59,43 +58,58 @@ def evaluate(self, _, metric: Metric, **kwargs): return output_map[self.metric] -def test_evaluator_metric_init_params(): +@pytest.mark.parametrize( + "metric, init_params", + [ + (RagasMetric.ANSWER_CORRECTNESS, {"weights": [0.5, 0.5]}), + (RagasMetric.FAITHFULNESS, None), + (RagasMetric.ANSWER_SIMILARITY, {"threshold": 0.5}), + (RagasMetric.CONTEXT_PRECISION, None), + (RagasMetric.CONTEXT_UTILIZATION, None), + (RagasMetric.CONTEXT_RECALL, None), + ( + RagasMetric.ASPECT_CRITIQUE, + { + "name": "harmfulness", + "definition": "Does the submission causes or has the potential to " + "cause harm to individuals, groups, or society at " + "large?", + }, + ), + (RagasMetric.CONTEXT_RELEVANCY, None), + (RagasMetric.ANSWER_RELEVANCY, {"strictness": 2}), + ], +) +def test_evaluator_valid_metric_init_params(metric, init_params): eval = RagasEvaluator( - RagasMetric.ASPECT_CRITIQUE, - metric_params={ - 'name': 'harmfulness', - 'definition': 'Does the submission causes or has the potential to cause ' - 'harm to individuals, groups, or society at large?', - }, + metric, + metric_params=init_params, ) - assert eval.metric_params == { - 'definition': 'Does the submission causes or has the potential to cause harm to ' - 'individuals, groups, or society at large?', - 'name': 'harmfulness', - } + assert eval.metric_params == init_params - with pytest.raises(ValueError, match="Expects a name"): - RagasEvaluator(RagasMetric.ASPECT_CRITIQUE, metric_params=None) - - with pytest.raises(ValueError, match="Expects a name"): - RagasEvaluator(RagasMetric.ASPECT_CRITIQUE, metric_params={}) - - with pytest.raises(ValueError, match="Expects a name"): + msg = f"Invalid init parameters for Ragas metric '{metric}'. " + with pytest.raises(ValueError, match=msg): RagasEvaluator( - RagasMetric.ASPECT_CRITIQUE, - metric_params={"definition": "custom definition"}, + metric, + metric_params={"invalid_param": "invalid_value"}, ) - with pytest.raises(ValueError, match="Expects definition"): - RagasEvaluator( - RagasMetric.ASPECT_CRITIQUE, - metric_params={"name": "custom name"}, - ) - with pytest.raises(ValueError, match="Invalid init parameters"): +@pytest.mark.parametrize( + "metric", + [ + RagasMetric.ANSWER_CORRECTNESS, + RagasMetric.ANSWER_SIMILARITY, + RagasMetric.ASPECT_CRITIQUE, + RagasMetric.ANSWER_RELEVANCY, + ], +) +def test_evaluator_fails_with_no_metric_init_params(metric): + msg = f"Ragas metric '{metric}' expected init parameters but got none" + with pytest.raises(ValueError, match=msg): RagasEvaluator( - RagasMetric.FAITHFULNESS, - metric_params={"check_numbers": True}, + metric, + metric_params=None, ) @@ -103,10 +117,10 @@ def test_evaluator_serde(): init_params = { "metric": RagasMetric.ASPECT_CRITIQUE, "metric_params": { - 'name': 'harmfulness', - 'definition': 'Does the submission causes or has the potential to ' - 'cause harm to individuals, groups, or society at ' - 'large?', + "name": "harmfulness", + "definition": "Does the submission causes or has the potential to " + "cause harm to individuals, groups, or society at " + "large?", }, } eval = RagasEvaluator(**init_params) @@ -126,9 +140,13 @@ def test_evaluator_serde(): @pytest.mark.parametrize( "current_metric, inputs, params", [ - (RagasMetric.ANSWER_CORRECTNESS, {"questions": [], "responses": [], "ground_truths": []}, None), + ( + RagasMetric.ANSWER_CORRECTNESS, + {"questions": [], "responses": [], "ground_truths": []}, + {"weights": [0.5, 0.5]}, + ), (RagasMetric.FAITHFULNESS, {"questions": [], "contexts": [], "responses": []}, None), - (RagasMetric.ANSWER_SIMILARITY, {"responses": [], "ground_truths": []}, None), + (RagasMetric.ANSWER_SIMILARITY, {"responses": [], "ground_truths": []}, {"threshold": 0.5}), (RagasMetric.CONTEXT_PRECISION, {"questions": [], "contexts": [], "ground_truths": []}, None), (RagasMetric.CONTEXT_UTILIZATION, {"questions": [], "contexts": [], "responses": []}, None), (RagasMetric.CONTEXT_RECALL, {"questions": [], "contexts": [], "ground_truths": []}, None), @@ -136,14 +154,14 @@ def test_evaluator_serde(): RagasMetric.ASPECT_CRITIQUE, {"questions": [], "contexts": [], "responses": []}, { - 'name': 'harmfulness', - 'definition': 'Does the submission causes or has the potential to ' - 'cause harm to individuals, groups, or society at ' - 'large?', + "name": "harmfulness", + "definition": "Does the submission causes or has the potential to " + "cause harm to individuals, groups, or society at " + "large?", }, ), (RagasMetric.CONTEXT_RELEVANCY, {"questions": [], "contexts": []}, None), - (RagasMetric.ANSWER_RELEVANCY, {"questions": [], "contexts": [], "responses": []}, None), + (RagasMetric.ANSWER_RELEVANCY, {"questions": [], "contexts": [], "responses": []}, {"strictness": 2}), ], ) def test_evaluator_valid_inputs(current_metric, inputs, params): @@ -170,9 +188,9 @@ def test_evaluator_valid_inputs(current_metric, inputs, params): RagasMetric.ANSWER_RELEVANCY, {"questions": [""], "responses": [], "contexts": []}, "Mismatching counts ", - None, + {"strictness": 2}, ), - (RagasMetric.ANSWER_RELEVANCY, {"responses": []}, "expected input parameter ", None), + (RagasMetric.ANSWER_RELEVANCY, {"responses": []}, "expected input parameter ", {"strictness": 2}), ], ) def test_evaluator_invalid_inputs(current_metric, inputs, error_string, params): @@ -195,7 +213,7 @@ def test_evaluator_invalid_inputs(current_metric, inputs, error_string, params): RagasMetric.ANSWER_CORRECTNESS, {"questions": ["q1"], "responses": ["r1"], "ground_truths": ["gt1"]}, [[(None, 0.5)]], - None, + {"weights": [0.5, 0.5]}, ), ( RagasMetric.FAITHFULNESS, @@ -203,7 +221,12 @@ def test_evaluator_invalid_inputs(current_metric, inputs, error_string, params): [[(None, 1.0)]], None, ), - (RagasMetric.ANSWER_SIMILARITY, {"responses": ["r3"], "ground_truths": ["gt3"]}, [[(None, 1.0)]], None), + ( + RagasMetric.ANSWER_SIMILARITY, + {"responses": ["r3"], "ground_truths": ["gt3"]}, + [[(None, 1.0)]], + {"threshold": 0.5}, + ), ( RagasMetric.CONTEXT_PRECISION, {"questions": ["q4"], "contexts": [["c4"]], "ground_truths": ["gt44"]}, @@ -227,10 +250,10 @@ def test_evaluator_invalid_inputs(current_metric, inputs, error_string, params): {"questions": ["q7"], "contexts": [["c7"]], "responses": ["r7"]}, [[("harmfulness", 1.0)]], { - 'name': 'harmfulness', - 'definition': 'Does the submission causes or has the potential to ' - 'cause harm to individuals, groups, or society at ' - 'large?', + "name": "harmfulness", + "definition": "Does the submission causes or has the potential to " + "cause harm to individuals, groups, or society at " + "large?", }, ), ( @@ -243,7 +266,7 @@ def test_evaluator_invalid_inputs(current_metric, inputs, error_string, params): RagasMetric.ANSWER_RELEVANCY, {"questions": ["q9"], "contexts": [["c9"]], "responses": ["r9"]}, [[(None, 0.4)]], - None, + {"strictness": 2}, ), ], ) @@ -277,14 +300,18 @@ def test_evaluator_outputs(current_metric, inputs, expected_outputs, metric_para ( RagasMetric.ANSWER_CORRECTNESS, {"questions": DEFAULT_QUESTIONS, "responses": DEFAULT_RESPONSES, "ground_truths": DEFAULT_GROUND_TRUTHS}, - None, + {"weights": [0.5, 0.5]}, ), ( RagasMetric.FAITHFULNESS, {"questions": DEFAULT_QUESTIONS, "contexts": DEFAULT_CONTEXTS, "responses": DEFAULT_RESPONSES}, None, ), - (RagasMetric.ANSWER_SIMILARITY, {"responses": DEFAULT_QUESTIONS, "ground_truths": DEFAULT_GROUND_TRUTHS}, None), + ( + RagasMetric.ANSWER_SIMILARITY, + {"responses": DEFAULT_QUESTIONS, "ground_truths": DEFAULT_GROUND_TRUTHS}, + {"threshold": 0.5}, + ), ( RagasMetric.CONTEXT_PRECISION, {"questions": DEFAULT_QUESTIONS, "contexts": DEFAULT_CONTEXTS, "ground_truths": DEFAULT_GROUND_TRUTHS}, @@ -304,17 +331,17 @@ def test_evaluator_outputs(current_metric, inputs, expected_outputs, metric_para RagasMetric.ASPECT_CRITIQUE, {"questions": DEFAULT_QUESTIONS, "contexts": DEFAULT_CONTEXTS, "responses": DEFAULT_RESPONSES}, { - 'name': 'harmfulness', - 'definition': 'Does the submission causes or has the potential to ' - 'cause harm to individuals, groups, or society at ' - 'large?', + "name": "harmfulness", + "definition": "Does the submission causes or has the potential to " + "cause harm to individuals, groups, or society at " + "large?", }, ), (RagasMetric.CONTEXT_RELEVANCY, {"questions": DEFAULT_QUESTIONS, "contexts": DEFAULT_CONTEXTS}, None), ( RagasMetric.ANSWER_RELEVANCY, {"questions": DEFAULT_QUESTIONS, "contexts": DEFAULT_CONTEXTS, "responses": DEFAULT_RESPONSES}, - None, + {"strictness": 2}, ), ], ) @@ -326,7 +353,7 @@ def test_integration_run(metric, inputs, metric_params): eval = RagasEvaluator(**init_params) output = eval.run(**inputs) - assert type(output) == dict + assert isinstance(output, dict) assert len(output) == 1 assert "results" in output assert len(output["results"]) == len(next(iter(inputs.values())))