diff --git a/integrations/ragas/src/haystack_integrations/components/evaluators/ragas/evaluator.py b/integrations/ragas/src/haystack_integrations/components/evaluators/ragas/evaluator.py index 7e088b47d..71dacd6c7 100644 --- a/integrations/ragas/src/haystack_integrations/components/evaluators/ragas/evaluator.py +++ b/integrations/ragas/src/haystack_integrations/components/evaluators/ragas/evaluator.py @@ -20,12 +20,30 @@ @component class RagasEvaluator: """ - A component that uses the Ragas framework to evaluate inputs against a specific metric. - - The supported metrics are defined by `RagasMetric`. - Most of them require an OpenAI API key to be provided as an environment variable "OPENAI_API_KEY". - The inputs of the component are metric-dependent. - The output is a nested list of evaluation results where each inner list contains the results for a single input. + A component that uses the [Ragas framework](https://docs.ragas.io/) to evaluate + inputs against a specific metric. Supported metrics are defined by `RagasMetric`. + + Usage example: + ```python + from haystack_integrations.components.evaluators.ragas import RagasEvaluator, RagasMetric + + evaluator = RagasEvaluator( + metric=RagasMetric.CONTEXT_PRECISION, + ) + output = evaluator.run( + questions=["Which is the most popular global sport?"], + contexts=[ + [ + "Football is undoubtedly the world's most popular sport with" + "major events like the FIFA World Cup and sports personalities" + "like Ronaldo and Messi, drawing a followership of more than 4" + "billion people." + ] + ], + ground_truths=["Football is the most popular sport with around 4 billion" "followers worldwide"], + ) + print(output["results"]) + ``` """ # Wrapped for easy mocking. @@ -44,6 +62,8 @@ def __init__( The metric to use for evaluation. :param metric_params: Parameters to pass to the metric's constructor. + Refer to the `RagasMetric` class for more details + on required parameters. """ self.metric = metric if isinstance(metric, RagasMetric) else RagasMetric.from_str(metric) self.metric_params = metric_params or {} @@ -56,9 +76,6 @@ def __init__( component.set_input_types(self, **expected_inputs) def _init_backend(self): - """ - Initialize the Ragas backend and validate inputs. - """ self._backend_callable = RagasEvaluator._invoke_evaluate def _init_metric(self): @@ -74,29 +91,19 @@ def _invoke_evaluate(dataset: Dataset, metric: Metric) -> Result: @component.output_types(results=List[List[Dict[str, Any]]]) def run(self, **inputs) -> Dict[str, Any]: """ - Run the Ragas evaluator. - - Example: - ```python - p = Pipeline() - evaluator = RagasEvaluator( - metric=RagasMetric.CONTEXT_PRECISION, - ) - p.add_component("evaluator", evaluator) - - results = p.run({"evaluator": {"questions": QUESTIONS, "contexts": CONTEXTS, "ground_truths": GROUND_TRUTHS}}) - ``` + Run the Ragas evaluator on the provided inputs. :param inputs: The inputs to evaluate. These are determined by the - metric being calculated. See :class:`RagasMetric` for more + metric being calculated. See `RagasMetric` for more information. :returns: - A nested list of metric results. Each input can have one or more + A dictionary with a single `results` entry that contains + a nested list of metric results. Each input can have one or more results, depending on the metric. Each result is a dictionary containing the following keys and values: - * `name` - The name of the metric. - * `score` - The score of the metric. + - `name` - The name of the metric. + - `score` - The score of the metric. """ InputConverters.validate_input_parameters(self.metric, self.descriptor.input_parameters, inputs) converted_inputs: List[Dict[str, str]] = list(self.descriptor.input_converter(**inputs)) # type: ignore @@ -113,7 +120,12 @@ def run(self, **inputs) -> Dict[str, Any]: def to_dict(self) -> Dict[str, Any]: """ - Serialize this component to a dictionary. + Serializes the component to a dictionary. + + :returns: + Dictionary with serialized data. + :raises DeserializationError: + If the component cannot be serialized. """ def check_serializable(obj: Any): @@ -136,9 +148,11 @@ def check_serializable(obj: Any): @classmethod def from_dict(cls, data: Dict[str, Any]) -> "RagasEvaluator": """ - Deserialize a component from a dictionary. + Deserializes the component from a dictionary. :param data: - The dictionary to deserialize from. + Dictionary to deserialize from. + :returns: + Deserialized component. """ return default_from_dict(cls, data) diff --git a/integrations/ragas/src/haystack_integrations/components/evaluators/ragas/metrics.py b/integrations/ragas/src/haystack_integrations/components/evaluators/ragas/metrics.py index 8d1f53593..72f3e8a3b 100644 --- a/integrations/ragas/src/haystack_integrations/components/evaluators/ragas/metrics.py +++ b/integrations/ragas/src/haystack_integrations/components/evaluators/ragas/metrics.py @@ -50,40 +50,44 @@ class RagasMetric(RagasBaseEnum): Metrics supported by Ragas. """ - #: Answer correctness - #: Inputs - `questions: List[str], responses: List[str], ground_truths: List[str]` + #: Answer correctness.\ + #: Inputs - `questions: List[str], responses: List[str], ground_truths: List[str]`\ + #: Parameters - `weights: Tuple[float, float]` ANSWER_CORRECTNESS = "answer_correctness" - #: Faithfulness + #: Faithfulness.\ #: Inputs - `questions: List[str], contexts: List[List[str]], responses: List[str]` FAITHFULNESS = "faithfulness" - #: Answer similarity - #: Inputs - `responses: List[str], ground_truths: List[str]` + #: Answer similarity.\ + #: Inputs - `responses: List[str], ground_truths: List[str]`\ + #: Parameters - `threshold: float` ANSWER_SIMILARITY = "answer_similarity" - #: Context precision + #: Context precision.\ #: Inputs - `questions: List[str], contexts: List[List[str]], ground_truths: List[str]` CONTEXT_PRECISION = "context_precision" - #: Context utilization - #: Inputs - `questions: List[str], contexts: List[List[str]], responses: List[str]` + #: Context utilization. + #: Inputs - `questions: List[str], contexts: List[List[str]], responses: List[str]`\ CONTEXT_UTILIZATION = "context_utilization" - #: Context recall - #: Inputs - `questions: List[str], contexts: List[List[str]], ground_truths: List[str]` + #: Context recall. + #: Inputs - `questions: List[str], contexts: List[List[str]], ground_truths: List[str]`\ CONTEXT_RECALL = "context_recall" - #: Aspect critique - #: Inputs - `questions: List[str], contexts: List[List[str]], responses: List[str]` + #: Aspect critique. + #: Inputs - `questions: List[str], contexts: List[List[str]], responses: List[str]`\ + #: Parameters - `name: str, definition: str, strictness: int` ASPECT_CRITIQUE = "aspect_critique" - #: Context relevancy + #: Context relevancy.\ #: Inputs - `questions: List[str], contexts: List[List[str]]` CONTEXT_RELEVANCY = "context_relevancy" - #: Answer relevancy - #: Inputs - `questions: List[str], contexts: List[List[str]], responses: List[str]` + #: Answer relevancy.\ + #: Inputs - `questions: List[str], contexts: List[List[str]], responses: List[str]`\ + #: Parameters - `strictness: int` ANSWER_RELEVANCY = "answer_relevancy"