diff --git a/integrations/uptrain/src/haystack_integrations/components/evaluators/uptrain/evaluator.py b/integrations/uptrain/src/haystack_integrations/components/evaluators/uptrain/evaluator.py index 3699d50f6..877706786 100644 --- a/integrations/uptrain/src/haystack_integrations/components/evaluators/uptrain/evaluator.py +++ b/integrations/uptrain/src/haystack_integrations/components/evaluators/uptrain/evaluator.py @@ -18,11 +18,33 @@ @component class UpTrainEvaluator: """ - A component that uses the UpTrain framework to evaluate inputs against a specific metric. - - The supported metrics are defined by :class:`UpTrainMetric`. The inputs of the component - metric-dependent. The output is a nested list of evaluation results where each inner list - contains the results for a single input. + A component that uses the [UpTrain framework](https://docs.uptrain.ai/getting-started/introduction) + to evaluate inputs against a specific metric. Supported metrics are defined by `UpTrainMetric`. + + Usage example: + ```python + from haystack_integrations.components.evaluators.uptrain import UpTrainEvaluator, UpTrainMetric + from haystack.utils import Secret + + evaluator = UpTrainEvaluator( + metric=UpTrainMetric.FACTUAL_ACCURACY, + api="openai", + api_key=Secret.from_env_var("OPENAI_API_KEY"), + ) + output = evaluator.run( + questions=["Which is the most popular global sport?"], + contexts=[ + [ + "Football is undoubtedly the world's most popular sport with" + "major events like the FIFA World Cup and sports personalities" + "like Ronaldo and Messi, drawing a followership of more than 4" + "billion people." + ] + ], + responses=["Football is the most popular sport with around 4 billion" "followers worldwide"], + ) + print(output["results"]) + ``` """ _backend_metric: Union[Evals, ParametricEval] @@ -44,15 +66,15 @@ def __init__( The metric to use for evaluation. :param metric_params: Parameters to pass to the metric's constructor. + Refer to the `UpTrainMetric` class for more details + on required parameters. :param api: - The API to use for evaluation. - - Supported APIs: "openai", "uptrain". + The API to use for evaluation. Supported APIs: + `openai`, `uptrain`. :param api_key: The API key to use. :param api_params: Additional parameters to pass to the API client. - Required parameters for the UpTrain API: `project_name`. """ self.metric = metric if isinstance(metric, UpTrainMetric) else UpTrainMetric.from_str(metric) @@ -69,38 +91,20 @@ def __init__( @component.output_types(results=List[List[Dict[str, Any]]]) def run(self, **inputs) -> Dict[str, Any]: """ - Run the UpTrain evaluator. - - Example: - ```python - pipeline = Pipeline() - evaluator = UpTrainEvaluator( - metric=UpTrainMetric.FACTUAL_ACCURACY, - api="openai", - api_key=Secret.from_env_var("OPENAI_API_KEY"), - ) - pipeline.add_component("evaluator", evaluator) - - # Each metric expects a specific set of parameters as input. Refer to the - # UpTrainMetric class' documentation for more details. - output = pipeline.run({"evaluator": { - "questions": ["question], - "contexts": [["context", "another context"]], - "responses": ["response"] - }}) - ``` + Run the UpTrain evaluator on the provided inputs. :param inputs: The inputs to evaluate. These are determined by the metric being calculated. See `UpTrainMetric` for more information. :returns: - A nested list of metric results. Each input can have one or more + A dictionary with a single `results` entry that contains + a nested list of metric results. Each input can have one or more results, depending on the metric. Each result is a dictionary containing the following keys and values: - * `name` - The name of the metric. - * `score` - The score of the metric. - * `explanation` - An optional explanation of the score. + - `name` - The name of the metric. + - `score` - The score of the metric. + - `explanation` - An optional explanation of the score. """ # The backend requires random access to the data, so we can't stream it. InputConverters.validate_input_parameters(self.metric, self.descriptor.input_parameters, inputs) @@ -125,7 +129,12 @@ def run(self, **inputs) -> Dict[str, Any]: def to_dict(self) -> Dict[str, Any]: """ - Serialize this component to a dictionary. + Serializes the component to a dictionary. + + :returns: + Dictionary with serialized data. + :raises DeserializationError: + If the component cannot be serialized. """ def check_serializable(obj: Any): @@ -151,18 +160,17 @@ def check_serializable(obj: Any): @classmethod def from_dict(cls, data: Dict[str, Any]) -> "UpTrainEvaluator": """ - Deserialize a component from a dictionary. + Deserializes the component from a dictionary. :param data: - The dictionary to deserialize from. + Dictionary to deserialize from. + :returns: + Deserialized component. """ deserialize_secrets_inplace(data["init_parameters"], ["api_key"]) return default_from_dict(cls, data) def _init_backend(self): - """ - Initialize the UpTrain backend. - """ if isinstance(self.descriptor.backend, Evals): if self.metric_params is not None: msg = ( diff --git a/integrations/uptrain/src/haystack_integrations/components/evaluators/uptrain/metrics.py b/integrations/uptrain/src/haystack_integrations/components/evaluators/uptrain/metrics.py index 6f7854aee..a13843d4a 100644 --- a/integrations/uptrain/src/haystack_integrations/components/evaluators/uptrain/metrics.py +++ b/integrations/uptrain/src/haystack_integrations/components/evaluators/uptrain/metrics.py @@ -14,48 +14,51 @@ class UpTrainMetric(Enum): Metrics supported by UpTrain. """ - #: Context relevance. + #: Context relevance.\ #: Inputs - `questions: List[str], contexts: List[List[str]]` CONTEXT_RELEVANCE = "context_relevance" - #: Factual accuracy. + #: Factual accuracy.\ #: Inputs - `questions: List[str], contexts: List[List[str]], responses: List[str]` FACTUAL_ACCURACY = "factual_accuracy" - #: Response relevance. + #: Response relevance.\ #: Inputs - `questions: List[str], responses: List[str]` RESPONSE_RELEVANCE = "response_relevance" - #: Response completeness. + #: Response completeness.\ #: Inputs - `questions: List[str], responses: List[str]` RESPONSE_COMPLETENESS = "response_completeness" - #: Response completeness with respect to context. + #: Response completeness with respect to context.\ #: Inputs - `questions: List[str], contexts: List[List[str]], responses: List[str]` RESPONSE_COMPLETENESS_WRT_CONTEXT = "response_completeness_wrt_context" - #: Response consistency. + #: Response consistency.\ #: Inputs - `questions: List[str], contexts: List[List[str]], responses: List[str]` RESPONSE_CONSISTENCY = "response_consistency" - #: Response conciseness. + #: Response conciseness.\ #: Inputs - `questions: List[str], responses: List[str]` RESPONSE_CONCISENESS = "response_conciseness" - #: Language critique. + #: Language critique.\ #: Inputs - `responses: List[str]` CRITIQUE_LANGUAGE = "critique_language" - #: Tone critique. - #: Inputs - `responses: List[str]` + #: Tone critique.\ + #: Inputs - `responses: List[str]`\ + #: Parameters - `llm_persona: str` CRITIQUE_TONE = "critique_tone" - #: Guideline adherence. - #: Inputs - `questions: List[str], responses: List[str]` + #: Guideline adherence.\ + #: Inputs - `questions: List[str], responses: List[str]`\ + #: Parameters - `guideline: str`, `guideline_name: str`, `response_schema: Optional[str]` GUIDELINE_ADHERENCE = "guideline_adherence" - #: Response matching. - #: Inputs - `responses: List[str], ground_truths: List[str]` + #: Response matching.\ + #: Inputs - `responses: List[str], ground_truths: List[str]`\ + #: Parameters - `method: str` RESPONSE_MATCHING = "response_matching" def __str__(self):