diff --git a/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md b/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md index 1940021861f5..312026361a5d 100644 --- a/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md +++ b/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md @@ -8,13 +8,14 @@ - The `parallel` parameter has been removed from composite evaluators: `QAEvaluator`, `ContentSafetyChatEvaluator`, and `ContentSafetyMultimodalEvaluator`. To control evaluator parallelism, you can now use the `_parallel` keyword argument, though please note that this private parameter may change in the future. ### Bugs Fixed +- Fixed an issue where the `output_path` parameter in the `evaluate` API did not support relative path. - Output of adversarial simulators are of type `JsonLineList` and the helper function `to_eval_qr_json_lines` now outputs context from both user and assistant turns along with `category` if it exists in the conversation -- Fixed an issue where during long-running simulations, API token expires causing "Forbidden" error. Instead, users can now set an environment variable `AZURE_TOKEN_REFRESH_INTERVAL` to refresh the token more frequently to prevent expiration and ensure continuous operation of the simulation. +- Fixed an issue where during long-running simulations, API token expires causing "Forbidden" error. Instead, users can now set an environment variable `AZURE_TOKEN_REFRESH_INTERVAL` to refresh the token more frequently to prevent expiration and ensure continuous operation of the simulation. ### Other Changes - Refined error messages for serviced-based evaluators and simulators. - Introduced environment variable `AI_EVALS_DISABLE_EXPERIMENTAL_WARNING` to disable the warning message for experimental features. -- Changed the randomization pattern for `AdversarialSimulator` such that there is an almost equal number of Adversarial harm categories (e.g. Hate + Unfairness, Self-Harm, Violence, Sex) represented in the `AdversarialSimulator` outputs. Previously, for 200 `max_simulation_results` a user might see 140 results belonging to the 'Hate + Unfairness' category and 40 results belonging to the 'Self-Harm' category. Now, user will see 50 results for each of Hate + Unfairness, Self-Harm, Violence, and Sex. +- Changed the randomization pattern for `AdversarialSimulator` such that there is an almost equal number of Adversarial harm categories (e.g. Hate + Unfairness, Self-Harm, Violence, Sex) represented in the `AdversarialSimulator` outputs. Previously, for 200 `max_simulation_results` a user might see 140 results belonging to the 'Hate + Unfairness' category and 40 results belonging to the 'Self-Harm' category. Now, user will see 50 results for each of Hate + Unfairness, Self-Harm, Violence, and Sex. - For the `DirectAttackSimulator`, the prompt templates used to generate simulated outputs for each Adversarial harm category will no longer be in a randomized order by default. To override this behavior, pass `randomize_order=True` when you call the `DirectAttackSimulator`, for example: ```python adversarial_simulator = DirectAttackSimulator(azure_ai_project=azure_ai_project, credential=DefaultAzureCredential()) @@ -83,7 +84,7 @@ outputs = asyncio.run(custom_simulator( - `SimilarityEvaluator` - `RetrievalEvaluator` - The following evaluators will now have a new key in their result output including LLM reasoning behind the score. The new key will follow the pattern "_reason". The reasoning is the result of a more detailed prompt template being used to generate the LLM response. Note that this requires the maximum number of tokens used to run these evaluators to be increased. - + | Evaluator | New `max_token` for Generation | | --- | --- | | `CoherenceEvaluator` | 800 | diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py index 5eea27afbd8d..feb3e3b03d66 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py @@ -36,8 +36,12 @@ def __init__(self, client: Union[CodeClient, ProxyClient]) -> None: self.client = client self._is_batch_timeout_set_by_system = False self._is_otel_timeout_set_by_system = False + self._original_cwd = os.getcwd() def __enter__(self) -> None: + # Preserve current working directory, as PF may change it without restoring it afterward + self._original_cwd = os.getcwd() + if isinstance(self.client, CodeClient): ClientUserAgentUtil.append_user_agent(USER_AGENT) inject_openai_api() @@ -64,6 +68,8 @@ def __exit__( exc_value: Optional[BaseException], exc_tb: Optional[types.TracebackType], ) -> None: + os.chdir(self._original_cwd) + if isinstance(self.client, CodeClient): recover_openai_api() diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py index 62a14aa75aa8..2dc843164552 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py @@ -17,8 +17,12 @@ class TargetRunContext: def __init__(self, upload_snapshot: bool) -> None: self._upload_snapshot = upload_snapshot + self._original_cwd = os.getcwd() def __enter__(self) -> None: + # Preserve current working directory, as PF may change it without restoring it afterward + self._original_cwd = os.getcwd() + # Address "[WinError 32] The process cannot access the file" error, # caused by conflicts when the venv and target function are in the same directory. # Setting PF_FLOW_ENTRY_IN_TMP to true uploads only the flex entry file (flow.flex.yaml). @@ -31,5 +35,7 @@ def __exit__( exc_value: Optional[BaseException], exc_tb: Optional[types.TracebackType], ) -> None: + os.chdir(self._original_cwd) + if not self._upload_snapshot: os.environ.pop(PF_FLOW_ENTRY_IN_TMP, None) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py index 689ce162122a..5ae9ebca6548 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py @@ -391,7 +391,7 @@ def _validate_and_load_data(target, data, evaluators, output_path, azure_ai_proj ) output_dir = output_path if os.path.isdir(output_path) else os.path.dirname(output_path) - if not os.path.exists(output_dir): + if output_dir and not os.path.exists(output_dir): msg = f"The output directory '{output_dir}' does not exist. Please create the directory manually." raise EvaluationException( message=msg, @@ -698,7 +698,7 @@ def _print_summary(per_evaluator_results: Dict[str, Any]) -> None: if output_dict: print("======= Combined Run Summary (Per Evaluator) =======\n") print(json.dumps(output_dict, indent=4)) - print("\n====================================================") + print("\n====================================================\n") def _evaluate( # pylint: disable=too-many-locals,too-many-statements @@ -888,9 +888,9 @@ def eval_batch_run( result_df_dict = result_df.to_dict("records") result: EvaluationResult = {"rows": result_df_dict, "metrics": metrics, "studio_url": studio_url} # type: ignore + _print_summary(per_evaluator_results) + if output_path: _write_output(output_path, result) - _print_summary(per_evaluator_results) - return result diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_utils.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_utils.py index 5e95eb904343..299685bf026c 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_utils.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_utils.py @@ -211,6 +211,8 @@ def _write_output(path: Union[str, os.PathLike], data_dict: Any) -> None: with open(p, "w", encoding=DefaultOpenEncoding.WRITE) as f: json.dump(data_dict, f) + print(f'Evaluation results saved to "{p.resolve()}".\n') + def _apply_column_mapping( source_df: pd.DataFrame, mapping_config: Optional[Dict[str, str]], inplace: bool = False diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py index ffd71a518c7c..9e26bf9a992b 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py @@ -396,14 +396,18 @@ def test_evaluate_output_dir_not_exist(self, mock_model_config, questions_file): assert "The output directory './not_exist_dir' does not exist." in exc_info.value.args[0] - @pytest.mark.parametrize("use_pf_client", [True, False]) - def test_evaluate_output_path(self, evaluate_test_data_jsonl_file, tmpdir, use_pf_client): - output_path = os.path.join(tmpdir, "eval_test_results.jsonl") + @pytest.mark.parametrize("use_relative_path", [True, False]) + def test_evaluate_output_path(self, evaluate_test_data_jsonl_file, tmpdir, use_relative_path): + # output_path is a file + if use_relative_path: + output_path = os.path.join(tmpdir, "eval_test_results.jsonl") + else: + output_path = "eval_test_results.jsonl" + result = evaluate( data=evaluate_test_data_jsonl_file, evaluators={"g": F1ScoreEvaluator()}, output_path=output_path, - _use_pf_client=use_pf_client, ) assert result is not None @@ -415,6 +419,9 @@ def test_evaluate_output_path(self, evaluate_test_data_jsonl_file, tmpdir, use_p data_from_file = json.loads(content) assert result["metrics"] == data_from_file["metrics"] + os.remove(output_path) + + # output_path is a directory result = evaluate( data=evaluate_test_data_jsonl_file, evaluators={"g": F1ScoreEvaluator()},