Skip to content

Commit

Permalink
Evaluation: Fix the output_path parameter of evaluate API doesn't…
Browse files Browse the repository at this point in the history
… support relative path (Azure#38241)

* Fix output_path parameter doesn't support relative path

* add comments

* fix the test

* update

* minor update

* update
  • Loading branch information
ninghu authored and allenkim0129 committed Nov 5, 2024
1 parent 9d2c7fc commit d9ea7a8
Show file tree
Hide file tree
Showing 6 changed files with 33 additions and 11 deletions.
7 changes: 4 additions & 3 deletions sdk/evaluation/azure-ai-evaluation/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,14 @@
- The `parallel` parameter has been removed from composite evaluators: `QAEvaluator`, `ContentSafetyChatEvaluator`, and `ContentSafetyMultimodalEvaluator`. To control evaluator parallelism, you can now use the `_parallel` keyword argument, though please note that this private parameter may change in the future.

### Bugs Fixed
- Fixed an issue where the `output_path` parameter in the `evaluate` API did not support relative path.
- Output of adversarial simulators are of type `JsonLineList` and the helper function `to_eval_qr_json_lines` now outputs context from both user and assistant turns along with `category` if it exists in the conversation
- Fixed an issue where during long-running simulations, API token expires causing "Forbidden" error. Instead, users can now set an environment variable `AZURE_TOKEN_REFRESH_INTERVAL` to refresh the token more frequently to prevent expiration and ensure continuous operation of the simulation.
- Fixed an issue where during long-running simulations, API token expires causing "Forbidden" error. Instead, users can now set an environment variable `AZURE_TOKEN_REFRESH_INTERVAL` to refresh the token more frequently to prevent expiration and ensure continuous operation of the simulation.

### Other Changes
- Refined error messages for serviced-based evaluators and simulators.
- Introduced environment variable `AI_EVALS_DISABLE_EXPERIMENTAL_WARNING` to disable the warning message for experimental features.
- Changed the randomization pattern for `AdversarialSimulator` such that there is an almost equal number of Adversarial harm categories (e.g. Hate + Unfairness, Self-Harm, Violence, Sex) represented in the `AdversarialSimulator` outputs. Previously, for 200 `max_simulation_results` a user might see 140 results belonging to the 'Hate + Unfairness' category and 40 results belonging to the 'Self-Harm' category. Now, user will see 50 results for each of Hate + Unfairness, Self-Harm, Violence, and Sex.
- Changed the randomization pattern for `AdversarialSimulator` such that there is an almost equal number of Adversarial harm categories (e.g. Hate + Unfairness, Self-Harm, Violence, Sex) represented in the `AdversarialSimulator` outputs. Previously, for 200 `max_simulation_results` a user might see 140 results belonging to the 'Hate + Unfairness' category and 40 results belonging to the 'Self-Harm' category. Now, user will see 50 results for each of Hate + Unfairness, Self-Harm, Violence, and Sex.
- For the `DirectAttackSimulator`, the prompt templates used to generate simulated outputs for each Adversarial harm category will no longer be in a randomized order by default. To override this behavior, pass `randomize_order=True` when you call the `DirectAttackSimulator`, for example:
```python
adversarial_simulator = DirectAttackSimulator(azure_ai_project=azure_ai_project, credential=DefaultAzureCredential())
Expand Down Expand Up @@ -83,7 +84,7 @@ outputs = asyncio.run(custom_simulator(
- `SimilarityEvaluator`
- `RetrievalEvaluator`
- The following evaluators will now have a new key in their result output including LLM reasoning behind the score. The new key will follow the pattern "<metric_name>_reason". The reasoning is the result of a more detailed prompt template being used to generate the LLM response. Note that this requires the maximum number of tokens used to run these evaluators to be increased.

| Evaluator | New `max_token` for Generation |
| --- | --- |
| `CoherenceEvaluator` | 800 |
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,12 @@ def __init__(self, client: Union[CodeClient, ProxyClient]) -> None:
self.client = client
self._is_batch_timeout_set_by_system = False
self._is_otel_timeout_set_by_system = False
self._original_cwd = os.getcwd()

def __enter__(self) -> None:
# Preserve current working directory, as PF may change it without restoring it afterward
self._original_cwd = os.getcwd()

if isinstance(self.client, CodeClient):
ClientUserAgentUtil.append_user_agent(USER_AGENT)
inject_openai_api()
Expand All @@ -64,6 +68,8 @@ def __exit__(
exc_value: Optional[BaseException],
exc_tb: Optional[types.TracebackType],
) -> None:
os.chdir(self._original_cwd)

if isinstance(self.client, CodeClient):
recover_openai_api()

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,12 @@ class TargetRunContext:

def __init__(self, upload_snapshot: bool) -> None:
self._upload_snapshot = upload_snapshot
self._original_cwd = os.getcwd()

def __enter__(self) -> None:
# Preserve current working directory, as PF may change it without restoring it afterward
self._original_cwd = os.getcwd()

# Address "[WinError 32] The process cannot access the file" error,
# caused by conflicts when the venv and target function are in the same directory.
# Setting PF_FLOW_ENTRY_IN_TMP to true uploads only the flex entry file (flow.flex.yaml).
Expand All @@ -31,5 +35,7 @@ def __exit__(
exc_value: Optional[BaseException],
exc_tb: Optional[types.TracebackType],
) -> None:
os.chdir(self._original_cwd)

if not self._upload_snapshot:
os.environ.pop(PF_FLOW_ENTRY_IN_TMP, None)
Original file line number Diff line number Diff line change
Expand Up @@ -391,7 +391,7 @@ def _validate_and_load_data(target, data, evaluators, output_path, azure_ai_proj
)

output_dir = output_path if os.path.isdir(output_path) else os.path.dirname(output_path)
if not os.path.exists(output_dir):
if output_dir and not os.path.exists(output_dir):
msg = f"The output directory '{output_dir}' does not exist. Please create the directory manually."
raise EvaluationException(
message=msg,
Expand Down Expand Up @@ -698,7 +698,7 @@ def _print_summary(per_evaluator_results: Dict[str, Any]) -> None:
if output_dict:
print("======= Combined Run Summary (Per Evaluator) =======\n")
print(json.dumps(output_dict, indent=4))
print("\n====================================================")
print("\n====================================================\n")


def _evaluate( # pylint: disable=too-many-locals,too-many-statements
Expand Down Expand Up @@ -888,9 +888,9 @@ def eval_batch_run(
result_df_dict = result_df.to_dict("records")
result: EvaluationResult = {"rows": result_df_dict, "metrics": metrics, "studio_url": studio_url} # type: ignore

_print_summary(per_evaluator_results)

if output_path:
_write_output(output_path, result)

_print_summary(per_evaluator_results)

return result
Original file line number Diff line number Diff line change
Expand Up @@ -211,6 +211,8 @@ def _write_output(path: Union[str, os.PathLike], data_dict: Any) -> None:
with open(p, "w", encoding=DefaultOpenEncoding.WRITE) as f:
json.dump(data_dict, f)

print(f'Evaluation results saved to "{p.resolve()}".\n')


def _apply_column_mapping(
source_df: pd.DataFrame, mapping_config: Optional[Dict[str, str]], inplace: bool = False
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -396,14 +396,18 @@ def test_evaluate_output_dir_not_exist(self, mock_model_config, questions_file):

assert "The output directory './not_exist_dir' does not exist." in exc_info.value.args[0]

@pytest.mark.parametrize("use_pf_client", [True, False])
def test_evaluate_output_path(self, evaluate_test_data_jsonl_file, tmpdir, use_pf_client):
output_path = os.path.join(tmpdir, "eval_test_results.jsonl")
@pytest.mark.parametrize("use_relative_path", [True, False])
def test_evaluate_output_path(self, evaluate_test_data_jsonl_file, tmpdir, use_relative_path):
# output_path is a file
if use_relative_path:
output_path = os.path.join(tmpdir, "eval_test_results.jsonl")
else:
output_path = "eval_test_results.jsonl"

result = evaluate(
data=evaluate_test_data_jsonl_file,
evaluators={"g": F1ScoreEvaluator()},
output_path=output_path,
_use_pf_client=use_pf_client,
)

assert result is not None
Expand All @@ -415,6 +419,9 @@ def test_evaluate_output_path(self, evaluate_test_data_jsonl_file, tmpdir, use_p
data_from_file = json.loads(content)
assert result["metrics"] == data_from_file["metrics"]

os.remove(output_path)

# output_path is a directory
result = evaluate(
data=evaluate_test_data_jsonl_file,
evaluators={"g": F1ScoreEvaluator()},
Expand Down

0 comments on commit d9ea7a8

Please sign in to comment.