Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Better async error message #1080

Merged
merged 3 commits into from
Oct 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions python/langsmith/evaluation/_runner.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""V2 Evaluation Interface."""

Check notice on line 1 in python/langsmith/evaluation/_runner.py

View workflow job for this annotation

GitHub Actions / benchmark

Benchmark results

......................................... create_5_000_run_trees: Mean +- std dev: 581 ms +- 50 ms ......................................... create_10_000_run_trees: Mean +- std dev: 1.14 sec +- 0.06 sec ......................................... create_20_000_run_trees: Mean +- std dev: 1.14 sec +- 0.06 sec ......................................... dumps_class_nested_py_branch_and_leaf_200x400: Mean +- std dev: 764 us +- 11 us ......................................... dumps_class_nested_py_leaf_50x100: Mean +- std dev: 27.3 ms +- 0.7 ms ......................................... dumps_class_nested_py_leaf_100x200: Mean +- std dev: 112 ms +- 3 ms ......................................... dumps_dataclass_nested_50x100: Mean +- std dev: 27.4 ms +- 0.4 ms ......................................... WARNING: the benchmark result may be unstable * the standard deviation (6.90 ms) is 12% of the mean (58.8 ms) Try to rerun the benchmark with more runs, values and/or loops. Run 'python -m pyperf system tune' command to reduce the system jitter. Use pyperf stats, pyperf dump and pyperf hist to analyze results. Use --quiet option to hide these warnings. dumps_pydantic_nested_50x100: Mean +- std dev: 58.8 ms +- 6.9 ms ......................................... WARNING: the benchmark result may be unstable * the standard deviation (30.6 ms) is 14% of the mean (215 ms) Try to rerun the benchmark with more runs, values and/or loops. Run 'python -m pyperf system tune' command to reduce the system jitter. Use pyperf stats, pyperf dump and pyperf hist to analyze results. Use --quiet option to hide these warnings. dumps_pydanticv1_nested_50x100: Mean +- std dev: 215 ms +- 31 ms

Check notice on line 1 in python/langsmith/evaluation/_runner.py

View workflow job for this annotation

GitHub Actions / benchmark

Comparison against main

+-----------------------------------------------+----------+------------------------+ | Benchmark | main | changes | +===============================================+==========+========================+ | dumps_pydantic_nested_50x100 | 63.7 ms | 58.8 ms: 1.08x faster | +-----------------------------------------------+----------+------------------------+ | dumps_dataclass_nested_50x100 | 27.9 ms | 27.4 ms: 1.02x faster | +-----------------------------------------------+----------+------------------------+ | dumps_class_nested_py_branch_and_leaf_200x400 | 762 us | 764 us: 1.00x slower | +-----------------------------------------------+----------+------------------------+ | create_10_000_run_trees | 1.12 sec | 1.14 sec: 1.02x slower | +-----------------------------------------------+----------+------------------------+ | create_20_000_run_trees | 1.12 sec | 1.14 sec: 1.02x slower | +-----------------------------------------------+----------+------------------------+ | Geometric mean | (ref) | 1.01x faster | +-----------------------------------------------+----------+------------------------+ Benchmark hidden because not significant (4): dumps_pydanticv1_nested_50x100, dumps_class_nested_py_leaf_100x200, dumps_class_nested_py_leaf_50x100, create_5_000_run_trees

from __future__ import annotations

Expand Down Expand Up @@ -253,6 +253,18 @@
... ) # doctest: +ELLIPSIS
View the evaluation results for experiment:...
""" # noqa: E501
if callable(target) and rh.is_async(target):
raise ValueError(
"Async functions are not supported by `evaluate`. "
"Please use `aevaluate` instead:\n\n"
"from langsmith import aevaluate\n\n"
"await aevaluate(\n"
" async_target_function,\n"
" data=data,\n"
" evaluators=evaluators,\n"
" # ... other parameters\n"
")"
)
if experiment and experiment_prefix:
raise ValueError(
"Expected at most one of 'experiment' or 'experiment_prefix',"
Expand Down
32 changes: 32 additions & 0 deletions python/tests/unit_tests/evaluation/test_runner.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Test the eval runner."""

import asyncio
import functools
import itertools
import json
import random
Expand Down Expand Up @@ -248,6 +249,37 @@ def score_value(run, example):
assert not fake_request.should_fail


def test_evaluate_raises_for_async():
async def my_func(inputs: dict):
pass

match = "Async functions are not supported by"
with pytest.raises(ValueError, match=match):
evaluate(my_func, data="foo")

async def my_other_func(inputs: dict, other_val: int):
pass

with pytest.raises(ValueError, match=match):
evaluate(functools.partial(my_other_func, other_val=3), data="foo")

try:
from langchain_core.runnables import RunnableLambda
except ImportError:
pytest.skip("langchain-core not installed.")

@RunnableLambda
def foo(inputs: dict):
return "bar"

with pytest.raises(ValueError, match=match):
evaluate(foo.ainvoke, data="foo")
if sys.version_info < (3, 10):
return
with pytest.raises(ValueError, match=match):
evaluate(functools.partial(foo.ainvoke, inputs={"foo": "bar"}), data="foo")


@pytest.mark.skipif(sys.version_info < (3, 9), reason="requires python3.9 or higher")
@pytest.mark.parametrize("blocking", [False, True])
async def test_aevaluate_results(blocking: bool) -> None:
Expand Down
Loading