Skip to content

Commit

Permalink
New debug benchmark visualwebarena_tiny (#271)
Browse files Browse the repository at this point in the history
  • Loading branch information
gasse authored Nov 21, 2024
1 parent 0bf9f47 commit 3b5881a
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 0 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,24 @@
),
task_metadata=task_metadata("webarena"),
),
"visualwebarena_tiny": lambda: Benchmark(
name="visualwebarena_tiny",
high_level_action_set_args=DEFAULT_HIGHLEVEL_ACTION_SET_ARGS["visualwebarena"],
is_multi_tab=True,
supports_parallel_seeds=False,
backends=["visualwebarena"],
env_args_list=make_env_args_list_from_fixed_seeds(
task_list=[
"visualwebarena.228",
"visualwebarena.263",
"visualwebarena.550",
"visualwebarena.784",
],
max_steps=30,
fixed_seeds=[0],
),
task_metadata=task_metadata("visualwebarena"),
),
"visualwebarena": lambda: Benchmark(
name="visualwebarena",
high_level_action_set_args=DEFAULT_HIGHLEVEL_ACTION_SET_ARGS["visualwebarena"],
Expand Down
1 change: 1 addition & 0 deletions tests/experiments/test_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ def test_build_benchmarks():
"webarena": 812,
"webarena_tiny": 6,
"visualwebarena": 910,
"visualwebarena_tiny": 4,
"workarena_l1": 33 * 10,
"workarena_l2_agent_curriculum_eval": 235,
"workarena_l3_agent_curriculum_eval": 235,
Expand Down

0 comments on commit 3b5881a

Please sign in to comment.