Skip to content

Commit

Permalink
[Evaluation]: Log openhands version in eval output folder, instead of…
Browse files Browse the repository at this point in the history
… agent version (#5394)
  • Loading branch information
xingyaoww authored Dec 4, 2024
1 parent 793e142 commit 9908e1b
Show file tree
Hide file tree
Showing 22 changed files with 65 additions and 68 deletions.
9 changes: 3 additions & 6 deletions evaluation/benchmarks/EDA/scripts/run_infer.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ if [ -z "$AGENT" ]; then
AGENT="CodeActAgent"
fi

get_agent_version
get_openhands_version

if [ -z "$DATASET" ]; then
echo "Dataset not specified, use default 'things'"
Expand All @@ -34,12 +34,9 @@ if [ -z "$OPENAI_API_KEY" ]; then
exit 1
fi

# IMPORTANT: Because Agent's prompt changes fairly often in the rapidly evolving codebase of OpenHands
# We need to track the version of Agent in the evaluation to make sure results are comparable
AGENT_VERSION=v$(poetry run python -c "import openhands.agenthub; from openhands.controller.agent import Agent; print(Agent.get_cls('$AGENT').VERSION)")

echo "AGENT: $AGENT"
echo "AGENT_VERSION: $AGENT_VERSION"
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
echo "MODEL_CONFIG: $MODEL_CONFIG"
echo "DATASET: $DATASET"

Expand All @@ -51,7 +48,7 @@ COMMAND="poetry run python evaluation/benchmarks/EDA/run_infer.py \
--max-iterations 20 \
--OPENAI_API_KEY $OPENAI_API_KEY \
--eval-num-workers $NUM_WORKERS \
--eval-note ${AGENT_VERSION}_${DATASET}"
--eval-note ${OPENHANDS_VERSION}_${DATASET}"

if [ -n "$EVAL_LIMIT" ]; then
echo "EVAL_LIMIT: $EVAL_LIMIT"
Expand Down
6 changes: 3 additions & 3 deletions evaluation/benchmarks/agent_bench/scripts/run_infer.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,18 +20,18 @@ if [ -z "$AGENT" ]; then
AGENT="CodeActAgent"
fi

get_agent_version
get_openhands_version

echo "AGENT: $AGENT"
echo "AGENT_VERSION: $AGENT_VERSION"
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
echo "MODEL_CONFIG: $MODEL_CONFIG"

COMMAND="export PYTHONPATH=evaluation/benchmarks/agent_bench:\$PYTHONPATH && poetry run python evaluation/benchmarks/agent_bench/run_infer.py \
--agent-cls $AGENT \
--llm-config $MODEL_CONFIG \
--max-iterations 30 \
--eval-num-workers $NUM_WORKERS \
--eval-note $AGENT_VERSION"
--eval-note $OPENHANDS_VERSION"

if [ -n "$EVAL_LIMIT" ]; then
echo "EVAL_LIMIT: $EVAL_LIMIT"
Expand Down
6 changes: 3 additions & 3 deletions evaluation/benchmarks/aider_bench/scripts/run_infer.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,13 @@ if [ -z "$AGENT" ]; then
AGENT="CodeActAgent"
fi

get_agent_version
get_openhands_version

echo "AGENT: $AGENT"
echo "AGENT_VERSION: $AGENT_VERSION"
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
echo "MODEL_CONFIG: $MODEL_CONFIG"

EVAL_NOTE=$AGENT_VERSION
EVAL_NOTE=$OPENHANDS_VERSION

# Default to NOT use unit tests.
if [ -z "$USE_UNIT_TESTS" ]; then
Expand Down
6 changes: 3 additions & 3 deletions evaluation/benchmarks/biocoder/scripts/run_infer.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,10 @@ if [ -z "$AGENT" ]; then
AGENT="CodeActAgent"
fi

get_agent_version
get_openhands_version

echo "AGENT: $AGENT"
echo "AGENT_VERSION: $AGENT_VERSION"
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
echo "MODEL_CONFIG: $MODEL_CONFIG"
echo "DATASET: $DATASET"

Expand All @@ -33,7 +33,7 @@ COMMAND="poetry run python evaluation/benchmarks/biocoder/run_infer.py \
--llm-config $MODEL_CONFIG \
--max-iterations 10 \
--eval-num-workers $NUM_WORKERS \
--eval-note ${AGENT_VERSION}_${DATASET}"
--eval-note ${OPENHANDS_VERSION}_${DATASET}"

if [ -n "$EVAL_LIMIT" ]; then
echo "EVAL_LIMIT: $EVAL_LIMIT"
Expand Down
6 changes: 3 additions & 3 deletions evaluation/benchmarks/bird/scripts/run_infer.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,18 +20,18 @@ if [ -z "$AGENT" ]; then
AGENT="CodeActAgent"
fi

get_agent_version
get_openhands_version

echo "AGENT: $AGENT"
echo "AGENT_VERSION: $AGENT_VERSION"
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
echo "MODEL_CONFIG: $MODEL_CONFIG"

COMMAND="poetry run python evaluation/benchmarks/bird/run_infer.py \
--agent-cls $AGENT \
--llm-config $MODEL_CONFIG \
--max-iterations 5 \
--eval-num-workers $NUM_WORKERS \
--eval-note $AGENT_VERSION" \
--eval-note $OPENHANDS_VERSION" \

if [ -n "$EVAL_LIMIT" ]; then
echo "EVAL_LIMIT: $EVAL_LIMIT"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,13 @@ if [ -z "$AGENT" ]; then
AGENT="CodeActAgent"
fi

get_agent_version
get_openhands_version

echo "AGENT: $AGENT"
echo "AGENT_VERSION: $AGENT_VERSION"
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
echo "MODEL_CONFIG: $MODEL_CONFIG"

EVAL_NOTE="$AGENT_VERSION"
EVAL_NOTE="$OPENHANDS_VERSION"

COMMAND="poetry run python evaluation/benchmarks/browsing_delegation/run_infer.py \
--agent-cls $AGENT \
Expand Down
6 changes: 3 additions & 3 deletions evaluation/benchmarks/commit0_bench/scripts/run_infer.sh
Original file line number Diff line number Diff line change
Expand Up @@ -61,10 +61,10 @@ echo "USE_INSTANCE_IMAGE: $USE_INSTANCE_IMAGE"
export RUN_WITH_BROWSING=$RUN_WITH_BROWSING
echo "RUN_WITH_BROWSING: $RUN_WITH_BROWSING"

get_agent_version
get_openhands_version

echo "AGENT: $AGENT"
echo "AGENT_VERSION: $AGENT_VERSION"
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
echo "MODEL_CONFIG: $MODEL_CONFIG"
echo "DATASET: $DATASET"
echo "HF SPLIT: $SPLIT"
Expand All @@ -75,7 +75,7 @@ if [ -z "$USE_HINT_TEXT" ]; then
export USE_HINT_TEXT=false
fi
echo "USE_HINT_TEXT: $USE_HINT_TEXT"
EVAL_NOTE="$AGENT_VERSION"
EVAL_NOTE="$OPENHANDS_VERSION"
# if not using Hint, add -no-hint to the eval note
if [ "$USE_HINT_TEXT" = false ]; then
EVAL_NOTE="$EVAL_NOTE-no-hint"
Expand Down
6 changes: 3 additions & 3 deletions evaluation/benchmarks/discoverybench/scripts/run_infer.sh
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,10 @@ if [ -z "$AGENT" ]; then
AGENT="CodeActAgent"
fi

get_agent_version
get_openhands_version

echo "AGENT: $AGENT"
echo "AGENT_VERSION: $AGENT_VERSION"
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
echo "MODEL_CONFIG: $MODEL_CONFIG"

COMMAND="poetry run python evaluation/benchmarks/discoverybench/run_infer.py \
Expand All @@ -35,7 +35,7 @@ COMMAND="poetry run python evaluation/benchmarks/discoverybench/run_infer.py \
--max-iterations 10 \
--max-chars 10000000 \
--eval-num-workers $NUM_WORKERS \
--eval-note $AGENT_VERSION"
--eval-note $OPENHANDS_VERSION"

if [ -n "$EVAL_LIMIT" ]; then
echo "EVAL_LIMIT: $EVAL_LIMIT"
Expand Down
8 changes: 4 additions & 4 deletions evaluation/benchmarks/gaia/scripts/run_infer.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,17 +21,17 @@ if [ -z "$AGENT" ]; then
AGENT="CodeActAgent"
fi

get_agent_version
get_openhands_version

if [ -z "$LEVELS" ]; then
LEVELS="2023_level1"
echo "Levels not specified, use default $LEVELS"
fi

get_agent_version
get_openhands_version

echo "AGENT: $AGENT"
echo "AGENT_VERSION: $AGENT_VERSION"
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
echo "MODEL_CONFIG: $MODEL_CONFIG"
echo "LEVELS: $LEVELS"

Expand All @@ -42,7 +42,7 @@ COMMAND="poetry run python ./evaluation/benchmarks/gaia/run_infer.py \
--level $LEVELS \
--data-split validation \
--eval-num-workers $NUM_WORKERS \
--eval-note ${AGENT_VERSION}_${LEVELS}"
--eval-note ${OPENHANDS_VERSION}_${LEVELS}"

if [ -n "$EVAL_LIMIT" ]; then
echo "EVAL_LIMIT: $EVAL_LIMIT"
Expand Down
6 changes: 3 additions & 3 deletions evaluation/benchmarks/gorilla/scripts/run_infer.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,15 +21,15 @@ if [ -z "$AGENT" ]; then
AGENT="CodeActAgent"
fi

get_agent_version
get_openhands_version

if [ -z "$HUBS" ]; then
HUBS="hf,torch,tf"
echo "Hubs not specified, use default $HUBS"
fi

echo "AGENT: $AGENT"
echo "AGENT_VERSION: $AGENT_VERSION"
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
echo "MODEL_CONFIG: $MODEL_CONFIG"
echo "HUBS: $HUBS"

Expand All @@ -40,7 +40,7 @@ COMMAND="poetry run python evaluation/benchmarks/gorilla/run_infer.py \
--hubs $HUBS \
--data-split validation \
--eval-num-workers $NUM_WORKERS \
--eval-note ${AGENT_VERSION}_${LEVELS}"
--eval-note ${OPENHANDS_VERSION}_${LEVELS}"

if [ -n "$EVAL_LIMIT" ]; then
echo "EVAL_LIMIT: $EVAL_LIMIT"
Expand Down
6 changes: 3 additions & 3 deletions evaluation/benchmarks/gpqa/scripts/run_infer.sh
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,10 @@ if [ -z "$DATA_SPLIT" ]; then
DATA_SPLIT="gpqa_diamond"
fi

get_agent_version
get_openhands_version

echo "AGENT: $AGENT"
echo "AGENT_VERSION: $AGENT_VERSION"
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
echo "MODEL_CONFIG: $MODEL_CONFIG"

COMMAND="poetry run python evaluation/benchmarks/gpqa/run_infer.py \
Expand All @@ -39,7 +39,7 @@ COMMAND="poetry run python evaluation/benchmarks/gpqa/run_infer.py \
--max-iterations 10 \
--eval-num-workers $NUM_WORKERS \
--data-split $DATA_SPLIT \
--eval-note $AGENT_VERSION"
--eval-note $OPENHANDS_VERSION"

if [ -n "$EVAL_LIMIT" ]; then
echo "EVAL_LIMIT: $EVAL_LIMIT"
Expand Down
6 changes: 3 additions & 3 deletions evaluation/benchmarks/humanevalfix/scripts/run_infer.sh
Original file line number Diff line number Diff line change
Expand Up @@ -58,18 +58,18 @@ if [ -z "$AGENT" ]; then
AGENT="CodeActAgent"
fi

get_agent_version
get_openhands_version

echo "AGENT: $AGENT"
echo "AGENT_VERSION: $AGENT_VERSION"
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
echo "MODEL_CONFIG: $MODEL_CONFIG"

COMMAND="poetry run python evaluation/benchmarks/humanevalfix/run_infer.py \
--agent-cls $AGENT \
--llm-config $MODEL_CONFIG \
--max-iterations 10 \
--eval-num-workers $NUM_WORKERS \
--eval-note $AGENT_VERSION"
--eval-note $OPENHANDS_VERSION"

if [ -n "$EVAL_LIMIT" ]; then
echo "EVAL_LIMIT: $EVAL_LIMIT"
Expand Down
6 changes: 3 additions & 3 deletions evaluation/benchmarks/logic_reasoning/scripts/run_infer.sh
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,10 @@ if [ -z "$DATASET" ]; then
DATASET="ProofWriter"
fi

get_agent_version
get_openhands_version

echo "AGENT: $AGENT"
echo "AGENT_VERSION: $AGENT_VERSION"
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
echo "MODEL_CONFIG: $MODEL_CONFIG"

COMMAND="poetry run python evaluation/benchmarks/logic_reasoning/run_infer.py \
Expand All @@ -40,7 +40,7 @@ COMMAND="poetry run python evaluation/benchmarks/logic_reasoning/run_infer.py \
--dataset $DATASET \
--max-iterations 10 \
--eval-num-workers $NUM_WORKERS \
--eval-note $AGENT_VERSION"
--eval-note $OPENHANDS_VERSION"

if [ -n "$EVAL_LIMIT" ]; then
echo "EVAL_LIMIT: $EVAL_LIMIT"
Expand Down
6 changes: 3 additions & 3 deletions evaluation/benchmarks/miniwob/scripts/run_infer.sh
Original file line number Diff line number Diff line change
Expand Up @@ -25,13 +25,13 @@ if [ -z "$AGENT" ]; then
AGENT="BrowsingAgent"
fi

get_agent_version
get_openhands_version

echo "AGENT: $AGENT"
echo "AGENT_VERSION: $AGENT_VERSION"
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
echo "MODEL_CONFIG: $MODEL_CONFIG"

EVAL_NOTE="${AGENT_VERSION}_${NOTE}"
EVAL_NOTE="${OPENHANDS_VERSION}_${NOTE}"

COMMAND="export PYTHONPATH=evaluation/benchmarks/miniwob:\$PYTHONPATH && poetry run python evaluation/benchmarks/miniwob/run_infer.py \
--agent-cls $AGENT \
Expand Down
4 changes: 2 additions & 2 deletions evaluation/benchmarks/mint/scripts/run_infer.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,10 @@ checkout_eval_branch
# Only 'CodeActAgent' is supported for MINT now
AGENT="CodeActAgent"

get_agent_version
get_openhands_version

echo "AGENT: $AGENT"
echo "AGENT_VERSION: $AGENT_VERSION"
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"

export PYTHONPATH=$(pwd)

Expand Down
6 changes: 3 additions & 3 deletions evaluation/benchmarks/ml_bench/scripts/run_infer.sh
Original file line number Diff line number Diff line change
Expand Up @@ -26,18 +26,18 @@ if [ -z "$AGENT" ]; then
AGENT="CodeActAgent"
fi

get_agent_version
get_openhands_version

echo "AGENT: $AGENT"
echo "AGENT_VERSION: $AGENT_VERSION"
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
echo "MODEL_CONFIG: $MODEL_CONFIG"

COMMAND="poetry run python evaluation/benchmarks/ml_bench/run_infer.py \
--agent-cls $AGENT \
--llm-config $MODEL_CONFIG \
--max-iterations 10 \
--eval-num-workers $NUM_WORKERS \
--eval-note $AGENT_VERSION"
--eval-note $OPENHANDS_VERSION"

if [ -n "$EVAL_LIMIT" ]; then
echo "EVAL_LIMIT: $EVAL_LIMIT"
Expand Down
6 changes: 3 additions & 3 deletions evaluation/benchmarks/scienceagentbench/scripts/run_infer.sh
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,10 @@ if [ -z "$USE_KNOWLEDGE" ]; then
USE_KNOWLEDGE=false
fi

get_agent_version
get_openhands_version

echo "AGENT: $AGENT"
echo "AGENT_VERSION: $AGENT_VERSION"
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
echo "MODEL_CONFIG: $MODEL_CONFIG"

COMMAND="poetry run python evaluation/benchmarks/scienceagentbench/run_infer.py \
Expand All @@ -38,7 +38,7 @@ COMMAND="poetry run python evaluation/benchmarks/scienceagentbench/run_infer.py
--use_knowledge $USE_KNOWLEDGE \
--max-iterations 30 \
--eval-num-workers $NUM_WORKERS \
--eval-note $AGENT_VERSION" \
--eval-note $OPENHANDS_VERSION" \

if [ -n "$EVAL_LIMIT" ]; then
echo "EVAL_LIMIT: $EVAL_LIMIT"
Expand Down
6 changes: 3 additions & 3 deletions evaluation/benchmarks/swe_bench/scripts/run_infer.sh
Original file line number Diff line number Diff line change
Expand Up @@ -55,10 +55,10 @@ echo "USE_INSTANCE_IMAGE: $USE_INSTANCE_IMAGE"
export RUN_WITH_BROWSING=$RUN_WITH_BROWSING
echo "RUN_WITH_BROWSING: $RUN_WITH_BROWSING"

get_agent_version
get_openhands_version

echo "AGENT: $AGENT"
echo "AGENT_VERSION: $AGENT_VERSION"
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
echo "MODEL_CONFIG: $MODEL_CONFIG"
echo "DATASET: $DATASET"
echo "SPLIT: $SPLIT"
Expand All @@ -68,7 +68,7 @@ if [ -z "$USE_HINT_TEXT" ]; then
export USE_HINT_TEXT=false
fi
echo "USE_HINT_TEXT: $USE_HINT_TEXT"
EVAL_NOTE="$AGENT_VERSION"
EVAL_NOTE="$OPENHANDS_VERSION"
# if not using Hint, add -no-hint to the eval note
if [ "$USE_HINT_TEXT" = false ]; then
EVAL_NOTE="$EVAL_NOTE-no-hint"
Expand Down
Loading

0 comments on commit 9908e1b

Please sign in to comment.