Skip to content

Commit

Permalink
update wbelo path
Browse files Browse the repository at this point in the history
  • Loading branch information
yuchenlin committed Jul 3, 2024
1 parent 4350c6f commit 4aacc70
Show file tree
Hide file tree
Showing 14 changed files with 23,287 additions and 2,097 deletions.

Large diffs are not rendered by default.

587 changes: 339 additions & 248 deletions leaderboard/data_dir/all_stat_wildbench.-1.json

Large diffs are not rendered by default.

455 changes: 273 additions & 182 deletions leaderboard/data_dir/all_stat_wildbench.100.json

Large diffs are not rendered by default.

589 changes: 340 additions & 249 deletions leaderboard/data_dir/all_stat_wildbench.1000.json

Large diffs are not rendered by default.

587 changes: 339 additions & 248 deletions leaderboard/data_dir/all_stat_wildbench.1500.json

Large diffs are not rendered by default.

455 changes: 273 additions & 182 deletions leaderboard/data_dir/all_stat_wildbench.2000.json

Large diffs are not rendered by default.

455 changes: 273 additions & 182 deletions leaderboard/data_dir/all_stat_wildbench.300.json

Large diffs are not rendered by default.

455 changes: 273 additions & 182 deletions leaderboard/data_dir/all_stat_wildbench.3000.json

Large diffs are not rendered by default.

591 changes: 341 additions & 250 deletions leaderboard/data_dir/all_stat_wildbench.500.json

Large diffs are not rendered by default.

188 changes: 102 additions & 86 deletions leaderboard/data_dir/score.json

Large diffs are not rendered by default.

564 changes: 282 additions & 282 deletions leaderboard/data_dir/wb_elo_results.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion leaderboard/data_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
bench_data = None
eval_results = None
score_eval_results = None
BASE_SCORE_RESULTS_PATH = "eval_results/v2.0522/score.v2/eval=gpt-4o-2024-05-13/"
BASE_SCORE_RESULTS_PATH = "eval_results/v2.0625/score.v2/eval=gpt-4o-2024-05-13/"
BASE_EVAL_RESULTS_PATH = "eval_results/v2.0522/pairwise.v2/eval=gpt-4-turbo-2024-04-09/"


Expand Down
10 changes: 6 additions & 4 deletions leaderboard/show_eval.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,6 @@ MODE=$1



margin=3;tie_margin=2;K=4;dynamic=True;interval=16
python -m leaderboard.wb_elo --K $K --margin $margin --tie_margin $tie_margin --num_rounds 100 --dynamic $dynamic --interval $interval --num_processes 4

# if MODE is not score
if [ "$MODE" != "score_only" ];
then
Expand All @@ -31,7 +28,12 @@ python leaderboard/data_dir/_create_tables.py score

python leaderboard/data_dir/_merge_results.py

if [ "$MODE" != "score_only" ];
margin=3;tie_margin=2;K=4;dynamic=True;interval=16
python -m leaderboard.wb_elo --K $K --margin $margin --tie_margin $tie_margin --num_rounds 100 --dynamic $dynamic --interval $interval --num_processes 4

python leaderboard/data_dir/_merge_results.py

if [ "$MODE" == "score_only" ];
then
python leaderboard/show_table.py --mode taskwise_score
else
Expand Down
4 changes: 3 additions & 1 deletion leaderboard/show_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,14 +18,16 @@ def show_table(K=-1, mode="main"):
rank_column = "WB_Elo"
elif mode == "taskwise_score":
all_column_names_to_show = ["WB_Elo", "WB_score.task_macro", "WB_score.Creative Tasks", "WB_score.Planning & Reasoning", "WB_score.Math & Data Analysis", "WB_score.Information/Advice seeking", "WB_score.Coding & Debugging", "Length"]
rank_column = "WB_score.task_macro"
# rank_column = "WB_score.task_macro"
rank_column = "WB_Elo"
elif mode == "taskwise_reward":
all_column_names_to_show = ["WB_Elo", f"task_macro_reward.K={K}", f"mixture_of_rewards.Creative Tasks.K={K}", f"mixture_of_rewards.Planning & Reasoning.K={K}", f"mixture_of_rewards.Math & Data Analysis.K={K}", f"mixture_of_rewards.Information/Advice seeking.K={K}", f"mixture_of_rewards.Coding & Debugging.K={K}", "Length"]
rank_column = f"task_macro_reward.K={K}"
else:
raise NotImplementedError

# rank by rank_column
print(f"Ranking by {rank_column}")
all_stat = {k: v for k, v in sorted(all_stat.items(), key=lambda item: item[1][rank_column], reverse=True)}

rows = []
Expand Down

0 comments on commit 4aacc70

Please sign in to comment.