Skip to content

Commit

Permalink
fix_wildbench
Browse files Browse the repository at this point in the history
  • Loading branch information
bittersweet1999 committed Sep 9, 2024
1 parent 42404a5 commit f62cfe0
Show file tree
Hide file tree
Showing 3 changed files with 15 additions and 9 deletions.
5 changes: 3 additions & 2 deletions configs/datasets/subjective/wildbench/wildbench_pair_judge.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from opencompass.openicl.icl_inferencer import ChatInferencer, GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import WildBenchDataset

from opencompass.summarizers import WildBenchPairSummarizer

subjective_reader_cfg = dict(
input_columns=['dialogue', 'prompt'],
Expand Down Expand Up @@ -61,5 +61,6 @@
{'abbr': 'llama-2-70b-chat-vllm', 'path':'./data/subjective/WildBench/llama2-70b'}],
mode='m2n', # m个模型 与 n个模型进行对战
infer_order='random',
base_models = [llama_2_70b, gpt4, claude]
base_models = [llama_2_70b, gpt4, claude],
summarizer = dict(type=WildBenchPairSummarizer),
))
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from opencompass.openicl.icl_inferencer import ChatInferencer, GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import WildBenchDataset

from opencompass.summarizers import WildBenchPairSummarizer

subjective_reader_cfg = dict(
input_columns=['dialogue', 'prompt'],
Expand Down Expand Up @@ -61,5 +61,6 @@
{'abbr': 'llama-2-70b-chat-vllm', 'path':'./data/subjective/WildBench/llama2-70b'}],
mode='m2n', # m个模型 与 n个模型进行对战
infer_order='random',
base_models = [llama_2_70b, gpt4, claude]
base_models = [llama_2_70b, gpt4, claude],
summarizer = dict(type=WildBenchPairSummarizer),
))
14 changes: 9 additions & 5 deletions opencompass/summarizers/subjective/wildbench.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,8 +156,8 @@ def __init__(self, config: ConfigDict, check_pos_bias=False) -> None:
self.tasks = []
self.cfg = config

self.base_models = self.cfg['eval']['partitioner']['base_models']
self.compare_models = self.cfg['eval']['partitioner']['compare_models']
self.base_models = self.cfg['datasets'][0]['base_models']
self.compare_models = self.cfg['eval']['partitioner']['models']
self.judge_models = self.cfg.get('judge_models', None)
self.meta_judge_model = self.cfg.eval.partitioner.get('meta_judge_model', None)
self.judge_abbr = model_abbr_from_cfg(self.cfg['judge_models'][0])
Expand Down Expand Up @@ -247,8 +247,10 @@ def summarize(
pd.DataFrame: The summary results.
"""
scores = self.get_score(time_str)
all_scores = {}
output_dir, results_folder = get_outdir(self.cfg, time_str)
for idx, judge_model in enumerate(self.judge_models):
score_by_judgemodel = {}
judge_abbr = model_abbr_from_cfg(judge_model)
for dataset in self.cfg['datasets']:
dataset_abbr = dataset_abbr_from_cfg(dataset)
Expand All @@ -258,7 +260,7 @@ def summarize(
row_headers = [dataset_abbr, 'position_bias'] + row_headers

table = []
for row_header in row_headers:
for idx, row_header in enumerate(row_headers):
row = [row_header]
headers = ['']
for model_cfg in self.compare_models:
Expand All @@ -276,12 +278,13 @@ def summarize(
s = str(s)
row.append(s)
avg = avg/len(self.base_models)
if idx == 0:
score_by_judgemodel[model_abbr] = {'score': avg}
row.append(f'{avg:.2f}')
headers.append('Avg')
table.append(row)

txt = tabulate(table, headers=headers)
print(txt)

if idx == len(self.judge_models):
output_filename = osp.join(output_dir, 'summarized-by--' + judge_abbr + '-' + dataset_abbr + '-report.csv')
Expand All @@ -292,4 +295,5 @@ def summarize(
f.write(','.join(headers) + '\n')
for line in table:
f.write(','.join(line) + '\n')
print(output_filename)
all_scores[judge_abbr] = score_by_judgemodel
return {'Wildbench': all_scores}

0 comments on commit f62cfe0

Please sign in to comment.