Skip to content

Commit

Permalink
fix lint
Browse files Browse the repository at this point in the history
  • Loading branch information
bittersweet1999 committed Sep 10, 2024
1 parent b900f93 commit f53663a
Show file tree
Hide file tree
Showing 2 changed files with 56 additions and 31 deletions.
34 changes: 26 additions & 8 deletions opencompass/datasets/subjective/followbench.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,25 @@
import json
import os.path as osp
import re

from datasets import Dataset

from opencompass.openicl.icl_evaluator import BaseEvaluator
from opencompass.registry import LOAD_DATASET
from opencompass.utils import get_data_path

from ..base import BaseDataset


def check_match(target, generation):
pattern_str = target.replace('{{', '{').replace('}}', '}').replace('{answer}', '.*')
pattern_str = target.replace('{{',
'{').replace('}}',
'}').replace('{answer}', '.*')
pattern_str = re.escape(pattern_str).replace('\\.\\*', '.*')
match = re.fullmatch(pattern_str, generation)
return bool(match)


@LOAD_DATASET.register_module()
class FollowBenchDataset(BaseDataset):

Expand All @@ -28,11 +34,19 @@ def load(self, path: str, name: str, cate: str, *args, **kwargs):
data = json.load(f)
for item in data:
if cate == 'llm':
raw_data.append({'instruction': item['instruction'], 'judge_prompt': item['judge_prompt'], 'judge': item})
raw_data.append({
'instruction': item['instruction'],
'judge_prompt': item['judge_prompt'],
'judge': item
})
elif cate == 'rule':
raw_data.append({'instruction': item['instruction'], 'judge': item})
raw_data.append({
'instruction': item['instruction'],
'judge': item
})
else:
raise NotImplementedError(f"Category '{cate}' is not implemented.")
raise NotImplementedError(
f"Category '{cate}' is not implemented.")

dataset = Dataset.from_list(raw_data)
return dataset
Expand Down Expand Up @@ -126,20 +140,24 @@ def chdir_return(cwd, return_value):
return chdir_return(cwd, pass_flag)

def score(self, predictions, references):
results = {'example':{'accuracy':[0,0,0,0,0], 'num':0}}
results = {'example': {'accuracy': [0, 0, 0, 0, 0], 'num': 0}}
for prediction, reference in zip(predictions, references):
if reference['category'] == 'example':
results['example']['num'] += 1
template = reference['target'].replace('{instruction}\n', '')
match_result = check_match(template, prediction)
print(match_result)
results['example']['accuracy'][reference['level'] - 1] += match_result
results['example']['accuracy'] = [round(acc / (results['example']['num'] // 5), 2) for acc in results['example']['accuracy']]
results['example']['accuracy'][reference['level'] -
1] += match_result
results['example']['accuracy'] = [
round(acc / (results['example']['num'] // 5), 2)
for acc in results['example']['accuracy']
]
######## Still not finished for rule-based evaluation

# Each process changes cwd, need to use multi-processing
with ProcessPoolExecutor(self.num_workers) as executor:
passed = sum(
list(executor.map(self.score_single, predictions, references)))

return {'accuracy': passed / total}
return {'accuracy': passed / total}
53 changes: 30 additions & 23 deletions opencompass/summarizers/subjective/followbench.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,10 @@
import os
import os.path as osp
import re
import statistics
from collections import defaultdict
from datetime import datetime
import statistics

import numpy as np
from mmengine import ConfigDict

Expand All @@ -19,32 +20,36 @@
from .subjective_post_process import post_process_autoj, post_process_judgelm
from .utils import get_judgeanswer_and_reference_update, get_outdir


def post_process_followbench(item):
generation, level = item['prediction'], item['gold']['level']
try:
satisify = generation.strip("```").strip().split('\n')[-1]
satisfy = generation.strip('```').strip().split('\n')[-1]

if level == 1:
if 'YES' in satisify:
if 'YES' in satisfy:
return 1, 1
elif 'NO' in satisify:
elif 'NO' in satisfy:
return 0, 0
else:
raise Exception('Invalid evaluation for level 1.')
else:
satisify_list = re.search(r'\[.*\]', satisify)
if satisify_list:
satisify_list = eval(satisify_list.group())
if len(satisify_list) == level:
satisfy_list = re.search(r'\[.*\]', satisfy)
if satisfy_list:
satisfy_list = eval(satisfy_list.group())
if len(satisfy_list) == level:
num_true = 0
for i in satisify_list:
for i in satisfy_list:
if i == 'YES' or i == 'True':
num_true += 1
elif i in ['NO', 'False', 'PARTIAL', 'MAYBE', 'UNKNOWN', 'N/A']:
elif i in [
'NO', 'False', 'PARTIAL', 'MAYBE', 'UNKNOWN',
'N/A'
]:
num_true += 0
else:
raise Exception('Invalid element in the list.')
return int(num_true==level), num_true/level
return int(num_true == level), num_true / level
else:
raise Exception('Invalid number of elements in the list.')
else:
Expand All @@ -53,33 +58,37 @@ def post_process_followbench(item):
except Exception as e:
return -1, -1


def get_scores(judged_answers, references):
results = [[0, 0, 0, 0, 0], [0, 0, 0, 0, 0]]
n_group = len(judged_answers) // 5
n_groups = [n_group]*5
n_groups = [n_group] * 5

for judged_answer, reference in zip(judged_answers, references):
if judged_answer[0] == -1:
n_groups[reference['level']-1] -= 1
n_groups[reference['level'] - 1] -= 1
else:
results[0][reference['level']-1] += judged_answer[0]
results[1][reference['level']-1] += judged_answer[1]
results[0][reference['level'] - 1] += judged_answer[0]
results[1][reference['level'] - 1] += judged_answer[1]

for i in range(len(results)):
for j in range(len(results[i])):
if n_groups[j] != 0:
results[i][j] = results[i][j] / n_groups[j]
else:
results[i][j] = 0
temp_dict = {"HSR_AVG": statistics.mean(results[0]), "SSR_AVG": statistics.mean(results[1])}
temp_dict = {
'HSR_AVG': statistics.mean(results[0]),
'SSR_AVG': statistics.mean(results[1])
}
for idx, s in enumerate(results[0]):
temp_dict[f'HSR_L{idx+1}'] = s
for idx, s in enumerate(results[1]):
temp_dict[f'SSR_L{idx+1}'] = s

return temp_dict


class FollowBenchSummarizer:
"""Do the subjectivity analyze based on evaluation results.
Expand All @@ -99,7 +108,6 @@ def __init__(self, config: ConfigDict) -> None:

self.judge_function = post_process_followbench


def summarize(self,
time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S')):
"""Summarize the subjectivity analysis based on evaluation results.
Expand All @@ -118,9 +126,8 @@ def summarize(self,
dataset = dataset_cfgs[0] # Alignbench just have only one subfile
output_dir, results_folder = get_outdir(self.cfg, time_str)

fout = osp.join(
output_dir,
'followbench-judged-by--' + judge_abbr + '.csv')
fout = osp.join(output_dir,
'followbench-judged-by--' + judge_abbr + '.csv')

for eval_model_abbr in self.eval_model_abbrs:
subdir = eval_model_abbr + '_judged-by--' + judge_abbr
Expand Down

0 comments on commit f53663a

Please sign in to comment.