forked from open-compass/opencompass
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
47e745d
commit 5fa3a2a
Showing
6 changed files
with
283 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
from opencompass.openicl.icl_prompt_template import PromptTemplate | ||
from opencompass.openicl.icl_retriever import ZeroRetriever | ||
from opencompass.openicl.icl_inferencer import GenInferencer | ||
from opencompass.openicl.icl_evaluator import LMEvaluator | ||
from opencompass.datasets import DongcaiDataset | ||
from mmengine.config import read_base | ||
|
||
subjective_reader_cfg = dict( | ||
input_columns=['question', 'capability', 'gpt4_prompt'], | ||
output_column='judge', | ||
) | ||
|
||
subjective_all_sets = [ | ||
"dongcai", | ||
] | ||
data_path ="data/subjective/dongcai" | ||
|
||
subjective_datasets = [] | ||
|
||
for _name in subjective_all_sets: | ||
subjective_infer_cfg = dict( | ||
prompt_template=dict( | ||
type=PromptTemplate, | ||
template=dict(round=[ | ||
dict( | ||
role='HUMAN', | ||
prompt="{question}" | ||
), | ||
]), | ||
), | ||
retriever=dict(type=ZeroRetriever), | ||
inferencer=dict(type=GenInferencer, max_out_len=1024), | ||
) | ||
|
||
subjective_eval_cfg = dict( | ||
evaluator=dict( | ||
type=LMEvaluator, | ||
prompt_template=dict( | ||
type=PromptTemplate, | ||
template=dict(round=[ | ||
dict( | ||
role='HUMAN', | ||
prompt = "{gpt4_prompt}[助手的答案开始]\n{prediction}\n[助手的答案结束]\n" | ||
), | ||
]), | ||
), | ||
), | ||
pred_role="BOT", | ||
) | ||
|
||
subjective_datasets.append( | ||
dict( | ||
abbr=f"{_name}", | ||
type=DongcaiDataset, | ||
path=data_path, | ||
name=_name, | ||
reader_cfg=subjective_reader_cfg, | ||
infer_cfg=subjective_infer_cfg, | ||
eval_cfg=subjective_eval_cfg | ||
)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,82 @@ | ||
from os import getenv as gv | ||
|
||
from mmengine.config import read_base | ||
with read_base(): | ||
from .models.qwen.hf_qwen_7b_chat import models as hf_qwen_7b_chat | ||
from .models.qwen.hf_qwen_14b_chat import models as hf_qwen_14b_chat | ||
from .models.chatglm.hf_chatglm3_6b import models as hf_chatglm3_6b | ||
from .models.baichuan.hf_baichuan2_7b_chat import models as hf_baichuan2_7b | ||
from .models.hf_internlm.hf_internlm_chat_20b import models as hf_internlm_chat_20b | ||
from .datasets.subjective_cmp.dongcai import subjective_datasets | ||
|
||
datasets = [*subjective_datasets] | ||
|
||
from opencompass.models import HuggingFaceCausalLM, HuggingFace, OpenAIAllesAPIN, HuggingFaceChatGLM3 | ||
from opencompass.partitioners import NaivePartitioner | ||
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner | ||
from opencompass.runners import LocalRunner | ||
from opencompass.runners import SlurmSequentialRunner | ||
from opencompass.tasks import OpenICLInferTask | ||
from opencompass.tasks.subjective_eval import SubjectiveEvalTask | ||
from opencompass.summarizers import DongcaiSummarizer | ||
|
||
|
||
# -------------Inferen Stage ---------------------------------------- | ||
|
||
models = [*hf_baichuan2_7b]#, *hf_chatglm3_6b, *hf_internlm_chat_20b, *hf_qwen_7b_chat, *hf_qwen_14b_chat] | ||
|
||
infer = dict( | ||
partitioner=dict(type=NaivePartitioner), | ||
runner=dict( | ||
type=SlurmSequentialRunner, | ||
partition='llmeval', | ||
quotatype='auto', | ||
max_num_workers=256, | ||
task=dict(type=OpenICLInferTask)), | ||
) | ||
|
||
|
||
# -------------Evalation Stage ---------------------------------------- | ||
|
||
|
||
## ------------- JudgeLLM Configuration | ||
api_meta_template = dict( | ||
round=[ | ||
dict(role='HUMAN', api_role='HUMAN'), | ||
dict(role='BOT', api_role='BOT', generate=True), | ||
] | ||
) | ||
|
||
judge_model = dict( | ||
abbr='GPT4-Turbo', | ||
type=OpenAIAllesAPIN, path='gpt-4-1106-preview', | ||
key='xxxx', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well | ||
url='xxxx', | ||
meta_template=api_meta_template, | ||
query_per_second=16, | ||
max_out_len=2048, | ||
max_seq_len=2048, | ||
batch_size=8 | ||
) | ||
|
||
## ------------- Evaluation Configuration | ||
eval = dict( | ||
partitioner=dict( | ||
type=SubjectiveNaivePartitioner, | ||
mode='singlescore', | ||
models = models | ||
), | ||
runner=dict( | ||
type=LocalRunner, | ||
max_num_workers=2, | ||
task=dict( | ||
type=SubjectiveEvalTask, | ||
judge_cfg=judge_model | ||
)), | ||
) | ||
|
||
summarizer = dict( | ||
type=DongcaiSummarizer, | ||
) | ||
|
||
work_dir = 'outputs/dongcai/' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
# flake8: noqa: E501 | ||
import json | ||
import os.path as osp | ||
import re | ||
|
||
from datasets import Dataset, DatasetDict | ||
|
||
from opencompass.registry import LOAD_DATASET | ||
|
||
from .subjective_cmp import SubjectiveCmpDataset | ||
|
||
|
||
def prompt_construct(data): | ||
base_prompt = '你是一个评价回复好坏的助手,请根据下面的回答和回答标准进行打分。由于您评估的回答类型是{category},因此你需要从下面的几个维度对回答进行评估:\n{dimensions}' \ | ||
'我们会给您提供用户的提问,高质量的参考答案,和需要你评估的AI助手的答案。当你开始你的评估时,你需要按照遵守以下的流程:\n' \ | ||
'1. 将AI助手的答案与参考答案进行比较,指出AI助手的答案有哪些不足,并进一步解释。\n' \ | ||
'2. 从不同维度对AI助手的答案进行评价,在每个维度的评价之后,给每一个维度一个1~10的分数。\n' \ | ||
'3. 最后,综合每个维度的评估,对AI助手的回答给出一个1~10的综合分数。\n' \ | ||
'4. 你的打分需要尽可能严格,并且要遵守下面的评分规则:总的来说,模型回答的质量越高,则分数越高。其中,事实正确性和满足用户需求这两个维度是最重要的,这两个维度的分数主导了最后的综合分数。' \ | ||
'当模型回答存在与问题不相关,或者有本质性的事实错误,或生成了有害内容时,总分必须是1到2分;' \ | ||
'当模型回答没有严重错误而且基本无害,但是质量较低,没有满足用户需求,总分为3到4分;' \ | ||
'当模型回答基本满足用户要求,但是在部分维度上表现较差,质量中等,总分可以得5到6分;' \ | ||
'当模型回答质量与参考答案相近,在所有维度上表现良好,总分得7到8分;' \ | ||
'只有当模型回答质量显著超过参考答案,充分地解决了用户问题和所有需求,并且在所有维度上都接近满分的情况下,才能得9到10分。' \ | ||
'作为示例,参考答案可以得到8分。\n' \ | ||
'请记住,你必须在你打分前进行评价和解释。在你对每个维度的解释之后,需要加上对该维度的打分。之后,在你回答的末尾,按照以下字典格式(包括括号)返回你所有的打分结果,并确保你的打分结果是整数:\n' \ | ||
"{{'维度一': 打分, '维度二': 打分, ..., '综合得分': 打分}},例如:{{'事实正确性': 9, '满足用户需求': 6, ..., '综合得分': 7}}。\n" \ | ||
'用户的提问: {question}\n' \ | ||
'[参考答案开始]\n{reference}\n[参考答案结束]\n' | ||
prompt = base_prompt.format(category=data['capability'], | ||
question=sample['question'], | ||
dimensions=data['others']['dimensions'], | ||
reference=sample['others']['reference']) | ||
|
||
return dimensions, prompt | ||
|
||
|
||
@LOAD_DATASET.register_module() | ||
class DongcaiDataset(SubjectiveCmpDataset): | ||
|
||
def load(self, path: str, name: str): | ||
dataset = list(super().load(path, name)) | ||
new_dataset = [] | ||
for data in dataset: | ||
gpt4_prompt = prompt_construct(data) | ||
data['gpt4_prompt'] = gpt4_prompt | ||
data['judge']['others'] = data['others'] | ||
new_dataset.append(data) | ||
dataset = Dataset.from_list(new_dataset) | ||
return dataset |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,89 @@ | ||
# flake8: noqa: E501 | ||
import csv | ||
import os | ||
import os.path as osp | ||
import re | ||
from collections import defaultdict | ||
from datetime import datetime | ||
|
||
import mmengine | ||
import numpy as np | ||
from mmengine import ConfigDict | ||
|
||
try: | ||
from prettytable import from_csv | ||
except ImportError: | ||
from_csv = None | ||
|
||
from opencompass.utils import dataset_abbr_from_cfg, model_abbr_from_cfg | ||
|
||
|
||
def extract_score(text): | ||
pattern = r'\'综合得分\': (\d+(\.\d{1,2})?)' | ||
match = re.search(pattern, text) | ||
if match: | ||
return float(match.group(1)) | ||
return -1 | ||
|
||
|
||
class DongcaiSummarizer: | ||
"""Do the subjectivity analyze based on evaluation results. | ||
Args: | ||
config (ConfigDict): The configuration object of the evaluation task. | ||
It's expected to be filled out at runtime. | ||
""" | ||
|
||
def __init__(self, config: ConfigDict) -> None: | ||
self.tasks = [] | ||
self.cfg = config | ||
self.eval_model_cfgs = self.cfg['eval']['partitioner']['models'] | ||
self.eval_model_abbrs = [ | ||
model_abbr_from_cfg(model) for model in self.eval_model_cfgs | ||
] | ||
|
||
def summarize(self, | ||
time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S')): | ||
"""Summarize the subjectivity analysis based on evaluation results. | ||
Args: | ||
time_str (str): Timestamp for file naming. | ||
Returns: | ||
pd.DataFrame: The summary results. | ||
""" | ||
dataset_cfgs = self.cfg['datasets'] | ||
work_dir = self.cfg['work_dir'] | ||
self.work_dir = work_dir | ||
|
||
self.time_str = time_str | ||
output_path = osp.join(self.work_dir, 'summary', | ||
f'summary_{self.time_str}.txt') | ||
output_dir = osp.join(osp.split(output_path)[0], f'{self.time_str}') | ||
mmengine.mkdir_or_exist(output_dir) | ||
results_folder = osp.join(work_dir, 'results') | ||
fout = osp.join(output_dir, 'dimension.csv') | ||
fout2 = osp.join(output_dir, 'capability.csv') | ||
fout_flag, fout_flag2 = 0, 0 | ||
for subdir in os.listdir(results_folder): | ||
if subdir not in self.eval_model_abbrs: | ||
continue | ||
subdir_path = os.path.join(results_folder, subdir) | ||
if os.path.isdir(subdir_path): | ||
model = subdir | ||
for dataset in dataset_cfgs: | ||
dataset_abbr = dataset_abbr_from_cfg(dataset) | ||
filepath = os.path.join(subdir_path, | ||
dataset_abbr + '.json') | ||
result = mmengine.load(filepath) | ||
judged_answers = [] | ||
references = [] | ||
for k, v in result.items(): | ||
score = extract_score(v['prediction']) | ||
if score != -1: | ||
judged_answers.append({'score': score}) | ||
references.append(v['gold']) | ||
print( | ||
f'Among {len(result)} judgements, successfully extracted {len(judged_answers)} judgements.' | ||
) | ||
###TODO Write your summarizer |