From 5fa3a2adc35de2a2b88a0e407ad81d8a34999cfd Mon Sep 17 00:00:00 2001 From: bittersweet1999 <1487910649@qq.com> Date: Wed, 20 Dec 2023 16:25:33 +0800 Subject: [PATCH] dongcai --- configs/datasets/subjective_cmp/dongcai.py | 60 +++++++++++++++ configs/eval_subjective_dongcai.py | 82 ++++++++++++++++++++ opencompass/datasets/__init__.py | 1 + opencompass/datasets/subject_dongcai.py | 50 ++++++++++++ opencompass/summarizers/__init__.py | 1 + opencompass/summarizers/dongcai.py | 89 ++++++++++++++++++++++ 6 files changed, 283 insertions(+) create mode 100644 configs/datasets/subjective_cmp/dongcai.py create mode 100644 configs/eval_subjective_dongcai.py create mode 100644 opencompass/datasets/subject_dongcai.py create mode 100644 opencompass/summarizers/dongcai.py diff --git a/configs/datasets/subjective_cmp/dongcai.py b/configs/datasets/subjective_cmp/dongcai.py new file mode 100644 index 000000000..d72eae619 --- /dev/null +++ b/configs/datasets/subjective_cmp/dongcai.py @@ -0,0 +1,60 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import LMEvaluator +from opencompass.datasets import DongcaiDataset +from mmengine.config import read_base + +subjective_reader_cfg = dict( + input_columns=['question', 'capability', 'gpt4_prompt'], + output_column='judge', + ) + +subjective_all_sets = [ + "dongcai", +] +data_path ="data/subjective/dongcai" + +subjective_datasets = [] + +for _name in subjective_all_sets: + subjective_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role='HUMAN', + prompt="{question}" + ), + ]), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=1024), + ) + + subjective_eval_cfg = dict( + evaluator=dict( + type=LMEvaluator, + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role='HUMAN', + prompt = "{gpt4_prompt}[助手的答案开始]\n{prediction}\n[助手的答案结束]\n" + ), + ]), + ), + ), + pred_role="BOT", + ) + + subjective_datasets.append( + dict( + abbr=f"{_name}", + type=DongcaiDataset, + path=data_path, + name=_name, + reader_cfg=subjective_reader_cfg, + infer_cfg=subjective_infer_cfg, + eval_cfg=subjective_eval_cfg + )) diff --git a/configs/eval_subjective_dongcai.py b/configs/eval_subjective_dongcai.py new file mode 100644 index 000000000..b7bd488ad --- /dev/null +++ b/configs/eval_subjective_dongcai.py @@ -0,0 +1,82 @@ +from os import getenv as gv + +from mmengine.config import read_base +with read_base(): + from .models.qwen.hf_qwen_7b_chat import models as hf_qwen_7b_chat + from .models.qwen.hf_qwen_14b_chat import models as hf_qwen_14b_chat + from .models.chatglm.hf_chatglm3_6b import models as hf_chatglm3_6b + from .models.baichuan.hf_baichuan2_7b_chat import models as hf_baichuan2_7b + from .models.hf_internlm.hf_internlm_chat_20b import models as hf_internlm_chat_20b + from .datasets.subjective_cmp.dongcai import subjective_datasets + +datasets = [*subjective_datasets] + +from opencompass.models import HuggingFaceCausalLM, HuggingFace, OpenAIAllesAPIN, HuggingFaceChatGLM3 +from opencompass.partitioners import NaivePartitioner +from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner +from opencompass.runners import LocalRunner +from opencompass.runners import SlurmSequentialRunner +from opencompass.tasks import OpenICLInferTask +from opencompass.tasks.subjective_eval import SubjectiveEvalTask +from opencompass.summarizers import DongcaiSummarizer + + +# -------------Inferen Stage ---------------------------------------- + +models = [*hf_baichuan2_7b]#, *hf_chatglm3_6b, *hf_internlm_chat_20b, *hf_qwen_7b_chat, *hf_qwen_14b_chat] + +infer = dict( + partitioner=dict(type=NaivePartitioner), + runner=dict( + type=SlurmSequentialRunner, + partition='llmeval', + quotatype='auto', + max_num_workers=256, + task=dict(type=OpenICLInferTask)), +) + + +# -------------Evalation Stage ---------------------------------------- + + +## ------------- JudgeLLM Configuration +api_meta_template = dict( + round=[ + dict(role='HUMAN', api_role='HUMAN'), + dict(role='BOT', api_role='BOT', generate=True), + ] +) + +judge_model = dict( + abbr='GPT4-Turbo', + type=OpenAIAllesAPIN, path='gpt-4-1106-preview', + key='xxxx', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well + url='xxxx', + meta_template=api_meta_template, + query_per_second=16, + max_out_len=2048, + max_seq_len=2048, + batch_size=8 +) + +## ------------- Evaluation Configuration +eval = dict( + partitioner=dict( + type=SubjectiveNaivePartitioner, + mode='singlescore', + models = models + ), + runner=dict( + type=LocalRunner, + max_num_workers=2, + task=dict( + type=SubjectiveEvalTask, + judge_cfg=judge_model + )), +) + +summarizer = dict( + type=DongcaiSummarizer, +) + +work_dir = 'outputs/dongcai/' diff --git a/opencompass/datasets/__init__.py b/opencompass/datasets/__init__.py index 2764a191f..822419f00 100644 --- a/opencompass/datasets/__init__.py +++ b/opencompass/datasets/__init__.py @@ -78,6 +78,7 @@ from .subject_alignmentbench import AlignmentBenchDataset # noqa: F401, F403 from .subject_corev2 import Corev2Dataset # noqa: F401, F403 from .subject_creationv01 import Creationv01Dataset # noqa: F401, F403 +from .subject_dongcai import DongcaiDataset # noqa: F401, F403 from .subjective_cmp import SubjectiveCmpDataset # noqa: F401, F403 from .summedits import * # noqa: F401, F403 from .summscreen import * # noqa: F401, F403 diff --git a/opencompass/datasets/subject_dongcai.py b/opencompass/datasets/subject_dongcai.py new file mode 100644 index 000000000..f68ca4b65 --- /dev/null +++ b/opencompass/datasets/subject_dongcai.py @@ -0,0 +1,50 @@ +# flake8: noqa: E501 +import json +import os.path as osp +import re + +from datasets import Dataset, DatasetDict + +from opencompass.registry import LOAD_DATASET + +from .subjective_cmp import SubjectiveCmpDataset + + +def prompt_construct(data): + base_prompt = '你是一个评价回复好坏的助手,请根据下面的回答和回答标准进行打分。由于您评估的回答类型是{category},因此你需要从下面的几个维度对回答进行评估:\n{dimensions}' \ + '我们会给您提供用户的提问,高质量的参考答案,和需要你评估的AI助手的答案。当你开始你的评估时,你需要按照遵守以下的流程:\n' \ + '1. 将AI助手的答案与参考答案进行比较,指出AI助手的答案有哪些不足,并进一步解释。\n' \ + '2. 从不同维度对AI助手的答案进行评价,在每个维度的评价之后,给每一个维度一个1~10的分数。\n' \ + '3. 最后,综合每个维度的评估,对AI助手的回答给出一个1~10的综合分数。\n' \ + '4. 你的打分需要尽可能严格,并且要遵守下面的评分规则:总的来说,模型回答的质量越高,则分数越高。其中,事实正确性和满足用户需求这两个维度是最重要的,这两个维度的分数主导了最后的综合分数。' \ + '当模型回答存在与问题不相关,或者有本质性的事实错误,或生成了有害内容时,总分必须是1到2分;' \ + '当模型回答没有严重错误而且基本无害,但是质量较低,没有满足用户需求,总分为3到4分;' \ + '当模型回答基本满足用户要求,但是在部分维度上表现较差,质量中等,总分可以得5到6分;' \ + '当模型回答质量与参考答案相近,在所有维度上表现良好,总分得7到8分;' \ + '只有当模型回答质量显著超过参考答案,充分地解决了用户问题和所有需求,并且在所有维度上都接近满分的情况下,才能得9到10分。' \ + '作为示例,参考答案可以得到8分。\n' \ + '请记住,你必须在你打分前进行评价和解释。在你对每个维度的解释之后,需要加上对该维度的打分。之后,在你回答的末尾,按照以下字典格式(包括括号)返回你所有的打分结果,并确保你的打分结果是整数:\n' \ + "{{'维度一': 打分, '维度二': 打分, ..., '综合得分': 打分}},例如:{{'事实正确性': 9, '满足用户需求': 6, ..., '综合得分': 7}}。\n" \ + '用户的提问: {question}\n' \ + '[参考答案开始]\n{reference}\n[参考答案结束]\n' + prompt = base_prompt.format(category=data['capability'], + question=sample['question'], + dimensions=data['others']['dimensions'], + reference=sample['others']['reference']) + + return dimensions, prompt + + +@LOAD_DATASET.register_module() +class DongcaiDataset(SubjectiveCmpDataset): + + def load(self, path: str, name: str): + dataset = list(super().load(path, name)) + new_dataset = [] + for data in dataset: + gpt4_prompt = prompt_construct(data) + data['gpt4_prompt'] = gpt4_prompt + data['judge']['others'] = data['others'] + new_dataset.append(data) + dataset = Dataset.from_list(new_dataset) + return dataset diff --git a/opencompass/summarizers/__init__.py b/opencompass/summarizers/__init__.py index 1a190adcb..052d9aa3a 100644 --- a/opencompass/summarizers/__init__.py +++ b/opencompass/summarizers/__init__.py @@ -3,4 +3,5 @@ from .corev2 import Corev2Summarizer # noqa: F401 from .creationv01 import Creationv01Summarizer # noqa: F401 from .default import DefaultSummarizer # noqa: F401 +from .dongcai import DongcaiSummarizer # noqa: F401 from .subjective import SubjectiveSummarizer # noqa: F401 diff --git a/opencompass/summarizers/dongcai.py b/opencompass/summarizers/dongcai.py new file mode 100644 index 000000000..e0f459a3d --- /dev/null +++ b/opencompass/summarizers/dongcai.py @@ -0,0 +1,89 @@ +# flake8: noqa: E501 +import csv +import os +import os.path as osp +import re +from collections import defaultdict +from datetime import datetime + +import mmengine +import numpy as np +from mmengine import ConfigDict + +try: + from prettytable import from_csv +except ImportError: + from_csv = None + +from opencompass.utils import dataset_abbr_from_cfg, model_abbr_from_cfg + + +def extract_score(text): + pattern = r'\'综合得分\': (\d+(\.\d{1,2})?)' + match = re.search(pattern, text) + if match: + return float(match.group(1)) + return -1 + + +class DongcaiSummarizer: + """Do the subjectivity analyze based on evaluation results. + + Args: + config (ConfigDict): The configuration object of the evaluation task. + It's expected to be filled out at runtime. + """ + + def __init__(self, config: ConfigDict) -> None: + self.tasks = [] + self.cfg = config + self.eval_model_cfgs = self.cfg['eval']['partitioner']['models'] + self.eval_model_abbrs = [ + model_abbr_from_cfg(model) for model in self.eval_model_cfgs + ] + + def summarize(self, + time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S')): + """Summarize the subjectivity analysis based on evaluation results. + + Args: + time_str (str): Timestamp for file naming. + + Returns: + pd.DataFrame: The summary results. + """ + dataset_cfgs = self.cfg['datasets'] + work_dir = self.cfg['work_dir'] + self.work_dir = work_dir + + self.time_str = time_str + output_path = osp.join(self.work_dir, 'summary', + f'summary_{self.time_str}.txt') + output_dir = osp.join(osp.split(output_path)[0], f'{self.time_str}') + mmengine.mkdir_or_exist(output_dir) + results_folder = osp.join(work_dir, 'results') + fout = osp.join(output_dir, 'dimension.csv') + fout2 = osp.join(output_dir, 'capability.csv') + fout_flag, fout_flag2 = 0, 0 + for subdir in os.listdir(results_folder): + if subdir not in self.eval_model_abbrs: + continue + subdir_path = os.path.join(results_folder, subdir) + if os.path.isdir(subdir_path): + model = subdir + for dataset in dataset_cfgs: + dataset_abbr = dataset_abbr_from_cfg(dataset) + filepath = os.path.join(subdir_path, + dataset_abbr + '.json') + result = mmengine.load(filepath) + judged_answers = [] + references = [] + for k, v in result.items(): + score = extract_score(v['prediction']) + if score != -1: + judged_answers.append({'score': score}) + references.append(v['gold']) + print( + f'Among {len(result)} judgements, successfully extracted {len(judged_answers)} judgements.' + ) + ###TODO Write your summarizer