Skip to content

Commit

Permalink
dongcai
Browse files Browse the repository at this point in the history
  • Loading branch information
bittersweet1999 committed Dec 20, 2023
1 parent 47e745d commit 5fa3a2a
Show file tree
Hide file tree
Showing 6 changed files with 283 additions and 0 deletions.
60 changes: 60 additions & 0 deletions configs/datasets/subjective_cmp/dongcai.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import DongcaiDataset
from mmengine.config import read_base

subjective_reader_cfg = dict(
input_columns=['question', 'capability', 'gpt4_prompt'],
output_column='judge',
)

subjective_all_sets = [
"dongcai",
]
data_path ="data/subjective/dongcai"

subjective_datasets = []

for _name in subjective_all_sets:
subjective_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt="{question}"
),
]),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=1024),
)

subjective_eval_cfg = dict(
evaluator=dict(
type=LMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt = "{gpt4_prompt}[助手的答案开始]\n{prediction}\n[助手的答案结束]\n"
),
]),
),
),
pred_role="BOT",
)

subjective_datasets.append(
dict(
abbr=f"{_name}",
type=DongcaiDataset,
path=data_path,
name=_name,
reader_cfg=subjective_reader_cfg,
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg
))
82 changes: 82 additions & 0 deletions configs/eval_subjective_dongcai.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
from os import getenv as gv

from mmengine.config import read_base
with read_base():
from .models.qwen.hf_qwen_7b_chat import models as hf_qwen_7b_chat
from .models.qwen.hf_qwen_14b_chat import models as hf_qwen_14b_chat
from .models.chatglm.hf_chatglm3_6b import models as hf_chatglm3_6b
from .models.baichuan.hf_baichuan2_7b_chat import models as hf_baichuan2_7b
from .models.hf_internlm.hf_internlm_chat_20b import models as hf_internlm_chat_20b
from .datasets.subjective_cmp.dongcai import subjective_datasets

datasets = [*subjective_datasets]

from opencompass.models import HuggingFaceCausalLM, HuggingFace, OpenAIAllesAPIN, HuggingFaceChatGLM3
from opencompass.partitioners import NaivePartitioner
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
from opencompass.runners import LocalRunner
from opencompass.runners import SlurmSequentialRunner
from opencompass.tasks import OpenICLInferTask
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
from opencompass.summarizers import DongcaiSummarizer


# -------------Inferen Stage ----------------------------------------

models = [*hf_baichuan2_7b]#, *hf_chatglm3_6b, *hf_internlm_chat_20b, *hf_qwen_7b_chat, *hf_qwen_14b_chat]

infer = dict(
partitioner=dict(type=NaivePartitioner),
runner=dict(
type=SlurmSequentialRunner,
partition='llmeval',
quotatype='auto',
max_num_workers=256,
task=dict(type=OpenICLInferTask)),
)


# -------------Evalation Stage ----------------------------------------


## ------------- JudgeLLM Configuration
api_meta_template = dict(
round=[
dict(role='HUMAN', api_role='HUMAN'),
dict(role='BOT', api_role='BOT', generate=True),
]
)

judge_model = dict(
abbr='GPT4-Turbo',
type=OpenAIAllesAPIN, path='gpt-4-1106-preview',
key='xxxx', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
url='xxxx',
meta_template=api_meta_template,
query_per_second=16,
max_out_len=2048,
max_seq_len=2048,
batch_size=8
)

## ------------- Evaluation Configuration
eval = dict(
partitioner=dict(
type=SubjectiveNaivePartitioner,
mode='singlescore',
models = models
),
runner=dict(
type=LocalRunner,
max_num_workers=2,
task=dict(
type=SubjectiveEvalTask,
judge_cfg=judge_model
)),
)

summarizer = dict(
type=DongcaiSummarizer,
)

work_dir = 'outputs/dongcai/'
1 change: 1 addition & 0 deletions opencompass/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@
from .subject_alignmentbench import AlignmentBenchDataset # noqa: F401, F403
from .subject_corev2 import Corev2Dataset # noqa: F401, F403
from .subject_creationv01 import Creationv01Dataset # noqa: F401, F403
from .subject_dongcai import DongcaiDataset # noqa: F401, F403
from .subjective_cmp import SubjectiveCmpDataset # noqa: F401, F403
from .summedits import * # noqa: F401, F403
from .summscreen import * # noqa: F401, F403
Expand Down
50 changes: 50 additions & 0 deletions opencompass/datasets/subject_dongcai.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
# flake8: noqa: E501
import json
import os.path as osp
import re

from datasets import Dataset, DatasetDict

from opencompass.registry import LOAD_DATASET

from .subjective_cmp import SubjectiveCmpDataset


def prompt_construct(data):
base_prompt = '你是一个评价回复好坏的助手,请根据下面的回答和回答标准进行打分。由于您评估的回答类型是{category},因此你需要从下面的几个维度对回答进行评估:\n{dimensions}' \
'我们会给您提供用户的提问,高质量的参考答案,和需要你评估的AI助手的答案。当你开始你的评估时,你需要按照遵守以下的流程:\n' \
'1. 将AI助手的答案与参考答案进行比较,指出AI助手的答案有哪些不足,并进一步解释。\n' \
'2. 从不同维度对AI助手的答案进行评价,在每个维度的评价之后,给每一个维度一个1~10的分数。\n' \
'3. 最后,综合每个维度的评估,对AI助手的回答给出一个1~10的综合分数。\n' \
'4. 你的打分需要尽可能严格,并且要遵守下面的评分规则:总的来说,模型回答的质量越高,则分数越高。其中,事实正确性和满足用户需求这两个维度是最重要的,这两个维度的分数主导了最后的综合分数。' \
'当模型回答存在与问题不相关,或者有本质性的事实错误,或生成了有害内容时,总分必须是1到2分;' \
'当模型回答没有严重错误而且基本无害,但是质量较低,没有满足用户需求,总分为3到4分;' \
'当模型回答基本满足用户要求,但是在部分维度上表现较差,质量中等,总分可以得5到6分;' \
'当模型回答质量与参考答案相近,在所有维度上表现良好,总分得7到8分;' \
'只有当模型回答质量显著超过参考答案,充分地解决了用户问题和所有需求,并且在所有维度上都接近满分的情况下,才能得9到10分。' \
'作为示例,参考答案可以得到8分。\n' \
'请记住,你必须在你打分前进行评价和解释。在你对每个维度的解释之后,需要加上对该维度的打分。之后,在你回答的末尾,按照以下字典格式(包括括号)返回你所有的打分结果,并确保你的打分结果是整数:\n' \
"{{'维度一': 打分, '维度二': 打分, ..., '综合得分': 打分}},例如:{{'事实正确性': 9, '满足用户需求': 6, ..., '综合得分': 7}}。\n" \
'用户的提问: {question}\n' \
'[参考答案开始]\n{reference}\n[参考答案结束]\n'
prompt = base_prompt.format(category=data['capability'],
question=sample['question'],
dimensions=data['others']['dimensions'],
reference=sample['others']['reference'])

return dimensions, prompt


@LOAD_DATASET.register_module()
class DongcaiDataset(SubjectiveCmpDataset):

def load(self, path: str, name: str):
dataset = list(super().load(path, name))
new_dataset = []
for data in dataset:
gpt4_prompt = prompt_construct(data)
data['gpt4_prompt'] = gpt4_prompt
data['judge']['others'] = data['others']
new_dataset.append(data)
dataset = Dataset.from_list(new_dataset)
return dataset
1 change: 1 addition & 0 deletions opencompass/summarizers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,5 @@
from .corev2 import Corev2Summarizer # noqa: F401
from .creationv01 import Creationv01Summarizer # noqa: F401
from .default import DefaultSummarizer # noqa: F401
from .dongcai import DongcaiSummarizer # noqa: F401
from .subjective import SubjectiveSummarizer # noqa: F401
89 changes: 89 additions & 0 deletions opencompass/summarizers/dongcai.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
# flake8: noqa: E501
import csv
import os
import os.path as osp
import re
from collections import defaultdict
from datetime import datetime

import mmengine
import numpy as np
from mmengine import ConfigDict

try:
from prettytable import from_csv
except ImportError:
from_csv = None

from opencompass.utils import dataset_abbr_from_cfg, model_abbr_from_cfg


def extract_score(text):
pattern = r'\'综合得分\': (\d+(\.\d{1,2})?)'
match = re.search(pattern, text)
if match:
return float(match.group(1))
return -1


class DongcaiSummarizer:
"""Do the subjectivity analyze based on evaluation results.
Args:
config (ConfigDict): The configuration object of the evaluation task.
It's expected to be filled out at runtime.
"""

def __init__(self, config: ConfigDict) -> None:
self.tasks = []
self.cfg = config
self.eval_model_cfgs = self.cfg['eval']['partitioner']['models']
self.eval_model_abbrs = [
model_abbr_from_cfg(model) for model in self.eval_model_cfgs
]

def summarize(self,
time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S')):
"""Summarize the subjectivity analysis based on evaluation results.
Args:
time_str (str): Timestamp for file naming.
Returns:
pd.DataFrame: The summary results.
"""
dataset_cfgs = self.cfg['datasets']
work_dir = self.cfg['work_dir']
self.work_dir = work_dir

self.time_str = time_str
output_path = osp.join(self.work_dir, 'summary',
f'summary_{self.time_str}.txt')
output_dir = osp.join(osp.split(output_path)[0], f'{self.time_str}')
mmengine.mkdir_or_exist(output_dir)
results_folder = osp.join(work_dir, 'results')
fout = osp.join(output_dir, 'dimension.csv')
fout2 = osp.join(output_dir, 'capability.csv')
fout_flag, fout_flag2 = 0, 0
for subdir in os.listdir(results_folder):
if subdir not in self.eval_model_abbrs:
continue
subdir_path = os.path.join(results_folder, subdir)
if os.path.isdir(subdir_path):
model = subdir
for dataset in dataset_cfgs:
dataset_abbr = dataset_abbr_from_cfg(dataset)
filepath = os.path.join(subdir_path,
dataset_abbr + '.json')
result = mmengine.load(filepath)
judged_answers = []
references = []
for k, v in result.items():
score = extract_score(v['prediction'])
if score != -1:
judged_answers.append({'score': score})
references.append(v['gold'])
print(
f'Among {len(result)} judgements, successfully extracted {len(judged_answers)} judgements.'
)
###TODO Write your summarizer

0 comments on commit 5fa3a2a

Please sign in to comment.