From 83eeb52b0946a2151db71adeaec629a0f13f0a7a Mon Sep 17 00:00:00 2001 From: liushz Date: Wed, 25 Sep 2024 11:26:36 +0800 Subject: [PATCH 01/20] [Feature] Update WikiBench base model config (#1553) * Update MathBench & WikiBench for FullBench * Update MathBench & WikiBench for FullBench * Update GPQA & MMLU_Pro * Update MathBench & WikiBench for FullBench * Update MathBench & WikiBench for FullBench * Update MathBench & WikiBench for FullBench * Update MathBench & Math base config * Update WikiBench base model config --------- Co-authored-by: liushz --- .../wikibench_few_shot_ppl_c23d79.py | 73 +++++++++++++++++++ .../wikibench_few_shot_ppl_c23d79.py | 73 +++++++++++++++++++ 2 files changed, 146 insertions(+) create mode 100644 configs/datasets/wikibench/wikibench_few_shot_ppl_c23d79.py create mode 100644 opencompass/configs/datasets/wikibench/wikibench_few_shot_ppl_c23d79.py diff --git a/configs/datasets/wikibench/wikibench_few_shot_ppl_c23d79.py b/configs/datasets/wikibench/wikibench_few_shot_ppl_c23d79.py new file mode 100644 index 000000000..0669bd7b9 --- /dev/null +++ b/configs/datasets/wikibench/wikibench_few_shot_ppl_c23d79.py @@ -0,0 +1,73 @@ +import copy + +from opencompass.datasets import WikiBenchDataset +from opencompass.openicl.icl_evaluator import AccEvaluator, CircularEvaluator +from opencompass.openicl.icl_inferencer import PPLInferencer +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever + +single_choice_prompts = { + 'single_choice_cn': [ + dict(role='HUMAN', + prompt='问题: 白色念珠菌常被用作哪种生物的研究模式?\nA. 病毒\nB. 细菌\nC. 真菌\nD. 寄生虫'), + dict(role='BOT', prompt='回答: C'), + dict( + role='HUMAN', + prompt='问题: 星期五广场(荷兰语:Vrijdagmarkt;荷兰语发音: )是比利时根特老城的一个城市广场。 星期五广场下方有一个什么设施?\nA. 游乐场\nB. 地下停车场\nC. 公园\nD. 地下商场' # noqa: E501 + ), + dict(role='BOT', prompt='回答: B'), + dict( + role='HUMAN', + prompt='问题: 尔迪雷·巴斯杜克代表土耳其国家队出场的次数?\nA. 60次\nB. 35次\nC. 49次\nD. 20次' + ), + dict(role='BOT', prompt='回答: C'), + dict( + role='HUMAN', + prompt='问题: 陈酆被任命为漳州刺史是因为什么原因?\nA. 朝廷认为他有能力担任该职务\nB. 漳州人怀念陈元光、陈伯珙的政绩\nC. 他是陈伯珙的儿子\nD. 他是陈元光的孙子' # noqa: E501 + ), + dict(role='BOT', prompt='回答: B'), + dict(role='HUMAN', + prompt='问题: 丹徒县在1928年改名为什么?\nA. 苏州市\nB. 润州县\nC. 镇江县\nD. 丹阳县'), + dict(role='BOT', prompt='回答: C'), + dict(role='HUMAN', prompt='问题: {question}'), + dict(role='BOT', prompt='回答: {answer}'), + ] +} + +wikibench_sets = { + 'wiki': ['single_choice_cn'], +} + +do_circular = True + +wikibench_datasets = [] + +for _split in list(wikibench_sets.keys()): + for _name in wikibench_sets[_split]: + template = {} + for answer in ['A', 'B', 'C', 'D']: + one_template_round = copy.deepcopy(single_choice_prompts[_name]) + one_template_round[-1]['prompt'] = one_template_round[-1][ + 'prompt'].format(answer=answer) + template[answer] = dict(round=one_template_round) + wikibench_infer_cfg = dict( + prompt_template=dict(type=PromptTemplate, template=template), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=PPLInferencer), + ) + wikibench_eval_cfg = dict(evaluator=dict( + type=CircularEvaluator if do_circular else AccEvaluator), ) + wikibench_datasets.append( + dict( + type=WikiBenchDataset, + path=f'./data/WikiBench/{_name}.jsonl', + name='circular_' + _name if do_circular else _name, + abbr='wikibench-' + _split + '-' + _name + + 'circular' if do_circular else '', + reader_cfg=dict( + input_columns=['question'], + output_column='answer', + ), + infer_cfg=wikibench_infer_cfg, + eval_cfg=wikibench_eval_cfg, + )) diff --git a/opencompass/configs/datasets/wikibench/wikibench_few_shot_ppl_c23d79.py b/opencompass/configs/datasets/wikibench/wikibench_few_shot_ppl_c23d79.py new file mode 100644 index 000000000..0669bd7b9 --- /dev/null +++ b/opencompass/configs/datasets/wikibench/wikibench_few_shot_ppl_c23d79.py @@ -0,0 +1,73 @@ +import copy + +from opencompass.datasets import WikiBenchDataset +from opencompass.openicl.icl_evaluator import AccEvaluator, CircularEvaluator +from opencompass.openicl.icl_inferencer import PPLInferencer +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever + +single_choice_prompts = { + 'single_choice_cn': [ + dict(role='HUMAN', + prompt='问题: 白色念珠菌常被用作哪种生物的研究模式?\nA. 病毒\nB. 细菌\nC. 真菌\nD. 寄生虫'), + dict(role='BOT', prompt='回答: C'), + dict( + role='HUMAN', + prompt='问题: 星期五广场(荷兰语:Vrijdagmarkt;荷兰语发音: )是比利时根特老城的一个城市广场。 星期五广场下方有一个什么设施?\nA. 游乐场\nB. 地下停车场\nC. 公园\nD. 地下商场' # noqa: E501 + ), + dict(role='BOT', prompt='回答: B'), + dict( + role='HUMAN', + prompt='问题: 尔迪雷·巴斯杜克代表土耳其国家队出场的次数?\nA. 60次\nB. 35次\nC. 49次\nD. 20次' + ), + dict(role='BOT', prompt='回答: C'), + dict( + role='HUMAN', + prompt='问题: 陈酆被任命为漳州刺史是因为什么原因?\nA. 朝廷认为他有能力担任该职务\nB. 漳州人怀念陈元光、陈伯珙的政绩\nC. 他是陈伯珙的儿子\nD. 他是陈元光的孙子' # noqa: E501 + ), + dict(role='BOT', prompt='回答: B'), + dict(role='HUMAN', + prompt='问题: 丹徒县在1928年改名为什么?\nA. 苏州市\nB. 润州县\nC. 镇江县\nD. 丹阳县'), + dict(role='BOT', prompt='回答: C'), + dict(role='HUMAN', prompt='问题: {question}'), + dict(role='BOT', prompt='回答: {answer}'), + ] +} + +wikibench_sets = { + 'wiki': ['single_choice_cn'], +} + +do_circular = True + +wikibench_datasets = [] + +for _split in list(wikibench_sets.keys()): + for _name in wikibench_sets[_split]: + template = {} + for answer in ['A', 'B', 'C', 'D']: + one_template_round = copy.deepcopy(single_choice_prompts[_name]) + one_template_round[-1]['prompt'] = one_template_round[-1][ + 'prompt'].format(answer=answer) + template[answer] = dict(round=one_template_round) + wikibench_infer_cfg = dict( + prompt_template=dict(type=PromptTemplate, template=template), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=PPLInferencer), + ) + wikibench_eval_cfg = dict(evaluator=dict( + type=CircularEvaluator if do_circular else AccEvaluator), ) + wikibench_datasets.append( + dict( + type=WikiBenchDataset, + path=f'./data/WikiBench/{_name}.jsonl', + name='circular_' + _name if do_circular else _name, + abbr='wikibench-' + _split + '-' + _name + + 'circular' if do_circular else '', + reader_cfg=dict( + input_columns=['question'], + output_column='answer', + ), + infer_cfg=wikibench_infer_cfg, + eval_cfg=wikibench_eval_cfg, + )) From 17eefc0e1e90c4cd669d1cb840456d1aa7ffb48d Mon Sep 17 00:00:00 2001 From: Chuanyang Jin <68135125+chuanyangjin@users.noreply.github.com> Date: Tue, 24 Sep 2024 23:27:17 -0400 Subject: [PATCH 02/20] [Fix] Correct typos (#1561) --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 98f640694..2da411958 100644 --- a/README.md +++ b/README.md @@ -594,7 +594,7 @@ OpenCompass is a one-stop platform for large model evaluation, aiming to provide ## 🔜 Roadmap - [x] Subjective Evaluation - - [x] Release CompassAreana + - [x] Release CompassAreana. - [x] Subjective evaluation. - [x] Long-context - [x] Long-context evaluation with extensive datasets. @@ -603,10 +603,10 @@ OpenCompass is a one-stop platform for large model evaluation, aiming to provide - [ ] Coding evaluation leaderboard. - [x] Non-python language evaluation service. - [x] Agent - - [ ] Support various agenet framework. + - [ ] Support various agent frameworks. - [x] Evaluation of tool use of the LLMs. - [x] Robustness - - [x] Support various attack method + - [x] Support various attack methods. ## 👷‍♂️ Contributing From fe84bbd9a048f26bd8dd19dc3236566758b7135b Mon Sep 17 00:00:00 2001 From: Songyang Zhang Date: Wed, 25 Sep 2024 11:36:43 +0800 Subject: [PATCH 03/20] [Feature] Add Config for CoreBench (#1547) * [Feature] Add Config for CoreBench * Update --- configs/eval_corebench_2409_longcontext.py | 138 ++++++++++++++ configs/eval_corebench_2409_objective.py | 208 +++++++++++++++++++++ configs/eval_corebench_2409_subjective.py | 134 +++++++++++++ 3 files changed, 480 insertions(+) create mode 100644 configs/eval_corebench_2409_longcontext.py create mode 100644 configs/eval_corebench_2409_objective.py create mode 100644 configs/eval_corebench_2409_subjective.py diff --git a/configs/eval_corebench_2409_longcontext.py b/configs/eval_corebench_2409_longcontext.py new file mode 100644 index 000000000..718044d2a --- /dev/null +++ b/configs/eval_corebench_2409_longcontext.py @@ -0,0 +1,138 @@ +import os.path as osp +from copy import deepcopy + +from mmengine.config import read_base +from opencompass.models import (HuggingFacewithChatTemplate, + TurboMindModelwithChatTemplate) +from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner +from opencompass.runners import DLCRunner, LocalRunner +from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask + + +####################################################################### +# PART 0 Essential Configs # +####################################################################### +with read_base(): + from opencompass.configs.datasets.longbench.longbench import \ + longbench_datasets + from opencompass.configs.datasets.needlebench.needlebench_8k.needlebench_8k import \ + needlebench_datasets as needlebench_8k_datasets + from opencompass.configs.datasets.needlebench.needlebench_32k.needlebench_32k import \ + needlebench_datasets as needlebench_32k_datasets + from opencompass.configs.datasets.needlebench.needlebench_128k.needlebench_128k import \ + needlebench_datasets as needlebench_128k_datasets + from opencompass.configs.datasets.ruler.ruler_8k_gen import \ + ruler_datasets as ruler_8k_datasets + from opencompass.configs.datasets.ruler.ruler_32k_gen import \ + ruler_datasets as ruler_32k_datasets + from opencompass.configs.datasets.ruler.ruler_128k_gen import \ + ruler_datasets as ruler_128k_datasets + # Summary Groups + from opencompass.configs.summarizers.groups.longbench import \ + longbench_summary_groups + from opencompass.configs.summarizers.groups.ruler import \ + ruler_summary_groups + from opencompass.configs.summarizers.needlebench import ( + needlebench_8k_summarizer, needlebench_32k_summarizer, + needlebench_128k_summarizer) + + # Instruct models + from opencompass.configs.models.qwen.lmdeploy_qwen2_7b_instruct import \ + models as lmdeploy_qwen2_7b_instruct_model + + from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat_1m import \ + models as lmdeploy_internlm2_5_7b_1m_chat_model + from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_8b_instruct import \ + models as llama3_1_8b_instruct_model + + +####################################################################### +# PART 1 Datasets List # +####################################################################### +# datasets list for evaluation +datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), []) + + +####################################################################### +# PART 2 Datset Summarizer # +####################################################################### +needlebench_8k_summary_groups = needlebench_8k_summarizer['summary_groups'] +needlebench_32k_summary_groups = needlebench_32k_summarizer['summary_groups'] +needlebench_128k_summary_groups = needlebench_128k_summarizer['summary_groups'] + +# Instruct models summarizer +summarizer = dict( + dataset_abbrs=[ + ['ruler_8k', 'naive_average'], + ['ruler_32k', 'naive_average'], + ['ruler_128k', 'naive_average'], + ['NeedleBench-Overall-Score-8K', 'weighted_average'], + ['NeedleBench-Overall-Score-32K', 'weighted_average'], + ['NeedleBench-Overall-Score-128K', 'weighted_average'], + ['longbench', 'naive_average'], + ['longbench_zh', 'naive_average'], + ['longbench_en', 'naive_average'], + '', + 'longbench_single-document-qa', + 'longbench_multi-document-qa', + 'longbench_summarization', + 'longbench_few-shot-learning', + 'longbench_synthetic-tasks', + 'longbench_code-completion', + ], + summary_groups=sum( + [v for k, v in locals().items() if k.endswith('_summary_groups')], []), +) + + +####################################################################### +# PART 3 Models List # +####################################################################### + +lmdeploy_qwen2_7b_instruct_model[0]['max_seq_len'] = 1048576 +lmdeploy_qwen2_7b_instruct_model[0]['engine_config']['session_len'] = 1048576 +lmdeploy_qwen2_7b_instruct_model[0]['engine_config']['tp'] = 4 +lmdeploy_qwen2_7b_instruct_model[0]['engine_config']['rope_scaling_factor'] = 4 +lmdeploy_qwen2_7b_instruct_model[0]['run_cfg']['num_gpus'] = 4 + +llama3_1_8b_instruct_model[0]['max_seq_len'] = 1048576 +llama3_1_8b_instruct_model[0]['engine_config']['session_len'] = 1048576 +llama3_1_8b_instruct_model[0]['engine_config']['tp'] = 4 +llama3_1_8b_instruct_model[0]['engine_config']['rope_scaling_factor'] = 4 +llama3_1_8b_instruct_model[0]['run_cfg']['num_gpus'] = 4 + +models = sum([v for k, v in locals().items() if k.endswith('_model')], []) + +####################################################################### +# PART 4 Inference/Evaluation Configuaration # +####################################################################### + +# Local Runner +infer = dict( + partitioner=dict( + type=NumWorkerPartitioner, + num_worker=8 + ), + runner=dict( + type=LocalRunner, + max_num_workers=16, + retry=0, # Modify if needed + task=dict(type=OpenICLInferTask) + ), +) + +# eval with local runner +eval = dict( + partitioner=dict(type=NaivePartitioner, n=10), + runner=dict( + type=LocalRunner, + max_num_workers=16, + task=dict(type=OpenICLEvalTask)), +) + + +####################################################################### +# PART 5 Utils Configuaration # +####################################################################### +base_exp_dir = 'outputs/corebench/' +work_dir = osp.join(base_exp_dir, 'long_context') diff --git a/configs/eval_corebench_2409_objective.py b/configs/eval_corebench_2409_objective.py new file mode 100644 index 000000000..e14c52472 --- /dev/null +++ b/configs/eval_corebench_2409_objective.py @@ -0,0 +1,208 @@ +from mmengine.config import read_base +import os.path as osp +from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner +from opencompass.runners import LocalRunner +from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask + + +####################################################################### +# PART 0 Essential Configs # +####################################################################### +with read_base(): + # Datasets Part + ## Core Set + # ## Examination + from opencompass.configs.datasets.mmlu.mmlu_openai_simple_evals_gen_b618ea import mmlu_datasets + from opencompass.configs.datasets.mmlu_pro.mmlu_pro_0shot_cot_gen_08c1de import mmlu_pro_datasets + from opencompass.configs.datasets.cmmlu.cmmlu_0shot_cot_gen_305931 import cmmlu_datasets + + # ## Reasoning + from opencompass.configs.datasets.bbh.bbh_gen_4a31fa import bbh_datasets + # TODO: Add HellaSwag + # TODO: Add DROP + + # ## Math + from opencompass.configs.datasets.math.math_0shot_gen_393424 import math_datasets + # TODO: Add GSM8K + # TODO: Add MathBench + + # ## Scientific + from opencompass.configs.datasets.gpqa.gpqa_openai_simple_evals_gen_5aeece import gpqa_datasets + + # ## Coding + from opencompass.configs.datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets + # TODO: Add MBPP + # TODO: Add LiveCodeBench + + # ## Instruction Following + from opencompass.configs.datasets.IFEval.IFEval_gen_3321a3 import ifeval_datasets + + # Summarizer + from opencompass.configs.summarizers.groups.mmlu import mmlu_summary_groups + from opencompass.configs.summarizers.groups.mmlu_pro import mmlu_pro_summary_groups + from opencompass.configs.summarizers.groups.cmmlu import cmmlu_summary_groups + from opencompass.configs.summarizers.groups.bbh import bbh_summary_groups + + + # Model List + # from opencompass.configs.models.qwen.lmdeploy_qwen2_1_5b_instruct import models as lmdeploy_qwen2_1_5b_instruct_model + # from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import models as hf_internlm2_5_7b_chat_model + # from opencompass.configs.models.openbmb.hf_minicpm_2b_sft_bf16 import models as hf_minicpm_2b_sft_bf16_model + # from opencompass.configs.models.yi.hf_yi_1_5_6b_chat import models as hf_yi_1_5_6b_chat_model + # from opencompass.configs.models.gemma.hf_gemma_2b_it import models as hf_gemma_2b_it_model + # from opencompass.configs.models.yi.hf_yi_1_5_34b_chat import models as hf_yi_1_5_34b_chat_model + +####################################################################### +# PART 1 Datasets List # +####################################################################### +# datasets list for evaluation +datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), []) + + +####################################################################### +# PART 2 Datset Summarizer # +####################################################################### +# with read_base(): + +core_summary_groups = [ + { + 'name': 'core_average', + 'subsets': [ + ['mmlu', 'accuracy'], + ['mmlu_pro', 'accuracy'], + # ['cmmlu', 'naive_average'], + ['cmmlu', 'accuracy'], + ['bbh', 'score'], + ['math', 'accuracy'], + ['openai_humaneval', 'humaneval_pass@1'], + ['GPQA_diamond', 'accuracy'], + ['IFEval', 'Prompt-level-strict-accuracy'], + ], + }, +] + +summarizer = dict( + dataset_abbrs=[ + ['core_average', 'naive_average'], + ['mmlu', 'accuracy'], + ['mmlu_pro', 'accuracy'], + ['cmmlu', 'accuracy'], + ['bbh', 'score'], + ['math', 'accuracy'], + ['openai_humaneval', 'humaneval_pass@1'], + ['GPQA_diamond', 'accuracy'], + ['IFEval', 'Prompt-level-strict-accuracy'], + '', + + ['mmlu', 'accuracy'], + ['mmlu-stem', 'accuracy'], + ['mmlu-social-science', 'accuracy'], + ['mmlu-humanities', 'accuracy'], + ['mmlu-other', 'accuracy'], + + '', + ['mmlu_pro', 'accuracy'], + ['mmlu_pro_math','accuracy'], + ['mmlu_pro_physics', 'accuracy'], + ['mmlu_pro_chemistry', 'accuracy'], + ['mmlu_pro_law', 'accuracy'], + ['mmlu_pro_engineering', 'accuracy'], + ['mmlu_pro_other', 'accuracy'], + ['mmlu_pro_economics', 'accuracy'], + ['mmlu_pro_health', 'accuracy'], + ['mmlu_pro_psychology', 'accuracy'], + ['mmlu_pro_business', 'accuracy'], + ['mmlu_pro_biology', 'accuracy'], + ['mmlu_pro_philosophy', 'accuracy'], + ['mmlu_pro_computer_science','accuracy'], + ['mmlu_pro_history', 'accuracy'], + '', + ['cmmlu', 'accuracy'], + ['cmmlu-stem', 'accuracy'], + ['cmmlu-social-science', 'accuracy'], + ['cmmlu-humanities', 'accuracy'], + ['cmmlu-other', 'accuracy'], + ['cmmlu-china-specific', 'accuracy'], + '', + ['bbh', 'extract_rate'], + ['math', 'extract_rate'], + # ['openai_humaneval', 'extract_rate'], + ['GPQA_diamond', 'extract_rate'], + # ['IFEval', 'extract_rate'], + '', + ['mmlu', 'extract_rate'], + ['mmlu-stem', 'extract_rate'], + ['mmlu-social-science', 'extract_rate'], + ['mmlu-humanities', 'extract_rate'], + ['mmlu-other', 'extract_rate'], + '', + ['mmlu_pro', 'extract_rate'], + ['mmlu_pro_math', 'extract_rate'], + ['mmlu_pro_physics', 'extract_rate'], + ['mmlu_pro_chemistry', 'extract_rate'], + ['mmlu_pro_law', 'extract_rate'], + ['mmlu_pro_engineering', 'extract_rate'], + ['mmlu_pro_other', 'extract_rate'], + ['mmlu_pro_economics', 'extract_rate'], + ['mmlu_pro_health', 'extract_rate'], + ['mmlu_pro_psychology', 'extract_rate'], + ['mmlu_pro_business', 'extract_rate'], + ['mmlu_pro_biology', 'extract_rate'], + ['mmlu_pro_philosophy', 'extract_rate'], + ['mmlu_pro_computer_science', 'extract_rate'], + ['mmlu_pro_history', 'extract_rate'], + '', + ['cmmlu', 'extract_rate'], + ['cmmlu-stem', 'extract_rate'], + ['cmmlu-social-science', 'extract_rate'], + ['cmmlu-humanities', 'extract_rate'], + ['cmmlu-other', 'extract_rate'], + ['cmmlu-china-specific', 'extract_rate'], + + ], + summary_groups=sum( + [v for k, v in locals().items() if k.endswith('_summary_groups')], []), +) + + +####################################################################### +# PART 3 Models List # +####################################################################### + +models = sum([v for k, v in locals().items() if k.endswith('_model')], []) + + + +####################################################################### +# PART 4 Inference/Evaluation Configuaration # +####################################################################### + +# Local Runner +infer = dict( + partitioner=dict( + type=NumWorkerPartitioner, + num_worker=8 + ), + runner=dict( + type=LocalRunner, + max_num_workers=16, + retry=0, # Modify if needed + task=dict(type=OpenICLInferTask) + ), +) + +# eval with local runner +eval = dict( + partitioner=dict(type=NaivePartitioner, n=10), + runner=dict( + type=LocalRunner, + max_num_workers=16, + task=dict(type=OpenICLEvalTask)), +) + + +####################################################################### +# PART 5 Utils Configuaration # +####################################################################### +base_exp_dir = 'outputs/corebench/' +work_dir = osp.join(base_exp_dir, 'chat_objective') diff --git a/configs/eval_corebench_2409_subjective.py b/configs/eval_corebench_2409_subjective.py new file mode 100644 index 000000000..c0623c804 --- /dev/null +++ b/configs/eval_corebench_2409_subjective.py @@ -0,0 +1,134 @@ +import os.path as osp +from copy import deepcopy + +from mmengine.config import read_base +from opencompass.models import (HuggingFacewithChatTemplate, + TurboMindModelwithChatTemplate) +from opencompass.models.openai_api import OpenAI, OpenAISDK +from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner +from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner +from opencompass.runners import DLCRunner, LocalRunner +from opencompass.summarizers import SubjectiveSummarizer +from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask +from opencompass.tasks.subjective_eval import SubjectiveEvalTask + + +####################################################################### +# PART 0 Essential Configs # +####################################################################### +with read_base(): + # Datasets Part + from opencompass.configs.datasets.subjective.arena_hard.arena_hard_compare import \ + arenahard_datasets + from opencompass.configs.datasets.subjective.alignbench.alignbench_v1_1_judgeby_critiquellm import \ + alignbench_datasets + from opencompass.configs.datasets.subjective.multiround.mtbench_single_judge_diff_temp import \ + mtbench_datasets + + # Summarizer + + # Model List + # from opencompass.configs.models.qwen.lmdeploy_qwen2_1_5b_instruct import models as lmdeploy_qwen2_1_5b_instruct_model + # from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import models as hf_internlm2_5_7b_chat_model + + +####################################################################### +# PART 1 Datasets List # +####################################################################### +# datasets list for evaluation + +datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), []) + + +####################################################################### +# PART 2 Datset Summarizer # +####################################################################### +summarizer = dict(type=SubjectiveSummarizer, function='subjective') + +####################################################################### +# PART 3 Models List # +####################################################################### + +models = [ + dict( + type=TurboMindModelwithChatTemplate, + abbr='internlm2_5-7b-chat-turbomind', + path='internlm/internlm2_5-7b-chat', + engine_config=dict(session_len=16384, max_batch_size=16, tp=1), + gen_config=dict(top_k=40, temperature=1.0, top_p=0.9, max_new_tokens=4096), + max_seq_len=16384, + max_out_len=4096, + batch_size=16, + run_cfg=dict(num_gpus=1), + ) +] + +models = sum([v for k, v in locals().items() if k.endswith('_model')], models) + + + +####################################################################### +# PART 4 Inference/Evaluation Configuaration # +####################################################################### + +# Local Runner +infer = dict( + partitioner=dict( + type=NumWorkerPartitioner, + num_worker=8 + ), + runner=dict( + type=LocalRunner, + max_num_workers=16, + retry=0, # Modify if needed + task=dict(type=OpenICLInferTask) + ), +) + +# JudgeLLM +api_meta_template = dict(round=[ + dict(role='HUMAN', api_role='HUMAN'), + dict(role='BOT', api_role='BOT', generate=True), +]) + + +judge_models = [ + dict( + type=OpenAISDK, + abbr='gpt-4o-2024-08-06', + path='gpt-4o-2024-08-06', + # openai_api_base= + # 'http://10.140.1.86:10001/v1', # Change to your own url if needed. + key='YOUR_API_KEY', + retry=10, + meta_template=api_meta_template, + rpm_verbose=True, + query_per_second=1, + max_out_len=4096, + max_seq_len=16384, + batch_size=16, + temperature=0.01, + tokenizer_path='gpt-4o-2024-08-06' + ) +] + +# Evaluation with local runner +eval = dict( + partitioner=dict( + type=SubjectiveNaivePartitioner, + models=models, + judge_models=judge_models, + ), + runner=dict( + type=LocalRunner, + max_num_workers=16, + task=dict(type=SubjectiveEvalTask)), +) + + + +####################################################################### +# PART 5 Utils Configuaration # +####################################################################### +base_exp_dir = 'outputs/corebench/' +work_dir = osp.join(base_exp_dir, 'chat_subjective') From c3fb9065db5f7ec9b7a9b5d7ea8f834ce75a0b1c Mon Sep 17 00:00:00 2001 From: Linchen Xiao Date: Wed, 25 Sep 2024 11:53:48 +0800 Subject: [PATCH 04/20] [Feature] Add dlc sleep time (#1562) --- opencompass/runners/dlc.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/opencompass/runners/dlc.py b/opencompass/runners/dlc.py index 40453ed08..224ef4300 100644 --- a/opencompass/runners/dlc.py +++ b/opencompass/runners/dlc.py @@ -232,6 +232,8 @@ def _run_within_retry(): while True: # 1. Avoid to request dlc too frequently. # 2. DLC job may not be ready immediately after creation. + dlc_sleep_time = self.aliyun_cfg.get('dlc_sleep_time', 10) + time.sleep(dlc_sleep_time) num_retry = 60 for retry_index in range(num_retry): time.sleep(2) From 87df8a73a3b2290fd0bb07c10b9acc461206cefd Mon Sep 17 00:00:00 2001 From: zhulinJulia24 <145004780+zhulinJulia24@users.noreply.github.com> Date: Wed, 25 Sep 2024 13:40:47 +0800 Subject: [PATCH 05/20] [CI] add a common summarizer for qabench summarizer (#1545) * update * update * update --------- Co-authored-by: zhulin1 --- .../summarizers/subjective/__init__.py | 1 + .../subjective/common_summarizer.py | 146 ++++++++++++++++++ 2 files changed, 147 insertions(+) create mode 100644 opencompass/summarizers/subjective/common_summarizer.py diff --git a/opencompass/summarizers/subjective/__init__.py b/opencompass/summarizers/subjective/__init__.py index 6565d5c89..ea2367c0b 100644 --- a/opencompass/summarizers/subjective/__init__.py +++ b/opencompass/summarizers/subjective/__init__.py @@ -4,6 +4,7 @@ from .alpacaeval import AlpacaSummarizer from .arenahard import ArenaHardSummarizer from .charm import CharmMemSummarizer +from .common_summarizer import CommonSummarizer from .compass_arena import CompassArenaSummarizer from .compassbench import CompassBenchSummarizer from .corev2 import Corev2Summarizer diff --git a/opencompass/summarizers/subjective/common_summarizer.py b/opencompass/summarizers/subjective/common_summarizer.py new file mode 100644 index 000000000..4793a91f1 --- /dev/null +++ b/opencompass/summarizers/subjective/common_summarizer.py @@ -0,0 +1,146 @@ +# flake8: noqa +# yapf: disable +import csv +import os +import os.path as osp +import re +from collections import defaultdict +from datetime import datetime + +import numpy as np +from mmengine import ConfigDict +from tabulate import tabulate + +from opencompass.utils import dataset_abbr_from_cfg, model_abbr_from_cfg + +from .compass_arena import CompassArenaSummarizer +from .utils import get_judgeanswer_and_reference, get_outdir + + +def model_abbr_from_cfg_used_in_summarizer(model): + if model.get('summarizer_abbr', None): + return model['summarizer_abbr'] + else: + return model_abbr_from_cfg(model) + +def post_process_single_rate(judgement: str): + """Input a string like below: + + xxx[[5]]xxx, and extract the score + """ + pattern = r'Rating:\s*\[\[([\d.]+)\]\]' + matched_result = re.findall(pattern, judgement) + if matched_result: + score = float(matched_result[0]) + else: + return None + return {'score': score} + + +def get_capability_results( + judged_answers, + references, + fout, + fout_flag, + model_abbr, + judge_model_abbr, + dataset_abbr, +): + capability_ratings = defaultdict(int) + capability_counts = defaultdict(int) + for ans, ref in zip(judged_answers, references): + capability_ratings['total'] += ans['score'] + capability_counts['total'] += 1 + capability_ratings[ref['capability']] += ans['score'] + capability_counts[ref['capability']] += 1 + + capability_avg_ratings = defaultdict(float) + + for capability, total_score in capability_ratings.items(): + s = total_score / capability_counts[capability] + s = round(s, 2) + capability_avg_ratings[capability] = s + columns = list(capability_avg_ratings.keys()) + columns.insert(0, columns.pop(columns.index('total'))) + + if fout_flag == 0: + with open(fout, 'w', newline='') as csvfile: + writer = csv.writer(csvfile) + if fout_flag == 0: + writer.writerow(['model', 'judge_model', 'dataset'] + columns) + writer.writerow([model_abbr] + [judge_model_abbr] + [dataset_abbr] + [capability_avg_ratings[column] for column in columns]) + else: + with open(fout, 'a+', newline='') as csvfile: + writer = csv.writer(csvfile) + writer.writerow([model_abbr] + [judge_model_abbr] + [dataset_abbr] + [capability_avg_ratings[column] for column in columns]) + + +class CommonSummarizer(CompassArenaSummarizer): + """Do the subjectivity analyze based on evaluation results. + + Args: + config (ConfigDict): The configuration object of the evaluation task. + It's expected to be filled out at runtime. + """ + + def __init__(self, config: ConfigDict, judge_type='single_rate') -> None: + self.judge_type = judge_type + self.tasks = [] + self.cfg = config + self.judge_type = 'single_rate' + self.eval_model_cfgs = self.cfg['eval']['partitioner']['models'] + self.judge_model_cfgs = self.cfg['judge_models'] + self.judge_map = { + 'single_rate': post_process_single_rate + } + self.judge_function = self.judge_map[self.judge_type] + + def summarize(self, time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S')): + """Summarize the subjectivity analysis based on evaluation results. + + Args: + time_str (str): Timestamp for file naming. + + Returns: + pd.DataFrame: The summary results. + """ + if self.judge_type == 'pair': + return super().summarize() + + # self.judge_type == 'single' + dataset_cfgs = self.cfg['datasets'] + output_dir, results_folder = get_outdir(self.cfg, time_str) + fout_flag = 0 + output_tmp_file = osp.join(output_dir, 'result.csv') + output_file = osp.join(output_dir, 'total_result.csv') + for eval_model_cfg in self.eval_model_cfgs: + for judge_model_cfg in self.judge_model_cfgs: + eval_model_abbr = model_abbr_from_cfg(eval_model_cfg) + show_model_abbr = model_abbr_from_cfg_used_in_summarizer(eval_model_cfg) + show_judge_model_abbr = model_abbr_from_cfg_used_in_summarizer(judge_model_cfg) + judge_abbr = model_abbr_from_cfg(judge_model_cfg) + subdir_path = os.path.join(results_folder, eval_model_abbr + '_judged-by--' + judge_abbr) + if os.path.isdir(subdir_path): + for dataset in dataset_cfgs: + judged_answers, references = get_judgeanswer_and_reference(dataset, subdir_path, self.judge_function) + show_dataset_abbr = dataset_abbr_from_cfg(dataset) + + get_capability_results(judged_answers, references, output_tmp_file, fout_flag, show_model_abbr, show_judge_model_abbr, show_dataset_abbr) + fout_flag += 1 + else: + print(subdir_path + ' is not exist! please check!') + with open(output_tmp_file, 'r') as f: + csv_reader = csv.reader(f) + header = next(csv_reader) + table = [line for line in csv_reader] + + new_header = [''] + [line[0] for line in table] + new_table = [[h] + line[1:] for h, line in zip(header[1:], table)] + new_table = [[h] + [line[i] for line in table] for i, h in enumerate(header[1:], start=1)] + t = tabulate(new_table, headers=new_header) + with open(output_file, 'a') as f: + f.write(','.join(new_header) + '\n') + for line in new_table: + f.write(','.join(map(str, line)) + '\n') + print(t) + print(output_file) From aa43eaf267199a1f91de3e10afdeada339d0e05d Mon Sep 17 00:00:00 2001 From: zhulinJulia24 <145004780+zhulinJulia24@users.noreply.github.com> Date: Wed, 25 Sep 2024 17:07:27 +0800 Subject: [PATCH 06/20] [CI] add more models into testcase and test env of cu12 (#1558) * update * update * Update pr-run-test.yml * update * update * update * update * Update daily-run-test.yml * update * updaste * update * update * update * Update daily-run-test.yml * update * update * Update daily-run-test.yml * Update daily-run-test.yml * update * update * update * update * update * Update daily-run-test.yml * update --------- Co-authored-by: zhulin1 --- .github/scripts/eval_regression_base.py | 26 ++++-- .github/scripts/eval_regression_chat.py | 34 +++++-- .github/scripts/oc_score_assert.py | 43 +++++---- .github/scripts/oc_score_baseline.yaml | 114 +++++++++++++++++++++++- .github/workflows/daily-run-test.yml | 75 ++++++++++------ .github/workflows/pr-run-test.yml | 2 +- 6 files changed, 235 insertions(+), 59 deletions(-) diff --git a/.github/scripts/eval_regression_base.py b/.github/scripts/eval_regression_base.py index 8b4c64468..12339ecfa 100644 --- a/.github/scripts/eval_regression_base.py +++ b/.github/scripts/eval_regression_base.py @@ -8,15 +8,17 @@ race_datasets # noqa: F401, E501 from opencompass.configs.models.deepseek.hf_deepseek_moe_16b_base import \ models as hf_deepseek_moe_16b_base_model # noqa: F401, E501 + from opencompass.configs.models.deepseek.hf_deepseek_v2_lite import \ + models as hf_deepseek_v2_lite_model # noqa: F401, E501 # read hf models - chat models from opencompass.configs.models.deepseek.lmdeploy_deepseek_7b_base import \ models as lmdeploy_deepseek_7b_base_model # noqa: F401, E501 from opencompass.configs.models.deepseek.vllm_deepseek_moe_16b_base import \ models as vllm_deepseek_moe_16b_base_model # noqa: F401, E501 - from opencompass.configs.models.gemma.hf_gemma_2b import \ - models as hf_gemma_2b_model # noqa: F401, E501 - from opencompass.configs.models.gemma.hf_gemma_7b import \ - models as hf_gemma_7b_model # noqa: F401, E501 + from opencompass.configs.models.gemma.hf_gemma2_2b import \ + models as hf_gemma2_2b_model # noqa: F401, E501 + from opencompass.configs.models.gemma.hf_gemma2_9b import \ + models as hf_gemma2_9b_model # noqa: F401, E501 from opencompass.configs.models.hf_internlm.hf_internlm2_5_7b import \ models as hf_internlm2_5_7b_model # noqa: F401, E501 from opencompass.configs.models.hf_internlm.hf_internlm2_7b import \ @@ -31,16 +33,28 @@ models as lmdeploy_internlm2_7b_model # noqa: F401, E501 from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_base_7b import \ models as lmdeploy_internlm2_base_7b_model # noqa: F401, E501 + from opencompass.configs.models.hf_llama.hf_llama2_7b import \ + models as hf_llama2_7b_model # noqa: F401, E501 + from opencompass.configs.models.hf_llama.hf_llama3_8b import \ + models as hf_llama3_8b_model # noqa: F401, E501 + from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_8b import \ + models as lmdeploy_llama3_1_8b_model # noqa: F401, E501 from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b import \ models as lmdeploy_llama3_8b_model # noqa: F401, E501 - from opencompass.configs.models.mistral.hf_mistral_7b_v0_2 import \ - models as hf_mistral_7b_v0_2_model # noqa: F401, E501 + from opencompass.configs.models.mistral.hf_mistral_7b_v0_3 import \ + models as hf_mistral_7b_v0_3_model # noqa: F401, E501 from opencompass.configs.models.mistral.vllm_mistral_7b_v0_2 import \ models as vllm_mistral_7b_v0_2_model # noqa: F401, E501 + from opencompass.configs.models.mistral.vllm_mixtral_8x7b_v0_1 import \ + models as vllm_mixtral_8x7b_v0_1_model # noqa: F401, E501 from opencompass.configs.models.qwen.hf_qwen1_5_moe_a2_7b import \ models as hf_qwen1_5_moe_a2_7b_model # noqa: F401, E501 from opencompass.configs.models.qwen.hf_qwen2_0_5b import \ models as hf_qwen2_0_5b_model # noqa: F401, E501 + from opencompass.configs.models.qwen.hf_qwen2_1_5b import \ + models as hf_qwen2_1_5b_model # noqa: F401, E501 + from opencompass.configs.models.qwen.hf_qwen2_7b import \ + models as hf_qwen2_7b_model # noqa: F401, E501 from opencompass.configs.models.qwen.lmdeploy_qwen2_1_5b import \ models as lmdeploy_qwen2_1_5b_model # noqa: F401, E501 from opencompass.configs.models.qwen.lmdeploy_qwen2_7b import \ diff --git a/.github/scripts/eval_regression_chat.py b/.github/scripts/eval_regression_chat.py index 1ee28e630..fa28562f4 100644 --- a/.github/scripts/eval_regression_chat.py +++ b/.github/scripts/eval_regression_chat.py @@ -13,20 +13,32 @@ models as hf_baichuan2_7b_chat_model # noqa: F401, E501 from opencompass.configs.models.chatglm.hf_glm4_9b_chat import \ models as hf_glm4_9b_chat_model # noqa: F401, E501 + from opencompass.configs.models.chatglm.lmdeploy_glm4_9b_chat import \ + models as lmdeploy_glm4_9b_chat_model # noqa: F401, E501 + from opencompass.configs.models.chatglm.vllm_glm4_9b_chat import \ + models as vllm_glm4_9b_chat_model # noqa: F401, E501 from opencompass.configs.models.deepseek.hf_deepseek_7b_chat import \ models as hf_deepseek_7b_chat_model # noqa: F401, E501 from opencompass.configs.models.deepseek.hf_deepseek_moe_16b_chat import \ models as hf_deepseek_moe_16b_chat_model # noqa: F401, E501 + from opencompass.configs.models.deepseek.hf_deepseek_v2_lite_chat import \ + models as hf_deepseek_v2_lite_chat_model # noqa: F401, E501 from opencompass.configs.models.deepseek.vllm_deepseek_7b_chat import \ models as vllm_deepseek_7b_chat_model # noqa: F401, E501 - from opencompass.configs.models.gemma.hf_gemma_2b_it import \ - models as hf_gemma_2b_it_model # noqa: F401, E501 - from opencompass.configs.models.gemma.hf_gemma_7b_it import \ - models as hf_gemma_7b_it_model # noqa: F401, E501 + from opencompass.configs.models.gemma.hf_gemma2_2b_it import \ + models as hf_gemma2_2b_it_model # noqa: F401, E501 + from opencompass.configs.models.gemma.hf_gemma2_9b_it import \ + models as hf_gemma2_9b_it_model # noqa: F401, E501 + from opencompass.configs.models.gemma.vllm_gemma_7b_it import \ + models as vllm_gemma_7b_it_model # noqa: F401, E501 from opencompass.configs.models.hf_internlm.hf_internlm2_5_7b_chat import \ models as hf_internlm2_5_7b_chat_model # noqa: F401, E501 + from opencompass.configs.models.hf_internlm.hf_internlm2_5_20b_chat import \ + models as hf_internlm2_5_20b_chat_model # noqa: F401, E501 from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import \ models as lmdeploy_internlm2_5_7b_chat_model # noqa: F401, E501 + from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_20b_chat import \ + models as lmdeploy_internlm2_5_20b_chat_model # noqa: F401, E501 from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_chat_1_8b import \ models as lmdeploy_internlm2_chat_1_8b_model # noqa: F401, E501 from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_chat_1_8b_sft import \ @@ -37,14 +49,20 @@ models as lmdeploy_internlm2_chat_7b_sft_model # noqa: F401, E501 from opencompass.configs.models.hf_internlm.vllm_internlm2_chat_7b import \ models as vllm_internlm2_chat_7b_model # noqa: F401, E501 + from opencompass.configs.models.hf_llama.hf_llama3_1_8b_instruct import \ + models as hf_llama3_1_8b_instruct_model # noqa: F401, E501 from opencompass.configs.models.hf_llama.hf_llama3_8b_instruct import \ models as hf_llama3_8b_instruct_model # noqa: F401, E501 + from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_8b_instruct import \ + models as lmdeploy_llama3_1_8b_instruct_model # noqa: F401, E501 from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b_instruct import \ models as lmdeploy_llama3_8b_instruct_model # noqa: F401, E501 - from opencompass.configs.models.mistral.hf_mistral_7b_instruct_v0_2 import \ - models as hf_mistral_7b_instruct_v0_2_model # noqa: F401, E501 + from opencompass.configs.models.mistral.hf_mistral_7b_instruct_v0_3 import \ + models as hf_mistral_7b_instruct_v0_3_model # noqa: F401, E501 from opencompass.configs.models.mistral.vllm_mistral_7b_instruct_v0_2 import \ models as vllm_mistral_7b_instruct_v0_2_model # noqa: F401, E501 + from opencompass.configs.models.mistral.vllm_mixtral_8x7b_instruct_v0_1 import \ + models as vllm_mixtral_8x7b_instruct_v0_1_model # noqa: F401, E501 from opencompass.configs.models.openbmb.hf_minicpm_2b_dpo_fp32 import \ models as hf_minicpm_2b_dpo_fp32_model # noqa: F401, E501 from opencompass.configs.models.openbmb.hf_minicpm_2b_sft_bf16 import \ @@ -57,6 +75,10 @@ models as hf_phi_3_mini_8k_instruct_model # noqa: F401, E501 from opencompass.configs.models.qwen.hf_qwen1_5_0_5b_chat import \ models as hf_qwen1_5_0_5b_chat_model # noqa: F401, E501 + from opencompass.configs.models.qwen.hf_qwen2_1_5b_instruct import \ + models as hf_qwen2_1_5b_instruct_model # noqa: F401, E501 + from opencompass.configs.models.qwen.hf_qwen2_7b_instruct import \ + models as hf_qwen2_7b_instruct_model # noqa: F401, E501 from opencompass.configs.models.qwen.lmdeploy_qwen2_1_5b_instruct import \ models as lmdeploy_qwen2_1_5b_instruct_model # noqa: F401, E501 from opencompass.configs.models.qwen.lmdeploy_qwen2_7b_instruct import \ diff --git a/.github/scripts/oc_score_assert.py b/.github/scripts/oc_score_assert.py index f869b157b..6f2c0a11a 100644 --- a/.github/scripts/oc_score_assert.py +++ b/.github/scripts/oc_score_assert.py @@ -7,30 +7,35 @@ output_path = 'regression_result_daily' chat_model_list = [ - 'baichuan2-7b-chat-hf', 'deepseek-7b-chat-hf', 'deepseek-moe-16b-chat-hf', - 'deepseek-7b-chat-vllm', 'gemma-2b-it-hf', 'gemma-7b-it-hf', - 'internlm2_5-7b-chat-hf', 'internlm2_5-7b-chat-turbomind', - 'internlm2-chat-1.8b-turbomind', 'internlm2-chat-1.8b-sft-turbomind', - 'internlm2-chat-7b-turbomind', 'internlm2-chat-7b-sft-turbomind', - 'internlm2-chat-7b-vllm', 'llama-3-8b-instruct-hf', - 'llama-3-8b-instruct-turbomind', 'mistral-7b-instruct-v0.2-hf', - 'mistral-7b-instruct-v0.2-vllm', 'minicpm-2b-dpo-fp32-hf', - 'minicpm-2b-sft-bf16-hf', 'minicpm-2b-sft-fp32-hf', - 'phi-3-mini-4k-instruct-hf', 'qwen1.5-0.5b-chat-hf', + 'baichuan2-7b-chat-hf', 'glm-4-9b-chat-turbomind', 'glm-4-9b-chat-vllm', + 'deepseek-7b-chat-hf', 'deepseek-moe-16b-chat-hf', + 'deepseek-v2-lite-chat-hf', 'deepseek-7b-chat-vllm', 'gemma2-2b-it-hf', + 'gemma2-9b-it-hf', 'gemma-7b-it-vllm', 'internlm2_5-7b-chat-hf', + 'internlm2_5-20b-chat-hf', 'internlm2_5-7b-chat-turbomind', + 'internlm2_5-20b-chat-turbomind', 'internlm2-chat-1.8b-turbomind', + 'internlm2-chat-1.8b-sft-turbomind', 'internlm2-chat-7b-turbomind', + 'internlm2-chat-7b-sft-turbomind', 'internlm2-chat-7b-vllm', + 'llama-3_1-8b-instruct-hf', 'llama-3-8b-instruct-hf', + 'llama-3_1-8b-instruct-turbomind', 'llama-3-8b-instruct-turbomind', + 'mistral-7b-instruct-v0.3-hf', 'mistral-7b-instruct-v0.2-vllm', + 'minicpm-2b-dpo-fp32-hf', 'minicpm-2b-sft-bf16-hf', + 'minicpm-2b-sft-fp32-hf', 'phi-3-mini-4k-instruct-hf', + 'qwen1.5-0.5b-chat-hf', 'qwen2-1.5b-instruct-hf', 'qwen2-7b-instruct-hf', 'qwen2-1.5b-instruct-turbomind', 'qwen2-7b-instruct-turbomind', 'qwen1.5-0.5b-chat-vllm', 'yi-1.5-6b-chat-hf', 'yi-1.5-9b-chat-hf', 'lmdeploy-api-test' ] base_model_list = [ - 'deepseek-moe-16b-base-hf', 'deepseek-7b-base-turbomind', - 'deepseek-moe-16b-base-vllm', 'gemma-2b-hf', 'gemma-7b-hf', - 'internlm2_5-7b-hf', 'internlm2-7b-hf', 'internlm2-base-7b-hf', - 'internlm2_5-7b-turbomind', 'internlm2-1.8b-turbomind', - 'internlm2-7b-turbomind', 'internlm2-base-7b-hf', - 'internlm2-base-7b-turbomind', 'llama-3-8b-turbomind', - 'mistral-7b-v0.2-hf', 'mistral-7b-v0.2-vllm', 'qwen1.5-moe-a2.7b-hf', - 'qwen2-0.5b-hf', 'qwen2-1.5b-turbomind', 'qwen2-7b-turbomind', - 'qwen1.5-0.5b-vllm', 'yi-1.5-6b-hf', 'yi-1.5-9b-hf' + 'deepseek-moe-16b-base-hf', 'deepseek-v2-lite-hf', + 'deepseek-7b-base-turbomind', 'deepseek-moe-16b-base-vllm', 'gemma2-2b-hf', + 'gemma2-9b-hf', 'internlm2_5-7b-hf', 'internlm2-7b-hf', + 'internlm2-base-7b-hf', 'internlm2-1.8b-turbomind', + 'internlm2_5-7b-turbomind', 'internlm2-7b-turbomind', + 'internlm2-base-7b-turbomind', 'llama-2-7b-hf', 'llama-3-8b-hf', + 'llama-3.1-8b-turbomind', 'llama-3-8b-turbomind', 'mistral-7b-v0.3-hf', + 'mistral-7b-v0.2-vllm', 'qwen1.5-moe-a2.7b-hf', 'qwen2-0.5b-hf', + 'qwen2-1.5b-hf', 'qwen2-7b-hf', 'qwen2-1.5b-turbomind', + 'qwen2-7b-turbomind', 'qwen1.5-0.5b-vllm', 'yi-1.5-6b-hf', 'yi-1.5-9b-hf' ] dataset_list = ['gsm8k', 'race-middle', 'race-high'] diff --git a/.github/scripts/oc_score_baseline.yaml b/.github/scripts/oc_score_baseline.yaml index d7e765be2..9690aa2c5 100644 --- a/.github/scripts/oc_score_baseline.yaml +++ b/.github/scripts/oc_score_baseline.yaml @@ -8,6 +8,16 @@ glm-4-9b-chat-hf: race-middle: 88 race-high: 88 +glm-4-9b-chat-turbomind: + gsm8k: 69 + race-middle: 82 + race-high: 77 + +glm-4-9b-chat-vllm: + gsm8k: 73 + race-middle: 87 + race-high: 87 + deepseek-7b-chat-hf: gsm8k: 60 race-middle: 74 @@ -18,6 +28,11 @@ deepseek-moe-16b-chat-hf: race-middle: 62 race-high: 70 +deepseek-v2-lite-chat-hf: + gsm8k: 59 + race-middle: 82 + race-high: 79 + deepseek-7b-chat-vllm: gsm8k: 63 race-middle: 74 @@ -33,23 +48,48 @@ gemma-7b-it-hf: race-middle: 74 race-high: 71 +gemma-7b-it-vllm: + gsm8k: 38 + race-middle: 75 + race-high: 70 + +gemma2-2b-it-hf: + gsm8k: 62 + race-middle: 75 + race-high: 67 + +gemma2-9b-it-hf: + gsm8k: 80 + race-middle: 89 + race-high: 85 + internlm2_5-7b-chat-hf: gsm8k: 86 race-middle: 92 race-high: 93 +internlm2_5-20b-chat-hf: + gsm8k: 91 + race-middle: 95 + race-high: 91 + internlm2_5-7b-chat-turbomind: gsm8k: 87 race-middle: 92 race-high: 93 +internlm2_5-20b-chat-turbomind: + gsm8k: 91 + race-middle: 95 + race-high: 91 + internlm2-chat-1.8b-turbomind: gsm8k: 40 race-middle: 82 race-high: 83 internlm2-chat-1.8b-sft-turbomind: - gsm8k: 32 + gsm8k: 34 race-middle: 81 race-high: 83 @@ -68,11 +108,21 @@ internlm2-chat-7b-vllm: race-middle: 90 race-high: 91 +llama-3_1-8b-instruct-hf: + gsm8k: 82 + race-middle: 82 + race-high: 88 + llama-3-8b-instruct-hf: gsm8k: 77 race-middle: 85 race-high: 87 +llama-3_1-8b-instruct-turbomind: + gsm8k: 79 + race-middle: 82 + race-high: 88 + llama-3-8b-instruct-turbomind: gsm8k: 77 race-middle: 85 @@ -83,6 +133,11 @@ mistral-7b-instruct-v0.2-hf: race-middle: 82 race-high: 78 +mistral-7b-instruct-v0.3-hf: + gsm8k: 53 + race-middle: 80 + race-high: 78 + mistral-7b-instruct-v0.2-vllm: gsm8k: 49 race-middle: 81 @@ -118,6 +173,11 @@ qwen1.5-0.5b-chat-hf: race-middle: 55 race-high: 50 +qwen2-1.5b-instruct-hf: + gsm8k: 63 + race-middle: 77 + race-high: 86 + qwen2-1.5b-instruct-turbomind: gsm8k: 60 race-middle: 77 @@ -128,6 +188,11 @@ qwen2-7b-instruct-turbomind: race-middle: 87 race-high: 89 +qwen2-7b-instruct-hf: + gsm8k: 85 + race-middle: 87 + race-high: 91 + qwen1.5-0.5b-chat-vllm: gsm8k: 5 race-middle: 57 @@ -153,6 +218,11 @@ deepseek-moe-16b-base-hf: race-middle: 35 race-high: 23 +deepseek-v2-lite-hf: + gsm8k: 37 + race-middle: 56 + race-high: 62 + deepseek-7b-base-turbomind: gsm8k: 21 race-middle: 42 @@ -173,8 +243,18 @@ gemma-7b-hf: race-middle: 59 race-high: 66 +gemma2-2b-hf: + gsm8k: 8 + race-middle: 31 + race-high: 30 + +gemma2-9b-hf: + gsm8k: 20 + race-middle: 42 + race-high: 35 + internlm2_5-7b-hf: - gsm8k: 46 + gsm8k: 47 race-middle: 92 race-high: 91 @@ -208,6 +288,21 @@ internlm2-base-7b-turbomind: race-middle: 75 race-high: 81 +llama-2-7b-hf: + gsm8k: 17 + race-middle: 32 + race-high: 38 + +llama-3-8b-hf: + gsm8k: 48 + race-middle: 64 + race-high: 70 + +llama-3.1-8b-turbomind: + gsm8k: 57 + race-middle: 67 + race-high: 75 + llama-3-8b-turbomind: gsm8k: 52 race-middle: 63 @@ -218,6 +313,11 @@ mistral-7b-v0.2-hf: race-middle: 42 race-high: 60 +mistral-7b-v0.3-hf: + gsm8k: 43 + race-middle: 42 + race-high: 60 + mistral-7b-v0.2-vllm: gsm8k: 45 race-middle: 42 @@ -228,11 +328,21 @@ qwen1.5-moe-a2.7b-hf: race-middle: 78 race-high: 90 +qwen2-1.5b-hf: + gsm8k: 58 + race-middle: 65 + race-high: 78 + qwen2-0.5b-hf: gsm8k: 35 race-middle: 52 race-high: 48 +qwen2-7b-hf: + gsm8k: 82 + race-middle: 88 + race-high: 89 + qwen2-1.5b-turbomind: gsm8k: 57 race-middle: 64 diff --git a/.github/workflows/daily-run-test.yml b/.github/workflows/daily-run-test.yml index 7d7affafc..894b149e0 100644 --- a/.github/workflows/daily-run-test.yml +++ b/.github/workflows/daily-run-test.yml @@ -14,9 +14,14 @@ env: PIP_CACHE_PATH: /cpfs01/user/qa-llm-cicd/.cache/pip USERSPACE_PREFIX: /cpfs01/user/qa-llm-cicd HF_CACHE_PATH: /cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub + HUGGINGFACE_HUB_CACHE: /cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub + HF_HUB_CACHE: /cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub DATEASET_CACHE_PATH: /cpfs01/shared/public/public_hdd/llmeval/llm-evaluation-datasets HF_DATASETS_OFFLINE: 1 + HF_EVALUATE_OFFLINE: 1 TRANSFORMERS_OFFLINE: 1 + VLLM_USE_MODELSCOPE: false + LMDEPLOY_USE_MODELSCOPE: false HF_HUB_OFFLINE: 1 TRITON_PTXAS_PATH: /usr/local/cuda/bin/ptxas @@ -43,7 +48,11 @@ jobs: daily_run_test: needs: build-pypi - runs-on: self-hosted + strategy: + fail-fast: false + matrix: + cuda_env: [dsw_cu11, dsw_cu12] + runs-on: ${{ matrix.cuda_env }} environment: 'prod' timeout-minutes: 420 #7hours steps: @@ -53,22 +62,38 @@ jobs: uses: actions/download-artifact@v4 with: name: my-artifact-${{ github.run_id }} - - name: Prepare - create conda env and install torch + - name: Prepare - create conda env and install torch - cu11 + if: ${{matrix.cuda_env == 'dsw_cu11'}} run: | . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate - conda create -y --name ${{env.CONDA_ENV}} python=3.10 - conda activate ${{env.CONDA_ENV}} - pip install opencompass*.whl - pip install /cpfs01/user/qa-llm-cicd/packages/lmdeploy-0.5.0+cu118-cp310-cp310-manylinux2014_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}} - pip install /cpfs01/user/qa-llm-cicd/packages/vllm-0.5.5+cu118-cp310-cp310-manylinux1_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}} - - pip install human_eval transformers protobuf pytest gguf msgspec librosa vllm_flash_attn bitsandbytes --cache-dir ${{env.PIP_CACHE_PATH}} + conda create -y --name ${{env.CONDA_ENV}}_${{ matrix.cuda_env }} python=3.10 + conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }} + pip install opencompass*.whl --cache-dir ${{env.PIP_CACHE_PATH}} + pip install /cpfs01/user/qa-llm-cicd/packages/lmdeploy-0.6.0+cu118-cp310-cp310-manylinux2014_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}} + pip install /cpfs01/user/qa-llm-cicd/packages/vllm-0.6.1.post1+cu118-cp310-cp310-manylinux1_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}} + pip install human_eval transformers protobuf pytest gguf msgspec librosa vllm_flash_attn bitsandbytes modelscope --cache-dir ${{env.PIP_CACHE_PATH}} pip uninstall torch torchvision torchaudio -y pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --cache-dir ${{env.PIP_CACHE_PATH}} --index-url https://download.pytorch.org/whl/cu118 FLASH_ATTENTION_FORCE_BUILD=TRUE pip install /cpfs01/user/qa-llm-cicd/packages/flash_attn-2.6.3+cu118torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl pip install /cpfs01/user/qa-llm-cicd/packages/xformers-0.0.27.post2+cu118-cp310-cp310-manylinux2014_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}} conda info --envs pip list + - name: Prepare - create conda env and install torch - cu12 + if: ${{matrix.cuda_env == 'dsw_cu12'}} + run: | + . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate + conda create -y --name ${{env.CONDA_ENV}}_${{ matrix.cuda_env }} python=3.10 + conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }} + pip install opencompass*.whl --cache-dir ${{env.PIP_CACHE_PATH}} + pip install lmdeploy==0.6.0 --cache-dir ${{env.PIP_CACHE_PATH}} --no-cache-dir + pip install opencompass[vllm] --cache-dir ${{env.PIP_CACHE_PATH}} + pip install human_eval transformers protobuf pytest gguf msgspec librosa vllm_flash_attn bitsandbytes modelscope --cache-dir ${{env.PIP_CACHE_PATH}} + pip uninstall torch torchvision torchaudio -y + pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --cache-dir ${{env.PIP_CACHE_PATH}} + FLASH_ATTENTION_FORCE_BUILD=TRUE pip install /cpfs01/user/qa-llm-cicd/packages/flash_attn-2.6.3+cu123torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl + pip install /cpfs01/user/qa-llm-cicd/packages/xformers-0.0.27.post2-cp310-cp310-manylinux2014_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}} + conda info --envs + pip list - name: Prepare - prepare data and hf model run: | ln -s ${{env.DATEASET_CACHE_PATH}} data @@ -77,45 +102,45 @@ jobs: - name: Run chat model test run: | . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate - conda activate ${{env.CONDA_ENV}} + conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }} conda info --envs sed -i 's/judgemodel/'$(tail -n 1 /cpfs01/shared/public/llmeval/share_info/compassjuder_ip.txt)'/g' .github/scripts/eval_regression_chat.py - python3 run.py .github/scripts/eval_regression_chat.py --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/chat --reuse - rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/chat/*/summary regression_result_daily + opencompass .github/scripts/eval_regression_chat.py --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/chat_${{ matrix.cuda_env }} --reuse --max-num-workers 2 + rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/chat_${{ matrix.cuda_env }}/*/summary regression_result_daily python -m pytest -m chat -s -v --color=yes .github/scripts/oc_score_assert.py - name: Run base model test run: | . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate - conda activate ${{env.CONDA_ENV}} + conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }} conda info --envs - python3 run.py .github/scripts/eval_regression_base.py --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/base --reuse - rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/base/*/summary regression_result_daily + opencompass .github/scripts/eval_regression_base.py --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/base_${{ matrix.cuda_env }} --reuse --max-num-workers 2 + rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/base_${{ matrix.cuda_env }}/*/summary regression_result_daily python -m pytest -m base -s -v --color=yes .github/scripts/oc_score_assert.py - name: Run command testcase run: | . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate - conda activate ${{env.CONDA_ENV}} + conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }} conda info --envs export from_tf=TRUE python tools/list_configs.py internlm2_5 mmlu - python run.py --models hf_internlm2_5_7b --datasets race_ppl --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd1 --reuse - rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd1/*/summary regression_result_daily + opencompass --models hf_internlm2_5_7b --datasets race_ppl --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd1_${{ matrix.cuda_env }} --reuse --max-num-workers 2 + rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd1_${{ matrix.cuda_env }}/*/summary regression_result_daily python -m pytest -m case1 -s -v --color=yes .github/scripts/oc_score_assert.py - python run.py --models hf_internlm2_5_7b_chat --datasets race_gen -a lmdeploy --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd2 --reuse - rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd2/*/summary regression_result_daily + opencompass --models hf_internlm2_5_7b_chat hf_internlm2_5_1_8b_chat --datasets race_gen -a lmdeploy --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd2 --reuse --max-num-workers 2 + rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd2_${{ matrix.cuda_env }}/*/summary regression_result_daily python -m pytest -m case2 -s -v --color=yes .github/scripts/oc_score_assert.py - python run.py --datasets race_ppl --hf-type base --hf-path internlm/internlm2_5-7b --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd3 --reuse - rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd3/*/summary regression_result_daily + opencompass --datasets race_ppl --hf-type base --hf-path internlm/internlm2_5-7b --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd3 --reuse --max-num-workers 2 + rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd3_${{ matrix.cuda_env }}/*/summary regression_result_daily python -m pytest -m case3 -s -v --color=yes .github/scripts/oc_score_assert.py - python run.py --datasets race_gen --hf-type chat --hf-path internlm/internlm2_5-7b-chat --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd4 --reuse - rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd4/*/summary regression_result_daily + opencompass --datasets race_gen --hf-type chat --hf-path internlm/internlm2_5-7b-chat --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd4 --reuse --max-num-workers 2 + rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd4_${{ matrix.cuda_env }}/*/summary regression_result_daily python -m pytest -m case4 -s -v --color=yes .github/scripts/oc_score_assert.py - name: Remove Conda Env if: always() run: | rm -rf regression_result_daily . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate - conda env remove -y --name ${{env.CONDA_ENV}} + conda env remove -y --name ${{env.CONDA_ENV}}_${{ matrix.cuda_env }} conda info --envs notify_to_feishu: diff --git a/.github/workflows/pr-run-test.yml b/.github/workflows/pr-run-test.yml index 6cab13786..d9fcdc3ae 100644 --- a/.github/workflows/pr-run-test.yml +++ b/.github/workflows/pr-run-test.yml @@ -51,7 +51,7 @@ jobs: conda activate ${{env.CONDA_ENV}} conda info --envs rm -rf regression_result - python3 run.py --models hf_internlm2_chat_7b --datasets siqa_gen --work-dir regression_result --debug + opencompass --models hf_internlm2_chat_7b --datasets siqa_gen --work-dir regression_result --debug - name: Get result run: | score=$(sed -n '$p' regression_result/*/summary/*.csv | awk -F ',' '{print $NF}') From 80cda1980e8725b713845675711f9e269025478d Mon Sep 17 00:00:00 2001 From: Linchen Xiao Date: Wed, 25 Sep 2024 20:58:34 +0800 Subject: [PATCH 07/20] [BUG] fix followbench dataset config (#1564) * [BUG] fix followbench dataset config * [BUG] fix followbench dataset config --- .../datasets/subjective/followbench/followbench_llmeval.py | 4 ++-- .../datasets/subjective/followbench/followbench_llmeval.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/configs/datasets/subjective/followbench/followbench_llmeval.py b/configs/datasets/subjective/followbench/followbench_llmeval.py index 0733340ed..e601bda34 100644 --- a/configs/datasets/subjective/followbench/followbench_llmeval.py +++ b/configs/datasets/subjective/followbench/followbench_llmeval.py @@ -15,7 +15,7 @@ ] data_path ='data/subjective/followbench/converted_data' -followbench_llmeval_dataset = [] +followbench_llmeval_datasets = [] for _name in subjective_all_sets: subjective_infer_cfg = dict( @@ -48,7 +48,7 @@ pred_role='BOT', ) - followbench_llmeval_dataset.append( + followbench_llmeval_datasets.append( dict( abbr=f'{_name}', type=FollowBenchDataset, diff --git a/opencompass/configs/datasets/subjective/followbench/followbench_llmeval.py b/opencompass/configs/datasets/subjective/followbench/followbench_llmeval.py index 0733340ed..e601bda34 100644 --- a/opencompass/configs/datasets/subjective/followbench/followbench_llmeval.py +++ b/opencompass/configs/datasets/subjective/followbench/followbench_llmeval.py @@ -15,7 +15,7 @@ ] data_path ='data/subjective/followbench/converted_data' -followbench_llmeval_dataset = [] +followbench_llmeval_datasets = [] for _name in subjective_all_sets: subjective_infer_cfg = dict( @@ -48,7 +48,7 @@ pred_role='BOT', ) - followbench_llmeval_dataset.append( + followbench_llmeval_datasets.append( dict( abbr=f'{_name}', type=FollowBenchDataset, From 3f833186dc8c757125420660041f30f664e7dbfc Mon Sep 17 00:00:00 2001 From: Yi Ding Date: Thu, 26 Sep 2024 16:49:52 +0800 Subject: [PATCH 08/20] [Feature] Support the reasoning from BaiLing LLM (#1541) * [Feature] Support the reasoning from BaiLing LLM This commit includes the access to BaiLing LLM and gets the reasoning. * Add the api example The example of evalute bailing api * Revise the generation arguments Based on current experiment, we update some generation arguments for better reasoning * [fix] set the batch size * Retry under flowcontrol of serverside * add dependent package into requirement.txt add dependent package retrying to clean up the pre-comment check. * correct the file names and make the file copy correct the file names. copy the files under configs to opencompass * fix the lint issue --------- Co-authored-by: christopher.dy --- configs/api_examples/eval_api_bailing.py | 38 ++++ .../models/bailing_api/bailing-lite-0830.py | 31 +++ .../models/bailing_api/bailing-pro-0920.py | 31 +++ .../models/bailing_api/bailing-lite-0830.py | 31 +++ .../models/bailing_api/bailing-pro-0920.py | 31 +++ opencompass/models/__init__.py | 4 +- opencompass/models/bailing_api_oc.py | 215 ++++++++++++++++++ requirements/runtime.txt | 1 + 8 files changed, 380 insertions(+), 2 deletions(-) create mode 100644 configs/api_examples/eval_api_bailing.py create mode 100644 configs/models/bailing_api/bailing-lite-0830.py create mode 100644 configs/models/bailing_api/bailing-pro-0920.py create mode 100644 opencompass/configs/models/bailing_api/bailing-lite-0830.py create mode 100644 opencompass/configs/models/bailing_api/bailing-pro-0920.py create mode 100644 opencompass/models/bailing_api_oc.py diff --git a/configs/api_examples/eval_api_bailing.py b/configs/api_examples/eval_api_bailing.py new file mode 100644 index 000000000..15101b09f --- /dev/null +++ b/configs/api_examples/eval_api_bailing.py @@ -0,0 +1,38 @@ +from mmengine.config import read_base + +from opencompass.models import BailingAPI +from opencompass.partitioners import NaivePartitioner +from opencompass.runners.local_api import LocalAPIRunner +from opencompass.tasks import OpenICLInferTask + +with read_base(): + from opencompass.configs.datasets.ceval.ceval_gen import ceval_datasets + from opencompass.configs.summarizers.medium import summarizer + +datasets = [ + *ceval_datasets, +] + +models = [ + dict( + path="Bailing-Lite-0830", + token="xxxxxx", # set your key here or in environment variable BAILING_API_KEY + url="https://bailingchat.alipay.com/chat/completions", + type=BailingAPI, + generation_kwargs={}, + query_per_second=1, + max_seq_len=4096, + ), +] + +infer = dict( + partitioner=dict(type=NaivePartitioner), + runner=dict( + type=LocalAPIRunner, + max_num_workers=2, + concurrent_users=2, + task=dict(type=OpenICLInferTask), + ), +) + +work_dir = "outputs/api_bailing/" diff --git a/configs/models/bailing_api/bailing-lite-0830.py b/configs/models/bailing_api/bailing-lite-0830.py new file mode 100644 index 000000000..1a43b4be1 --- /dev/null +++ b/configs/models/bailing_api/bailing-lite-0830.py @@ -0,0 +1,31 @@ +from opencompass.models import BailingAPI + +api_meta_template = dict( + round=[ + dict(role="HUMAN", api_role="HUMAN"), + dict(role="BOT", api_role="BOT", generate=False), + ], + reserved_roles=[dict(role="SYSTEM", api_role="SYSTEM")], +) + +models = [ + dict( + path="Bailing-Lite-0830", + token="", # set your key here or in environment variable BAILING_API_KEY + url="https://bailingchat.alipay.com/chat/completions", + type=BailingAPI, + meta_template=api_meta_template, + query_per_second=1, + max_seq_len=4096, + batch_size=1, + generation_kwargs={ + "temperature": 0.4, + "top_p": 1.0, + "top_k": -1, + "n": 1, + "logprobs": 1, + "use_beam_search": False, + }, + ), +] + diff --git a/configs/models/bailing_api/bailing-pro-0920.py b/configs/models/bailing_api/bailing-pro-0920.py new file mode 100644 index 000000000..35814bf79 --- /dev/null +++ b/configs/models/bailing_api/bailing-pro-0920.py @@ -0,0 +1,31 @@ +from opencompass.models import BailingAPI + +api_meta_template = dict( + round=[ + dict(role="HUMAN", api_role="HUMAN"), + dict(role="BOT", api_role="BOT", generate=False), + ], + reserved_roles=[dict(role="SYSTEM", api_role="SYSTEM")], +) + +models = [ + dict( + path="Bailing-Pro-0920", + token="", # set your key here or in environment variable BAILING_API_KEY + url="https://bailingchat.alipay.com/chat/completions", + type=BailingAPI, + meta_template=api_meta_template, + query_per_second=1, + max_seq_len=4096, + batch_size=1, + generation_kwargs={ + "temperature": 0.4, + "top_p": 1.0, + "top_k": -1, + "n": 1, + "logprobs": 1, + "use_beam_search": False, + }, + ), +] + diff --git a/opencompass/configs/models/bailing_api/bailing-lite-0830.py b/opencompass/configs/models/bailing_api/bailing-lite-0830.py new file mode 100644 index 000000000..1a43b4be1 --- /dev/null +++ b/opencompass/configs/models/bailing_api/bailing-lite-0830.py @@ -0,0 +1,31 @@ +from opencompass.models import BailingAPI + +api_meta_template = dict( + round=[ + dict(role="HUMAN", api_role="HUMAN"), + dict(role="BOT", api_role="BOT", generate=False), + ], + reserved_roles=[dict(role="SYSTEM", api_role="SYSTEM")], +) + +models = [ + dict( + path="Bailing-Lite-0830", + token="", # set your key here or in environment variable BAILING_API_KEY + url="https://bailingchat.alipay.com/chat/completions", + type=BailingAPI, + meta_template=api_meta_template, + query_per_second=1, + max_seq_len=4096, + batch_size=1, + generation_kwargs={ + "temperature": 0.4, + "top_p": 1.0, + "top_k": -1, + "n": 1, + "logprobs": 1, + "use_beam_search": False, + }, + ), +] + diff --git a/opencompass/configs/models/bailing_api/bailing-pro-0920.py b/opencompass/configs/models/bailing_api/bailing-pro-0920.py new file mode 100644 index 000000000..35814bf79 --- /dev/null +++ b/opencompass/configs/models/bailing_api/bailing-pro-0920.py @@ -0,0 +1,31 @@ +from opencompass.models import BailingAPI + +api_meta_template = dict( + round=[ + dict(role="HUMAN", api_role="HUMAN"), + dict(role="BOT", api_role="BOT", generate=False), + ], + reserved_roles=[dict(role="SYSTEM", api_role="SYSTEM")], +) + +models = [ + dict( + path="Bailing-Pro-0920", + token="", # set your key here or in environment variable BAILING_API_KEY + url="https://bailingchat.alipay.com/chat/completions", + type=BailingAPI, + meta_template=api_meta_template, + query_per_second=1, + max_seq_len=4096, + batch_size=1, + generation_kwargs={ + "temperature": 0.4, + "top_p": 1.0, + "top_k": -1, + "n": 1, + "logprobs": 1, + "use_beam_search": False, + }, + ), +] + diff --git a/opencompass/models/__init__.py b/opencompass/models/__init__.py index 403eb5d6a..0beb963a1 100644 --- a/opencompass/models/__init__.py +++ b/opencompass/models/__init__.py @@ -3,6 +3,7 @@ from .alaya import AlayaLM # noqa: F401 from .baichuan_api import BaiChuan # noqa: F401 from .baidu_api import ERNIEBot # noqa: F401 +from .bailing_api_oc import BailingAPI # noqa: F401 from .base import BaseModel, LMTemplateParser # noqa: F401 from .base_api import APITemplateParser, BaseAPIModel # noqa: F401 from .bytedance_api import ByteDance # noqa: F401 @@ -41,8 +42,7 @@ from .stepfun_api import StepFun # noqa: F401 from .turbomind import TurboMindModel # noqa: F401 from .turbomind_tis import TurboMindTisModel # noqa: F401 -from .turbomind_with_tf_above_v4_33 import \ - TurboMindModelwithChatTemplate # noqa: F401 +from .turbomind_with_tf_above_v4_33 import TurboMindModelwithChatTemplate # noqa: F401 from .unigpt_api import UniGPT # noqa: F401 from .vllm import VLLM # noqa: F401 from .vllm_with_tf_above_v4_33 import VLLMwithChatTemplate # noqa: F401 diff --git a/opencompass/models/bailing_api_oc.py b/opencompass/models/bailing_api_oc.py new file mode 100644 index 000000000..6ff75e0d5 --- /dev/null +++ b/opencompass/models/bailing_api_oc.py @@ -0,0 +1,215 @@ +import concurrent +import concurrent.futures +import os +import socket +import traceback +from typing import Dict, List, Optional, Union + +import requests +from requests.adapters import HTTPAdapter +from retrying import retry +from urllib3.connection import HTTPConnection + +from opencompass.utils.prompt import PromptList + +from .base_api import BaseAPIModel + +PromptType = Union[PromptList, str] + + +class HTTPAdapterWithSocketOptions(HTTPAdapter): + def __init__(self, *args, **kwargs): + self._socket_options = HTTPConnection.default_socket_options + [ + (socket.SOL_SOCKET, socket.SO_KEEPALIVE, 1), + (socket.SOL_TCP, socket.TCP_KEEPIDLE, 75), + (socket.SOL_TCP, socket.TCP_KEEPINTVL, 30), + (socket.SOL_TCP, socket.TCP_KEEPCNT, 120), + ] + super(HTTPAdapterWithSocketOptions, self).__init__(*args, **kwargs) + + def init_poolmanager(self, *args, **kwargs): + if self._socket_options is not None: + kwargs["socket_options"] = self._socket_options + super(HTTPAdapterWithSocketOptions, self).init_poolmanager(*args, **kwargs) + + +class BailingAPI(BaseAPIModel): + """Model wrapper around Bailing Service. + + Args: + ouput_key (str): key for prediction + query_per_second (int): The maximum queries allowed per second + between two consecutive calls of the API. Defaults to 1. + generation_kwargs: other params + retry (int): Number of retires if the API call fails. Defaults to 2. + """ + + def __init__( + self, + path: str, + token: str, + url: str, + meta_template: Optional[Dict] = None, + query_per_second: int = 1, + retry: int = 3, + generation_kwargs: Dict = {}, + max_seq_len=4096, + ): + super().__init__( + path=path, + max_seq_len=max_seq_len, + query_per_second=query_per_second, + meta_template=meta_template, + retry=retry, + generation_kwargs=generation_kwargs, + ) + + self.logger.info(f"Bailing API Model Init path: {path} url={url}") + if not token: + token = os.environ.get("BAILING_API_KEY") + if token: + self._headers = {"Authorization": f"Bearer {token}"} + else: + raise RuntimeError(f"There is not valid token.") + self._headers["Content-Type"] = "application/json" + self._url = url if url else "https://bailingchat.alipay.com/chat/completions" + self._model = path + self._sessions = [] + self._num = ( + int(os.environ.get("BAILING_API_PARALLEL_NUM")) + if os.environ.get("BAILING_API_PARALLEL_NUM") + else 1 + ) + try: + for _ in range(self._num): + adapter = HTTPAdapterWithSocketOptions() + sess = requests.Session() + sess.mount("http://", adapter) + sess.mount("https://", adapter) + self._sessions.append(sess) + except Exception as e: + self.logger.error(f"Fail to setup the session. {e}") + raise e + + def generate( + self, + inputs: Union[List[str], PromptList], + max_out_len: int = 4096, + ) -> List[str]: + """Generate results given a list of inputs. + + Args: + inputs (Union[List[str], PromptList]): A list of strings or PromptDicts. + The PromptDict should be organized in OpenCompass' API format. + max_out_len (int): The maximum length of the output. + + Returns: + List[str]: A list of generated strings. + """ + with concurrent.futures.ThreadPoolExecutor( + max_workers=self._num, + ) as executor: + future_to_m = { + executor.submit( + self._generate, + self._sessions[i % self._num], + input, + max_out_len, + ): i + for i, input in enumerate(inputs) + } + results = [] + for future in concurrent.futures.as_completed(future_to_m): + m = future_to_m[future] + resp = future.result() + if resp and resp.status_code == 200: + try: + result = resp.json() + except: + results.append("") + else: + if ( + result.get("choices") + and result["choices"][0].get("message") + and result["choices"][0]["message"].get("content") + ): + results.append(result["choices"][0]["message"]["content"]) + else: + results.append("") + self.flush() + return results + + def _generate( + self, + sess, + input: Union[str, PromptList], + max_out_len: int, + ) -> str: + """Generate results given an input. + + Args: + inputs (str or PromptList): A string or PromptDict. + The PromptDict should be organized in OpenCompass' API format. + max_out_len (int): The maximum length of the output. + + Returns: + str: The generated string. + """ + if isinstance(input, str): + messages = [{"role": "user", "content": input}] + else: + messages = [] + for item in input: + content = item["prompt"] + if not content: + continue + message = {"content": content} + if item["role"] == "HUMAN": + message["role"] = "user" + elif item["role"] == "BOT": + message["role"] = "assistant" + elif item["role"] == "SYSTEM": + message["role"] = "system" + else: + message["role"] = item["role"] + messages.append(message) + request = { + "model": self._model, + "messages": messages, + "max_seq_len": max( + max_out_len if max_out_len else 4096, + self.max_seq_len if self.max_seq_len else 4096, + ), + } + request.update(self.generation_kwargs) + try: + retry_num = 0 + while retry_num < self.retry: + response = self._infer_result(request, sess) + if response.status_code == 200: + break # success + elif response.status_code == 426: + retry_num += 1 # retry + else: + raise ValueError(f"Status code = {response.status_code}") + else: + raise ValueError( + f"Exceed the maximal retry times. Last status code = {response.status_code}" + ) + except Exception as e: + self.logger.error( + f"Fail to inference request={request}; model_name={self.path}; error={e}, stack:{traceback.format_exc()}" + ) + raise e + return response + + @retry(stop_max_attempt_number=3, wait_fixed=16000) # ms + def _infer_result(self, request, sess): + response = sess.request( + "POST", + self._url, + json=request, + headers=self._headers, + timeout=500, + ) + return response diff --git a/requirements/runtime.txt b/requirements/runtime.txt index dc6389114..e7229e889 100644 --- a/requirements/runtime.txt +++ b/requirements/runtime.txt @@ -23,6 +23,7 @@ python-Levenshtein rank_bm25==0.2.2 rapidfuzz requests>=2.31.0 +retrying rich rouge -e git+https://github.com/Isaac-JL-Chen/rouge_chinese.git@master#egg=rouge_chinese From a7bacfdf7edeb5bea58345a91c9ba486a3195b68 Mon Sep 17 00:00:00 2001 From: Songyang Zhang Date: Thu, 26 Sep 2024 18:44:00 +0800 Subject: [PATCH 09/20] [Feature] Update CoreBench 2.0 (#1566) * [Feature] 1. Update CoreBench Base\n 2. Fix lint issue in BalingAPI * Update * Update --- configs/api_examples/eval_api_bailing.py | 8 +- configs/eval_corebench_2409_base_objective.py | 188 ++++++++++++++++++ ... => eval_corebench_2409_chat_objective.py} | 26 ++- .../models/bailing_api/bailing-lite-0830.py | 25 ++- .../models/bailing_api/bailing-pro-0920.py | 25 ++- .../models/qwen2_5/lmdeploy_qwen2_5_1_5b.py | 15 ++ configs/models/qwen2_5/lmdeploy_qwen2_5_7b.py | 15 ++ .../models/bailing_api/bailing-lite-0830.py | 25 ++- .../models/bailing_api/bailing-pro-0920.py | 25 ++- .../models/qwen2_5/lmdeploy_qwen2_5_1_5b.py | 15 ++ .../models/qwen2_5/lmdeploy_qwen2_5_7b.py | 15 ++ opencompass/models/__init__.py | 3 +- opencompass/models/bailing_api_oc.py | 108 +++++----- 13 files changed, 379 insertions(+), 114 deletions(-) create mode 100644 configs/eval_corebench_2409_base_objective.py rename configs/{eval_corebench_2409_objective.py => eval_corebench_2409_chat_objective.py} (88%) create mode 100644 configs/models/qwen2_5/lmdeploy_qwen2_5_1_5b.py create mode 100644 configs/models/qwen2_5/lmdeploy_qwen2_5_7b.py create mode 100644 opencompass/configs/models/qwen2_5/lmdeploy_qwen2_5_1_5b.py create mode 100644 opencompass/configs/models/qwen2_5/lmdeploy_qwen2_5_7b.py diff --git a/configs/api_examples/eval_api_bailing.py b/configs/api_examples/eval_api_bailing.py index 15101b09f..00640fb4f 100644 --- a/configs/api_examples/eval_api_bailing.py +++ b/configs/api_examples/eval_api_bailing.py @@ -15,9 +15,9 @@ models = [ dict( - path="Bailing-Lite-0830", - token="xxxxxx", # set your key here or in environment variable BAILING_API_KEY - url="https://bailingchat.alipay.com/chat/completions", + path='Bailing-Lite-0830', + token='xxxxxx', # set your key here or in environment variable BAILING_API_KEY + url='https://bailingchat.alipay.com/chat/completions', type=BailingAPI, generation_kwargs={}, query_per_second=1, @@ -35,4 +35,4 @@ ), ) -work_dir = "outputs/api_bailing/" +work_dir = 'outputs/api_bailing/' diff --git a/configs/eval_corebench_2409_base_objective.py b/configs/eval_corebench_2409_base_objective.py new file mode 100644 index 000000000..9c9043657 --- /dev/null +++ b/configs/eval_corebench_2409_base_objective.py @@ -0,0 +1,188 @@ +from mmengine.config import read_base +import os.path as osp +from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner +from opencompass.runners import LocalRunner +from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask + + +####################################################################### +# PART 0 Essential Configs # +####################################################################### +with read_base(): + # Datasets Part + ## Core Set + # ## Examination + from opencompass.configs.datasets.mmlu.mmlu_ppl_ac766d import mmlu_datasets + from opencompass.configs.datasets.mmlu_pro.mmlu_pro_few_shot_gen_bfaf90 import \ + mmlu_pro_datasets + from opencompass.configs.datasets.cmmlu.cmmlu_ppl_041cbf import \ + cmmlu_datasets + # ## Reasoning + from opencompass.configs.datasets.bbh.bbh_gen_98fba6 import bbh_datasets + from opencompass.configs.datasets.hellaswag.hellaswag_10shot_ppl_59c85e import hellaswag_datasets + from opencompass.configs.datasets.drop.drop_gen_a2697c import drop_datasets + + # ## Math + from opencompass.configs.datasets.math.math_4shot_base_gen_43d5b6 import math_datasets + from opencompass.configs.datasets.gsm8k.gsm8k_gen_17d0dc import gsm8k_datasets + from opencompass.configs.datasets.MathBench.mathbench_2024_few_shot_mixed_4a3fd4 import \ + mathbench_datasets + + # ## Scientific + from opencompass.configs.datasets.gpqa.gpqa_few_shot_ppl_2c9cd6 import \ + gpqa_datasets + + # ## Coding + from opencompass.configs.datasets.humaneval.deprecated_humaneval_gen_d2537e import humaneval_datasets + from opencompass.configs.datasets.mbpp.sanitized_mbpp_gen_742f0c import sanitized_mbpp_datasets + # TODO: Add LiveCodeBench + + # ## Instruction Following + # from opencompass.configs.datasets.IFEval.IFEval_gen_3321a3 import ifeval_datasets + + # Summarizer + from opencompass.configs.summarizers.groups.mmlu import mmlu_summary_groups + from opencompass.configs.summarizers.groups.mmlu_pro import mmlu_pro_summary_groups + from opencompass.configs.summarizers.groups.cmmlu import cmmlu_summary_groups + from opencompass.configs.summarizers.groups.bbh import bbh_summary_groups + from opencompass.configs.summarizers.groups.mathbench_v1_2024 import \ + mathbench_2024_summary_groups + + # Model List + from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_1_5b import models as lmdeploy_qwen2_5_1_5b_model + # from opencompass.configs.models.qwen.lmdeploy_qwen2_1_5b_instruct import models as lmdeploy_qwen2_1_5b_instruct_model + # from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import models as hf_internlm2_5_7b_chat_model + # from opencompass.configs.models.openbmb.hf_minicpm_2b_sft_bf16 import models as hf_minicpm_2b_sft_bf16_model + # from opencompass.configs.models.yi.hf_yi_1_5_6b_chat import models as hf_yi_1_5_6b_chat_model + # from opencompass.configs.models.gemma.hf_gemma_2b_it import models as hf_gemma_2b_it_model + # from opencompass.configs.models.yi.hf_yi_1_5_34b_chat import models as hf_yi_1_5_34b_chat_model + +####################################################################### +# PART 1 Datasets List # +####################################################################### +# datasets list for evaluation +datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), []) + + +####################################################################### +# PART 2 Datset Summarizer # +####################################################################### +# with read_base(): + +core_summary_groups = [ + { + 'name': 'core_average', + 'subsets': [ + ['mmlu', 'accuracy'], + ['mmlu_pro', 'accuracy'], + ['cmmlu', 'accuracy'], + ['bbh', 'naive_average'], + ['hellaswag', 'accuracy'], + ['drop', 'accuracy'], + ['math', 'accuracy'], + ['gsm8k', 'accuracy'], + ['mathbench-t (average)', 'naive_average'] + ['GPQA_diamond', 'accuracy'], + ['openai_humaneval', 'humaneval_pass@1'], + ['IFEval', 'Prompt-level-strict-accuracy'], + ['sanitized_mbpp', 'score'], + ['mathbench-t (average)', 'naive_average'] + ], + }, +] + +summarizer = dict( + dataset_abbrs=[ + ['mmlu', 'accuracy'], + ['mmlu_pro', 'accuracy'], + ['cmmlu', 'accuracy'], + ['bbh', 'naive_average'], + ['hellaswag', 'accuracy'], + ['drop', 'accuracy'], + ['math', 'accuracy'], + ['gsm8k', 'accuracy'], + ['mathbench-t (average)', 'naive_average'] + ['GPQA_diamond', 'accuracy'], + ['openai_humaneval', 'humaneval_pass@1'], + ['IFEval', 'Prompt-level-strict-accuracy'], + ['sanitized_mbpp', 'score'], + 'mathbench-a (average)', + 'mathbench-t (average)' + '', + ['mmlu', 'accuracy'], + ['mmlu-stem', 'accuracy'], + ['mmlu-social-science', 'accuracy'], + ['mmlu-humanities', 'accuracy'], + ['mmlu-other', 'accuracy'], + + '', + ['mmlu_pro', 'accuracy'], + ['mmlu_pro_math','accuracy'], + ['mmlu_pro_physics', 'accuracy'], + ['mmlu_pro_chemistry', 'accuracy'], + ['mmlu_pro_law', 'accuracy'], + ['mmlu_pro_engineering', 'accuracy'], + ['mmlu_pro_other', 'accuracy'], + ['mmlu_pro_economics', 'accuracy'], + ['mmlu_pro_health', 'accuracy'], + ['mmlu_pro_psychology', 'accuracy'], + ['mmlu_pro_business', 'accuracy'], + ['mmlu_pro_biology', 'accuracy'], + ['mmlu_pro_philosophy', 'accuracy'], + ['mmlu_pro_computer_science','accuracy'], + ['mmlu_pro_history', 'accuracy'], + '', + ['cmmlu', 'accuracy'], + ['cmmlu-stem', 'accuracy'], + ['cmmlu-social-science', 'accuracy'], + ['cmmlu-humanities', 'accuracy'], + ['cmmlu-other', 'accuracy'], + ['cmmlu-china-specific', 'accuracy'], + + ], + summary_groups=sum( + [v for k, v in locals().items() if k.endswith('_summary_groups')], []), +) + + +####################################################################### +# PART 3 Models List # +####################################################################### + +models = sum([v for k, v in locals().items() if k.endswith('_model')], []) + + + +####################################################################### +# PART 4 Inference/Evaluation Configuaration # +####################################################################### + +# Local Runner +infer = dict( + partitioner=dict( + type=NumWorkerPartitioner, + num_worker=8 + ), + runner=dict( + type=LocalRunner, + max_num_workers=16, + retry=0, # Modify if needed + task=dict(type=OpenICLInferTask) + ), +) + +# eval with local runner +eval = dict( + partitioner=dict(type=NaivePartitioner, n=10), + runner=dict( + type=LocalRunner, + max_num_workers=16, + task=dict(type=OpenICLEvalTask)), +) + + +####################################################################### +# PART 5 Utils Configuaration # +####################################################################### +base_exp_dir = 'outputs/corebench_2409_objective/' +work_dir = osp.join(base_exp_dir, 'chat_objective') diff --git a/configs/eval_corebench_2409_objective.py b/configs/eval_corebench_2409_chat_objective.py similarity index 88% rename from configs/eval_corebench_2409_objective.py rename to configs/eval_corebench_2409_chat_objective.py index e14c52472..0b6735062 100644 --- a/configs/eval_corebench_2409_objective.py +++ b/configs/eval_corebench_2409_chat_objective.py @@ -18,20 +18,22 @@ # ## Reasoning from opencompass.configs.datasets.bbh.bbh_gen_4a31fa import bbh_datasets - # TODO: Add HellaSwag - # TODO: Add DROP + from opencompass.configs.datasets.hellaswag.hellaswag_10shot_gen_e42710 import \ + hellaswag_datasets + from opencompass.configs.datasets.drop.drop_openai_simple_evals_gen_3857b0 import drop_datasets # ## Math from opencompass.configs.datasets.math.math_0shot_gen_393424 import math_datasets - # TODO: Add GSM8K - # TODO: Add MathBench + from opencompass.configs.datasets.gsm8k.gsm8k_0shot_v2_gen_a58960 import \ + gsm8k_datasets + from opencompass.configs.datasets.MathBench.mathbench_2024_gen_50a320 import mathbench_datasets # ## Scientific from opencompass.configs.datasets.gpqa.gpqa_openai_simple_evals_gen_5aeece import gpqa_datasets # ## Coding from opencompass.configs.datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets - # TODO: Add MBPP + from opencompass.configs.datasets.mbpp.sanitized_mbpp_mdblock_gen_a447ff import sanitized_mbpp_datasets # TODO: Add LiveCodeBench # ## Instruction Following @@ -70,13 +72,17 @@ 'subsets': [ ['mmlu', 'accuracy'], ['mmlu_pro', 'accuracy'], - # ['cmmlu', 'naive_average'], ['cmmlu', 'accuracy'], ['bbh', 'score'], ['math', 'accuracy'], ['openai_humaneval', 'humaneval_pass@1'], ['GPQA_diamond', 'accuracy'], ['IFEval', 'Prompt-level-strict-accuracy'], + ['drop', 'accuracy'], + ['sanitized_mbpp', 'score'], + ['gsm8k', 'accuracy'], + ['hellaswag', 'accuracy'], + ['mathbench-t (average)', 'naive_average'] ], }, ] @@ -92,6 +98,12 @@ ['openai_humaneval', 'humaneval_pass@1'], ['GPQA_diamond', 'accuracy'], ['IFEval', 'Prompt-level-strict-accuracy'], + ['drop', 'accuracy'], + ['sanitized_mbpp', 'score'], + ['gsm8k', 'accuracy'], + ['hellaswag', 'accuracy'], + 'mathbench-a (average)', + 'mathbench-t (average)' '', ['mmlu', 'accuracy'], @@ -204,5 +216,5 @@ ####################################################################### # PART 5 Utils Configuaration # ####################################################################### -base_exp_dir = 'outputs/corebench/' +base_exp_dir = 'outputs/corebench_2409_objective/' work_dir = osp.join(base_exp_dir, 'chat_objective') diff --git a/configs/models/bailing_api/bailing-lite-0830.py b/configs/models/bailing_api/bailing-lite-0830.py index 1a43b4be1..88053ce98 100644 --- a/configs/models/bailing_api/bailing-lite-0830.py +++ b/configs/models/bailing_api/bailing-lite-0830.py @@ -2,30 +2,29 @@ api_meta_template = dict( round=[ - dict(role="HUMAN", api_role="HUMAN"), - dict(role="BOT", api_role="BOT", generate=False), + dict(role='HUMAN', api_role='HUMAN'), + dict(role='BOT', api_role='BOT', generate=False), ], - reserved_roles=[dict(role="SYSTEM", api_role="SYSTEM")], + reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')], ) models = [ dict( - path="Bailing-Lite-0830", - token="", # set your key here or in environment variable BAILING_API_KEY - url="https://bailingchat.alipay.com/chat/completions", + path='Bailing-Lite-0830', + token='', # set your key here or in environment variable BAILING_API_KEY + url='https://bailingchat.alipay.com/chat/completions', type=BailingAPI, meta_template=api_meta_template, query_per_second=1, max_seq_len=4096, batch_size=1, generation_kwargs={ - "temperature": 0.4, - "top_p": 1.0, - "top_k": -1, - "n": 1, - "logprobs": 1, - "use_beam_search": False, + 'temperature': 0.4, + 'top_p': 1.0, + 'top_k': -1, + 'n': 1, + 'logprobs': 1, + 'use_beam_search': False, }, ), ] - diff --git a/configs/models/bailing_api/bailing-pro-0920.py b/configs/models/bailing_api/bailing-pro-0920.py index 35814bf79..db69b263e 100644 --- a/configs/models/bailing_api/bailing-pro-0920.py +++ b/configs/models/bailing_api/bailing-pro-0920.py @@ -2,30 +2,29 @@ api_meta_template = dict( round=[ - dict(role="HUMAN", api_role="HUMAN"), - dict(role="BOT", api_role="BOT", generate=False), + dict(role='HUMAN', api_role='HUMAN'), + dict(role='BOT', api_role='BOT', generate=False), ], - reserved_roles=[dict(role="SYSTEM", api_role="SYSTEM")], + reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')], ) models = [ dict( - path="Bailing-Pro-0920", - token="", # set your key here or in environment variable BAILING_API_KEY - url="https://bailingchat.alipay.com/chat/completions", + path='Bailing-Pro-0920', + token='', # set your key here or in environment variable BAILING_API_KEY + url='https://bailingchat.alipay.com/chat/completions', type=BailingAPI, meta_template=api_meta_template, query_per_second=1, max_seq_len=4096, batch_size=1, generation_kwargs={ - "temperature": 0.4, - "top_p": 1.0, - "top_k": -1, - "n": 1, - "logprobs": 1, - "use_beam_search": False, + 'temperature': 0.4, + 'top_p': 1.0, + 'top_k': -1, + 'n': 1, + 'logprobs': 1, + 'use_beam_search': False, }, ), ] - diff --git a/configs/models/qwen2_5/lmdeploy_qwen2_5_1_5b.py b/configs/models/qwen2_5/lmdeploy_qwen2_5_1_5b.py new file mode 100644 index 000000000..a2661c9fd --- /dev/null +++ b/configs/models/qwen2_5/lmdeploy_qwen2_5_1_5b.py @@ -0,0 +1,15 @@ +from opencompass.models import TurboMindModel + +models = [ + dict( + type=TurboMindModel, + abbr='qwen2.5-1.5b-turbomind', + path='Qwen/Qwen2.5-1.5B', + engine_config=dict(session_len=7168, max_batch_size=16, tp=1), + gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024), + max_seq_len=7168, + max_out_len=1024, + batch_size=16, + run_cfg=dict(num_gpus=1), + ) +] diff --git a/configs/models/qwen2_5/lmdeploy_qwen2_5_7b.py b/configs/models/qwen2_5/lmdeploy_qwen2_5_7b.py new file mode 100644 index 000000000..b2d7aa0c5 --- /dev/null +++ b/configs/models/qwen2_5/lmdeploy_qwen2_5_7b.py @@ -0,0 +1,15 @@ +from opencompass.models import TurboMindModel + +models = [ + dict( + type=TurboMindModel, + abbr='qwen2.5-7b-turbomind', + path='Qwen/Qwen2.5-7B', + engine_config=dict(session_len=7168, max_batch_size=16, tp=1), + gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024), + max_seq_len=7168, + max_out_len=1024, + batch_size=16, + run_cfg=dict(num_gpus=1), + ) +] diff --git a/opencompass/configs/models/bailing_api/bailing-lite-0830.py b/opencompass/configs/models/bailing_api/bailing-lite-0830.py index 1a43b4be1..88053ce98 100644 --- a/opencompass/configs/models/bailing_api/bailing-lite-0830.py +++ b/opencompass/configs/models/bailing_api/bailing-lite-0830.py @@ -2,30 +2,29 @@ api_meta_template = dict( round=[ - dict(role="HUMAN", api_role="HUMAN"), - dict(role="BOT", api_role="BOT", generate=False), + dict(role='HUMAN', api_role='HUMAN'), + dict(role='BOT', api_role='BOT', generate=False), ], - reserved_roles=[dict(role="SYSTEM", api_role="SYSTEM")], + reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')], ) models = [ dict( - path="Bailing-Lite-0830", - token="", # set your key here or in environment variable BAILING_API_KEY - url="https://bailingchat.alipay.com/chat/completions", + path='Bailing-Lite-0830', + token='', # set your key here or in environment variable BAILING_API_KEY + url='https://bailingchat.alipay.com/chat/completions', type=BailingAPI, meta_template=api_meta_template, query_per_second=1, max_seq_len=4096, batch_size=1, generation_kwargs={ - "temperature": 0.4, - "top_p": 1.0, - "top_k": -1, - "n": 1, - "logprobs": 1, - "use_beam_search": False, + 'temperature': 0.4, + 'top_p': 1.0, + 'top_k': -1, + 'n': 1, + 'logprobs': 1, + 'use_beam_search': False, }, ), ] - diff --git a/opencompass/configs/models/bailing_api/bailing-pro-0920.py b/opencompass/configs/models/bailing_api/bailing-pro-0920.py index 35814bf79..db69b263e 100644 --- a/opencompass/configs/models/bailing_api/bailing-pro-0920.py +++ b/opencompass/configs/models/bailing_api/bailing-pro-0920.py @@ -2,30 +2,29 @@ api_meta_template = dict( round=[ - dict(role="HUMAN", api_role="HUMAN"), - dict(role="BOT", api_role="BOT", generate=False), + dict(role='HUMAN', api_role='HUMAN'), + dict(role='BOT', api_role='BOT', generate=False), ], - reserved_roles=[dict(role="SYSTEM", api_role="SYSTEM")], + reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')], ) models = [ dict( - path="Bailing-Pro-0920", - token="", # set your key here or in environment variable BAILING_API_KEY - url="https://bailingchat.alipay.com/chat/completions", + path='Bailing-Pro-0920', + token='', # set your key here or in environment variable BAILING_API_KEY + url='https://bailingchat.alipay.com/chat/completions', type=BailingAPI, meta_template=api_meta_template, query_per_second=1, max_seq_len=4096, batch_size=1, generation_kwargs={ - "temperature": 0.4, - "top_p": 1.0, - "top_k": -1, - "n": 1, - "logprobs": 1, - "use_beam_search": False, + 'temperature': 0.4, + 'top_p': 1.0, + 'top_k': -1, + 'n': 1, + 'logprobs': 1, + 'use_beam_search': False, }, ), ] - diff --git a/opencompass/configs/models/qwen2_5/lmdeploy_qwen2_5_1_5b.py b/opencompass/configs/models/qwen2_5/lmdeploy_qwen2_5_1_5b.py new file mode 100644 index 000000000..a2661c9fd --- /dev/null +++ b/opencompass/configs/models/qwen2_5/lmdeploy_qwen2_5_1_5b.py @@ -0,0 +1,15 @@ +from opencompass.models import TurboMindModel + +models = [ + dict( + type=TurboMindModel, + abbr='qwen2.5-1.5b-turbomind', + path='Qwen/Qwen2.5-1.5B', + engine_config=dict(session_len=7168, max_batch_size=16, tp=1), + gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024), + max_seq_len=7168, + max_out_len=1024, + batch_size=16, + run_cfg=dict(num_gpus=1), + ) +] diff --git a/opencompass/configs/models/qwen2_5/lmdeploy_qwen2_5_7b.py b/opencompass/configs/models/qwen2_5/lmdeploy_qwen2_5_7b.py new file mode 100644 index 000000000..b2d7aa0c5 --- /dev/null +++ b/opencompass/configs/models/qwen2_5/lmdeploy_qwen2_5_7b.py @@ -0,0 +1,15 @@ +from opencompass.models import TurboMindModel + +models = [ + dict( + type=TurboMindModel, + abbr='qwen2.5-7b-turbomind', + path='Qwen/Qwen2.5-7B', + engine_config=dict(session_len=7168, max_batch_size=16, tp=1), + gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024), + max_seq_len=7168, + max_out_len=1024, + batch_size=16, + run_cfg=dict(num_gpus=1), + ) +] diff --git a/opencompass/models/__init__.py b/opencompass/models/__init__.py index 0beb963a1..0f55b869c 100644 --- a/opencompass/models/__init__.py +++ b/opencompass/models/__init__.py @@ -42,7 +42,8 @@ from .stepfun_api import StepFun # noqa: F401 from .turbomind import TurboMindModel # noqa: F401 from .turbomind_tis import TurboMindTisModel # noqa: F401 -from .turbomind_with_tf_above_v4_33 import TurboMindModelwithChatTemplate # noqa: F401 +from .turbomind_with_tf_above_v4_33 import \ + TurboMindModelwithChatTemplate # noqa: F401 from .unigpt_api import UniGPT # noqa: F401 from .vllm import VLLM # noqa: F401 from .vllm_with_tf_above_v4_33 import VLLMwithChatTemplate # noqa: F401 diff --git a/opencompass/models/bailing_api_oc.py b/opencompass/models/bailing_api_oc.py index 6ff75e0d5..34e8a333a 100644 --- a/opencompass/models/bailing_api_oc.py +++ b/opencompass/models/bailing_api_oc.py @@ -7,9 +7,14 @@ import requests from requests.adapters import HTTPAdapter -from retrying import retry from urllib3.connection import HTTPConnection +try: + from retrying import retry +except ImportError: + retry = None + print('please install retrying by `pip install retrying`') + from opencompass.utils.prompt import PromptList from .base_api import BaseAPIModel @@ -18,6 +23,7 @@ class HTTPAdapterWithSocketOptions(HTTPAdapter): + def __init__(self, *args, **kwargs): self._socket_options = HTTPConnection.default_socket_options + [ (socket.SOL_SOCKET, socket.SO_KEEPALIVE, 1), @@ -29,8 +35,9 @@ def __init__(self, *args, **kwargs): def init_poolmanager(self, *args, **kwargs): if self._socket_options is not None: - kwargs["socket_options"] = self._socket_options - super(HTTPAdapterWithSocketOptions, self).init_poolmanager(*args, **kwargs) + kwargs['socket_options'] = self._socket_options + super(HTTPAdapterWithSocketOptions, + self).init_poolmanager(*args, **kwargs) class BailingAPI(BaseAPIModel): @@ -64,31 +71,29 @@ def __init__( generation_kwargs=generation_kwargs, ) - self.logger.info(f"Bailing API Model Init path: {path} url={url}") + self.logger.info(f'Bailing API Model Init path: {path} url={url}') if not token: - token = os.environ.get("BAILING_API_KEY") + token = os.environ.get('BAILING_API_KEY') if token: - self._headers = {"Authorization": f"Bearer {token}"} + self._headers = {'Authorization': f'Bearer {token}'} else: - raise RuntimeError(f"There is not valid token.") - self._headers["Content-Type"] = "application/json" - self._url = url if url else "https://bailingchat.alipay.com/chat/completions" + raise RuntimeError('There is not valid token.') + self._headers['Content-Type'] = 'application/json' + self._url = url if url else \ + 'https://bailingchat.alipay.com/chat/completions' self._model = path self._sessions = [] - self._num = ( - int(os.environ.get("BAILING_API_PARALLEL_NUM")) - if os.environ.get("BAILING_API_PARALLEL_NUM") - else 1 - ) + self._num = (int(os.environ.get('BAILING_API_PARALLEL_NUM')) + if os.environ.get('BAILING_API_PARALLEL_NUM') else 1) try: for _ in range(self._num): adapter = HTTPAdapterWithSocketOptions() sess = requests.Session() - sess.mount("http://", adapter) - sess.mount("https://", adapter) + sess.mount('http://', adapter) + sess.mount('https://', adapter) self._sessions.append(sess) except Exception as e: - self.logger.error(f"Fail to setup the session. {e}") + self.logger.error(f'Fail to setup the session. {e}') raise e def generate( @@ -99,7 +104,8 @@ def generate( """Generate results given a list of inputs. Args: - inputs (Union[List[str], PromptList]): A list of strings or PromptDicts. + inputs (Union[List[str], PromptList]): + A list of strings or PromptDicts. The PromptDict should be organized in OpenCompass' API format. max_out_len (int): The maximum length of the output. @@ -107,8 +113,7 @@ def generate( List[str]: A list of generated strings. """ with concurrent.futures.ThreadPoolExecutor( - max_workers=self._num, - ) as executor: + max_workers=self._num, ) as executor: future_to_m = { executor.submit( self._generate, @@ -120,22 +125,22 @@ def generate( } results = [] for future in concurrent.futures.as_completed(future_to_m): - m = future_to_m[future] + m = future_to_m[future] # noqa F841 resp = future.result() if resp and resp.status_code == 200: try: result = resp.json() - except: - results.append("") + except Exception as e: # noqa F841 + results.append('') else: - if ( - result.get("choices") - and result["choices"][0].get("message") - and result["choices"][0]["message"].get("content") - ): - results.append(result["choices"][0]["message"]["content"]) + if (result.get('choices') + and result['choices'][0].get('message') + and result['choices'][0]['message'].get( + 'content')): + results.append( + result['choices'][0]['message']['content']) else: - results.append("") + results.append('') self.flush() return results @@ -156,27 +161,30 @@ def _generate( str: The generated string. """ if isinstance(input, str): - messages = [{"role": "user", "content": input}] + messages = [{'role': 'user', 'content': input}] else: messages = [] for item in input: - content = item["prompt"] + content = item['prompt'] if not content: continue - message = {"content": content} - if item["role"] == "HUMAN": - message["role"] = "user" - elif item["role"] == "BOT": - message["role"] = "assistant" - elif item["role"] == "SYSTEM": - message["role"] = "system" + message = {'content': content} + if item['role'] == 'HUMAN': + message['role'] = 'user' + elif item['role'] == 'BOT': + message['role'] = 'assistant' + elif item['role'] == 'SYSTEM': + message['role'] = 'system' else: - message["role"] = item["role"] + message['role'] = item['role'] messages.append(message) request = { - "model": self._model, - "messages": messages, - "max_seq_len": max( + 'model': + self._model, + 'messages': + messages, + 'max_seq_len': + max( max_out_len if max_out_len else 4096, self.max_seq_len if self.max_seq_len else 4096, ), @@ -191,22 +199,22 @@ def _generate( elif response.status_code == 426: retry_num += 1 # retry else: - raise ValueError(f"Status code = {response.status_code}") + raise ValueError(f'Status code = {response.status_code}') else: raise ValueError( - f"Exceed the maximal retry times. Last status code = {response.status_code}" - ) + f'Exceed the maximal retry times. Last status code ' + f'= {response.status_code}') except Exception as e: - self.logger.error( - f"Fail to inference request={request}; model_name={self.path}; error={e}, stack:{traceback.format_exc()}" - ) + self.logger.error(f'Fail to inference request={request}; ' + f'model_name={self.path}; error={e}, ' + f'stack:{traceback.format_exc()}') raise e return response @retry(stop_max_attempt_number=3, wait_fixed=16000) # ms def _infer_result(self, request, sess): response = sess.request( - "POST", + 'POST', self._url, json=request, headers=self._headers, From 7d50294117e319dfe9fc8ffbf6c5c0268329ee09 Mon Sep 17 00:00:00 2001 From: Songyang Zhang Date: Thu, 26 Sep 2024 18:56:17 +0800 Subject: [PATCH 10/20] [Feature] Update Bailing (#1567) * [Feature] 1. Update CoreBench Base\n 2. Fix lint issue in BalingAPI * Update * Update * Update --- opencompass/models/bailing_api_oc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opencompass/models/bailing_api_oc.py b/opencompass/models/bailing_api_oc.py index 34e8a333a..54e0d502f 100644 --- a/opencompass/models/bailing_api_oc.py +++ b/opencompass/models/bailing_api_oc.py @@ -211,7 +211,7 @@ def _generate( raise e return response - @retry(stop_max_attempt_number=3, wait_fixed=16000) # ms + # @retry(stop_max_attempt_number=3, wait_fixed=16000) # ms def _infer_result(self, request, sess): response = sess.request( 'POST', From e8437db98fc6a817ed101d1945077cdd421089f1 Mon Sep 17 00:00:00 2001 From: Songyang Zhang Date: Fri, 27 Sep 2024 11:15:25 +0800 Subject: [PATCH 11/20] [Feature] Update BailingLM/OpenAI verbose (#1568) * [Feature] 1. Update CoreBench Base\n 2. Fix lint issue in BalingAPI * Update * [Feature] Update API * Update --- configs/eval_corebench_2409_base_objective.py | 6 +++--- opencompass/models/bailing_api_oc.py | 1 - opencompass/models/openai_api.py | 4 ++++ 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/configs/eval_corebench_2409_base_objective.py b/configs/eval_corebench_2409_base_objective.py index 9c9043657..d5d7a3879 100644 --- a/configs/eval_corebench_2409_base_objective.py +++ b/configs/eval_corebench_2409_base_objective.py @@ -81,7 +81,7 @@ ['drop', 'accuracy'], ['math', 'accuracy'], ['gsm8k', 'accuracy'], - ['mathbench-t (average)', 'naive_average'] + ['mathbench-t (average)', 'naive_average'], ['GPQA_diamond', 'accuracy'], ['openai_humaneval', 'humaneval_pass@1'], ['IFEval', 'Prompt-level-strict-accuracy'], @@ -101,7 +101,7 @@ ['drop', 'accuracy'], ['math', 'accuracy'], ['gsm8k', 'accuracy'], - ['mathbench-t (average)', 'naive_average'] + ['mathbench-t (average)', 'naive_average'], ['GPQA_diamond', 'accuracy'], ['openai_humaneval', 'humaneval_pass@1'], ['IFEval', 'Prompt-level-strict-accuracy'], @@ -185,4 +185,4 @@ # PART 5 Utils Configuaration # ####################################################################### base_exp_dir = 'outputs/corebench_2409_objective/' -work_dir = osp.join(base_exp_dir, 'chat_objective') +work_dir = osp.join(base_exp_dir, 'base_objective') diff --git a/opencompass/models/bailing_api_oc.py b/opencompass/models/bailing_api_oc.py index 54e0d502f..d4368a363 100644 --- a/opencompass/models/bailing_api_oc.py +++ b/opencompass/models/bailing_api_oc.py @@ -13,7 +13,6 @@ from retrying import retry except ImportError: retry = None - print('please install retrying by `pip install retrying`') from opencompass.utils.prompt import PromptList diff --git a/opencompass/models/openai_api.py b/opencompass/models/openai_api.py index 4a07dee3f..aff2579a6 100644 --- a/opencompass/models/openai_api.py +++ b/opencompass/models/openai_api.py @@ -601,6 +601,10 @@ def _generate(self, input: PromptList | str, max_out_len: int, if self.verbose: self.logger.info( 'Successfully get response from OpenAI API') + try: + self.logger.info(responses) + except Exception as e: # noqa F841 + pass return responses.choices[0].message.content except Exception as e: self.logger.error(e) From 85a28874aacf14dd215eb6b3212c7307adacbb43 Mon Sep 17 00:00:00 2001 From: Yi Ding Date: Fri, 27 Sep 2024 11:56:57 +0800 Subject: [PATCH 12/20] [BUG]: Fix Bailing API configs (#1570) --- opencompass/models/bailing_api_oc.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/opencompass/models/bailing_api_oc.py b/opencompass/models/bailing_api_oc.py index d4368a363..8e107556c 100644 --- a/opencompass/models/bailing_api_oc.py +++ b/opencompass/models/bailing_api_oc.py @@ -77,6 +77,9 @@ def __init__( self._headers = {'Authorization': f'Bearer {token}'} else: raise RuntimeError('There is not valid token.') + else: + self._headers = {'Authorization': f'Bearer {token}'} + self._headers['Content-Type'] = 'application/json' self._url = url if url else \ 'https://bailingchat.alipay.com/chat/completions' From 7528b8ab8a9b80210e2c51b7257895bdd2ac49ae Mon Sep 17 00:00:00 2001 From: shijinpjlab Date: Sun, 29 Sep 2024 19:24:58 +0800 Subject: [PATCH 13/20] [Feature] Add dingo test (#1529) * add qa dingo * update * change name qa to dingo * eval model: llm_base * update path * change name and move path * add eval_dingo * update import * add for pip * add dingo package * change import place * update import place * fix lint fail * isort * double quoted --------- Co-authored-by: sj --- configs/datasets/dingo/dingo_gen.py | 34 ++++++++ configs/eval_dingo.py | 7 ++ .../configs/datasets/dingo/dingo_gen.py | 34 ++++++++ opencompass/datasets/__init__.py | 1 + opencompass/datasets/dingo.py | 84 +++++++++++++++++++ requirements/extra.txt | 1 + 6 files changed, 161 insertions(+) create mode 100644 configs/datasets/dingo/dingo_gen.py create mode 100644 configs/eval_dingo.py create mode 100644 opencompass/configs/datasets/dingo/dingo_gen.py create mode 100644 opencompass/datasets/dingo.py diff --git a/configs/datasets/dingo/dingo_gen.py b/configs/datasets/dingo/dingo_gen.py new file mode 100644 index 000000000..c36f6cdcc --- /dev/null +++ b/configs/datasets/dingo/dingo_gen.py @@ -0,0 +1,34 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import DingoDataset, DingoEvaluator + + +dingo_paths = [ + './data/dingo/en_192.csv', + './data/dingo/zh_170.csv', +] + +dingo_datasets = [] +for path in dingo_paths: + dingo_reader_cfg = dict(input_columns='input', output_column=None) + dingo_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[dict(role='HUMAN', prompt='{input}')])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer), + ) + dingo_eval_cfg = dict(evaluator=dict(type=DingoEvaluator), pred_role='BOT') + + dingo_datasets.append( + dict( + abbr='dingo_' + path.split('/')[-1].split('.csv')[0], + type=DingoDataset, + path=path, + reader_cfg=dingo_reader_cfg, + infer_cfg=dingo_infer_cfg, + eval_cfg=dingo_eval_cfg, + )) + +datasets = dingo_datasets diff --git a/configs/eval_dingo.py b/configs/eval_dingo.py new file mode 100644 index 000000000..3e0ecb86b --- /dev/null +++ b/configs/eval_dingo.py @@ -0,0 +1,7 @@ +from mmengine.config import read_base + +with read_base(): + from .models.hf_internlm.hf_internlm_7b import models + from .datasets.dingo.dingo_gen import datasets + +work_dir = './outputs/eval_dingo' diff --git a/opencompass/configs/datasets/dingo/dingo_gen.py b/opencompass/configs/datasets/dingo/dingo_gen.py new file mode 100644 index 000000000..c36f6cdcc --- /dev/null +++ b/opencompass/configs/datasets/dingo/dingo_gen.py @@ -0,0 +1,34 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import DingoDataset, DingoEvaluator + + +dingo_paths = [ + './data/dingo/en_192.csv', + './data/dingo/zh_170.csv', +] + +dingo_datasets = [] +for path in dingo_paths: + dingo_reader_cfg = dict(input_columns='input', output_column=None) + dingo_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[dict(role='HUMAN', prompt='{input}')])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer), + ) + dingo_eval_cfg = dict(evaluator=dict(type=DingoEvaluator), pred_role='BOT') + + dingo_datasets.append( + dict( + abbr='dingo_' + path.split('/')[-1].split('.csv')[0], + type=DingoDataset, + path=path, + reader_cfg=dingo_reader_cfg, + infer_cfg=dingo_infer_cfg, + eval_cfg=dingo_eval_cfg, + )) + +datasets = dingo_datasets diff --git a/opencompass/datasets/__init__.py b/opencompass/datasets/__init__.py index a1f201efd..8f178242c 100644 --- a/opencompass/datasets/__init__.py +++ b/opencompass/datasets/__init__.py @@ -33,6 +33,7 @@ from .csl import * # noqa: F401, F403 from .custom import * # noqa: F401, F403 from .cvalues import * # noqa: F401, F403 +from .dingo import * # noqa: F401, F403 from .drcd import * # noqa: F401, F403 from .drop import * # noqa: F401, F403 from .drop_simple_eval import * # noqa: F401, F403 diff --git a/opencompass/datasets/dingo.py b/opencompass/datasets/dingo.py new file mode 100644 index 000000000..753d78ddb --- /dev/null +++ b/opencompass/datasets/dingo.py @@ -0,0 +1,84 @@ +# flake8: nodingo +# yapf: disable +import csv +import json +import os +import time +from typing import List + +from datasets import Dataset + +from opencompass.openicl.icl_evaluator import BaseEvaluator +from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET + +from .base import BaseDataset + + +@LOAD_DATASET.register_module() +class DingoDataset(BaseDataset): + + @staticmethod + def load(path: str): + raw_data = [] + with open(path, encoding='utf-8') as f: + reader = csv.reader(f, delimiter=';') + for row in reader: + if len(row) < 1: + row = [''] + raw_data.append({'input': row[0]}) + return Dataset.from_list(raw_data) + + +@LOAD_DATASET.register_module() +class DingoLongDataset(BaseDataset): + + @staticmethod + def load(path: str): + raw_data = [] + with open(path, 'r', encoding='utf-8') as f: + for line in f: + raw_data.append({'input': json.loads(line).get('input')}) + return Dataset.from_list(raw_data) + + +@ICL_EVALUATORS.register_module() +class DingoEvaluator(BaseEvaluator): + + def score(self, origin_prompt: List, predictions: List) -> dict: + try: + # from dingo.model.model import Model + from dingo.exec import Executor + from dingo.io import InputArgs + except Exception: + raise ModuleNotFoundError( + '=========== ' + 'dingo register fail. please try: pip install dingo-python.' + ' ===========') + + current_time = time.strftime('%Y%m%d_%H%M%S', time.localtime()) + file_data = [{'prompt': pmt, 'prediction': prd} + for pmt, prd in zip(origin_prompt, predictions)] + file_name = 'dingo_file_' + current_time + '.jsonl' + with open(file_name, 'a', encoding='utf-8') as f: + for d in file_data: + json.dump(d, f, ensure_ascii=False) + f.write('\n') + + input_data = { + 'eval_models': ['llm_base'], + 'input_path': file_name, + 'output_path': './outputs/dingo/', + 'dataset': 'local', + 'datasource': 'local', + 'data_format': 'jsonl', + 'column_prompt': ['prompt'], + 'column_content': ['prediction'], + } + # Model.apply_config(input_data["custom_config_path"]) + input_args = InputArgs(**input_data) + executor = Executor.exec_map['local'](input_args) + result = executor.execute() + summary = result[0].to_dict() + + os.remove(file_name) + return summary diff --git a/requirements/extra.txt b/requirements/extra.txt index 218348344..efeef772e 100644 --- a/requirements/extra.txt +++ b/requirements/extra.txt @@ -1,6 +1,7 @@ # Alpaca-eval alpaca-eval==0.6 cn2an +dingo-python # Icl topk retriever faiss_gpu==1.7.2 # Humaneval, Humaneval X From 763d7755b6a22bcf4ac1d579966829125d4dbc61 Mon Sep 17 00:00:00 2001 From: Linchen Xiao Date: Mon, 30 Sep 2024 15:13:26 +0800 Subject: [PATCH 14/20] [BUG]GaokaoBench dataset fix (#1583) --- opencompass/datasets/GaokaoBench.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opencompass/datasets/GaokaoBench.py b/opencompass/datasets/GaokaoBench.py index 383845356..d3cd31a00 100644 --- a/opencompass/datasets/GaokaoBench.py +++ b/opencompass/datasets/GaokaoBench.py @@ -16,7 +16,7 @@ class GaokaoBenchDataset(BaseDataset): @staticmethod def load(path: str, name: str): - data = get_data_path(path, local_mode=True) + path = get_data_path(path, local_mode=True) if environ.get('DATASET_SOURCE') == 'ModelScope': from modelscope import MsDataset return MsDataset.load(path, subset_name=name, split='test') From bbdca5eb4cb08c24a386c22bf677d1856485f5f4 Mon Sep 17 00:00:00 2001 From: x54-729 <45304952+x54-729@users.noreply.github.com> Date: Mon, 30 Sep 2024 15:46:06 +0800 Subject: [PATCH 15/20] [BUG] Fix eos token handling and add comments for InternTrain (#1569) Co-authored-by: x54-729 --- opencompass/models/interntrain.py | 71 ++++++++++++++++++++++++++++--- 1 file changed, 65 insertions(+), 6 deletions(-) diff --git a/opencompass/models/interntrain.py b/opencompass/models/interntrain.py index d6c233cdb..6d904acf7 100644 --- a/opencompass/models/interntrain.py +++ b/opencompass/models/interntrain.py @@ -79,6 +79,50 @@ def initialize_model(self): @MODELS.register_module() class InternTrain(BaseModel): + """Model wrapper for InternTrain. + + Args: + path (str): The name or path to HuggingFace's model. + module_path (str): Path of InternTrain repository. + max_seq_len (int): The maximum length of the input sequence. Defaults + to 2048. + tokenizer_only (bool): If True, only the tokenizer will be initialized. + Defaults to False. + tokenizer_path (str): The path to the tokenizer. Defaults to None. + tokenizer_type: InternTrain's tokenizer type. Defaults to 'InternLM'. + model_config (str, dict, optional): Config of model. There are several + options for this parameter: + + - filename (str): The config items are defined in a python file + so the model will load configs from this file. + - config (dict): The configuration items are defined in a dict + and the model will be initialized from ```model_config```. + - None: The config is loaded from ```path```. In this case, + please make sure that ```path``` contains a config file named + ``model_config.pt``. + + Defaults to None. + model_type: Type of model. Defaults to 'InternTrain' + ckpt_type: The type of load function in InternTrain when checkpoints + are loaded. Defaults to None, which means load the checkpoint + directlywith pipeline merged. + meta_template (Dict, optional): The model's meta prompt + template if needed, in case the requirement of injecting or + wrapping of any meta instructions. + model_dtype: The model's dtype. If None, will use dtype defined in + ```model_config```. Defaults to None. + generation_kwargs (Dict, optional): The generation kwargs for the + model. Defaults to dict(). + sync_rank (bool): Whether to sync inputs between ranks. Do not use this + if you are not familiar with this behavior. Check `sync_inputs` + function for more details. Defaults to False. + mode (str, optional): The method of input truncation when input length + exceeds max_seq_len. 'mid' represents the part of input to + truncate. Defaults to 'none'. + end_str (str, optional): Whether to trim generated strings with end_str + if the model has special ending strings that are not handled well. + Defaults to None. + """ def __init__(self, path: str, @@ -87,14 +131,15 @@ def __init__(self, tokenizer_only: bool = False, tokenizer_path: Optional[str] = None, tokenizer_type: str = 'INTERNLM', - model_config: Optional[str] = None, + model_config: Optional[Union[str, Dict]] = None, model_type: str = 'INTERNLM2', ckpt_type: Optional[str] = None, meta_template: Optional[Dict] = None, model_dtype: Optional[str] = None, generation_kwargs={}, sync_rank: bool = False, - mode='none'): + mode='none', + end_str: Optional[str] = None): super().__init__(path=path, max_seq_len=max_seq_len, tokenizer_only=tokenizer_only, @@ -146,6 +191,7 @@ def __init__(self, bos_token_id=self.tokenizer.bos_id, pad_token_id=self.tokenizer.bos_id, eos_token_id=eos_token_ids) + self.end_str = end_str def _load_model(self, path: str, @@ -287,8 +333,10 @@ def generate(self, max_length=tokens.shape[1] + max_out_len, **self.generation_kwargs) # bsz, num_return_sequences, max_length outputs = outputs[:, 0, tokens.shape[1]:] - output_text = self.batch_decode(outputs, - stopping_criteria=stopping_criteria) + output_text = self.batch_decode( + outputs, + eos_token_ids=self.generator.eos_token_id, + stopping_criteria=stopping_criteria) return output_text @@ -407,11 +455,22 @@ def batch_encode(self, return torch.LongTensor(tokens).cuda() - def batch_decode(self, outputs, stopping_criteria: List[str] = []): + def batch_decode(self, + outputs, + eos_token_ids: List[int], + stopping_criteria: List[str] = []): # outputs: bsz, seq_len output_text = [] + outputs = outputs.tolist() for output in outputs: - text = self.tokenizer.decode(output.tolist()) + # cut off by eos_token_ids + eos_idx = len(output) + for eos_id in eos_token_ids: + if eos_id in output: + eos_idx = min(output.index(eos_id), eos_idx) + text = self.tokenizer.decode(output[:eos_idx]) + if self.end_str is not None: + text = text.split(self.end_str)[0] for stop_word in stopping_criteria: text = text.split(stop_word)[0] output_text.append(text) From 22a4e7651180f0940ea7173e58e8121abe46ca11 Mon Sep 17 00:00:00 2001 From: Linchen Xiao Date: Mon, 30 Sep 2024 16:57:41 +0800 Subject: [PATCH 16/20] [BUMP] Bump version to 0.3.3 (#1581) --- opencompass/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opencompass/__init__.py b/opencompass/__init__.py index d1daced0e..80eb7f98f 100644 --- a/opencompass/__init__.py +++ b/opencompass/__init__.py @@ -1 +1 @@ -__version__ = '0.3.2.post1' +__version__ = '0.3.3' From 89abcba486b8c1e6c6c8b93b6ed856a0d0bb3554 Mon Sep 17 00:00:00 2001 From: zhulinJulia24 <145004780+zhulinJulia24@users.noreply.github.com> Date: Wed, 2 Oct 2024 12:30:38 +0800 Subject: [PATCH 17/20] [CI] Fix testcase failure (#1582) * update * Update oc_score_baseline.yaml * Update daily-run-test.yml * Update daily-run-test.yml * Update daily-run-test.yml * Update daily-run-test.yml --------- Co-authored-by: zhulin1 --- .github/scripts/oc_score_assert.py | 3 +-- .github/scripts/oc_score_baseline.yaml | 12 ++++++------ .github/workflows/daily-run-test.yml | 8 ++++---- 3 files changed, 11 insertions(+), 12 deletions(-) diff --git a/.github/scripts/oc_score_assert.py b/.github/scripts/oc_score_assert.py index 6f2c0a11a..c01ef6864 100644 --- a/.github/scripts/oc_score_assert.py +++ b/.github/scripts/oc_score_assert.py @@ -7,8 +7,7 @@ output_path = 'regression_result_daily' chat_model_list = [ - 'baichuan2-7b-chat-hf', 'glm-4-9b-chat-turbomind', 'glm-4-9b-chat-vllm', - 'deepseek-7b-chat-hf', 'deepseek-moe-16b-chat-hf', + 'baichuan2-7b-chat-hf', 'deepseek-7b-chat-hf', 'deepseek-moe-16b-chat-hf', 'deepseek-v2-lite-chat-hf', 'deepseek-7b-chat-vllm', 'gemma2-2b-it-hf', 'gemma2-9b-it-hf', 'gemma-7b-it-vllm', 'internlm2_5-7b-chat-hf', 'internlm2_5-20b-chat-hf', 'internlm2_5-7b-chat-turbomind', diff --git a/.github/scripts/oc_score_baseline.yaml b/.github/scripts/oc_score_baseline.yaml index 9690aa2c5..809dfea45 100644 --- a/.github/scripts/oc_score_baseline.yaml +++ b/.github/scripts/oc_score_baseline.yaml @@ -244,14 +244,14 @@ gemma-7b-hf: race-high: 66 gemma2-2b-hf: - gsm8k: 8 - race-middle: 31 - race-high: 30 + gsm8k: 33 + race-middle: 56 + race-high: 58 gemma2-9b-hf: - gsm8k: 20 - race-middle: 42 - race-high: 35 + gsm8k: 70 + race-middle: 82 + race-high: 84 internlm2_5-7b-hf: gsm8k: 47 diff --git a/.github/workflows/daily-run-test.yml b/.github/workflows/daily-run-test.yml index 894b149e0..42ada2f08 100644 --- a/.github/workflows/daily-run-test.yml +++ b/.github/workflows/daily-run-test.yml @@ -123,16 +123,16 @@ jobs: conda info --envs export from_tf=TRUE python tools/list_configs.py internlm2_5 mmlu - opencompass --models hf_internlm2_5_7b --datasets race_ppl --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd1_${{ matrix.cuda_env }} --reuse --max-num-workers 2 + opencompass --models hf_internlm2_5_7b --datasets race_ppl --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd1_${{ matrix.cuda_env }} --reuse rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd1_${{ matrix.cuda_env }}/*/summary regression_result_daily python -m pytest -m case1 -s -v --color=yes .github/scripts/oc_score_assert.py - opencompass --models hf_internlm2_5_7b_chat hf_internlm2_5_1_8b_chat --datasets race_gen -a lmdeploy --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd2 --reuse --max-num-workers 2 + opencompass --models hf_internlm2_5_7b_chat --datasets race_gen -a lmdeploy --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd2_${{ matrix.cuda_env }} --reuse rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd2_${{ matrix.cuda_env }}/*/summary regression_result_daily python -m pytest -m case2 -s -v --color=yes .github/scripts/oc_score_assert.py - opencompass --datasets race_ppl --hf-type base --hf-path internlm/internlm2_5-7b --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd3 --reuse --max-num-workers 2 + opencompass --datasets race_ppl --hf-type base --hf-path internlm/internlm2_5-7b --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd3_${{ matrix.cuda_env }} --reuse rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd3_${{ matrix.cuda_env }}/*/summary regression_result_daily python -m pytest -m case3 -s -v --color=yes .github/scripts/oc_score_assert.py - opencompass --datasets race_gen --hf-type chat --hf-path internlm/internlm2_5-7b-chat --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd4 --reuse --max-num-workers 2 + opencompass --datasets race_gen --hf-type chat --hf-path internlm/internlm2_5-7b-chat --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd4_${{ matrix.cuda_env }} --reuse rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd4_${{ matrix.cuda_env }}/*/summary regression_result_daily python -m pytest -m case4 -s -v --color=yes .github/scripts/oc_score_assert.py - name: Remove Conda Env From 4d6349dfe14f81dc5eea68704c9597f1866a0d51 Mon Sep 17 00:00:00 2001 From: x54-729 <45304952+x54-729@users.noreply.github.com> Date: Tue, 8 Oct 2024 11:34:04 +0800 Subject: [PATCH 18/20] [FIX] fix interntrain get_loglikelihood (#1584) --- opencompass/models/interntrain.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/opencompass/models/interntrain.py b/opencompass/models/interntrain.py index 6d904acf7..e846aae2f 100644 --- a/opencompass/models/interntrain.py +++ b/opencompass/models/interntrain.py @@ -288,7 +288,7 @@ def _convert_dtype(self, default_dtype, model_dtype=None): else: raise NotImplementedError(f'Unknown model dtype {model_dtype}') - def get_token_len(self, prompt: str) -> int: + def get_token_len(self, prompt: str, use_bos=None, use_eos=None) -> int: """Get lengths of the tokenized strings. Args: @@ -297,7 +297,7 @@ def get_token_len(self, prompt: str) -> int: Returns: int: Length of the input tokens """ - tokens = self.tokenizer(prompt, use_bos=True, use_eos=True) + tokens = self.tokenizer(prompt, use_bos=use_bos, use_eos=use_eos) return len(tokens) def generate(self, @@ -391,7 +391,7 @@ def get_loglikelihood(self, input_texts: List[str], for input_text, cont in zip(input_texts, conts) ] replaced_lens = [ - len(self.encode(input_text)[0]) for input_text in replaced_texts + self.get_token_len(input_text) for input_text in replaced_texts ] loglikelihoods = [] for nloss, nlen, rlen in zip(loss, lens, replaced_lens): From d2ab51abbd628b3b2c260c403ffc069c4d0a43ee Mon Sep 17 00:00:00 2001 From: Songyang Zhang Date: Wed, 9 Oct 2024 17:09:48 +0800 Subject: [PATCH 19/20] [Bug] Fix pre-commit hook (#1592) --- .github/workflows/lint.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index ae9a9bd2f..bc6d36a7e 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -17,7 +17,7 @@ jobs: python-version: '3.10' - name: Install pre-commit hook run: | - pip install pre-commit mmengine + pip install pre-commit==3.8.0 mmengine pre-commit install - name: Linting run: pre-commit run --all-files From b52ba65c267c4d8bf05cd57ed3386a2d466887db Mon Sep 17 00:00:00 2001 From: Lyu Han Date: Wed, 9 Oct 2024 22:58:06 +0800 Subject: [PATCH 20/20] [Feature] Integrate lmdeploy pipeline api (#1198) * integrate lmdeploy's pipeline api * fix linting * update user guide * rename * update * update * update * rollback class name * update * remove unused code * update * update * fix ci check * compatibility * remove concurrency * Update configs/models/hf_internlm/lmdeploy_internlm2_chat_7b.py * Update docs/zh_cn/advanced_guides/evaluation_lmdeploy.md * [Bug] fix lint --------- Co-authored-by: Songyang Zhang Co-authored-by: tonysy --- .../eval_internlm_chat_lmdeploy_pytorch.py | 69 ------ configs/eval_internlm_chat_lmdeploy_tis.py | 41 ---- configs/eval_internlm_chat_turbomind_tis.py | 40 ---- configs/eval_internlm_turbomind_tis.py | 28 --- .../hf_internlm/lmdeploy_internlm2_chat_7b.py | 17 +- .../en/advanced_guides/evaluation_lmdeploy.md | 88 ++++++++ .../advanced_guides/evaluation_turbomind.md | 78 ------- .../advanced_guides/evaluation_lmdeploy.md | 86 ++++++++ .../advanced_guides/evaluation_turbomind.md | 75 ------- .../hf_internlm/lmdeploy_internlm2_chat_7b.py | 17 +- opencompass/models/__init__.py | 3 - opencompass/models/lmdeploy_pytorch.py | 188 ---------------- opencompass/models/lmdeploy_tis.py | 200 ------------------ opencompass/models/turbomind_tis.py | 135 ------------ .../models/turbomind_with_tf_above_v4_33.py | 128 ++++------- opencompass/utils/run.py | 11 +- 16 files changed, 249 insertions(+), 955 deletions(-) delete mode 100644 configs/eval_internlm_chat_lmdeploy_pytorch.py delete mode 100644 configs/eval_internlm_chat_lmdeploy_tis.py delete mode 100644 configs/eval_internlm_chat_turbomind_tis.py delete mode 100644 configs/eval_internlm_turbomind_tis.py create mode 100644 docs/en/advanced_guides/evaluation_lmdeploy.md delete mode 100644 docs/en/advanced_guides/evaluation_turbomind.md create mode 100644 docs/zh_cn/advanced_guides/evaluation_lmdeploy.md delete mode 100644 docs/zh_cn/advanced_guides/evaluation_turbomind.md delete mode 100644 opencompass/models/lmdeploy_pytorch.py delete mode 100644 opencompass/models/lmdeploy_tis.py delete mode 100644 opencompass/models/turbomind_tis.py diff --git a/configs/eval_internlm_chat_lmdeploy_pytorch.py b/configs/eval_internlm_chat_lmdeploy_pytorch.py deleted file mode 100644 index 4ea1f84c2..000000000 --- a/configs/eval_internlm_chat_lmdeploy_pytorch.py +++ /dev/null @@ -1,69 +0,0 @@ -from mmengine.config import read_base -from opencompass.models import LmdeployPytorchModel - - -with read_base(): - # choose a list of datasets - from opencompass.configs.datasets.mmlu.mmlu_gen_a484b3 import mmlu_datasets - from opencompass.configs.datasets.ceval.ceval_gen_5f30c7 import ceval_datasets - from opencompass.configs.datasets.SuperGLUE_WiC.SuperGLUE_WiC_gen_d06864 import WiC_datasets - from opencompass.configs.datasets.SuperGLUE_WSC.SuperGLUE_WSC_gen_7902a7 import WSC_datasets - from opencompass.configs.datasets.triviaqa.triviaqa_gen_2121ce import triviaqa_datasets - from opencompass.configs.datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets - from opencompass.configs.datasets.race.race_gen_69ee4f import race_datasets - from opencompass.configs.datasets.crowspairs.crowspairs_gen_381af0 import crowspairs_datasets - # and output the results in a choosen format - from opencompass.configs.summarizers.medium import summarizer - - -datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), []) - - -meta_template = dict( - round=[ - dict(role='HUMAN', begin='<|User|>:', end='\n'), - dict(role='BOT', begin='<|Bot|>:', end='\n', generate=True), - ], - eos_token_id=103028) - -# config for internlm-chat-7b -internlm_chat_7b = dict( - type=LmdeployPytorchModel, - abbr='internlm-chat-7b-pytorch', - path='internlm/internlm-chat-7b', - engine_config=dict(session_len=2048, - max_batch_size=16), - gen_config=dict(top_k=1, - top_p=0.8, - temperature=1.0, - max_new_tokens=100), - max_out_len=100, - max_seq_len=2048, - batch_size=16, - concurrency=16, - meta_template=meta_template, - run_cfg=dict(num_gpus=1, num_procs=1), - end_str='', -) - -# config for internlm-chat-20b -internlm_chat_20b = dict( - type=LmdeployPytorchModel, - abbr='internlm-chat-20b-pytorch', - path='internlm/internlm-chat-20b', - engine_config=dict(session_len=2048, - max_batch_size=8), - gen_config=dict(top_k=1, - top_p=0.8, - temperature=1.0, - max_new_tokens=100), - max_out_len=100, - max_seq_len=2048, - batch_size=8, - concurrency=8, - meta_template=meta_template, - run_cfg=dict(num_gpus=1, num_procs=1), - end_str='', - ) - -models = [internlm_chat_20b] diff --git a/configs/eval_internlm_chat_lmdeploy_tis.py b/configs/eval_internlm_chat_lmdeploy_tis.py deleted file mode 100644 index 8f5470d52..000000000 --- a/configs/eval_internlm_chat_lmdeploy_tis.py +++ /dev/null @@ -1,41 +0,0 @@ -from mmengine.config import read_base -from opencompass.models.lmdeploy_tis import LmdeployTisModel - -with read_base(): - # choose a list of datasets - from opencompass.configs.datasets.mmlu.mmlu_gen_a484b3 import mmlu_datasets - from opencompass.configs.datasets.ceval.ceval_gen_5f30c7 import ceval_datasets - from opencompass.configs.datasets.SuperGLUE_WiC.SuperGLUE_WiC_gen_d06864 import WiC_datasets - from opencompass.configs.datasets.SuperGLUE_WSC.SuperGLUE_WSC_gen_7902a7 import WSC_datasets - from opencompass.configs.datasets.triviaqa.triviaqa_gen_2121ce import triviaqa_datasets - from opencompass.configs.datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets - from opencompass.configs.datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets - from opencompass.configs.datasets.race.race_gen_69ee4f import race_datasets - from opencompass.configs.datasets.crowspairs.crowspairs_gen_381af0 import crowspairs_datasets - # and output the results in a choosen format - from opencompass.configs.summarizers.medium import summarizer - -datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), []) - -meta_template = dict( - round=[ - dict(role='HUMAN', begin='<|im_start|>user\n', end='<|im_end|>\n'), - dict(role='BOT', begin='<|im_start|>assistant\n', end='<|im_end|>\n', generate=True), - ], - eos_token_id=92542 -) - -models = [ - dict( - type=LmdeployTisModel, - abbr='internlm-chat-20b-lmdeploy-tis', - path='internlm/internlm-chat-20b', - tis_addr='0.0.0.0:33337', - max_out_len=100, - max_seq_len=2048, - batch_size=8, - meta_template=meta_template, - run_cfg=dict(num_gpus=1, num_procs=1), - end_str='<|im_end|>', - ) -] diff --git a/configs/eval_internlm_chat_turbomind_tis.py b/configs/eval_internlm_chat_turbomind_tis.py deleted file mode 100644 index 01f42000f..000000000 --- a/configs/eval_internlm_chat_turbomind_tis.py +++ /dev/null @@ -1,40 +0,0 @@ -from mmengine.config import read_base -from opencompass.models.turbomind_tis import TurboMindTisModel - -with read_base(): - # choose a list of datasets - from opencompass.configs.datasets.mmlu.mmlu_gen_a484b3 import mmlu_datasets - from opencompass.configs.datasets.ceval.ceval_gen_5f30c7 import ceval_datasets - from opencompass.configs.datasets.SuperGLUE_WiC.SuperGLUE_WiC_gen_d06864 import WiC_datasets - from opencompass.configs.datasets.SuperGLUE_WSC.SuperGLUE_WSC_gen_7902a7 import WSC_datasets - from opencompass.configs.datasets.triviaqa.triviaqa_gen_2121ce import triviaqa_datasets - from opencompass.configs.datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets - from opencompass.configs.datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets - from opencompass.configs.datasets.race.race_gen_69ee4f import race_datasets - from opencompass.configs.datasets.crowspairs.crowspairs_gen_381af0 import crowspairs_datasets - # and output the results in a choosen format - from opencompass.configs.summarizers.medium import summarizer - -datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), []) - - -meta_template = dict( - round=[ - dict(role='HUMAN', begin='<|User|>:', end='\n'), - dict(role='BOT', begin='<|Bot|>:', end='\n', generate=True), - ], - eos_token_id=103028) - -models = [ - dict( - type=TurboMindTisModel, - abbr='internlm-chat-20b-turbomind', - path='internlm', - tis_addr='0.0.0.0:33337', - max_out_len=100, - max_seq_len=2048, - batch_size=8, - meta_template=meta_template, - run_cfg=dict(num_gpus=1, num_procs=1), - ) -] diff --git a/configs/eval_internlm_turbomind_tis.py b/configs/eval_internlm_turbomind_tis.py deleted file mode 100644 index 98914fa47..000000000 --- a/configs/eval_internlm_turbomind_tis.py +++ /dev/null @@ -1,28 +0,0 @@ -from mmengine.config import read_base -from opencompass.models.turbomind_tis import TurboMindTisModel - -with read_base(): - # choose a list of datasets - from opencompass.configs.datasets.mmlu.mmlu_gen_a484b3 import mmlu_datasets - from opencompass.configs.datasets.ceval.ceval_gen_5f30c7 import ceval_datasets - from opencompass.configs.datasets.SuperGLUE_WiC.SuperGLUE_WiC_gen_d06864 import WiC_datasets - from opencompass.configs.datasets.triviaqa.triviaqa_gen_2121ce import triviaqa_datasets - from opencompass.configs.datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets - from opencompass.configs.datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets - # and output the results in a choosen format - from opencompass.configs.summarizers.medium import summarizer - -datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), []) - -models = [ - dict( - type=TurboMindTisModel, - abbr='internlm-chat-20b-turbomind', - path='internlm', - tis_addr='0.0.0.0:33337', - max_out_len=100, - max_seq_len=2048, - batch_size=8, - run_cfg=dict(num_gpus=1, num_procs=1), - ) -] diff --git a/configs/models/hf_internlm/lmdeploy_internlm2_chat_7b.py b/configs/models/hf_internlm/lmdeploy_internlm2_chat_7b.py index 60097e373..38ea39d7d 100644 --- a/configs/models/hf_internlm/lmdeploy_internlm2_chat_7b.py +++ b/configs/models/hf_internlm/lmdeploy_internlm2_chat_7b.py @@ -1,15 +1,24 @@ from opencompass.models import TurboMindModelwithChatTemplate + models = [ dict( type=TurboMindModelwithChatTemplate, - abbr='internlm2-chat-7b-turbomind', + abbr=f'internlm2-chat-7b-lmdeploy', path='internlm/internlm2-chat-7b', - engine_config=dict(session_len=8192, max_batch_size=16, tp=1), - gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096), + # inference backend of LMDeploy. It can be either 'turbomind' or 'pytorch'. + # If the model is not supported by 'turbomind', it will fallback to + # 'pytorch' + backend='turbomind', + # For the detailed engine config and generation config, please refer to + # https://github.com/InternLM/lmdeploy/blob/main/lmdeploy/messages.py + engine_config=dict(tp=1), + gen_config=dict(do_sample=False), max_seq_len=8192, max_out_len=4096, - batch_size=16, + # the max number of prompts that LMDeploy receives + # in `generate` function + batch_size=5000, run_cfg=dict(num_gpus=1), ) ] diff --git a/docs/en/advanced_guides/evaluation_lmdeploy.md b/docs/en/advanced_guides/evaluation_lmdeploy.md new file mode 100644 index 000000000..bfacd4881 --- /dev/null +++ b/docs/en/advanced_guides/evaluation_lmdeploy.md @@ -0,0 +1,88 @@ +# Evaluation with LMDeploy + +We now support evaluation of models accelerated by the [LMDeploy](https://github.com/InternLM/lmdeploy). LMDeploy is a toolkit designed for compressing, deploying, and serving LLM. It has a remarkable inference performance. We now illustrate how to evaluate a model with the support of LMDeploy in OpenCompass. + +## Setup + +### Install OpenCompass + +Please follow the [instructions](https://opencompass.readthedocs.io/en/latest/get_started/installation.html) to install the OpenCompass and prepare the evaluation datasets. + +### Install LMDeploy + +Install lmdeploy via pip (python 3.8+) + +```shell +pip install lmdeploy +``` + +The default prebuilt package is compiled on CUDA 12. However, if CUDA 11+ is required, you can install lmdeploy by: + +```shell +export LMDEPLOY_VERSION=0.6.0 +export PYTHON_VERSION=310 +pip install https://github.com/InternLM/lmdeploy/releases/download/v${LMDEPLOY_VERSION}/lmdeploy-${LMDEPLOY_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux2014_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118 +``` + +## Evaluation + +When evaluating a model, it is necessary to prepare an evaluation configuration that specifies information such as the evaluation dataset, the model, and inference parameters. + +Taking [internlm2-chat-7b](https://huggingface.co/internlm/internlm2-chat-7b) as an example, the evaluation config is as follows: + +```python +# configure the dataset +from mmengine.config import read_base + + +with read_base(): + # choose a list of datasets + from .datasets.mmlu.mmlu_gen_a484b3 import mmlu_datasets + from .datasets.ceval.ceval_gen_5f30c7 import ceval_datasets + from .datasets.triviaqa.triviaqa_gen_2121ce import triviaqa_datasets + from opencompass.configs.datasets.gsm8k.gsm8k_0shot_v2_gen_a58960 import \ + gsm8k_datasets + # and output the results in a chosen format + from .summarizers.medium import summarizer + +datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), []) + +# configure lmdeploy +from opencompass.models import TurboMindModelwithChatTemplate + + + +# configure the model +models = [ + dict( + type=TurboMindModelwithChatTemplate, + abbr=f'internlm2-chat-7b-lmdeploy', + # model path, which can be the address of a model repository on the Hugging Face Hub or a local path + path='internlm/internlm2-chat-7b', + # inference backend of LMDeploy. It can be either 'turbomind' or 'pytorch'. + # If the model is not supported by 'turbomind', it will fallback to + # 'pytorch' + backend='turbomind', + # For the detailed engine config and generation config, please refer to + # https://github.com/InternLM/lmdeploy/blob/main/lmdeploy/messages.py + engine_config=dict(tp=1), + gen_config=dict(do_sample=False), + # the max size of the context window + max_seq_len=7168, + # the max number of new tokens + max_out_len=1024, + # the max number of prompts that LMDeploy receives + # in `generate` function + batch_size=5000, + run_cfg=dict(num_gpus=1), + ) +] +``` + +Place the aforementioned configuration in a file, such as "configs/eval_internlm2_lmdeploy.py". Then, in the home folder of OpenCompass, start evaluation by the following command: + +```shell +python run.py configs/eval_internlm2_lmdeploy.py -w outputs +``` + +You are expected to get the evaluation results after the inference and evaluation. diff --git a/docs/en/advanced_guides/evaluation_turbomind.md b/docs/en/advanced_guides/evaluation_turbomind.md deleted file mode 100644 index c1299f0b3..000000000 --- a/docs/en/advanced_guides/evaluation_turbomind.md +++ /dev/null @@ -1,78 +0,0 @@ -# Evaluation with LMDeploy - -We now support evaluation of models accelerated by the [LMDeploy](https://github.com/InternLM/lmdeploy). LMDeploy is a toolkit designed for compressing, deploying, and serving LLM. **TurboMind** is an efficient inference engine proposed by LMDeploy. OpenCompass is compatible with TurboMind. We now illustrate how to evaluate a model with the support of TurboMind in OpenCompass. - -## Setup - -### Install OpenCompass - -Please follow the [instructions](https://opencompass.readthedocs.io/en/latest/get_started/installation.html) to install the OpenCompass and prepare the evaluation datasets. - -### Install LMDeploy - -Install lmdeploy via pip (python 3.8+) - -```shell -pip install lmdeploy -``` - -## Evaluation - -OpenCompass integrates turbomind's python API for evaluation. - -We take the InternLM-20B as example. Firstly, we prepare the evaluation config `configs/eval_internlm_turbomind.py`: - -```python -from mmengine.config import read_base -from opencompass.models.turbomind import TurboMindModel - - -with read_base(): - # choose a list of datasets - from .datasets.mmlu.mmlu_gen_a484b3 import mmlu_datasets - from .datasets.ceval.ceval_gen_5f30c7 import ceval_datasets - from .datasets.SuperGLUE_WiC.SuperGLUE_WiC_gen_d06864 import WiC_datasets - from .datasets.triviaqa.triviaqa_gen_2121ce import triviaqa_datasets - from .datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets - from .datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets - # and output the results in a chosen format - from .summarizers.medium import summarizer - -datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), []) - -# config for internlm-20b model -internlm_20b = dict( - type=TurboMindModel, - abbr='internlm-20b-turbomind', - path="internlm/internlm-20b", # this path should be same as in huggingface - engine_config=dict(session_len=2048, - max_batch_size=8, - rope_scaling_factor=1.0), - gen_config=dict(top_k=1, top_p=0.8, - temperature=1.0, - max_new_tokens=100), - max_out_len=100, - max_seq_len=2048, - batch_size=8, - concurrency=8, - run_cfg=dict(num_gpus=1, num_procs=1), - end_str='' - ) - -models = [internlm_20b] -``` - -Then, in the home folder of OpenCompass, start evaluation by the following command: - -```shell -python run.py configs/eval_internlm_turbomind.py -w outputs/turbomind/internlm-20b -``` - -You are expected to get the evaluation results after the inference and evaluation. - -**Note**: - -- If you want to pass more arguments for `engine_config`和`gen_config` in the evaluation config file, please refer to [TurbomindEngineConfig](https://lmdeploy.readthedocs.io/en/latest/inference/pipeline.html#turbomindengineconfig) - and [GenerationConfig](https://lmdeploy.readthedocs.io/en/latest/inference/pipeline.html#generationconfig) -- If you evaluate the InternLM Chat model, please use configuration file `eval_internlm_chat_turbomind.py` -- If you evaluate the InternLM 7B model, please modify `eval_internlm_turbomind.py` or `eval_internlm_chat_turbomind.py` by changing to the setting `models = [internlm_7b]` in the last line. diff --git a/docs/zh_cn/advanced_guides/evaluation_lmdeploy.md b/docs/zh_cn/advanced_guides/evaluation_lmdeploy.md new file mode 100644 index 000000000..158399641 --- /dev/null +++ b/docs/zh_cn/advanced_guides/evaluation_lmdeploy.md @@ -0,0 +1,86 @@ +# 使用 LMDeploy 加速评测 + +我们支持在评测大语言模型时,使用 [LMDeploy](https://github.com/InternLM/lmdeploy) 作为推理加速引擎。LMDeploy 是涵盖了 LLM 和 VLM 任务的全套轻量化、部署和服务解决方案,拥有卓越的推理性能。本教程将介绍如何使用 LMDeploy 加速对模型的评测。 + +## 环境配置 + +### 安装 OpenCompass + +请根据 OpenCompass [安装指南](https://opencompass.readthedocs.io/en/latest/get_started/installation.html) 来安装算法库和准备数据集。 + +### 安装 LMDeploy + +使用 pip 安装 LMDeploy (python 3.8+): + +```shell +pip install lmdeploy +``` + +LMDeploy 预编译包默认基于 CUDA 12 编译。如果需要在 CUDA 11+ 下安装 LMDeploy,请执行以下命令: + +```shell +export LMDEPLOY_VERSION=0.6.0 +export PYTHON_VERSION=310 +pip install https://github.com/InternLM/lmdeploy/releases/download/v${LMDEPLOY_VERSION}/lmdeploy-${LMDEPLOY_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux2014_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118 +``` + +## 评测 + +在评测一个模型时,需要准备一份评测配置,指明评测集、模型和推理参数等信息。 + +以 [internlm2-chat-7b](https://huggingface.co/internlm/internlm2-chat-7b) 模型为例,相关的配置信息如下: + +```python +# configure the dataset +from mmengine.config import read_base + + +with read_base(): + # choose a list of datasets + from .datasets.mmlu.mmlu_gen_a484b3 import mmlu_datasets + from .datasets.ceval.ceval_gen_5f30c7 import ceval_datasets + from .datasets.triviaqa.triviaqa_gen_2121ce import triviaqa_datasets + from opencompass.configs.datasets.gsm8k.gsm8k_0shot_v2_gen_a58960 import \ + gsm8k_datasets + # and output the results in a chosen format + from .summarizers.medium import summarizer + +datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), []) + +# configure lmdeploy +from opencompass.models import TurboMindModelwithChatTemplate + + + +# configure the model +models = [ + dict( + type=TurboMindModelwithChatTemplate, + abbr=f'internlm2-chat-7b-lmdeploy', + # model path, which can be the address of a model repository on the Hugging Face Hub or a local path + path='internlm/internlm2-chat-7b', + # inference backend of LMDeploy. It can be either 'turbomind' or 'pytorch'. + # If the model is not supported by 'turbomind', it will fallback to + # 'pytorch' + backend='turbomind', + # For the detailed engine config and generation config, please refer to + # https://github.com/InternLM/lmdeploy/blob/main/lmdeploy/messages.py + engine_config=dict(tp=1), + gen_config=dict(do_sample=False), + # the max size of the context window + max_seq_len=7168, + # the max number of new tokens + max_out_len=1024, + # the max number of prompts that LMDeploy receives + # in `generate` function + batch_size=32, + run_cfg=dict(num_gpus=1), + ) +] +``` + +把上述配置放在文件中,比如 "configs/eval_internlm2_lmdeploy.py"。然后,在 OpenCompass 的项目目录下,执行如下命令可得到评测结果: + +```shell +python run.py configs/eval_internlm2_lmdeploy.py -w outputs +``` diff --git a/docs/zh_cn/advanced_guides/evaluation_turbomind.md b/docs/zh_cn/advanced_guides/evaluation_turbomind.md deleted file mode 100644 index a7c37b758..000000000 --- a/docs/zh_cn/advanced_guides/evaluation_turbomind.md +++ /dev/null @@ -1,75 +0,0 @@ -# 评测 LMDeploy 模型 - -我们支持评测使用 [LMDeploy](https://github.com/InternLM/lmdeploy) 加速过的大语言模型。LMDeploy 由 MMDeploy 和 MMRazor 团队联合开发,是涵盖了 LLM 任务的全套轻量化、部署和服务解决方案。 **TurboMind** 是 LMDeploy 推出的高效推理引擎。OpenCompass 对 TurboMind 进行了适配,本教程将介绍如何使用 OpenCompass 来对 TurboMind 加速后的模型进行评测。 - -## 环境配置 - -### 安装 OpenCompass - -请根据 OpenCompass [安装指南](https://opencompass.readthedocs.io/en/latest/get_started/installation.html) 来安装算法库和准备数据集。 - -### 安装 LMDeploy - -使用 pip 安装 LMDeploy (python 3.8+): - -```shell -pip install lmdeploy -``` - -## 评测 - -OpenCompass 支持分别通过 turbomind python API 评测数据集。 - -下文以 InternLM-20B 模型为例,介绍如何评测。首先我们准备好测试配置文件`configs/eval_internlm_turbomind.py`: - -```python -from mmengine.config import read_base -from opencompass.models.turbomind import TurboMindModel - - -with read_base(): - # choose a list of datasets - from .datasets.mmlu.mmlu_gen_a484b3 import mmlu_datasets - from .datasets.ceval.ceval_gen_5f30c7 import ceval_datasets - from .datasets.SuperGLUE_WiC.SuperGLUE_WiC_gen_d06864 import WiC_datasets - from .datasets.triviaqa.triviaqa_gen_2121ce import triviaqa_datasets - from .datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets - from .datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets - # and output the results in a chosen format - from .summarizers.medium import summarizer - -datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), []) - -# config for internlm-20b model -internlm_20b = dict( - type=TurboMindModel, - abbr='internlm-20b-turbomind', - path="internlm/internlm-20b", # 注意路径与huggingface保持一致 - engine_config=dict(session_len=2048, - max_batch_size=8, - rope_scaling_factor=1.0), - gen_config=dict(top_k=1, top_p=0.8, - temperature=1.0, - max_new_tokens=100), - max_out_len=100, - max_seq_len=2048, - batch_size=8, - concurrency=8, - run_cfg=dict(num_gpus=1, num_procs=1), - end_str='' - ) - -models = [internlm_20b] -``` - -然后,在 OpenCompass 的项目目录下,执行如下命令可得到评测结果: - -```shell -python run.py configs/eval_internlm_turbomind.py -w outputs/turbomind/internlm-20b -``` - -**注:** - -- 如果想在测评配置文件中`engine_config`和`gen_config`字段传递更多参数,请参考[TurbomindEngineConfig](https://lmdeploy.readthedocs.io/zh-cn/latest/inference/pipeline.html#turbomindengineconfig) 和 [GenerationConfig](https://lmdeploy.readthedocs.io/zh-cn/latest/inference/pipeline.html#generationconfig) -- 如果评测 InternLM Chat 模型,请使用配置文件 `eval_internlm_chat_turbomind.py` -- 如果评测 InternLM 7B 模型,请修改 `eval_internlm_turbomind.py` 或者 `eval_internlm_chat_turbomind.py`。将`models`字段配置为`models = [internlm_7b]` 。 diff --git a/opencompass/configs/models/hf_internlm/lmdeploy_internlm2_chat_7b.py b/opencompass/configs/models/hf_internlm/lmdeploy_internlm2_chat_7b.py index 60097e373..38ea39d7d 100644 --- a/opencompass/configs/models/hf_internlm/lmdeploy_internlm2_chat_7b.py +++ b/opencompass/configs/models/hf_internlm/lmdeploy_internlm2_chat_7b.py @@ -1,15 +1,24 @@ from opencompass.models import TurboMindModelwithChatTemplate + models = [ dict( type=TurboMindModelwithChatTemplate, - abbr='internlm2-chat-7b-turbomind', + abbr=f'internlm2-chat-7b-lmdeploy', path='internlm/internlm2-chat-7b', - engine_config=dict(session_len=8192, max_batch_size=16, tp=1), - gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096), + # inference backend of LMDeploy. It can be either 'turbomind' or 'pytorch'. + # If the model is not supported by 'turbomind', it will fallback to + # 'pytorch' + backend='turbomind', + # For the detailed engine config and generation config, please refer to + # https://github.com/InternLM/lmdeploy/blob/main/lmdeploy/messages.py + engine_config=dict(tp=1), + gen_config=dict(do_sample=False), max_seq_len=8192, max_out_len=4096, - batch_size=16, + # the max number of prompts that LMDeploy receives + # in `generate` function + batch_size=5000, run_cfg=dict(num_gpus=1), ) ] diff --git a/opencompass/models/__init__.py b/opencompass/models/__init__.py index 0f55b869c..580402d46 100644 --- a/opencompass/models/__init__.py +++ b/opencompass/models/__init__.py @@ -25,8 +25,6 @@ from .krgpt_api import KrGPT # noqa: F401 from .lightllm_api import LightllmAPI, LightllmChatAPI # noqa: F401 from .llama2 import Llama2, Llama2Chat # noqa: F401 -from .lmdeploy_pytorch import LmdeployPytorchModel # noqa: F401 -from .lmdeploy_tis import LmdeployTisModel # noqa: F401 from .minimax_api import MiniMax, MiniMaxChatCompletionV2 # noqa: F401 from .mistral_api import Mistral # noqa: F401 from .mixtral import Mixtral # noqa: F401 @@ -41,7 +39,6 @@ from .sensetime_api import SenseTime # noqa: F401 from .stepfun_api import StepFun # noqa: F401 from .turbomind import TurboMindModel # noqa: F401 -from .turbomind_tis import TurboMindTisModel # noqa: F401 from .turbomind_with_tf_above_v4_33 import \ TurboMindModelwithChatTemplate # noqa: F401 from .unigpt_api import UniGPT # noqa: F401 diff --git a/opencompass/models/lmdeploy_pytorch.py b/opencompass/models/lmdeploy_pytorch.py deleted file mode 100644 index 80924c276..000000000 --- a/opencompass/models/lmdeploy_pytorch.py +++ /dev/null @@ -1,188 +0,0 @@ -from concurrent.futures import ThreadPoolExecutor -from typing import Dict, List, Optional, Union - -from opencompass.models.base import BaseModel -from opencompass.utils.logging import get_logger -from opencompass.utils.prompt import PromptList - -PromptType = Union[PromptList, str] - - -def valid_str(string, coding='utf-8'): - """decode text according to its encoding type.""" - invalid_chars = [b'\xef\xbf\xbd'] - bstr = bytes(string, coding) - for invalid_char in invalid_chars: - bstr = bstr.replace(invalid_char, b'') - ret = bstr.decode(encoding=coding, errors='ignore') - return ret - - -class LmdeployPytorchModel(BaseModel): - """Model wrapper for lmdeploy pytorch engine through python API. - - Args: - path (str): path of the supported pytorch model. - max_seq_len (int): The maximum allowed sequence length of a model. - Note that the length of prompt + generated tokens shall not exceed - this value. Defaults to 2048. - meta_template (Dict, optional): The model's meta prompt - template if needed, in case the requirement of injecting or - wrapping of any meta instructions. - engine_config (Dict, optional): The engine config to set - arguments like session_len, max_batch_size for TurboMind. - gen_config (Dict, optional): Generation config to set - arguments like top_k, top_p, temperature. - end_str (str, optional): Whether to trim generated strings with end_str - if the model has special ending strings that are not handled well. - Defaults to None. - """ - - def __init__(self, - path: str, - concurrency: int = 8, - max_seq_len: int = 2048, - meta_template: Optional[Dict] = None, - engine_config: Optional[Dict] = None, - gen_config: Optional[Dict] = None, - end_str: Optional[str] = None): - super().__init__(path=path, - max_seq_len=max_seq_len, - meta_template=meta_template) - from lmdeploy.pytorch import engine as tm - from lmdeploy.version import version_info - - if engine_config is not None: - from lmdeploy.messages import PytorchEngineConfig - engine_config = PytorchEngineConfig(**engine_config) - # set thread_safe - if hasattr(engine_config, 'thread_safe'): - engine_config.thread_safe = True - - if gen_config is not None: - from lmdeploy.messages import GenerationConfig - gen_config = GenerationConfig(**gen_config) - - self.logger = get_logger() - tm_model = tm.Engine(path, engine_config) - self.tokenizer = tm_model.tokenizer - self.generators = [ - tm_model.create_instance() for i in range(concurrency) - ] - self.generator_ids = [i + 1 for i in range(concurrency)] - - from transformers import GenerationConfig - try: - generation_config = GenerationConfig.from_pretrained(path) - except Exception: - generation_config = None - if generation_config and hasattr(generation_config, 'eos_token_id'): - if gen_config.stop_words is None: - stop_words = [] - if isinstance(generation_config.eos_token_id, int): - stop_words.append(generation_config.eos_token_id) - else: - assert isinstance(generation_config.eos_token_id, list) - for token_id in generation_config.eos_token_id: - stop_words.append(token_id) - gen_config.stop_words = stop_words - if version_info >= (0, 6, 0): - gen_config.stop_token_ids = stop_words - self.gen_config = gen_config - self.end_str = end_str - self.major_version, self.minor_version = version_info[:2] - - def generate( - self, - inputs: List[str], - max_out_len: int = 512, - ) -> List[str]: - """Generate results given a list of inputs. - - Args: - inputs (List[str]): A list of prompts - max_out_len (int): The maximum length of the output. - - Returns: - List[str]: A list of generated strings. - """ - assert isinstance( - inputs, List), f'List(str) is expected, but got {type(inputs)}' - - # split inputs into batches - batch_size = len(self.generators) - batch_inputs = [ - inputs[i:i + batch_size] for i in range(0, len(inputs), batch_size) - ] - - results = [] - for batch_input in batch_inputs: - with ThreadPoolExecutor() as executor: - _results = list( - executor.map( - self._generate, - self.generators[:len(batch_input)], - self.generator_ids[:len(batch_input)], - batch_input, - [self.gen_config] * len(batch_input), - [self.end_str] * len(batch_input), - )) - results += _results - return results - - def get_token_len(self, prompt: str) -> int: - input_ids = self.tokenizer.encode(prompt) - return len(input_ids) - - def wait(self): - """Wait till the next query can be sent. - - Applicable in both single-thread and multi-thread environments. - """ - return self.token_bucket.get_token() - - def _generate(self, - generator, - session_id, - prompt: PromptType, - gen_config=None, - end_str: Optional[str] = None) -> str: - """Generate results given a list of inputs. - - Args: - prompt (PromptType): A string or PromptDict. - The PromptDict should be organized in OpenCompass' - API format. - gen_config (GenerationConfig, optional): Generation - config to set arguments like top_k, top_p, temperature. - end_str (str, optional): Whether to trim generated strings - with end_str if the model has special ending strings - that are not handled well. - Defaults to None. - Returns: - str: The generated string. - """ - assert type( - prompt) is str, 'We only support string for TurboMind Python API' - input_ids = self.tokenizer.encode(prompt) - if self.major_version >= 0 and self.minor_version >= 4: - outputs = generator.infer(session_id, - input_ids, - gen_config=gen_config) - output_ids = outputs.token_ids - else: - _, output_ids, _ = generator.infer(session_id, - input_ids, - gen_config=gen_config) - - # stop engine - if hasattr(generator, 'end'): - generator.end(session_id) - # decode output - response_all = self.tokenizer.decode(output_ids) - # trim output - if end_str: - response_all = response_all.split(end_str)[0] - # remove invalid characters - response_all = valid_str(response_all) - return response_all diff --git a/opencompass/models/lmdeploy_tis.py b/opencompass/models/lmdeploy_tis.py deleted file mode 100644 index 9c92ef18a..000000000 --- a/opencompass/models/lmdeploy_tis.py +++ /dev/null @@ -1,200 +0,0 @@ -import threading -from concurrent.futures import ThreadPoolExecutor -from functools import partial -from queue import Queue -from typing import Dict, List, Optional, Union - -import numpy as np - -from opencompass.models.base import BaseModel, LMTemplateParser -from opencompass.utils.logging import get_logger -from opencompass.utils.prompt import PromptList - -PromptType = Union[PromptList, str] - - -def valid_str(string, coding='utf-8'): - """decode text according to its encoding type.""" - invalid_chars = [b'\xef\xbf\xbd'] - bstr = bytes(string, coding) - for invalid_char in invalid_chars: - bstr = bstr.replace(invalid_char, b'') - ret = bstr.decode(encoding=coding, errors='ignore') - return ret - - -def prepare_tensor(name, input_tensor): - """Create grpcclient's InferInput instance according to a given tensor.""" - import tritonclient.grpc as grpcclient - from tritonclient.utils import np_to_triton_dtype - t = grpcclient.InferInput(name, list(input_tensor.shape), - np_to_triton_dtype(input_tensor.dtype)) - t.set_data_from_numpy(input_tensor) - return t - - -def stream_callback(que, result, error): - """callback function invoked by triton client.""" - que.put((result, error)) - - -class LmdeployTisModel(BaseModel): - """Model wrapper for LMDeploy Python Backend Triton Inference Server gRPC - API. - - Args: - path (str): The name of OpenAI's model. - tis_addr (str): The address (ip:port format) of turbomind's - triton inference server - max_seq_len (int): The maximum allowed sequence length of a model. - Note that the length of prompt + generated tokens shall not exceed - this value. Defaults to 2048. - meta_template (Dict, optional): The model's meta prompt - template if needed, in case the requirement of injecting or - wrapping of any meta instructions. - """ - - is_api: bool = True - - def __init__(self, - path: str, - tis_addr: str = '0.0.0.0:33337', - max_seq_len: int = 2048, - meta_template: Optional[Dict] = None, - end_str: Optional[str] = None): - super().__init__(path=path, - max_seq_len=max_seq_len, - meta_template=meta_template) - from lmdeploy.tokenizer import Tokenizer - - self.logger = get_logger() - self.template_parser = LMTemplateParser(meta_template) - self.eos_token_id = None - if meta_template and 'eos_token_id' in meta_template: - self.eos_token_id = meta_template['eos_token_id'] - self.tis_addr = tis_addr - self.tokenizer = Tokenizer(path) - self.end_str = end_str - - def generate( - self, - inputs: List[str or PromptList], - max_out_len: int = 512, - temperature: float = 1.0, - ) -> List[str]: - """Generate results given a list of inputs. - - Args: - inputs (List[str or PromptList]): A list of strings or PromptDicts. - The PromptDict should be organized in OpenCompass' - API format. - max_out_len (int): The maximum length of the output. - temperature (float): What sampling temperature to use, - between 0 and 2. Higher values like 0.8 will make the output - more random, while lower values like 0.2 will make it more - focused and deterministic. Defaults to 0.7. - - Returns: - List[str]: A list of generated strings. - """ - - with ThreadPoolExecutor() as executor: - results = list( - executor.map(self._generate, inputs, - [max_out_len] * len(inputs), - [temperature] * len(inputs), - [self.end_str] * len(inputs))) - return results - - def wait(self): - """Wait till the next query can be sent. - - Applicable in both single-thread and multi-thread environments. - """ - return self.token_bucket.get_token() - - def get_token_len(self, prompt: str) -> int: - input_ids = self.tokenizer.encode(prompt) - return len(input_ids) - - def _call_triton_server(self, prompt, tis_addr, session_id, - request_output_len, temperature, res_que): - import tritonclient.grpc as grpcclient - - with grpcclient.InferenceServerClient(tis_addr) as client: - inputs = [ - prepare_tensor('prompt', - np.array([prompt.encode()], dtype=np.object_)), - prepare_tensor('max_tokens', - np.array([request_output_len], dtype=np.int32)), - prepare_tensor('temperature', - np.array([temperature], dtype=np.float_)), - prepare_tensor('top_p', np.array([1.0], dtype=np.float_)), - prepare_tensor('top_k', np.array([1], dtype=np.int32)), - prepare_tensor('ignore_eos', np.array([False], - dtype=np.bool_)), - prepare_tensor('stream', np.array([True], dtype=np.bool_)), - ] - - # async_stream - client.start_stream(partial(stream_callback, res_que)) - client.async_stream_infer('lmdeploy_model', - inputs, - sequence_id=session_id, - sequence_start=True, - sequence_end=True) - - res_que.put(None) - return - - def _process_result(self, que): - text = '' - while True: - res = que.get() - if res is not None: - result, err = res - if err is not None: - print(err) - else: - res = result.as_numpy('response').item().decode() - text += res - else: - return text - - def _generate(self, - prompt: str or PromptList, - max_out_len: int, - temperature: float, - end_str: Optional[str] = None) -> str: - """Generate results given a list of inputs. - - Args: - prompt (str or PromptList): A string or PromptDict. - The PromptDict should be organized in OpenCompass' - API format. - max_out_len (int): The maximum length of the output. - temperature (float): What sampling temperature to use, - between 0 and 2. Higher values like 0.8 will make the output - more random, while lower values like 0.2 will make it more - focused and deterministic. - - Returns: - str: The generated string. - """ - assert type( - prompt - ) is str, 'We only support string for LMDeploy Python Backend TIS API' - - res_que = Queue() - - self._call_triton_server(prompt=prompt, - tis_addr=self.tis_addr, - session_id=threading.currentThread().ident, - request_output_len=max_out_len, - temperature=temperature, - res_que=res_que) - text = self._process_result(res_que) - response = valid_str(text) - if end_str: - response = response.split(end_str)[0] - return response diff --git a/opencompass/models/turbomind_tis.py b/opencompass/models/turbomind_tis.py deleted file mode 100644 index 8541b9de5..000000000 --- a/opencompass/models/turbomind_tis.py +++ /dev/null @@ -1,135 +0,0 @@ -import logging -import threading -from concurrent.futures import ThreadPoolExecutor -from typing import Dict, List, Optional, Union - -from opencompass.models.base import BaseModel, LMTemplateParser -from opencompass.utils.logging import get_logger -from opencompass.utils.prompt import PromptList - -PromptType = Union[PromptList, str] - - -def valid_str(string, coding='utf-8'): - """decode text according to its encoding type.""" - invalid_chars = [b'\xef\xbf\xbd'] - bstr = bytes(string, coding) - for invalid_char in invalid_chars: - bstr = bstr.replace(invalid_char, b'') - ret = bstr.decode(encoding=coding, errors='ignore') - return ret - - -class TurboMindTisModel(BaseModel): - """Model wrapper for TurboMind Triton Inference Server gRPC API. - - Args: - path (str): The name of OpenAI's model. - tis_addr (str): The address (ip:port format) of turbomind's - triton inference server - max_seq_len (int): The maximum allowed sequence length of a model. - Note that the length of prompt + generated tokens shall not exceed - this value. Defaults to 2048. - meta_template (Dict, optional): The model's meta prompt - template if needed, in case the requirement of injecting or - wrapping of any meta instructions. - """ - - is_api: bool = True - - def __init__( - self, - path: str, - tis_addr: str = '0.0.0.0:33337', - max_seq_len: int = 2048, - meta_template: Optional[Dict] = None, - ): - super().__init__(path=path, - max_seq_len=max_seq_len, - meta_template=meta_template) - from lmdeploy.serve.turbomind.utils import Preprocessor - self.preprocess = Preprocessor(tis_addr) - self.logger = get_logger() - self.template_parser = LMTemplateParser(meta_template) - self.eos_token_id = None - if meta_template and 'eos_token_id' in meta_template: - self.eos_token_id = meta_template['eos_token_id'] - self.tis_addr = tis_addr - - def generate( - self, - inputs: List[PromptType], - max_out_len: int = 512, - temperature: float = 1.0, - ) -> List[str]: - """Generate results given a list of inputs. - - Args: - inputs (List[PromptType]): A list of strings or PromptDicts. - The PromptDict should be organized in OpenCompass' - API format. - max_out_len (int): The maximum length of the output. - temperature (float): What sampling temperature to use, - between 0 and 2. Higher values like 0.8 will make the output - more random, while lower values like 0.2 will make it more - focused and deterministic. Defaults to 0.7. - - Returns: - List[str]: A list of generated strings. - """ - - with ThreadPoolExecutor() as executor: - results = list( - executor.map(self._generate, inputs, - [max_out_len] * len(inputs), - [temperature] * len(inputs))) - return results - - def get_token_len(self, prompt: str) -> int: - input_ids, _ = self.preprocess(prompt) - return input_ids.shape[-1] - - def wait(self): - """Wait till the next query can be sent. - - Applicable in both single-thread and multi-thread environments. - """ - return self.token_bucket.get_token() - - def _generate(self, prompt: PromptType, max_out_len: int, - temperature: float) -> str: - """Generate results given a list of inputs. - - Args: - prompt (PromptType): A string or PromptDict. - The PromptDict should be organized in OpenCompass' - API format. - max_out_len (int): The maximum length of the output. - temperature (float): What sampling temperature to use, - between 0 and 2. Higher values like 0.8 will make the output - more random, while lower values like 0.2 will make it more - focused and deterministic. - - Returns: - str: The generated string. - """ - assert type( - prompt) is str, 'We only support string for TurboMind RPC API' - - from lmdeploy.serve.turbomind.chatbot import Chatbot - chatbot = Chatbot(self.tis_addr, - temperature=temperature, - capability='completion', - top_k=1, - log_level=logging.ERROR) - - for status, text, n_token in chatbot.stream_infer( - session_id=threading.currentThread().ident, - prompt=prompt, - request_output_len=max_out_len, - sequence_start=True, - sequence_end=True): - continue - response = valid_str(text) - response = response.replace('', '') - return response diff --git a/opencompass/models/turbomind_with_tf_above_v4_33.py b/opencompass/models/turbomind_with_tf_above_v4_33.py index 48706671f..ab6801c9c 100644 --- a/opencompass/models/turbomind_with_tf_above_v4_33.py +++ b/opencompass/models/turbomind_with_tf_above_v4_33.py @@ -1,7 +1,6 @@ # flake8: noqa # yapf: disable import copy -from concurrent.futures import ThreadPoolExecutor from typing import Dict, List, Optional, Union from opencompass.models.base import BaseModel @@ -31,38 +30,32 @@ def __init__( self, path: str, tokenizer_only: bool = False, + backend: str = 'turbomind', engine_config: Dict = {}, gen_config: Dict = {}, - concurrency: int = 8, max_seq_len: int = None, meta_template: Optional[Dict] = None, fastchat_template: Optional[str] = None, stop_words: List[str] = [], ): - from lmdeploy.messages import TurbomindEngineConfig - from lmdeploy.turbomind import TurboMind - from lmdeploy.version import version_info - from transformers import AutoTokenizer - self.logger = get_logger() self.path = path self.tokenizer_only = tokenizer_only self.template_parser = _get_meta_template(meta_template) self.max_seq_len = _get_possible_max_seq_len(max_seq_len, path) - self.origin_tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True) + from lmdeploy import version_info + from transformers import AutoTokenizer + self.version_info = version_info + self.tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True) if not tokenizer_only: DEFAULT_ENGING_CONFIG = {'session_len': self.max_seq_len} _engine_config = DEFAULT_ENGING_CONFIG.copy() _engine_config.update(engine_config) - engine_config = TurbomindEngineConfig(**_engine_config) - tm_model = TurboMind.from_pretrained(path, engine_config=engine_config) - self.tokenizer = tm_model.tokenizer - self.generators = [tm_model.create_instance() for i in range(concurrency)] - self.generator_ids = [i + 1 for i in range(concurrency)] - self.concurrency = concurrency + self.pipe = self._build_pipe(path, backend, _engine_config) + else: + self.pipe = None self.gen_config = gen_config - self.version_info = version_info self.fastchat_template = fastchat_template self.stop_words = list(set(stop_words + self._get_potential_stop_words(path))) self.logger.info(f'using stop words: {self.stop_words}') @@ -76,23 +69,23 @@ def _get_potential_stop_words(self, path: Optional[str]): generation_config = None if generation_config and hasattr(generation_config, 'eos_token_id'): if isinstance(generation_config.eos_token_id, int): - potential_stop_words.append(self.origin_tokenizer.decode(generation_config.eos_token_id)) + potential_stop_words.append(self.tokenizer.decode(generation_config.eos_token_id)) else: assert isinstance(generation_config.eos_token_id, list) for token_id in generation_config.eos_token_id: - potential_stop_words.append(self.origin_tokenizer.decode(token_id)) - if self.origin_tokenizer.eos_token is not None: - potential_stop_words.append(self.origin_tokenizer.eos_token) + potential_stop_words.append(self.tokenizer.decode(token_id)) + if self.tokenizer.eos_token is not None: + potential_stop_words.append(self.tokenizer.eos_token) potential_stop_words = list(set(potential_stop_words)) potential_stop_words = [s for s in potential_stop_words if s] return potential_stop_words def generate(self, inputs: List[str], - max_out_len: int = 512, + max_out_len: int, stopping_criteria: List[str] = [], do_sample: Optional[bool] = None, - temperature: int = 1, + temperature: float = 1.0, **kwargs) -> List[str]: """Generate results given a list of inputs. @@ -104,93 +97,45 @@ def generate(self, List[str]: A list of generated strings. """ assert isinstance(inputs, List), f'List(str) is expected, but got {type(inputs)}' - messages = _convert_chat_messages(inputs) if self.fastchat_template: messages = _format_with_fast_chat_template(messages, self.fastchat_template) else: - messages = [self.origin_tokenizer.apply_chat_template(m, add_generation_prompt=True, tokenize=False) for m in messages] - - # split messages into batches - batch_messages = [messages[i:i + self.concurrency] for i in range(0, len(messages), self.concurrency)] + messages = [self.tokenizer.apply_chat_template(m, add_generation_prompt=True, tokenize=False) for m in messages] stop_words = list(set(self.stop_words + stopping_criteria)) - encode_stop_words = [] - if stop_words is not None and len(stop_words) > 0: - for words in stop_words: - encode_stop_words += self.tokenizer.encode(words, add_bos=False) DEFAULT_GEN_CONFIG = { 'max_new_tokens': max_out_len, 'min_new_tokens': 1, - 'top_k': 1, - 'stop_words': encode_stop_words, + 'stop_words': stop_words, } gen_config = copy.deepcopy(DEFAULT_GEN_CONFIG) gen_config.update(self.gen_config) if do_sample: - gen_config['top_k'] = 1000 + gen_config['top_k'] = 40 gen_config['temperature'] = temperature + else: + if self.version_info >= (0, 6, 0): + gen_config['do_sample'] = False + else: + gen_config['top_k'] = 1 - from lmdeploy.messages import GenerationConfig + from lmdeploy import GenerationConfig + gen_config = {k: v for k, v in gen_config.items() if hasattr(GenerationConfig, k)} gen_config = GenerationConfig(**gen_config) - if self.version_info >= (0, 6, 0): - gen_config.stop_words = stop_words - gen_config.convert_stop_bad_words_to_ids(self.tokenizer) results = [] - for batch_message in batch_messages: - n = len(batch_message) - with ThreadPoolExecutor() as executor: - _results = list( - executor.map( - self._generate, - self.generators[:n], - self.generator_ids[:n], - batch_message, - [gen_config] * n, - )) - results += _results + outputs = self.pipe(messages, gen_config=gen_config, do_preprocess=False) + for output in outputs: + text = self.tokenizer.decode(output.token_ids) + results.append(text) for s in stop_words: results = [r.split(s)[0] for r in results] return results - def _generate(self, - generator, - session_id, - prompt: PromptType, - gen_config=None) -> str: - """Generate results given a list of inputs. - - Args: - prompt (PromptType): A string or PromptDict. - The PromptDict should be organized in OpenCompass' - API format. - gen_config (GenerationConfig, optional): Generation - config to set arguments like top_k, top_p, temperature. - Returns: - str: The generated string. - """ - assert type(prompt) is str, 'We only support string for TurboMind Python API' - - input_ids = self.tokenizer.encode(prompt, add_bos=False) - for outputs in generator.stream_infer(session_id=session_id, - input_ids=[input_ids], - gen_config=gen_config, - sequence_start=True, - sequence_end=True, - step=0, - stream_output=False): - if self.version_info >= (0, 4, 0): - output_ids = outputs.token_ids - else: - _, output_ids, _ = outputs - response = self.tokenizer.decode(output_ids) - response = valid_str(response) - return response - def get_token_len(self, prompt: str) -> int: """Get lengths of the tokenized strings. @@ -201,5 +146,20 @@ def get_token_len(self, prompt: str) -> int: int: Length of the input tokens """ m = _convert_chat_messages([prompt])[0] - t = self.origin_tokenizer.apply_chat_template(m, add_generation_prompt=True, return_dict=True) + t = self.tokenizer.apply_chat_template(m, add_generation_prompt=True, return_dict=True) return len(t['input_ids']) + + def _build_pipe(self, model_path, backend, engine_config): + from lmdeploy import (PytorchEngineConfig, TurbomindEngineConfig, + pipeline) + + assert backend in ['pytorch', 'turbomind'], \ + f'unsupported backend type: {backend}' + + if backend == 'turbomind': + filtered = {k: v for k, v in engine_config.items() if hasattr(TurbomindEngineConfig, k)} + backend_config = TurbomindEngineConfig(**filtered) + else: + filtered = {k: v for k, v in engine_config.items() if hasattr(PytorchEngineConfig, k)} + backend_config = PytorchEngineConfig(**filtered) + return pipeline(model_path, backend_config=backend_config, log_level='INFO', max_log_len=10) diff --git a/opencompass/utils/run.py b/opencompass/utils/run.py index 67c465941..025efc4b3 100644 --- a/opencompass/utils/run.py +++ b/opencompass/utils/run.py @@ -9,7 +9,7 @@ from opencompass.datasets.custom import make_custom_dataset_config from opencompass.models import (VLLM, HuggingFace, HuggingFaceBaseModel, HuggingFaceCausalLM, HuggingFaceChatGLM3, - HuggingFacewithChatTemplate, TurboMindModel, + HuggingFacewithChatTemplate, TurboMindModelwithChatTemplate, VLLMwithChatTemplate) from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner @@ -233,7 +233,7 @@ def change_accelerator(models, accelerator): model_accels = [] for model in models: logger.info(f'Transforming {model["abbr"]} to {accelerator}') - # change HuggingFace model to VLLM or TurboMindModel + # change HuggingFace model to VLLM or LMDeploy if model['type'] in [HuggingFace, HuggingFaceCausalLM, HuggingFaceChatGLM3, f'{HuggingFaceBaseModel.__module__}.{HuggingFaceBaseModel.__name__}']: gen_args = dict() if model.get('generation_kwargs') is not None: @@ -254,10 +254,10 @@ def change_accelerator(models, accelerator): if accelerator == 'lmdeploy': logger.info(f'Transforming {model["abbr"]} to {accelerator}') - mod = TurboMindModel + mod = TurboMindModelwithChatTemplate acc_model = dict( type=f'{mod.__module__}.{mod.__name__}', - abbr=model['abbr'].replace('hf', 'turbomind') if '-hf' in model['abbr'] else model['abbr'] + '-turbomind', + abbr=model['abbr'].replace('hf', 'lmdeploy') if '-hf' in model['abbr'] else model['abbr'] + '-lmdeploy', path=model['path'], engine_config=dict(session_len=model['max_seq_len'], max_batch_size=model['batch_size'], @@ -270,7 +270,6 @@ def change_accelerator(models, accelerator): max_out_len=model['max_out_len'], max_seq_len=model['max_seq_len'], batch_size=model['batch_size'], - concurrency=model['batch_size'], run_cfg=model['run_cfg'], ) for item in ['meta_template']: @@ -312,7 +311,7 @@ def change_accelerator(models, accelerator): mod = TurboMindModelwithChatTemplate acc_model = dict( type=f'{mod.__module__}.{mod.__name__}', - abbr=model['abbr'].replace('hf', 'turbomind') if '-hf' in model['abbr'] else model['abbr'] + '-turbomind', + abbr=model['abbr'].replace('hf', 'lmdeploy') if '-hf' in model['abbr'] else model['abbr'] + '-lmdeploy', path=model['path'], engine_config=dict(max_batch_size=model.get('batch_size', 16), tp=model['run_cfg']['num_gpus']), gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9),