From 83eeb52b0946a2151db71adeaec629a0f13f0a7a Mon Sep 17 00:00:00 2001
From: liushz <qq1791167085@163.com>
Date: Wed, 25 Sep 2024 11:26:36 +0800
Subject: [PATCH 01/20] [Feature] Update WikiBench base model config  (#1553)

* Update MathBench & WikiBench for FullBench

* Update MathBench & WikiBench for FullBench

* Update GPQA & MMLU_Pro

* Update MathBench & WikiBench for FullBench

* Update MathBench & WikiBench for FullBench

* Update MathBench & WikiBench for FullBench

* Update MathBench & Math base config

* Update WikiBench base model config

---------

Co-authored-by: liushz <liuhongwei@pjlab.rog.cn>
---
 .../wikibench_few_shot_ppl_c23d79.py          | 73 +++++++++++++++++++
 .../wikibench_few_shot_ppl_c23d79.py          | 73 +++++++++++++++++++
 2 files changed, 146 insertions(+)
 create mode 100644 configs/datasets/wikibench/wikibench_few_shot_ppl_c23d79.py
 create mode 100644 opencompass/configs/datasets/wikibench/wikibench_few_shot_ppl_c23d79.py

diff --git a/configs/datasets/wikibench/wikibench_few_shot_ppl_c23d79.py b/configs/datasets/wikibench/wikibench_few_shot_ppl_c23d79.py
new file mode 100644
index 000000000..0669bd7b9
--- /dev/null
+++ b/configs/datasets/wikibench/wikibench_few_shot_ppl_c23d79.py
@@ -0,0 +1,73 @@
+import copy
+
+from opencompass.datasets import WikiBenchDataset
+from opencompass.openicl.icl_evaluator import AccEvaluator, CircularEvaluator
+from opencompass.openicl.icl_inferencer import PPLInferencer
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+
+single_choice_prompts = {
+    'single_choice_cn': [
+        dict(role='HUMAN',
+             prompt='问题: 白色念珠菌常被用作哪种生物的研究模式？\nA. 病毒\nB. 细菌\nC. 真菌\nD. 寄生虫'),
+        dict(role='BOT', prompt='回答: C'),
+        dict(
+            role='HUMAN',
+            prompt='问题: 星期五广场（荷兰语：Vrijdagmarkt；荷兰语发音： ）是比利时根特老城的一个城市广场。 星期五广场下方有一个什么设施？\nA. 游乐场\nB. 地下停车场\nC. 公园\nD. 地下商场' # noqa: E501
+        ),
+        dict(role='BOT', prompt='回答: B'),
+        dict(
+            role='HUMAN',
+            prompt='问题: 尔迪雷·巴斯杜克代表土耳其国家队出场的次数？\nA. 60次\nB. 35次\nC. 49次\nD. 20次'
+        ),
+        dict(role='BOT', prompt='回答: C'),
+        dict(
+            role='HUMAN',
+            prompt='问题: 陈酆被任命为漳州刺史是因为什么原因？\nA. 朝廷认为他有能力担任该职务\nB. 漳州人怀念陈元光、陈伯珙的政绩\nC. 他是陈伯珙的儿子\nD. 他是陈元光的孙子' # noqa: E501
+        ),
+        dict(role='BOT', prompt='回答: B'),
+        dict(role='HUMAN',
+             prompt='问题: 丹徒县在1928年改名为什么？\nA. 苏州市\nB. 润州县\nC. 镇江县\nD. 丹阳县'),
+        dict(role='BOT', prompt='回答: C'),
+        dict(role='HUMAN', prompt='问题: {question}'),
+        dict(role='BOT', prompt='回答: {answer}'),
+    ]
+}
+
+wikibench_sets = {
+    'wiki': ['single_choice_cn'],
+}
+
+do_circular = True
+
+wikibench_datasets = []
+
+for _split in list(wikibench_sets.keys()):
+    for _name in wikibench_sets[_split]:
+        template = {}
+        for answer in ['A', 'B', 'C', 'D']:
+            one_template_round = copy.deepcopy(single_choice_prompts[_name])
+            one_template_round[-1]['prompt'] = one_template_round[-1][
+                'prompt'].format(answer=answer)
+            template[answer] = dict(round=one_template_round)
+        wikibench_infer_cfg = dict(
+            prompt_template=dict(type=PromptTemplate, template=template),
+            retriever=dict(type=ZeroRetriever),
+            inferencer=dict(type=PPLInferencer),
+        )
+        wikibench_eval_cfg = dict(evaluator=dict(
+            type=CircularEvaluator if do_circular else AccEvaluator), )
+        wikibench_datasets.append(
+            dict(
+                type=WikiBenchDataset,
+                path=f'./data/WikiBench/{_name}.jsonl',
+                name='circular_' + _name if do_circular else _name,
+                abbr='wikibench-' + _split + '-' + _name +
+                'circular' if do_circular else '',
+                reader_cfg=dict(
+                    input_columns=['question'],
+                    output_column='answer',
+                ),
+                infer_cfg=wikibench_infer_cfg,
+                eval_cfg=wikibench_eval_cfg,
+            ))
diff --git a/opencompass/configs/datasets/wikibench/wikibench_few_shot_ppl_c23d79.py b/opencompass/configs/datasets/wikibench/wikibench_few_shot_ppl_c23d79.py
new file mode 100644
index 000000000..0669bd7b9
--- /dev/null
+++ b/opencompass/configs/datasets/wikibench/wikibench_few_shot_ppl_c23d79.py
@@ -0,0 +1,73 @@
+import copy
+
+from opencompass.datasets import WikiBenchDataset
+from opencompass.openicl.icl_evaluator import AccEvaluator, CircularEvaluator
+from opencompass.openicl.icl_inferencer import PPLInferencer
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+
+single_choice_prompts = {
+    'single_choice_cn': [
+        dict(role='HUMAN',
+             prompt='问题: 白色念珠菌常被用作哪种生物的研究模式？\nA. 病毒\nB. 细菌\nC. 真菌\nD. 寄生虫'),
+        dict(role='BOT', prompt='回答: C'),
+        dict(
+            role='HUMAN',
+            prompt='问题: 星期五广场（荷兰语：Vrijdagmarkt；荷兰语发音： ）是比利时根特老城的一个城市广场。 星期五广场下方有一个什么设施？\nA. 游乐场\nB. 地下停车场\nC. 公园\nD. 地下商场' # noqa: E501
+        ),
+        dict(role='BOT', prompt='回答: B'),
+        dict(
+            role='HUMAN',
+            prompt='问题: 尔迪雷·巴斯杜克代表土耳其国家队出场的次数？\nA. 60次\nB. 35次\nC. 49次\nD. 20次'
+        ),
+        dict(role='BOT', prompt='回答: C'),
+        dict(
+            role='HUMAN',
+            prompt='问题: 陈酆被任命为漳州刺史是因为什么原因？\nA. 朝廷认为他有能力担任该职务\nB. 漳州人怀念陈元光、陈伯珙的政绩\nC. 他是陈伯珙的儿子\nD. 他是陈元光的孙子' # noqa: E501
+        ),
+        dict(role='BOT', prompt='回答: B'),
+        dict(role='HUMAN',
+             prompt='问题: 丹徒县在1928年改名为什么？\nA. 苏州市\nB. 润州县\nC. 镇江县\nD. 丹阳县'),
+        dict(role='BOT', prompt='回答: C'),
+        dict(role='HUMAN', prompt='问题: {question}'),
+        dict(role='BOT', prompt='回答: {answer}'),
+    ]
+}
+
+wikibench_sets = {
+    'wiki': ['single_choice_cn'],
+}
+
+do_circular = True
+
+wikibench_datasets = []
+
+for _split in list(wikibench_sets.keys()):
+    for _name in wikibench_sets[_split]:
+        template = {}
+        for answer in ['A', 'B', 'C', 'D']:
+            one_template_round = copy.deepcopy(single_choice_prompts[_name])
+            one_template_round[-1]['prompt'] = one_template_round[-1][
+                'prompt'].format(answer=answer)
+            template[answer] = dict(round=one_template_round)
+        wikibench_infer_cfg = dict(
+            prompt_template=dict(type=PromptTemplate, template=template),
+            retriever=dict(type=ZeroRetriever),
+            inferencer=dict(type=PPLInferencer),
+        )
+        wikibench_eval_cfg = dict(evaluator=dict(
+            type=CircularEvaluator if do_circular else AccEvaluator), )
+        wikibench_datasets.append(
+            dict(
+                type=WikiBenchDataset,
+                path=f'./data/WikiBench/{_name}.jsonl',
+                name='circular_' + _name if do_circular else _name,
+                abbr='wikibench-' + _split + '-' + _name +
+                'circular' if do_circular else '',
+                reader_cfg=dict(
+                    input_columns=['question'],
+                    output_column='answer',
+                ),
+                infer_cfg=wikibench_infer_cfg,
+                eval_cfg=wikibench_eval_cfg,
+            ))

From 17eefc0e1e90c4cd669d1cb840456d1aa7ffb48d Mon Sep 17 00:00:00 2001
From: Chuanyang Jin <68135125+chuanyangjin@users.noreply.github.com>
Date: Tue, 24 Sep 2024 23:27:17 -0400
Subject: [PATCH 02/20] [Fix] Correct typos (#1561)

---
 README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 98f640694..2da411958 100644
--- a/README.md
+++ b/README.md
@@ -594,7 +594,7 @@ OpenCompass is a one-stop platform for large model evaluation, aiming to provide
 ## 🔜 Roadmap
 
 - [x] Subjective Evaluation
-  - [x] Release CompassAreana
+  - [x] Release CompassAreana.
   - [x] Subjective evaluation.
 - [x] Long-context
   - [x] Long-context evaluation with extensive datasets.
@@ -603,10 +603,10 @@ OpenCompass is a one-stop platform for large model evaluation, aiming to provide
   - [ ] Coding evaluation leaderboard.
   - [x] Non-python language evaluation service.
 - [x] Agent
-  - [ ] Support various agenet framework.
+  - [ ] Support various agent frameworks.
   - [x] Evaluation of tool use of the LLMs.
 - [x] Robustness
-  - [x] Support various attack method
+  - [x] Support various attack methods.
 
 ## 👷‍♂️ Contributing
 

From fe84bbd9a048f26bd8dd19dc3236566758b7135b Mon Sep 17 00:00:00 2001
From: Songyang Zhang <tonysy@users.noreply.github.com>
Date: Wed, 25 Sep 2024 11:36:43 +0800
Subject: [PATCH 03/20] [Feature] Add Config for CoreBench (#1547)

* [Feature] Add Config for CoreBench

* Update
---
 configs/eval_corebench_2409_longcontext.py | 138 ++++++++++++++
 configs/eval_corebench_2409_objective.py   | 208 +++++++++++++++++++++
 configs/eval_corebench_2409_subjective.py  | 134 +++++++++++++
 3 files changed, 480 insertions(+)
 create mode 100644 configs/eval_corebench_2409_longcontext.py
 create mode 100644 configs/eval_corebench_2409_objective.py
 create mode 100644 configs/eval_corebench_2409_subjective.py

diff --git a/configs/eval_corebench_2409_longcontext.py b/configs/eval_corebench_2409_longcontext.py
new file mode 100644
index 000000000..718044d2a
--- /dev/null
+++ b/configs/eval_corebench_2409_longcontext.py
@@ -0,0 +1,138 @@
+import os.path as osp
+from copy import deepcopy
+
+from mmengine.config import read_base
+from opencompass.models import (HuggingFacewithChatTemplate,
+                                TurboMindModelwithChatTemplate)
+from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
+from opencompass.runners import DLCRunner, LocalRunner
+from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask
+
+
+#######################################################################
+#                          PART 0  Essential Configs                  #
+#######################################################################
+with read_base():
+    from opencompass.configs.datasets.longbench.longbench import \
+        longbench_datasets
+    from opencompass.configs.datasets.needlebench.needlebench_8k.needlebench_8k import \
+        needlebench_datasets as needlebench_8k_datasets
+    from opencompass.configs.datasets.needlebench.needlebench_32k.needlebench_32k import \
+        needlebench_datasets as needlebench_32k_datasets
+    from opencompass.configs.datasets.needlebench.needlebench_128k.needlebench_128k import \
+        needlebench_datasets as needlebench_128k_datasets
+    from opencompass.configs.datasets.ruler.ruler_8k_gen import \
+        ruler_datasets as ruler_8k_datasets
+    from opencompass.configs.datasets.ruler.ruler_32k_gen import \
+        ruler_datasets as ruler_32k_datasets
+    from opencompass.configs.datasets.ruler.ruler_128k_gen import \
+        ruler_datasets as ruler_128k_datasets
+    # Summary Groups
+    from opencompass.configs.summarizers.groups.longbench import \
+        longbench_summary_groups
+    from opencompass.configs.summarizers.groups.ruler import \
+        ruler_summary_groups
+    from opencompass.configs.summarizers.needlebench import (
+        needlebench_8k_summarizer, needlebench_32k_summarizer,
+        needlebench_128k_summarizer)
+
+    # Instruct models
+    from opencompass.configs.models.qwen.lmdeploy_qwen2_7b_instruct import \
+        models as lmdeploy_qwen2_7b_instruct_model
+
+    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat_1m import \
+        models as lmdeploy_internlm2_5_7b_1m_chat_model
+    from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_8b_instruct import \
+        models as llama3_1_8b_instruct_model
+
+
+#######################################################################
+#                          PART 1  Datasets List                      #
+#######################################################################
+# datasets list for evaluation
+datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
+
+
+#######################################################################
+#                       PART 2  Datset Summarizer                     #
+#######################################################################
+needlebench_8k_summary_groups = needlebench_8k_summarizer['summary_groups']
+needlebench_32k_summary_groups = needlebench_32k_summarizer['summary_groups']
+needlebench_128k_summary_groups = needlebench_128k_summarizer['summary_groups']
+
+# Instruct models summarizer
+summarizer = dict(
+    dataset_abbrs=[
+        ['ruler_8k', 'naive_average'],
+        ['ruler_32k', 'naive_average'],
+        ['ruler_128k', 'naive_average'],
+        ['NeedleBench-Overall-Score-8K', 'weighted_average'],
+        ['NeedleBench-Overall-Score-32K', 'weighted_average'],
+        ['NeedleBench-Overall-Score-128K', 'weighted_average'],
+        ['longbench', 'naive_average'],
+        ['longbench_zh', 'naive_average'],
+        ['longbench_en', 'naive_average'],
+        '',
+        'longbench_single-document-qa',
+        'longbench_multi-document-qa',
+        'longbench_summarization',
+        'longbench_few-shot-learning',
+        'longbench_synthetic-tasks',
+        'longbench_code-completion',
+    ],
+    summary_groups=sum(
+        [v for k, v in locals().items() if k.endswith('_summary_groups')], []),
+)
+
+
+#######################################################################
+#                        PART 3  Models  List                         #
+#######################################################################
+
+lmdeploy_qwen2_7b_instruct_model[0]['max_seq_len'] = 1048576
+lmdeploy_qwen2_7b_instruct_model[0]['engine_config']['session_len'] = 1048576
+lmdeploy_qwen2_7b_instruct_model[0]['engine_config']['tp'] = 4
+lmdeploy_qwen2_7b_instruct_model[0]['engine_config']['rope_scaling_factor'] = 4
+lmdeploy_qwen2_7b_instruct_model[0]['run_cfg']['num_gpus'] = 4
+
+llama3_1_8b_instruct_model[0]['max_seq_len'] = 1048576
+llama3_1_8b_instruct_model[0]['engine_config']['session_len'] = 1048576
+llama3_1_8b_instruct_model[0]['engine_config']['tp'] = 4
+llama3_1_8b_instruct_model[0]['engine_config']['rope_scaling_factor'] = 4
+llama3_1_8b_instruct_model[0]['run_cfg']['num_gpus'] = 4
+
+models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
+
+#######################################################################
+#                 PART 4  Inference/Evaluation Configuaration         #
+#######################################################################
+
+# Local Runner
+infer = dict(
+    partitioner=dict(
+        type=NumWorkerPartitioner,
+        num_worker=8
+    ),
+    runner=dict(
+        type=LocalRunner,
+        max_num_workers=16,
+        retry=0, # Modify if needed
+        task=dict(type=OpenICLInferTask)
+    ),
+)
+
+# eval with local runner
+eval = dict(
+    partitioner=dict(type=NaivePartitioner, n=10),
+    runner=dict(
+        type=LocalRunner,
+        max_num_workers=16,
+        task=dict(type=OpenICLEvalTask)),
+)
+
+
+#######################################################################
+#                      PART 5  Utils Configuaration                   #
+#######################################################################
+base_exp_dir = 'outputs/corebench/'
+work_dir = osp.join(base_exp_dir, 'long_context')
diff --git a/configs/eval_corebench_2409_objective.py b/configs/eval_corebench_2409_objective.py
new file mode 100644
index 000000000..e14c52472
--- /dev/null
+++ b/configs/eval_corebench_2409_objective.py
@@ -0,0 +1,208 @@
+from mmengine.config import read_base
+import os.path as osp
+from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
+from opencompass.runners import LocalRunner
+from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask
+
+
+#######################################################################
+#                          PART 0  Essential Configs                  #
+#######################################################################
+with read_base():
+    # Datasets Part
+    ## Core Set
+    # ## Examination
+    from opencompass.configs.datasets.mmlu.mmlu_openai_simple_evals_gen_b618ea import mmlu_datasets
+    from opencompass.configs.datasets.mmlu_pro.mmlu_pro_0shot_cot_gen_08c1de import mmlu_pro_datasets
+    from opencompass.configs.datasets.cmmlu.cmmlu_0shot_cot_gen_305931 import cmmlu_datasets
+
+    # ## Reasoning
+    from opencompass.configs.datasets.bbh.bbh_gen_4a31fa import bbh_datasets
+    # TODO: Add HellaSwag
+    # TODO: Add DROP
+
+    # ## Math
+    from opencompass.configs.datasets.math.math_0shot_gen_393424 import math_datasets
+    # TODO: Add GSM8K
+    # TODO: Add MathBench
+
+    # ## Scientific
+    from opencompass.configs.datasets.gpqa.gpqa_openai_simple_evals_gen_5aeece import gpqa_datasets
+
+    # ## Coding
+    from opencompass.configs.datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets
+    # TODO: Add MBPP
+    # TODO: Add LiveCodeBench
+
+    # ## Instruction Following
+    from opencompass.configs.datasets.IFEval.IFEval_gen_3321a3 import ifeval_datasets
+
+    # Summarizer
+    from opencompass.configs.summarizers.groups.mmlu import mmlu_summary_groups
+    from opencompass.configs.summarizers.groups.mmlu_pro import mmlu_pro_summary_groups
+    from opencompass.configs.summarizers.groups.cmmlu import cmmlu_summary_groups
+    from opencompass.configs.summarizers.groups.bbh import bbh_summary_groups
+
+
+    # Model List
+    # from opencompass.configs.models.qwen.lmdeploy_qwen2_1_5b_instruct import models as lmdeploy_qwen2_1_5b_instruct_model
+    # from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import models as hf_internlm2_5_7b_chat_model
+    # from opencompass.configs.models.openbmb.hf_minicpm_2b_sft_bf16 import models as hf_minicpm_2b_sft_bf16_model
+    # from opencompass.configs.models.yi.hf_yi_1_5_6b_chat import models as hf_yi_1_5_6b_chat_model
+    # from opencompass.configs.models.gemma.hf_gemma_2b_it import models as hf_gemma_2b_it_model
+    # from opencompass.configs.models.yi.hf_yi_1_5_34b_chat import models as hf_yi_1_5_34b_chat_model
+
+#######################################################################
+#                          PART 1  Datasets List                      #
+#######################################################################
+# datasets list for evaluation
+datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
+
+
+#######################################################################
+#                       PART 2  Datset Summarizer                     #
+#######################################################################
+# with read_base():
+
+core_summary_groups = [
+    {
+        'name': 'core_average',
+        'subsets': [
+            ['mmlu', 'accuracy'],
+            ['mmlu_pro', 'accuracy'],
+            # ['cmmlu', 'naive_average'],
+            ['cmmlu', 'accuracy'],
+            ['bbh', 'score'],
+            ['math', 'accuracy'],
+            ['openai_humaneval', 'humaneval_pass@1'],
+            ['GPQA_diamond', 'accuracy'],
+            ['IFEval', 'Prompt-level-strict-accuracy'],
+        ],
+    },
+]
+
+summarizer = dict(
+    dataset_abbrs=[
+        ['core_average', 'naive_average'],
+        ['mmlu', 'accuracy'],
+        ['mmlu_pro', 'accuracy'],
+        ['cmmlu', 'accuracy'],
+        ['bbh', 'score'],
+        ['math', 'accuracy'],
+        ['openai_humaneval', 'humaneval_pass@1'],
+        ['GPQA_diamond', 'accuracy'],
+        ['IFEval', 'Prompt-level-strict-accuracy'],
+        '',
+
+        ['mmlu', 'accuracy'],
+        ['mmlu-stem', 'accuracy'],
+        ['mmlu-social-science', 'accuracy'],
+        ['mmlu-humanities', 'accuracy'],
+        ['mmlu-other', 'accuracy'],
+
+        '',
+        ['mmlu_pro', 'accuracy'],
+        ['mmlu_pro_math','accuracy'],
+        ['mmlu_pro_physics', 'accuracy'],
+        ['mmlu_pro_chemistry', 'accuracy'],
+        ['mmlu_pro_law', 'accuracy'],
+        ['mmlu_pro_engineering', 'accuracy'],
+        ['mmlu_pro_other', 'accuracy'],
+        ['mmlu_pro_economics', 'accuracy'],
+        ['mmlu_pro_health', 'accuracy'],
+        ['mmlu_pro_psychology', 'accuracy'],
+        ['mmlu_pro_business', 'accuracy'],
+        ['mmlu_pro_biology', 'accuracy'],
+        ['mmlu_pro_philosophy', 'accuracy'],
+        ['mmlu_pro_computer_science','accuracy'],
+        ['mmlu_pro_history', 'accuracy'],
+        '',
+        ['cmmlu', 'accuracy'],
+        ['cmmlu-stem', 'accuracy'],
+        ['cmmlu-social-science', 'accuracy'],
+        ['cmmlu-humanities', 'accuracy'],
+        ['cmmlu-other', 'accuracy'],
+        ['cmmlu-china-specific', 'accuracy'],
+        '',
+        ['bbh', 'extract_rate'],
+        ['math', 'extract_rate'],
+        # ['openai_humaneval', 'extract_rate'],
+        ['GPQA_diamond', 'extract_rate'],
+        # ['IFEval', 'extract_rate'],
+        '',
+        ['mmlu', 'extract_rate'],
+        ['mmlu-stem', 'extract_rate'],
+        ['mmlu-social-science', 'extract_rate'],
+        ['mmlu-humanities', 'extract_rate'],
+        ['mmlu-other', 'extract_rate'],
+        '',
+        ['mmlu_pro', 'extract_rate'],
+        ['mmlu_pro_math', 'extract_rate'],
+        ['mmlu_pro_physics', 'extract_rate'],
+        ['mmlu_pro_chemistry', 'extract_rate'],
+        ['mmlu_pro_law', 'extract_rate'],
+        ['mmlu_pro_engineering', 'extract_rate'],
+        ['mmlu_pro_other', 'extract_rate'],
+        ['mmlu_pro_economics', 'extract_rate'],
+        ['mmlu_pro_health', 'extract_rate'],
+        ['mmlu_pro_psychology', 'extract_rate'],
+        ['mmlu_pro_business', 'extract_rate'],
+        ['mmlu_pro_biology', 'extract_rate'],
+        ['mmlu_pro_philosophy', 'extract_rate'],
+        ['mmlu_pro_computer_science', 'extract_rate'],
+        ['mmlu_pro_history', 'extract_rate'],
+        '',
+        ['cmmlu', 'extract_rate'],
+        ['cmmlu-stem', 'extract_rate'],
+        ['cmmlu-social-science', 'extract_rate'],
+        ['cmmlu-humanities', 'extract_rate'],
+        ['cmmlu-other', 'extract_rate'],
+        ['cmmlu-china-specific', 'extract_rate'],
+
+    ],
+    summary_groups=sum(
+        [v for k, v in locals().items() if k.endswith('_summary_groups')], []),
+)
+
+
+#######################################################################
+#                        PART 3  Models  List                         #
+#######################################################################
+
+models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
+
+
+
+#######################################################################
+#                 PART 4  Inference/Evaluation Configuaration         #
+#######################################################################
+
+# Local Runner
+infer = dict(
+    partitioner=dict(
+        type=NumWorkerPartitioner,
+        num_worker=8
+    ),
+    runner=dict(
+        type=LocalRunner,
+        max_num_workers=16,
+        retry=0, # Modify if needed
+        task=dict(type=OpenICLInferTask)
+    ),
+)
+
+# eval with local runner
+eval = dict(
+    partitioner=dict(type=NaivePartitioner, n=10),
+    runner=dict(
+        type=LocalRunner,
+        max_num_workers=16,
+        task=dict(type=OpenICLEvalTask)),
+)
+
+
+#######################################################################
+#                      PART 5  Utils Configuaration                   #
+#######################################################################
+base_exp_dir = 'outputs/corebench/'
+work_dir = osp.join(base_exp_dir, 'chat_objective')
diff --git a/configs/eval_corebench_2409_subjective.py b/configs/eval_corebench_2409_subjective.py
new file mode 100644
index 000000000..c0623c804
--- /dev/null
+++ b/configs/eval_corebench_2409_subjective.py
@@ -0,0 +1,134 @@
+import os.path as osp
+from copy import deepcopy
+
+from mmengine.config import read_base
+from opencompass.models import (HuggingFacewithChatTemplate,
+                                TurboMindModelwithChatTemplate)
+from opencompass.models.openai_api import OpenAI, OpenAISDK
+from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
+from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
+from opencompass.runners import DLCRunner, LocalRunner
+from opencompass.summarizers import SubjectiveSummarizer
+from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask
+from opencompass.tasks.subjective_eval import SubjectiveEvalTask
+
+
+#######################################################################
+#                          PART 0  Essential Configs                  #
+#######################################################################
+with read_base():
+    # Datasets Part
+    from opencompass.configs.datasets.subjective.arena_hard.arena_hard_compare import \
+        arenahard_datasets
+    from opencompass.configs.datasets.subjective.alignbench.alignbench_v1_1_judgeby_critiquellm import \
+        alignbench_datasets
+    from opencompass.configs.datasets.subjective.multiround.mtbench_single_judge_diff_temp import \
+        mtbench_datasets
+
+    # Summarizer
+
+    # Model List
+    # from opencompass.configs.models.qwen.lmdeploy_qwen2_1_5b_instruct import models as lmdeploy_qwen2_1_5b_instruct_model
+    # from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import models as hf_internlm2_5_7b_chat_model
+
+
+#######################################################################
+#                          PART 1  Datasets List                      #
+#######################################################################
+# datasets list for evaluation
+
+datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
+
+
+#######################################################################
+#                       PART 2  Datset Summarizer                     #
+#######################################################################
+summarizer = dict(type=SubjectiveSummarizer, function='subjective')
+
+#######################################################################
+#                        PART 3  Models  List                         #
+#######################################################################
+
+models = [
+    dict(
+        type=TurboMindModelwithChatTemplate,
+        abbr='internlm2_5-7b-chat-turbomind',
+        path='internlm/internlm2_5-7b-chat',
+        engine_config=dict(session_len=16384, max_batch_size=16, tp=1),
+        gen_config=dict(top_k=40, temperature=1.0, top_p=0.9, max_new_tokens=4096),
+        max_seq_len=16384,
+        max_out_len=4096,
+        batch_size=16,
+        run_cfg=dict(num_gpus=1),
+    )
+]
+
+models = sum([v for k, v in locals().items() if k.endswith('_model')], models)
+
+
+
+#######################################################################
+#                 PART 4  Inference/Evaluation Configuaration         #
+#######################################################################
+
+# Local Runner
+infer = dict(
+    partitioner=dict(
+        type=NumWorkerPartitioner,
+        num_worker=8
+    ),
+    runner=dict(
+        type=LocalRunner,
+        max_num_workers=16,
+        retry=0, # Modify if needed
+        task=dict(type=OpenICLInferTask)
+    ),
+)
+
+# JudgeLLM
+api_meta_template = dict(round=[
+    dict(role='HUMAN', api_role='HUMAN'),
+    dict(role='BOT', api_role='BOT', generate=True),
+])
+
+
+judge_models = [
+    dict(
+        type=OpenAISDK,
+        abbr='gpt-4o-2024-08-06',
+        path='gpt-4o-2024-08-06',
+        # openai_api_base=
+        # 'http://10.140.1.86:10001/v1',  # Change to your own url if needed.
+        key='YOUR_API_KEY',
+        retry=10,
+        meta_template=api_meta_template,
+        rpm_verbose=True,
+        query_per_second=1,
+        max_out_len=4096,
+        max_seq_len=16384,
+        batch_size=16,
+        temperature=0.01,
+        tokenizer_path='gpt-4o-2024-08-06'
+    )
+]
+
+# Evaluation with local runner
+eval = dict(
+    partitioner=dict(
+        type=SubjectiveNaivePartitioner,
+        models=models,
+        judge_models=judge_models,
+    ),
+    runner=dict(
+        type=LocalRunner,
+        max_num_workers=16,
+        task=dict(type=SubjectiveEvalTask)),
+)
+
+
+
+#######################################################################
+#                      PART 5  Utils Configuaration                   #
+#######################################################################
+base_exp_dir = 'outputs/corebench/'
+work_dir = osp.join(base_exp_dir, 'chat_subjective')

From c3fb9065db5f7ec9b7a9b5d7ea8f834ce75a0b1c Mon Sep 17 00:00:00 2001
From: Linchen Xiao <xxllcc1993@gmail.com>
Date: Wed, 25 Sep 2024 11:53:48 +0800
Subject: [PATCH 04/20] [Feature] Add dlc sleep time (#1562)

---
 opencompass/runners/dlc.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/opencompass/runners/dlc.py b/opencompass/runners/dlc.py
index 40453ed08..224ef4300 100644
--- a/opencompass/runners/dlc.py
+++ b/opencompass/runners/dlc.py
@@ -232,6 +232,8 @@ def _run_within_retry():
                 while True:
                     # 1. Avoid to request dlc too frequently.
                     # 2. DLC job may not be ready immediately after creation.
+                    dlc_sleep_time = self.aliyun_cfg.get('dlc_sleep_time', 10)
+                    time.sleep(dlc_sleep_time)
                     num_retry = 60
                     for retry_index in range(num_retry):
                         time.sleep(2)

From 87df8a73a3b2290fd0bb07c10b9acc461206cefd Mon Sep 17 00:00:00 2001
From: zhulinJulia24 <145004780+zhulinJulia24@users.noreply.github.com>
Date: Wed, 25 Sep 2024 13:40:47 +0800
Subject: [PATCH 05/20] [CI] add a common summarizer for qabench summarizer
 (#1545)

* update

* update

* update

---------

Co-authored-by: zhulin1 <zhulin1@pjlab.org.cn>
---
 .../summarizers/subjective/__init__.py        |   1 +
 .../subjective/common_summarizer.py           | 146 ++++++++++++++++++
 2 files changed, 147 insertions(+)
 create mode 100644 opencompass/summarizers/subjective/common_summarizer.py

diff --git a/opencompass/summarizers/subjective/__init__.py b/opencompass/summarizers/subjective/__init__.py
index 6565d5c89..ea2367c0b 100644
--- a/opencompass/summarizers/subjective/__init__.py
+++ b/opencompass/summarizers/subjective/__init__.py
@@ -4,6 +4,7 @@
 from .alpacaeval import AlpacaSummarizer
 from .arenahard import ArenaHardSummarizer
 from .charm import CharmMemSummarizer
+from .common_summarizer import CommonSummarizer
 from .compass_arena import CompassArenaSummarizer
 from .compassbench import CompassBenchSummarizer
 from .corev2 import Corev2Summarizer
diff --git a/opencompass/summarizers/subjective/common_summarizer.py b/opencompass/summarizers/subjective/common_summarizer.py
new file mode 100644
index 000000000..4793a91f1
--- /dev/null
+++ b/opencompass/summarizers/subjective/common_summarizer.py
@@ -0,0 +1,146 @@
+# flake8: noqa
+# yapf: disable
+import csv
+import os
+import os.path as osp
+import re
+from collections import defaultdict
+from datetime import datetime
+
+import numpy as np
+from mmengine import ConfigDict
+from tabulate import tabulate
+
+from opencompass.utils import dataset_abbr_from_cfg, model_abbr_from_cfg
+
+from .compass_arena import CompassArenaSummarizer
+from .utils import get_judgeanswer_and_reference, get_outdir
+
+
+def model_abbr_from_cfg_used_in_summarizer(model):
+    if model.get('summarizer_abbr', None):
+        return model['summarizer_abbr']
+    else:
+        return model_abbr_from_cfg(model)
+
+def post_process_single_rate(judgement: str):
+    """Input a string like below:
+
+    xxx[[5]]xxx, and extract the score
+    """
+    pattern = r'Rating:\s*\[\[([\d.]+)\]\]'
+    matched_result = re.findall(pattern, judgement)
+    if matched_result:
+        score = float(matched_result[0])
+    else:
+        return None
+    return {'score': score}
+
+
+def get_capability_results(
+    judged_answers,
+    references,
+    fout,
+    fout_flag,
+    model_abbr,
+    judge_model_abbr,
+    dataset_abbr,
+):
+    capability_ratings = defaultdict(int)
+    capability_counts = defaultdict(int)
+    for ans, ref in zip(judged_answers, references):
+        capability_ratings['total'] += ans['score']
+        capability_counts['total'] += 1
+        capability_ratings[ref['capability']] += ans['score']
+        capability_counts[ref['capability']] += 1
+
+    capability_avg_ratings = defaultdict(float)
+
+    for capability, total_score in capability_ratings.items():
+        s = total_score / capability_counts[capability]
+        s = round(s, 2)
+        capability_avg_ratings[capability] = s
+    columns = list(capability_avg_ratings.keys())
+    columns.insert(0, columns.pop(columns.index('total')))
+
+    if fout_flag == 0:
+        with open(fout, 'w', newline='') as csvfile:
+            writer = csv.writer(csvfile)
+            if fout_flag == 0:
+                writer.writerow(['model', 'judge_model', 'dataset'] + columns)
+            writer.writerow([model_abbr] + [judge_model_abbr] + [dataset_abbr] + [capability_avg_ratings[column] for column in columns])
+    else:
+        with open(fout, 'a+', newline='') as csvfile:
+            writer = csv.writer(csvfile)
+            writer.writerow([model_abbr] + [judge_model_abbr] + [dataset_abbr] + [capability_avg_ratings[column] for column in columns])
+
+
+class CommonSummarizer(CompassArenaSummarizer):
+    """Do the subjectivity analyze based on evaluation results.
+
+    Args:
+        config (ConfigDict): The configuration object of the evaluation task.
+            It's expected to be filled out at runtime.
+    """
+
+    def __init__(self, config: ConfigDict, judge_type='single_rate') -> None:
+        self.judge_type = judge_type
+        self.tasks = []
+        self.cfg = config
+        self.judge_type = 'single_rate'
+        self.eval_model_cfgs = self.cfg['eval']['partitioner']['models']
+        self.judge_model_cfgs = self.cfg['judge_models']
+        self.judge_map = {
+            'single_rate': post_process_single_rate
+        }
+        self.judge_function = self.judge_map[self.judge_type]
+
+    def summarize(self, time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S')):
+        """Summarize the subjectivity analysis based on evaluation results.
+
+        Args:
+            time_str (str): Timestamp for file naming.
+
+        Returns:
+            pd.DataFrame: The summary results.
+        """
+        if self.judge_type == 'pair':
+            return super().summarize()
+
+        # self.judge_type == 'single'
+        dataset_cfgs = self.cfg['datasets']
+        output_dir, results_folder = get_outdir(self.cfg, time_str)
+        fout_flag = 0
+        output_tmp_file = osp.join(output_dir, 'result.csv')
+        output_file = osp.join(output_dir, 'total_result.csv')
+        for eval_model_cfg in self.eval_model_cfgs:
+            for judge_model_cfg in self.judge_model_cfgs:
+                eval_model_abbr = model_abbr_from_cfg(eval_model_cfg)
+                show_model_abbr = model_abbr_from_cfg_used_in_summarizer(eval_model_cfg)
+                show_judge_model_abbr = model_abbr_from_cfg_used_in_summarizer(judge_model_cfg)
+                judge_abbr = model_abbr_from_cfg(judge_model_cfg)
+                subdir_path = os.path.join(results_folder, eval_model_abbr + '_judged-by--' + judge_abbr)
+                if os.path.isdir(subdir_path):
+                    for dataset in dataset_cfgs:
+                        judged_answers, references = get_judgeanswer_and_reference(dataset, subdir_path, self.judge_function)
+                        show_dataset_abbr = dataset_abbr_from_cfg(dataset)
+
+                        get_capability_results(judged_answers, references, output_tmp_file, fout_flag, show_model_abbr, show_judge_model_abbr, show_dataset_abbr)
+                        fout_flag += 1
+                else:
+                    print(subdir_path + ' is not exist! please check!')
+        with open(output_tmp_file, 'r') as f:
+            csv_reader = csv.reader(f)
+            header = next(csv_reader)
+            table = [line for line in csv_reader]
+
+            new_header = [''] + [line[0] for line in table]
+            new_table = [[h] + line[1:] for h, line in zip(header[1:], table)]
+            new_table = [[h] + [line[i] for line in table] for i, h in enumerate(header[1:], start=1)]
+            t = tabulate(new_table, headers=new_header)
+        with open(output_file, 'a') as f:
+            f.write(','.join(new_header) + '\n')
+            for line in new_table:
+                f.write(','.join(map(str, line)) + '\n')
+            print(t)
+            print(output_file)

From aa43eaf267199a1f91de3e10afdeada339d0e05d Mon Sep 17 00:00:00 2001
From: zhulinJulia24 <145004780+zhulinJulia24@users.noreply.github.com>
Date: Wed, 25 Sep 2024 17:07:27 +0800
Subject: [PATCH 06/20] [CI] add more models into testcase and test env of cu12
 (#1558)

* update

* update

* Update pr-run-test.yml

* update

* update

* update

* update

* Update daily-run-test.yml

* update

* updaste

* update

* update

* update

* Update daily-run-test.yml

* update

* update

* Update daily-run-test.yml

* Update daily-run-test.yml

* update

* update

* update

* update

* update

* Update daily-run-test.yml

* update

---------

Co-authored-by: zhulin1 <zhulin1@pjlab.org.cn>
---
 .github/scripts/eval_regression_base.py |  26 ++++--
 .github/scripts/eval_regression_chat.py |  34 +++++--
 .github/scripts/oc_score_assert.py      |  43 +++++----
 .github/scripts/oc_score_baseline.yaml  | 114 +++++++++++++++++++++++-
 .github/workflows/daily-run-test.yml    |  75 ++++++++++------
 .github/workflows/pr-run-test.yml       |   2 +-
 6 files changed, 235 insertions(+), 59 deletions(-)

diff --git a/.github/scripts/eval_regression_base.py b/.github/scripts/eval_regression_base.py
index 8b4c64468..12339ecfa 100644
--- a/.github/scripts/eval_regression_base.py
+++ b/.github/scripts/eval_regression_base.py
@@ -8,15 +8,17 @@
         race_datasets  # noqa: F401, E501
     from opencompass.configs.models.deepseek.hf_deepseek_moe_16b_base import \
         models as hf_deepseek_moe_16b_base_model  # noqa: F401, E501
+    from opencompass.configs.models.deepseek.hf_deepseek_v2_lite import \
+        models as hf_deepseek_v2_lite_model  # noqa: F401, E501
     # read hf models - chat models
     from opencompass.configs.models.deepseek.lmdeploy_deepseek_7b_base import \
         models as lmdeploy_deepseek_7b_base_model  # noqa: F401, E501
     from opencompass.configs.models.deepseek.vllm_deepseek_moe_16b_base import \
         models as vllm_deepseek_moe_16b_base_model  # noqa: F401, E501
-    from opencompass.configs.models.gemma.hf_gemma_2b import \
-        models as hf_gemma_2b_model  # noqa: F401, E501
-    from opencompass.configs.models.gemma.hf_gemma_7b import \
-        models as hf_gemma_7b_model  # noqa: F401, E501
+    from opencompass.configs.models.gemma.hf_gemma2_2b import \
+        models as hf_gemma2_2b_model  # noqa: F401, E501
+    from opencompass.configs.models.gemma.hf_gemma2_9b import \
+        models as hf_gemma2_9b_model  # noqa: F401, E501
     from opencompass.configs.models.hf_internlm.hf_internlm2_5_7b import \
         models as hf_internlm2_5_7b_model  # noqa: F401, E501
     from opencompass.configs.models.hf_internlm.hf_internlm2_7b import \
@@ -31,16 +33,28 @@
         models as lmdeploy_internlm2_7b_model  # noqa: F401, E501
     from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_base_7b import \
         models as lmdeploy_internlm2_base_7b_model  # noqa: F401, E501
+    from opencompass.configs.models.hf_llama.hf_llama2_7b import \
+        models as hf_llama2_7b_model  # noqa: F401, E501
+    from opencompass.configs.models.hf_llama.hf_llama3_8b import \
+        models as hf_llama3_8b_model  # noqa: F401, E501
+    from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_8b import \
+        models as lmdeploy_llama3_1_8b_model  # noqa: F401, E501
     from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b import \
         models as lmdeploy_llama3_8b_model  # noqa: F401, E501
-    from opencompass.configs.models.mistral.hf_mistral_7b_v0_2 import \
-        models as hf_mistral_7b_v0_2_model  # noqa: F401, E501
+    from opencompass.configs.models.mistral.hf_mistral_7b_v0_3 import \
+        models as hf_mistral_7b_v0_3_model  # noqa: F401, E501
     from opencompass.configs.models.mistral.vllm_mistral_7b_v0_2 import \
         models as vllm_mistral_7b_v0_2_model  # noqa: F401, E501
+    from opencompass.configs.models.mistral.vllm_mixtral_8x7b_v0_1 import \
+        models as vllm_mixtral_8x7b_v0_1_model  # noqa: F401, E501
     from opencompass.configs.models.qwen.hf_qwen1_5_moe_a2_7b import \
         models as hf_qwen1_5_moe_a2_7b_model  # noqa: F401, E501
     from opencompass.configs.models.qwen.hf_qwen2_0_5b import \
         models as hf_qwen2_0_5b_model  # noqa: F401, E501
+    from opencompass.configs.models.qwen.hf_qwen2_1_5b import \
+        models as hf_qwen2_1_5b_model  # noqa: F401, E501
+    from opencompass.configs.models.qwen.hf_qwen2_7b import \
+        models as hf_qwen2_7b_model  # noqa: F401, E501
     from opencompass.configs.models.qwen.lmdeploy_qwen2_1_5b import \
         models as lmdeploy_qwen2_1_5b_model  # noqa: F401, E501
     from opencompass.configs.models.qwen.lmdeploy_qwen2_7b import \
diff --git a/.github/scripts/eval_regression_chat.py b/.github/scripts/eval_regression_chat.py
index 1ee28e630..fa28562f4 100644
--- a/.github/scripts/eval_regression_chat.py
+++ b/.github/scripts/eval_regression_chat.py
@@ -13,20 +13,32 @@
         models as hf_baichuan2_7b_chat_model  # noqa: F401, E501
     from opencompass.configs.models.chatglm.hf_glm4_9b_chat import \
         models as hf_glm4_9b_chat_model  # noqa: F401, E501
+    from opencompass.configs.models.chatglm.lmdeploy_glm4_9b_chat import \
+        models as lmdeploy_glm4_9b_chat_model  # noqa: F401, E501
+    from opencompass.configs.models.chatglm.vllm_glm4_9b_chat import \
+        models as vllm_glm4_9b_chat_model  # noqa: F401, E501
     from opencompass.configs.models.deepseek.hf_deepseek_7b_chat import \
         models as hf_deepseek_7b_chat_model  # noqa: F401, E501
     from opencompass.configs.models.deepseek.hf_deepseek_moe_16b_chat import \
         models as hf_deepseek_moe_16b_chat_model  # noqa: F401, E501
+    from opencompass.configs.models.deepseek.hf_deepseek_v2_lite_chat import \
+        models as hf_deepseek_v2_lite_chat_model  # noqa: F401, E501
     from opencompass.configs.models.deepseek.vllm_deepseek_7b_chat import \
         models as vllm_deepseek_7b_chat_model  # noqa: F401, E501
-    from opencompass.configs.models.gemma.hf_gemma_2b_it import \
-        models as hf_gemma_2b_it_model  # noqa: F401, E501
-    from opencompass.configs.models.gemma.hf_gemma_7b_it import \
-        models as hf_gemma_7b_it_model  # noqa: F401, E501
+    from opencompass.configs.models.gemma.hf_gemma2_2b_it import \
+        models as hf_gemma2_2b_it_model  # noqa: F401, E501
+    from opencompass.configs.models.gemma.hf_gemma2_9b_it import \
+        models as hf_gemma2_9b_it_model  # noqa: F401, E501
+    from opencompass.configs.models.gemma.vllm_gemma_7b_it import \
+        models as vllm_gemma_7b_it_model  # noqa: F401, E501
     from opencompass.configs.models.hf_internlm.hf_internlm2_5_7b_chat import \
         models as hf_internlm2_5_7b_chat_model  # noqa: F401, E501
+    from opencompass.configs.models.hf_internlm.hf_internlm2_5_20b_chat import \
+        models as hf_internlm2_5_20b_chat_model  # noqa: F401, E501
     from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import \
         models as lmdeploy_internlm2_5_7b_chat_model  # noqa: F401, E501
+    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_20b_chat import \
+        models as lmdeploy_internlm2_5_20b_chat_model  # noqa: F401, E501
     from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_chat_1_8b import \
         models as lmdeploy_internlm2_chat_1_8b_model  # noqa: F401, E501
     from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_chat_1_8b_sft import \
@@ -37,14 +49,20 @@
         models as lmdeploy_internlm2_chat_7b_sft_model  # noqa: F401, E501
     from opencompass.configs.models.hf_internlm.vllm_internlm2_chat_7b import \
         models as vllm_internlm2_chat_7b_model  # noqa: F401, E501
+    from opencompass.configs.models.hf_llama.hf_llama3_1_8b_instruct import \
+        models as hf_llama3_1_8b_instruct_model  # noqa: F401, E501
     from opencompass.configs.models.hf_llama.hf_llama3_8b_instruct import \
         models as hf_llama3_8b_instruct_model  # noqa: F401, E501
+    from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_8b_instruct import \
+        models as lmdeploy_llama3_1_8b_instruct_model  # noqa: F401, E501
     from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b_instruct import \
         models as lmdeploy_llama3_8b_instruct_model  # noqa: F401, E501
-    from opencompass.configs.models.mistral.hf_mistral_7b_instruct_v0_2 import \
-        models as hf_mistral_7b_instruct_v0_2_model  # noqa: F401, E501
+    from opencompass.configs.models.mistral.hf_mistral_7b_instruct_v0_3 import \
+        models as hf_mistral_7b_instruct_v0_3_model  # noqa: F401, E501
     from opencompass.configs.models.mistral.vllm_mistral_7b_instruct_v0_2 import \
         models as vllm_mistral_7b_instruct_v0_2_model  # noqa: F401, E501
+    from opencompass.configs.models.mistral.vllm_mixtral_8x7b_instruct_v0_1 import \
+        models as vllm_mixtral_8x7b_instruct_v0_1_model  # noqa: F401, E501
     from opencompass.configs.models.openbmb.hf_minicpm_2b_dpo_fp32 import \
         models as hf_minicpm_2b_dpo_fp32_model  # noqa: F401, E501
     from opencompass.configs.models.openbmb.hf_minicpm_2b_sft_bf16 import \
@@ -57,6 +75,10 @@
         models as hf_phi_3_mini_8k_instruct_model  # noqa: F401, E501
     from opencompass.configs.models.qwen.hf_qwen1_5_0_5b_chat import \
         models as hf_qwen1_5_0_5b_chat_model  # noqa: F401, E501
+    from opencompass.configs.models.qwen.hf_qwen2_1_5b_instruct import \
+        models as hf_qwen2_1_5b_instruct_model  # noqa: F401, E501
+    from opencompass.configs.models.qwen.hf_qwen2_7b_instruct import \
+        models as hf_qwen2_7b_instruct_model  # noqa: F401, E501
     from opencompass.configs.models.qwen.lmdeploy_qwen2_1_5b_instruct import \
         models as lmdeploy_qwen2_1_5b_instruct_model  # noqa: F401, E501
     from opencompass.configs.models.qwen.lmdeploy_qwen2_7b_instruct import \
diff --git a/.github/scripts/oc_score_assert.py b/.github/scripts/oc_score_assert.py
index f869b157b..6f2c0a11a 100644
--- a/.github/scripts/oc_score_assert.py
+++ b/.github/scripts/oc_score_assert.py
@@ -7,30 +7,35 @@
 output_path = 'regression_result_daily'
 
 chat_model_list = [
-    'baichuan2-7b-chat-hf', 'deepseek-7b-chat-hf', 'deepseek-moe-16b-chat-hf',
-    'deepseek-7b-chat-vllm', 'gemma-2b-it-hf', 'gemma-7b-it-hf',
-    'internlm2_5-7b-chat-hf', 'internlm2_5-7b-chat-turbomind',
-    'internlm2-chat-1.8b-turbomind', 'internlm2-chat-1.8b-sft-turbomind',
-    'internlm2-chat-7b-turbomind', 'internlm2-chat-7b-sft-turbomind',
-    'internlm2-chat-7b-vllm', 'llama-3-8b-instruct-hf',
-    'llama-3-8b-instruct-turbomind', 'mistral-7b-instruct-v0.2-hf',
-    'mistral-7b-instruct-v0.2-vllm', 'minicpm-2b-dpo-fp32-hf',
-    'minicpm-2b-sft-bf16-hf', 'minicpm-2b-sft-fp32-hf',
-    'phi-3-mini-4k-instruct-hf', 'qwen1.5-0.5b-chat-hf',
+    'baichuan2-7b-chat-hf', 'glm-4-9b-chat-turbomind', 'glm-4-9b-chat-vllm',
+    'deepseek-7b-chat-hf', 'deepseek-moe-16b-chat-hf',
+    'deepseek-v2-lite-chat-hf', 'deepseek-7b-chat-vllm', 'gemma2-2b-it-hf',
+    'gemma2-9b-it-hf', 'gemma-7b-it-vllm', 'internlm2_5-7b-chat-hf',
+    'internlm2_5-20b-chat-hf', 'internlm2_5-7b-chat-turbomind',
+    'internlm2_5-20b-chat-turbomind', 'internlm2-chat-1.8b-turbomind',
+    'internlm2-chat-1.8b-sft-turbomind', 'internlm2-chat-7b-turbomind',
+    'internlm2-chat-7b-sft-turbomind', 'internlm2-chat-7b-vllm',
+    'llama-3_1-8b-instruct-hf', 'llama-3-8b-instruct-hf',
+    'llama-3_1-8b-instruct-turbomind', 'llama-3-8b-instruct-turbomind',
+    'mistral-7b-instruct-v0.3-hf', 'mistral-7b-instruct-v0.2-vllm',
+    'minicpm-2b-dpo-fp32-hf', 'minicpm-2b-sft-bf16-hf',
+    'minicpm-2b-sft-fp32-hf', 'phi-3-mini-4k-instruct-hf',
+    'qwen1.5-0.5b-chat-hf', 'qwen2-1.5b-instruct-hf', 'qwen2-7b-instruct-hf',
     'qwen2-1.5b-instruct-turbomind', 'qwen2-7b-instruct-turbomind',
     'qwen1.5-0.5b-chat-vllm', 'yi-1.5-6b-chat-hf', 'yi-1.5-9b-chat-hf',
     'lmdeploy-api-test'
 ]
 base_model_list = [
-    'deepseek-moe-16b-base-hf', 'deepseek-7b-base-turbomind',
-    'deepseek-moe-16b-base-vllm', 'gemma-2b-hf', 'gemma-7b-hf',
-    'internlm2_5-7b-hf', 'internlm2-7b-hf', 'internlm2-base-7b-hf',
-    'internlm2_5-7b-turbomind', 'internlm2-1.8b-turbomind',
-    'internlm2-7b-turbomind', 'internlm2-base-7b-hf',
-    'internlm2-base-7b-turbomind', 'llama-3-8b-turbomind',
-    'mistral-7b-v0.2-hf', 'mistral-7b-v0.2-vllm', 'qwen1.5-moe-a2.7b-hf',
-    'qwen2-0.5b-hf', 'qwen2-1.5b-turbomind', 'qwen2-7b-turbomind',
-    'qwen1.5-0.5b-vllm', 'yi-1.5-6b-hf', 'yi-1.5-9b-hf'
+    'deepseek-moe-16b-base-hf', 'deepseek-v2-lite-hf',
+    'deepseek-7b-base-turbomind', 'deepseek-moe-16b-base-vllm', 'gemma2-2b-hf',
+    'gemma2-9b-hf', 'internlm2_5-7b-hf', 'internlm2-7b-hf',
+    'internlm2-base-7b-hf', 'internlm2-1.8b-turbomind',
+    'internlm2_5-7b-turbomind', 'internlm2-7b-turbomind',
+    'internlm2-base-7b-turbomind', 'llama-2-7b-hf', 'llama-3-8b-hf',
+    'llama-3.1-8b-turbomind', 'llama-3-8b-turbomind', 'mistral-7b-v0.3-hf',
+    'mistral-7b-v0.2-vllm', 'qwen1.5-moe-a2.7b-hf', 'qwen2-0.5b-hf',
+    'qwen2-1.5b-hf', 'qwen2-7b-hf', 'qwen2-1.5b-turbomind',
+    'qwen2-7b-turbomind', 'qwen1.5-0.5b-vllm', 'yi-1.5-6b-hf', 'yi-1.5-9b-hf'
 ]
 dataset_list = ['gsm8k', 'race-middle', 'race-high']
 
diff --git a/.github/scripts/oc_score_baseline.yaml b/.github/scripts/oc_score_baseline.yaml
index d7e765be2..9690aa2c5 100644
--- a/.github/scripts/oc_score_baseline.yaml
+++ b/.github/scripts/oc_score_baseline.yaml
@@ -8,6 +8,16 @@ glm-4-9b-chat-hf:
     race-middle: 88
     race-high: 88
 
+glm-4-9b-chat-turbomind:
+    gsm8k: 69
+    race-middle: 82
+    race-high: 77
+
+glm-4-9b-chat-vllm:
+    gsm8k: 73
+    race-middle: 87
+    race-high: 87
+
 deepseek-7b-chat-hf:
     gsm8k: 60
     race-middle: 74
@@ -18,6 +28,11 @@ deepseek-moe-16b-chat-hf:
     race-middle: 62
     race-high: 70
 
+deepseek-v2-lite-chat-hf:
+    gsm8k: 59
+    race-middle: 82
+    race-high: 79
+
 deepseek-7b-chat-vllm:
     gsm8k: 63
     race-middle: 74
@@ -33,23 +48,48 @@ gemma-7b-it-hf:
     race-middle: 74
     race-high: 71
 
+gemma-7b-it-vllm:
+    gsm8k: 38
+    race-middle: 75
+    race-high: 70
+
+gemma2-2b-it-hf:
+    gsm8k: 62
+    race-middle: 75
+    race-high: 67
+
+gemma2-9b-it-hf:
+    gsm8k: 80
+    race-middle: 89
+    race-high: 85
+
 internlm2_5-7b-chat-hf:
     gsm8k: 86
     race-middle: 92
     race-high: 93
 
+internlm2_5-20b-chat-hf:
+    gsm8k: 91
+    race-middle: 95
+    race-high: 91
+
 internlm2_5-7b-chat-turbomind:
     gsm8k: 87
     race-middle: 92
     race-high: 93
 
+internlm2_5-20b-chat-turbomind:
+    gsm8k: 91
+    race-middle: 95
+    race-high: 91
+
 internlm2-chat-1.8b-turbomind:
     gsm8k: 40
     race-middle: 82
     race-high: 83
 
 internlm2-chat-1.8b-sft-turbomind:
-    gsm8k: 32
+    gsm8k: 34
     race-middle: 81
     race-high: 83
 
@@ -68,11 +108,21 @@ internlm2-chat-7b-vllm:
     race-middle: 90
     race-high: 91
 
+llama-3_1-8b-instruct-hf:
+    gsm8k: 82
+    race-middle: 82
+    race-high: 88
+
 llama-3-8b-instruct-hf:
     gsm8k: 77
     race-middle: 85
     race-high: 87
 
+llama-3_1-8b-instruct-turbomind:
+    gsm8k: 79
+    race-middle: 82
+    race-high: 88
+
 llama-3-8b-instruct-turbomind:
     gsm8k: 77
     race-middle: 85
@@ -83,6 +133,11 @@ mistral-7b-instruct-v0.2-hf:
     race-middle: 82
     race-high: 78
 
+mistral-7b-instruct-v0.3-hf:
+    gsm8k: 53
+    race-middle: 80
+    race-high: 78
+
 mistral-7b-instruct-v0.2-vllm:
     gsm8k: 49
     race-middle: 81
@@ -118,6 +173,11 @@ qwen1.5-0.5b-chat-hf:
     race-middle: 55
     race-high: 50
 
+qwen2-1.5b-instruct-hf:
+    gsm8k: 63
+    race-middle: 77
+    race-high: 86
+
 qwen2-1.5b-instruct-turbomind:
     gsm8k: 60
     race-middle: 77
@@ -128,6 +188,11 @@ qwen2-7b-instruct-turbomind:
     race-middle: 87
     race-high: 89
 
+qwen2-7b-instruct-hf:
+    gsm8k: 85
+    race-middle: 87
+    race-high: 91
+
 qwen1.5-0.5b-chat-vllm:
     gsm8k: 5
     race-middle: 57
@@ -153,6 +218,11 @@ deepseek-moe-16b-base-hf:
     race-middle: 35
     race-high: 23
 
+deepseek-v2-lite-hf:
+    gsm8k: 37
+    race-middle: 56
+    race-high: 62
+
 deepseek-7b-base-turbomind:
     gsm8k: 21
     race-middle: 42
@@ -173,8 +243,18 @@ gemma-7b-hf:
     race-middle: 59
     race-high: 66
 
+gemma2-2b-hf:
+    gsm8k: 8
+    race-middle: 31
+    race-high: 30
+
+gemma2-9b-hf:
+    gsm8k: 20
+    race-middle: 42
+    race-high: 35
+
 internlm2_5-7b-hf:
-    gsm8k: 46
+    gsm8k: 47
     race-middle: 92
     race-high: 91
 
@@ -208,6 +288,21 @@ internlm2-base-7b-turbomind:
     race-middle: 75
     race-high: 81
 
+llama-2-7b-hf:
+    gsm8k: 17
+    race-middle: 32
+    race-high: 38
+
+llama-3-8b-hf:
+    gsm8k: 48
+    race-middle: 64
+    race-high: 70
+
+llama-3.1-8b-turbomind:
+    gsm8k: 57
+    race-middle: 67
+    race-high: 75
+
 llama-3-8b-turbomind:
     gsm8k: 52
     race-middle: 63
@@ -218,6 +313,11 @@ mistral-7b-v0.2-hf:
     race-middle: 42
     race-high: 60
 
+mistral-7b-v0.3-hf:
+    gsm8k: 43
+    race-middle: 42
+    race-high: 60
+
 mistral-7b-v0.2-vllm:
     gsm8k: 45
     race-middle: 42
@@ -228,11 +328,21 @@ qwen1.5-moe-a2.7b-hf:
     race-middle: 78
     race-high: 90
 
+qwen2-1.5b-hf:
+    gsm8k: 58
+    race-middle: 65
+    race-high: 78
+
 qwen2-0.5b-hf:
     gsm8k: 35
     race-middle: 52
     race-high: 48
 
+qwen2-7b-hf:
+    gsm8k: 82
+    race-middle: 88
+    race-high: 89
+
 qwen2-1.5b-turbomind:
     gsm8k: 57
     race-middle: 64
diff --git a/.github/workflows/daily-run-test.yml b/.github/workflows/daily-run-test.yml
index 7d7affafc..894b149e0 100644
--- a/.github/workflows/daily-run-test.yml
+++ b/.github/workflows/daily-run-test.yml
@@ -14,9 +14,14 @@ env:
   PIP_CACHE_PATH: /cpfs01/user/qa-llm-cicd/.cache/pip
   USERSPACE_PREFIX: /cpfs01/user/qa-llm-cicd
   HF_CACHE_PATH: /cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub
+  HUGGINGFACE_HUB_CACHE: /cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub
+  HF_HUB_CACHE: /cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub
   DATEASET_CACHE_PATH: /cpfs01/shared/public/public_hdd/llmeval/llm-evaluation-datasets
   HF_DATASETS_OFFLINE: 1
+  HF_EVALUATE_OFFLINE: 1
   TRANSFORMERS_OFFLINE: 1
+  VLLM_USE_MODELSCOPE: false
+  LMDEPLOY_USE_MODELSCOPE: false
   HF_HUB_OFFLINE: 1
   TRITON_PTXAS_PATH: /usr/local/cuda/bin/ptxas
 
@@ -43,7 +48,11 @@ jobs:
 
   daily_run_test:
     needs: build-pypi
-    runs-on: self-hosted
+    strategy:
+      fail-fast: false
+      matrix:
+        cuda_env: [dsw_cu11, dsw_cu12]
+    runs-on: ${{ matrix.cuda_env }}
     environment: 'prod'
     timeout-minutes: 420 #7hours
     steps:
@@ -53,22 +62,38 @@ jobs:
         uses: actions/download-artifact@v4
         with:
           name: my-artifact-${{ github.run_id }}
-      - name: Prepare - create conda env and install torch
+      - name: Prepare - create conda env and install torch - cu11
+        if: ${{matrix.cuda_env == 'dsw_cu11'}}
         run: |
           . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
-          conda create -y --name ${{env.CONDA_ENV}} python=3.10
-          conda activate ${{env.CONDA_ENV}}
-          pip install opencompass*.whl
-          pip install /cpfs01/user/qa-llm-cicd/packages/lmdeploy-0.5.0+cu118-cp310-cp310-manylinux2014_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
-          pip install /cpfs01/user/qa-llm-cicd/packages/vllm-0.5.5+cu118-cp310-cp310-manylinux1_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
-
-          pip install human_eval transformers protobuf pytest gguf msgspec librosa vllm_flash_attn bitsandbytes --cache-dir ${{env.PIP_CACHE_PATH}}
+          conda create -y --name ${{env.CONDA_ENV}}_${{ matrix.cuda_env }} python=3.10
+          conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
+          pip install opencompass*.whl --cache-dir ${{env.PIP_CACHE_PATH}}
+          pip install /cpfs01/user/qa-llm-cicd/packages/lmdeploy-0.6.0+cu118-cp310-cp310-manylinux2014_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
+          pip install /cpfs01/user/qa-llm-cicd/packages/vllm-0.6.1.post1+cu118-cp310-cp310-manylinux1_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
+          pip install human_eval transformers protobuf pytest gguf msgspec librosa vllm_flash_attn bitsandbytes modelscope --cache-dir ${{env.PIP_CACHE_PATH}}
           pip uninstall torch torchvision torchaudio -y
           pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --cache-dir ${{env.PIP_CACHE_PATH}} --index-url https://download.pytorch.org/whl/cu118
           FLASH_ATTENTION_FORCE_BUILD=TRUE pip install /cpfs01/user/qa-llm-cicd/packages/flash_attn-2.6.3+cu118torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
           pip install /cpfs01/user/qa-llm-cicd/packages/xformers-0.0.27.post2+cu118-cp310-cp310-manylinux2014_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
           conda info --envs
           pip list
+      - name: Prepare - create conda env and install torch - cu12
+        if: ${{matrix.cuda_env == 'dsw_cu12'}}
+        run: |
+          . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
+          conda create -y --name ${{env.CONDA_ENV}}_${{ matrix.cuda_env }} python=3.10
+          conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
+          pip install opencompass*.whl --cache-dir ${{env.PIP_CACHE_PATH}}
+          pip install lmdeploy==0.6.0 --cache-dir ${{env.PIP_CACHE_PATH}} --no-cache-dir
+          pip install opencompass[vllm] --cache-dir ${{env.PIP_CACHE_PATH}}
+          pip install human_eval transformers protobuf pytest gguf msgspec librosa vllm_flash_attn bitsandbytes modelscope --cache-dir ${{env.PIP_CACHE_PATH}}
+          pip uninstall torch torchvision torchaudio -y
+          pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --cache-dir ${{env.PIP_CACHE_PATH}}
+          FLASH_ATTENTION_FORCE_BUILD=TRUE pip install /cpfs01/user/qa-llm-cicd/packages/flash_attn-2.6.3+cu123torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
+          pip install /cpfs01/user/qa-llm-cicd/packages/xformers-0.0.27.post2-cp310-cp310-manylinux2014_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}}
+          conda info --envs
+          pip list
       - name: Prepare - prepare data and hf model
         run: |
           ln -s ${{env.DATEASET_CACHE_PATH}} data
@@ -77,45 +102,45 @@ jobs:
       - name:  Run chat model test
         run: |
           . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
-          conda activate ${{env.CONDA_ENV}}
+          conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
           conda info --envs
           sed -i 's/judgemodel/'$(tail -n 1 /cpfs01/shared/public/llmeval/share_info/compassjuder_ip.txt)'/g' .github/scripts/eval_regression_chat.py
-          python3 run.py .github/scripts/eval_regression_chat.py --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/chat --reuse
-          rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/chat/*/summary regression_result_daily
+          opencompass .github/scripts/eval_regression_chat.py --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/chat_${{ matrix.cuda_env }} --reuse --max-num-workers 2
+          rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/chat_${{ matrix.cuda_env }}/*/summary regression_result_daily
           python -m pytest -m chat -s -v --color=yes .github/scripts/oc_score_assert.py
       - name:  Run base model test
         run: |
           . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
-          conda activate ${{env.CONDA_ENV}}
+          conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
           conda info --envs
-          python3 run.py .github/scripts/eval_regression_base.py --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/base --reuse
-          rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/base/*/summary regression_result_daily
+          opencompass .github/scripts/eval_regression_base.py --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/base_${{ matrix.cuda_env }} --reuse --max-num-workers 2
+          rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/base_${{ matrix.cuda_env }}/*/summary regression_result_daily
           python -m pytest -m base -s -v --color=yes .github/scripts/oc_score_assert.py
       - name:  Run command testcase
         run: |
           . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
-          conda activate ${{env.CONDA_ENV}}
+          conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
           conda info --envs
           export from_tf=TRUE
           python tools/list_configs.py internlm2_5 mmlu
-          python run.py --models hf_internlm2_5_7b --datasets race_ppl --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd1 --reuse
-          rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd1/*/summary regression_result_daily
+          opencompass --models hf_internlm2_5_7b --datasets race_ppl --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd1_${{ matrix.cuda_env }} --reuse --max-num-workers 2
+          rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd1_${{ matrix.cuda_env }}/*/summary regression_result_daily
           python -m pytest -m case1 -s -v --color=yes .github/scripts/oc_score_assert.py
-          python run.py --models hf_internlm2_5_7b_chat --datasets race_gen -a lmdeploy --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd2 --reuse
-          rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd2/*/summary regression_result_daily
+          opencompass --models hf_internlm2_5_7b_chat hf_internlm2_5_1_8b_chat --datasets race_gen -a lmdeploy --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd2 --reuse --max-num-workers 2
+          rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd2_${{ matrix.cuda_env }}/*/summary regression_result_daily
           python -m pytest -m case2 -s -v --color=yes .github/scripts/oc_score_assert.py
-          python run.py --datasets race_ppl --hf-type base --hf-path internlm/internlm2_5-7b --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd3 --reuse
-          rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd3/*/summary regression_result_daily
+          opencompass --datasets race_ppl --hf-type base --hf-path internlm/internlm2_5-7b --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd3 --reuse --max-num-workers 2
+          rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd3_${{ matrix.cuda_env }}/*/summary regression_result_daily
           python -m pytest -m case3 -s -v --color=yes .github/scripts/oc_score_assert.py
-          python run.py --datasets race_gen --hf-type chat --hf-path internlm/internlm2_5-7b-chat --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd4 --reuse
-          rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd4/*/summary regression_result_daily
+          opencompass --datasets race_gen --hf-type chat --hf-path internlm/internlm2_5-7b-chat --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd4 --reuse --max-num-workers 2
+          rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd4_${{ matrix.cuda_env }}/*/summary regression_result_daily
           python -m pytest -m case4 -s -v --color=yes .github/scripts/oc_score_assert.py
       - name:  Remove Conda Env
         if: always()
         run: |
           rm -rf regression_result_daily
           . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
-          conda env remove -y --name ${{env.CONDA_ENV}}
+          conda env remove -y --name ${{env.CONDA_ENV}}_${{ matrix.cuda_env }}
           conda info --envs
 
   notify_to_feishu:
diff --git a/.github/workflows/pr-run-test.yml b/.github/workflows/pr-run-test.yml
index 6cab13786..d9fcdc3ae 100644
--- a/.github/workflows/pr-run-test.yml
+++ b/.github/workflows/pr-run-test.yml
@@ -51,7 +51,7 @@ jobs:
           conda activate ${{env.CONDA_ENV}}
           conda info --envs
           rm -rf regression_result
-          python3 run.py --models hf_internlm2_chat_7b --datasets siqa_gen --work-dir regression_result --debug
+          opencompass --models hf_internlm2_chat_7b --datasets siqa_gen --work-dir regression_result --debug
       - name:  Get result
         run: |
           score=$(sed -n '$p' regression_result/*/summary/*.csv | awk -F ',' '{print $NF}')

From 80cda1980e8725b713845675711f9e269025478d Mon Sep 17 00:00:00 2001
From: Linchen Xiao <xxllcc1993@gmail.com>
Date: Wed, 25 Sep 2024 20:58:34 +0800
Subject: [PATCH 07/20] [BUG] fix followbench dataset config (#1564)

* [BUG] fix followbench dataset config

* [BUG] fix followbench dataset config
---
 .../datasets/subjective/followbench/followbench_llmeval.py    | 4 ++--
 .../datasets/subjective/followbench/followbench_llmeval.py    | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/configs/datasets/subjective/followbench/followbench_llmeval.py b/configs/datasets/subjective/followbench/followbench_llmeval.py
index 0733340ed..e601bda34 100644
--- a/configs/datasets/subjective/followbench/followbench_llmeval.py
+++ b/configs/datasets/subjective/followbench/followbench_llmeval.py
@@ -15,7 +15,7 @@
 ]
 data_path ='data/subjective/followbench/converted_data'
 
-followbench_llmeval_dataset = []
+followbench_llmeval_datasets = []
 
 for _name in subjective_all_sets:
     subjective_infer_cfg = dict(
@@ -48,7 +48,7 @@
         pred_role='BOT',
     )
 
-    followbench_llmeval_dataset.append(
+    followbench_llmeval_datasets.append(
         dict(
             abbr=f'{_name}',
             type=FollowBenchDataset,
diff --git a/opencompass/configs/datasets/subjective/followbench/followbench_llmeval.py b/opencompass/configs/datasets/subjective/followbench/followbench_llmeval.py
index 0733340ed..e601bda34 100644
--- a/opencompass/configs/datasets/subjective/followbench/followbench_llmeval.py
+++ b/opencompass/configs/datasets/subjective/followbench/followbench_llmeval.py
@@ -15,7 +15,7 @@
 ]
 data_path ='data/subjective/followbench/converted_data'
 
-followbench_llmeval_dataset = []
+followbench_llmeval_datasets = []
 
 for _name in subjective_all_sets:
     subjective_infer_cfg = dict(
@@ -48,7 +48,7 @@
         pred_role='BOT',
     )
 
-    followbench_llmeval_dataset.append(
+    followbench_llmeval_datasets.append(
         dict(
             abbr=f'{_name}',
             type=FollowBenchDataset,

From 3f833186dc8c757125420660041f30f664e7dbfc Mon Sep 17 00:00:00 2001
From: Yi Ding <cuauty@users.noreply.github.com>
Date: Thu, 26 Sep 2024 16:49:52 +0800
Subject: [PATCH 08/20] [Feature] Support the reasoning from BaiLing LLM
 (#1541)

* [Feature] Support the reasoning from BaiLing LLM

This commit includes the access to BaiLing LLM and gets the reasoning.

* Add the api example

The example of evalute bailing api

* Revise the generation arguments

Based on current experiment, we update some generation arguments for better reasoning

* [fix] set the batch size

* Retry under flowcontrol of serverside

* add dependent package into requirement.txt

add dependent package retrying to clean up the pre-comment check.

* correct the file names and make the file copy

correct the file names.
copy the files under configs to opencompass

* fix the lint issue

---------

Co-authored-by: christopher.dy <christopher.dy@antgroup.com>
---
 configs/api_examples/eval_api_bailing.py      |  38 ++++
 .../models/bailing_api/bailing-lite-0830.py   |  31 +++
 .../models/bailing_api/bailing-pro-0920.py    |  31 +++
 .../models/bailing_api/bailing-lite-0830.py   |  31 +++
 .../models/bailing_api/bailing-pro-0920.py    |  31 +++
 opencompass/models/__init__.py                |   4 +-
 opencompass/models/bailing_api_oc.py          | 215 ++++++++++++++++++
 requirements/runtime.txt                      |   1 +
 8 files changed, 380 insertions(+), 2 deletions(-)
 create mode 100644 configs/api_examples/eval_api_bailing.py
 create mode 100644 configs/models/bailing_api/bailing-lite-0830.py
 create mode 100644 configs/models/bailing_api/bailing-pro-0920.py
 create mode 100644 opencompass/configs/models/bailing_api/bailing-lite-0830.py
 create mode 100644 opencompass/configs/models/bailing_api/bailing-pro-0920.py
 create mode 100644 opencompass/models/bailing_api_oc.py

diff --git a/configs/api_examples/eval_api_bailing.py b/configs/api_examples/eval_api_bailing.py
new file mode 100644
index 000000000..15101b09f
--- /dev/null
+++ b/configs/api_examples/eval_api_bailing.py
@@ -0,0 +1,38 @@
+from mmengine.config import read_base
+
+from opencompass.models import BailingAPI
+from opencompass.partitioners import NaivePartitioner
+from opencompass.runners.local_api import LocalAPIRunner
+from opencompass.tasks import OpenICLInferTask
+
+with read_base():
+    from opencompass.configs.datasets.ceval.ceval_gen import ceval_datasets
+    from opencompass.configs.summarizers.medium import summarizer
+
+datasets = [
+    *ceval_datasets,
+]
+
+models = [
+    dict(
+        path="Bailing-Lite-0830",
+        token="xxxxxx",  # set your key here or in environment variable BAILING_API_KEY
+        url="https://bailingchat.alipay.com/chat/completions",
+        type=BailingAPI,
+        generation_kwargs={},
+        query_per_second=1,
+        max_seq_len=4096,
+    ),
+]
+
+infer = dict(
+    partitioner=dict(type=NaivePartitioner),
+    runner=dict(
+        type=LocalAPIRunner,
+        max_num_workers=2,
+        concurrent_users=2,
+        task=dict(type=OpenICLInferTask),
+    ),
+)
+
+work_dir = "outputs/api_bailing/"
diff --git a/configs/models/bailing_api/bailing-lite-0830.py b/configs/models/bailing_api/bailing-lite-0830.py
new file mode 100644
index 000000000..1a43b4be1
--- /dev/null
+++ b/configs/models/bailing_api/bailing-lite-0830.py
@@ -0,0 +1,31 @@
+from opencompass.models import BailingAPI
+
+api_meta_template = dict(
+    round=[
+        dict(role="HUMAN", api_role="HUMAN"),
+        dict(role="BOT", api_role="BOT", generate=False),
+    ],
+    reserved_roles=[dict(role="SYSTEM", api_role="SYSTEM")],
+)
+
+models = [
+    dict(
+        path="Bailing-Lite-0830",
+        token="",  # set your key here or in environment variable BAILING_API_KEY
+        url="https://bailingchat.alipay.com/chat/completions",
+        type=BailingAPI,
+        meta_template=api_meta_template,
+        query_per_second=1,
+        max_seq_len=4096,
+        batch_size=1,
+        generation_kwargs={
+            "temperature": 0.4,
+            "top_p": 1.0,
+            "top_k": -1,
+            "n": 1,
+            "logprobs": 1,
+            "use_beam_search": False,
+        },
+    ),
+]
+
diff --git a/configs/models/bailing_api/bailing-pro-0920.py b/configs/models/bailing_api/bailing-pro-0920.py
new file mode 100644
index 000000000..35814bf79
--- /dev/null
+++ b/configs/models/bailing_api/bailing-pro-0920.py
@@ -0,0 +1,31 @@
+from opencompass.models import BailingAPI
+
+api_meta_template = dict(
+    round=[
+        dict(role="HUMAN", api_role="HUMAN"),
+        dict(role="BOT", api_role="BOT", generate=False),
+    ],
+    reserved_roles=[dict(role="SYSTEM", api_role="SYSTEM")],
+)
+
+models = [
+    dict(
+        path="Bailing-Pro-0920",
+        token="",  # set your key here or in environment variable BAILING_API_KEY
+        url="https://bailingchat.alipay.com/chat/completions",
+        type=BailingAPI,
+        meta_template=api_meta_template,
+        query_per_second=1,
+        max_seq_len=4096,
+        batch_size=1,
+        generation_kwargs={
+            "temperature": 0.4,
+            "top_p": 1.0,
+            "top_k": -1,
+            "n": 1,
+            "logprobs": 1,
+            "use_beam_search": False,
+        },
+    ),
+]
+
diff --git a/opencompass/configs/models/bailing_api/bailing-lite-0830.py b/opencompass/configs/models/bailing_api/bailing-lite-0830.py
new file mode 100644
index 000000000..1a43b4be1
--- /dev/null
+++ b/opencompass/configs/models/bailing_api/bailing-lite-0830.py
@@ -0,0 +1,31 @@
+from opencompass.models import BailingAPI
+
+api_meta_template = dict(
+    round=[
+        dict(role="HUMAN", api_role="HUMAN"),
+        dict(role="BOT", api_role="BOT", generate=False),
+    ],
+    reserved_roles=[dict(role="SYSTEM", api_role="SYSTEM")],
+)
+
+models = [
+    dict(
+        path="Bailing-Lite-0830",
+        token="",  # set your key here or in environment variable BAILING_API_KEY
+        url="https://bailingchat.alipay.com/chat/completions",
+        type=BailingAPI,
+        meta_template=api_meta_template,
+        query_per_second=1,
+        max_seq_len=4096,
+        batch_size=1,
+        generation_kwargs={
+            "temperature": 0.4,
+            "top_p": 1.0,
+            "top_k": -1,
+            "n": 1,
+            "logprobs": 1,
+            "use_beam_search": False,
+        },
+    ),
+]
+
diff --git a/opencompass/configs/models/bailing_api/bailing-pro-0920.py b/opencompass/configs/models/bailing_api/bailing-pro-0920.py
new file mode 100644
index 000000000..35814bf79
--- /dev/null
+++ b/opencompass/configs/models/bailing_api/bailing-pro-0920.py
@@ -0,0 +1,31 @@
+from opencompass.models import BailingAPI
+
+api_meta_template = dict(
+    round=[
+        dict(role="HUMAN", api_role="HUMAN"),
+        dict(role="BOT", api_role="BOT", generate=False),
+    ],
+    reserved_roles=[dict(role="SYSTEM", api_role="SYSTEM")],
+)
+
+models = [
+    dict(
+        path="Bailing-Pro-0920",
+        token="",  # set your key here or in environment variable BAILING_API_KEY
+        url="https://bailingchat.alipay.com/chat/completions",
+        type=BailingAPI,
+        meta_template=api_meta_template,
+        query_per_second=1,
+        max_seq_len=4096,
+        batch_size=1,
+        generation_kwargs={
+            "temperature": 0.4,
+            "top_p": 1.0,
+            "top_k": -1,
+            "n": 1,
+            "logprobs": 1,
+            "use_beam_search": False,
+        },
+    ),
+]
+
diff --git a/opencompass/models/__init__.py b/opencompass/models/__init__.py
index 403eb5d6a..0beb963a1 100644
--- a/opencompass/models/__init__.py
+++ b/opencompass/models/__init__.py
@@ -3,6 +3,7 @@
 from .alaya import AlayaLM  # noqa: F401
 from .baichuan_api import BaiChuan  # noqa: F401
 from .baidu_api import ERNIEBot  # noqa: F401
+from .bailing_api_oc import BailingAPI  # noqa: F401
 from .base import BaseModel, LMTemplateParser  # noqa: F401
 from .base_api import APITemplateParser, BaseAPIModel  # noqa: F401
 from .bytedance_api import ByteDance  # noqa: F401
@@ -41,8 +42,7 @@
 from .stepfun_api import StepFun  # noqa: F401
 from .turbomind import TurboMindModel  # noqa: F401
 from .turbomind_tis import TurboMindTisModel  # noqa: F401
-from .turbomind_with_tf_above_v4_33 import \
-    TurboMindModelwithChatTemplate  # noqa: F401
+from .turbomind_with_tf_above_v4_33 import TurboMindModelwithChatTemplate  # noqa: F401
 from .unigpt_api import UniGPT  # noqa: F401
 from .vllm import VLLM  # noqa: F401
 from .vllm_with_tf_above_v4_33 import VLLMwithChatTemplate  # noqa: F401
diff --git a/opencompass/models/bailing_api_oc.py b/opencompass/models/bailing_api_oc.py
new file mode 100644
index 000000000..6ff75e0d5
--- /dev/null
+++ b/opencompass/models/bailing_api_oc.py
@@ -0,0 +1,215 @@
+import concurrent
+import concurrent.futures
+import os
+import socket
+import traceback
+from typing import Dict, List, Optional, Union
+
+import requests
+from requests.adapters import HTTPAdapter
+from retrying import retry
+from urllib3.connection import HTTPConnection
+
+from opencompass.utils.prompt import PromptList
+
+from .base_api import BaseAPIModel
+
+PromptType = Union[PromptList, str]
+
+
+class HTTPAdapterWithSocketOptions(HTTPAdapter):
+    def __init__(self, *args, **kwargs):
+        self._socket_options = HTTPConnection.default_socket_options + [
+            (socket.SOL_SOCKET, socket.SO_KEEPALIVE, 1),
+            (socket.SOL_TCP, socket.TCP_KEEPIDLE, 75),
+            (socket.SOL_TCP, socket.TCP_KEEPINTVL, 30),
+            (socket.SOL_TCP, socket.TCP_KEEPCNT, 120),
+        ]
+        super(HTTPAdapterWithSocketOptions, self).__init__(*args, **kwargs)
+
+    def init_poolmanager(self, *args, **kwargs):
+        if self._socket_options is not None:
+            kwargs["socket_options"] = self._socket_options
+        super(HTTPAdapterWithSocketOptions, self).init_poolmanager(*args, **kwargs)
+
+
+class BailingAPI(BaseAPIModel):
+    """Model wrapper around Bailing Service.
+
+    Args:
+        ouput_key (str): key for prediction
+        query_per_second (int): The maximum queries allowed per second
+            between two consecutive calls of the API. Defaults to 1.
+        generation_kwargs: other params
+        retry (int): Number of retires if the API call fails. Defaults to 2.
+    """
+
+    def __init__(
+        self,
+        path: str,
+        token: str,
+        url: str,
+        meta_template: Optional[Dict] = None,
+        query_per_second: int = 1,
+        retry: int = 3,
+        generation_kwargs: Dict = {},
+        max_seq_len=4096,
+    ):
+        super().__init__(
+            path=path,
+            max_seq_len=max_seq_len,
+            query_per_second=query_per_second,
+            meta_template=meta_template,
+            retry=retry,
+            generation_kwargs=generation_kwargs,
+        )
+
+        self.logger.info(f"Bailing API Model Init path: {path} url={url}")
+        if not token:
+            token = os.environ.get("BAILING_API_KEY")
+            if token:
+                self._headers = {"Authorization": f"Bearer {token}"}
+            else:
+                raise RuntimeError(f"There is not valid token.")
+        self._headers["Content-Type"] = "application/json"
+        self._url = url if url else "https://bailingchat.alipay.com/chat/completions"
+        self._model = path
+        self._sessions = []
+        self._num = (
+            int(os.environ.get("BAILING_API_PARALLEL_NUM"))
+            if os.environ.get("BAILING_API_PARALLEL_NUM")
+            else 1
+        )
+        try:
+            for _ in range(self._num):
+                adapter = HTTPAdapterWithSocketOptions()
+                sess = requests.Session()
+                sess.mount("http://", adapter)
+                sess.mount("https://", adapter)
+                self._sessions.append(sess)
+        except Exception as e:
+            self.logger.error(f"Fail to setup the session. {e}")
+            raise e
+
+    def generate(
+        self,
+        inputs: Union[List[str], PromptList],
+        max_out_len: int = 4096,
+    ) -> List[str]:
+        """Generate results given a list of inputs.
+
+        Args:
+            inputs (Union[List[str], PromptList]): A list of strings or PromptDicts.
+                The PromptDict should be organized in OpenCompass' API format.
+            max_out_len (int): The maximum length of the output.
+
+        Returns:
+            List[str]: A list of generated strings.
+        """
+        with concurrent.futures.ThreadPoolExecutor(
+            max_workers=self._num,
+        ) as executor:
+            future_to_m = {
+                executor.submit(
+                    self._generate,
+                    self._sessions[i % self._num],
+                    input,
+                    max_out_len,
+                ): i
+                for i, input in enumerate(inputs)
+            }
+            results = []
+            for future in concurrent.futures.as_completed(future_to_m):
+                m = future_to_m[future]
+                resp = future.result()
+                if resp and resp.status_code == 200:
+                    try:
+                        result = resp.json()
+                    except:
+                        results.append("")
+                    else:
+                        if (
+                            result.get("choices")
+                            and result["choices"][0].get("message")
+                            and result["choices"][0]["message"].get("content")
+                        ):
+                            results.append(result["choices"][0]["message"]["content"])
+                else:
+                    results.append("")
+        self.flush()
+        return results
+
+    def _generate(
+        self,
+        sess,
+        input: Union[str, PromptList],
+        max_out_len: int,
+    ) -> str:
+        """Generate results given an input.
+
+        Args:
+            inputs (str or PromptList): A string or PromptDict.
+                The PromptDict should be organized in OpenCompass' API format.
+            max_out_len (int): The maximum length of the output.
+
+        Returns:
+            str: The generated string.
+        """
+        if isinstance(input, str):
+            messages = [{"role": "user", "content": input}]
+        else:
+            messages = []
+            for item in input:
+                content = item["prompt"]
+                if not content:
+                    continue
+                message = {"content": content}
+                if item["role"] == "HUMAN":
+                    message["role"] = "user"
+                elif item["role"] == "BOT":
+                    message["role"] = "assistant"
+                elif item["role"] == "SYSTEM":
+                    message["role"] = "system"
+                else:
+                    message["role"] = item["role"]
+                messages.append(message)
+        request = {
+            "model": self._model,
+            "messages": messages,
+            "max_seq_len": max(
+                max_out_len if max_out_len else 4096,
+                self.max_seq_len if self.max_seq_len else 4096,
+            ),
+        }
+        request.update(self.generation_kwargs)
+        try:
+            retry_num = 0
+            while retry_num < self.retry:
+                response = self._infer_result(request, sess)
+                if response.status_code == 200:
+                    break  # success
+                elif response.status_code == 426:
+                    retry_num += 1  # retry
+                else:
+                    raise ValueError(f"Status code = {response.status_code}")
+            else:
+                raise ValueError(
+                    f"Exceed the maximal retry times. Last status code = {response.status_code}"
+                )
+        except Exception as e:
+            self.logger.error(
+                f"Fail to inference request={request}; model_name={self.path};  error={e}, stack:{traceback.format_exc()}"
+            )
+            raise e
+        return response
+
+    @retry(stop_max_attempt_number=3, wait_fixed=16000)  # ms
+    def _infer_result(self, request, sess):
+        response = sess.request(
+            "POST",
+            self._url,
+            json=request,
+            headers=self._headers,
+            timeout=500,
+        )
+        return response
diff --git a/requirements/runtime.txt b/requirements/runtime.txt
index dc6389114..e7229e889 100644
--- a/requirements/runtime.txt
+++ b/requirements/runtime.txt
@@ -23,6 +23,7 @@ python-Levenshtein
 rank_bm25==0.2.2
 rapidfuzz
 requests>=2.31.0
+retrying
 rich
 rouge
 -e git+https://github.com/Isaac-JL-Chen/rouge_chinese.git@master#egg=rouge_chinese

From a7bacfdf7edeb5bea58345a91c9ba486a3195b68 Mon Sep 17 00:00:00 2001
From: Songyang Zhang <tonysy@users.noreply.github.com>
Date: Thu, 26 Sep 2024 18:44:00 +0800
Subject: [PATCH 09/20] [Feature] Update CoreBench 2.0 (#1566)

* [Feature] 1. Update CoreBench Base\n 2. Fix lint issue in BalingAPI

* Update

* Update
---
 configs/api_examples/eval_api_bailing.py      |   8 +-
 configs/eval_corebench_2409_base_objective.py | 188 ++++++++++++++++++
 ... => eval_corebench_2409_chat_objective.py} |  26 ++-
 .../models/bailing_api/bailing-lite-0830.py   |  25 ++-
 .../models/bailing_api/bailing-pro-0920.py    |  25 ++-
 .../models/qwen2_5/lmdeploy_qwen2_5_1_5b.py   |  15 ++
 configs/models/qwen2_5/lmdeploy_qwen2_5_7b.py |  15 ++
 .../models/bailing_api/bailing-lite-0830.py   |  25 ++-
 .../models/bailing_api/bailing-pro-0920.py    |  25 ++-
 .../models/qwen2_5/lmdeploy_qwen2_5_1_5b.py   |  15 ++
 .../models/qwen2_5/lmdeploy_qwen2_5_7b.py     |  15 ++
 opencompass/models/__init__.py                |   3 +-
 opencompass/models/bailing_api_oc.py          | 108 +++++-----
 13 files changed, 379 insertions(+), 114 deletions(-)
 create mode 100644 configs/eval_corebench_2409_base_objective.py
 rename configs/{eval_corebench_2409_objective.py => eval_corebench_2409_chat_objective.py} (88%)
 create mode 100644 configs/models/qwen2_5/lmdeploy_qwen2_5_1_5b.py
 create mode 100644 configs/models/qwen2_5/lmdeploy_qwen2_5_7b.py
 create mode 100644 opencompass/configs/models/qwen2_5/lmdeploy_qwen2_5_1_5b.py
 create mode 100644 opencompass/configs/models/qwen2_5/lmdeploy_qwen2_5_7b.py

diff --git a/configs/api_examples/eval_api_bailing.py b/configs/api_examples/eval_api_bailing.py
index 15101b09f..00640fb4f 100644
--- a/configs/api_examples/eval_api_bailing.py
+++ b/configs/api_examples/eval_api_bailing.py
@@ -15,9 +15,9 @@
 
 models = [
     dict(
-        path="Bailing-Lite-0830",
-        token="xxxxxx",  # set your key here or in environment variable BAILING_API_KEY
-        url="https://bailingchat.alipay.com/chat/completions",
+        path='Bailing-Lite-0830',
+        token='xxxxxx',  # set your key here or in environment variable BAILING_API_KEY
+        url='https://bailingchat.alipay.com/chat/completions',
         type=BailingAPI,
         generation_kwargs={},
         query_per_second=1,
@@ -35,4 +35,4 @@
     ),
 )
 
-work_dir = "outputs/api_bailing/"
+work_dir = 'outputs/api_bailing/'
diff --git a/configs/eval_corebench_2409_base_objective.py b/configs/eval_corebench_2409_base_objective.py
new file mode 100644
index 000000000..9c9043657
--- /dev/null
+++ b/configs/eval_corebench_2409_base_objective.py
@@ -0,0 +1,188 @@
+from mmengine.config import read_base
+import os.path as osp
+from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
+from opencompass.runners import LocalRunner
+from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask
+
+
+#######################################################################
+#                          PART 0  Essential Configs                  #
+#######################################################################
+with read_base():
+    # Datasets Part
+    ## Core Set
+    # ## Examination
+    from opencompass.configs.datasets.mmlu.mmlu_ppl_ac766d import mmlu_datasets
+    from opencompass.configs.datasets.mmlu_pro.mmlu_pro_few_shot_gen_bfaf90 import \
+        mmlu_pro_datasets
+    from opencompass.configs.datasets.cmmlu.cmmlu_ppl_041cbf import \
+        cmmlu_datasets
+    # ## Reasoning
+    from opencompass.configs.datasets.bbh.bbh_gen_98fba6 import bbh_datasets
+    from opencompass.configs.datasets.hellaswag.hellaswag_10shot_ppl_59c85e import hellaswag_datasets
+    from opencompass.configs.datasets.drop.drop_gen_a2697c import drop_datasets
+
+    # ## Math
+    from opencompass.configs.datasets.math.math_4shot_base_gen_43d5b6 import math_datasets
+    from opencompass.configs.datasets.gsm8k.gsm8k_gen_17d0dc import gsm8k_datasets
+    from opencompass.configs.datasets.MathBench.mathbench_2024_few_shot_mixed_4a3fd4 import \
+        mathbench_datasets
+
+    # ## Scientific
+    from opencompass.configs.datasets.gpqa.gpqa_few_shot_ppl_2c9cd6 import \
+        gpqa_datasets
+
+    # ## Coding
+    from opencompass.configs.datasets.humaneval.deprecated_humaneval_gen_d2537e import humaneval_datasets
+    from opencompass.configs.datasets.mbpp.sanitized_mbpp_gen_742f0c import sanitized_mbpp_datasets
+    # TODO: Add LiveCodeBench
+
+    # ## Instruction Following
+    # from opencompass.configs.datasets.IFEval.IFEval_gen_3321a3 import ifeval_datasets
+
+    # Summarizer
+    from opencompass.configs.summarizers.groups.mmlu import mmlu_summary_groups
+    from opencompass.configs.summarizers.groups.mmlu_pro import mmlu_pro_summary_groups
+    from opencompass.configs.summarizers.groups.cmmlu import cmmlu_summary_groups
+    from opencompass.configs.summarizers.groups.bbh import bbh_summary_groups
+    from opencompass.configs.summarizers.groups.mathbench_v1_2024 import \
+        mathbench_2024_summary_groups
+
+    # Model List
+    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_1_5b import models as lmdeploy_qwen2_5_1_5b_model
+    # from opencompass.configs.models.qwen.lmdeploy_qwen2_1_5b_instruct import models as lmdeploy_qwen2_1_5b_instruct_model
+    # from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import models as hf_internlm2_5_7b_chat_model
+    # from opencompass.configs.models.openbmb.hf_minicpm_2b_sft_bf16 import models as hf_minicpm_2b_sft_bf16_model
+    # from opencompass.configs.models.yi.hf_yi_1_5_6b_chat import models as hf_yi_1_5_6b_chat_model
+    # from opencompass.configs.models.gemma.hf_gemma_2b_it import models as hf_gemma_2b_it_model
+    # from opencompass.configs.models.yi.hf_yi_1_5_34b_chat import models as hf_yi_1_5_34b_chat_model
+
+#######################################################################
+#                          PART 1  Datasets List                      #
+#######################################################################
+# datasets list for evaluation
+datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
+
+
+#######################################################################
+#                       PART 2  Datset Summarizer                     #
+#######################################################################
+# with read_base():
+
+core_summary_groups = [
+    {
+        'name': 'core_average',
+        'subsets': [
+            ['mmlu', 'accuracy'],
+            ['mmlu_pro', 'accuracy'],
+            ['cmmlu', 'accuracy'],
+            ['bbh', 'naive_average'],
+            ['hellaswag', 'accuracy'],
+            ['drop', 'accuracy'],
+            ['math', 'accuracy'],
+            ['gsm8k', 'accuracy'],
+            ['mathbench-t (average)', 'naive_average']
+            ['GPQA_diamond', 'accuracy'],
+            ['openai_humaneval', 'humaneval_pass@1'],
+            ['IFEval', 'Prompt-level-strict-accuracy'],
+            ['sanitized_mbpp', 'score'],
+            ['mathbench-t (average)', 'naive_average']
+        ],
+    },
+]
+
+summarizer = dict(
+    dataset_abbrs=[
+        ['mmlu', 'accuracy'],
+        ['mmlu_pro', 'accuracy'],
+        ['cmmlu', 'accuracy'],
+        ['bbh', 'naive_average'],
+        ['hellaswag', 'accuracy'],
+        ['drop', 'accuracy'],
+        ['math', 'accuracy'],
+        ['gsm8k', 'accuracy'],
+        ['mathbench-t (average)', 'naive_average']
+        ['GPQA_diamond', 'accuracy'],
+        ['openai_humaneval', 'humaneval_pass@1'],
+        ['IFEval', 'Prompt-level-strict-accuracy'],
+        ['sanitized_mbpp', 'score'],
+        'mathbench-a (average)',
+        'mathbench-t (average)'
+        '',
+        ['mmlu', 'accuracy'],
+        ['mmlu-stem', 'accuracy'],
+        ['mmlu-social-science', 'accuracy'],
+        ['mmlu-humanities', 'accuracy'],
+        ['mmlu-other', 'accuracy'],
+
+        '',
+        ['mmlu_pro', 'accuracy'],
+        ['mmlu_pro_math','accuracy'],
+        ['mmlu_pro_physics', 'accuracy'],
+        ['mmlu_pro_chemistry', 'accuracy'],
+        ['mmlu_pro_law', 'accuracy'],
+        ['mmlu_pro_engineering', 'accuracy'],
+        ['mmlu_pro_other', 'accuracy'],
+        ['mmlu_pro_economics', 'accuracy'],
+        ['mmlu_pro_health', 'accuracy'],
+        ['mmlu_pro_psychology', 'accuracy'],
+        ['mmlu_pro_business', 'accuracy'],
+        ['mmlu_pro_biology', 'accuracy'],
+        ['mmlu_pro_philosophy', 'accuracy'],
+        ['mmlu_pro_computer_science','accuracy'],
+        ['mmlu_pro_history', 'accuracy'],
+        '',
+        ['cmmlu', 'accuracy'],
+        ['cmmlu-stem', 'accuracy'],
+        ['cmmlu-social-science', 'accuracy'],
+        ['cmmlu-humanities', 'accuracy'],
+        ['cmmlu-other', 'accuracy'],
+        ['cmmlu-china-specific', 'accuracy'],
+
+    ],
+    summary_groups=sum(
+        [v for k, v in locals().items() if k.endswith('_summary_groups')], []),
+)
+
+
+#######################################################################
+#                        PART 3  Models  List                         #
+#######################################################################
+
+models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
+
+
+
+#######################################################################
+#                 PART 4  Inference/Evaluation Configuaration         #
+#######################################################################
+
+# Local Runner
+infer = dict(
+    partitioner=dict(
+        type=NumWorkerPartitioner,
+        num_worker=8
+    ),
+    runner=dict(
+        type=LocalRunner,
+        max_num_workers=16,
+        retry=0, # Modify if needed
+        task=dict(type=OpenICLInferTask)
+    ),
+)
+
+# eval with local runner
+eval = dict(
+    partitioner=dict(type=NaivePartitioner, n=10),
+    runner=dict(
+        type=LocalRunner,
+        max_num_workers=16,
+        task=dict(type=OpenICLEvalTask)),
+)
+
+
+#######################################################################
+#                      PART 5  Utils Configuaration                   #
+#######################################################################
+base_exp_dir = 'outputs/corebench_2409_objective/'
+work_dir = osp.join(base_exp_dir, 'chat_objective')
diff --git a/configs/eval_corebench_2409_objective.py b/configs/eval_corebench_2409_chat_objective.py
similarity index 88%
rename from configs/eval_corebench_2409_objective.py
rename to configs/eval_corebench_2409_chat_objective.py
index e14c52472..0b6735062 100644
--- a/configs/eval_corebench_2409_objective.py
+++ b/configs/eval_corebench_2409_chat_objective.py
@@ -18,20 +18,22 @@
 
     # ## Reasoning
     from opencompass.configs.datasets.bbh.bbh_gen_4a31fa import bbh_datasets
-    # TODO: Add HellaSwag
-    # TODO: Add DROP
+    from opencompass.configs.datasets.hellaswag.hellaswag_10shot_gen_e42710 import \
+        hellaswag_datasets
+    from opencompass.configs.datasets.drop.drop_openai_simple_evals_gen_3857b0 import drop_datasets
 
     # ## Math
     from opencompass.configs.datasets.math.math_0shot_gen_393424 import math_datasets
-    # TODO: Add GSM8K
-    # TODO: Add MathBench
+    from opencompass.configs.datasets.gsm8k.gsm8k_0shot_v2_gen_a58960 import \
+        gsm8k_datasets
+    from opencompass.configs.datasets.MathBench.mathbench_2024_gen_50a320 import mathbench_datasets
 
     # ## Scientific
     from opencompass.configs.datasets.gpqa.gpqa_openai_simple_evals_gen_5aeece import gpqa_datasets
 
     # ## Coding
     from opencompass.configs.datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets
-    # TODO: Add MBPP
+    from opencompass.configs.datasets.mbpp.sanitized_mbpp_mdblock_gen_a447ff import sanitized_mbpp_datasets
     # TODO: Add LiveCodeBench
 
     # ## Instruction Following
@@ -70,13 +72,17 @@
         'subsets': [
             ['mmlu', 'accuracy'],
             ['mmlu_pro', 'accuracy'],
-            # ['cmmlu', 'naive_average'],
             ['cmmlu', 'accuracy'],
             ['bbh', 'score'],
             ['math', 'accuracy'],
             ['openai_humaneval', 'humaneval_pass@1'],
             ['GPQA_diamond', 'accuracy'],
             ['IFEval', 'Prompt-level-strict-accuracy'],
+            ['drop', 'accuracy'],
+            ['sanitized_mbpp', 'score'],
+            ['gsm8k', 'accuracy'],
+            ['hellaswag', 'accuracy'],
+            ['mathbench-t (average)', 'naive_average']
         ],
     },
 ]
@@ -92,6 +98,12 @@
         ['openai_humaneval', 'humaneval_pass@1'],
         ['GPQA_diamond', 'accuracy'],
         ['IFEval', 'Prompt-level-strict-accuracy'],
+        ['drop', 'accuracy'],
+        ['sanitized_mbpp', 'score'],
+        ['gsm8k', 'accuracy'],
+        ['hellaswag', 'accuracy'],
+        'mathbench-a (average)',
+        'mathbench-t (average)'
         '',
 
         ['mmlu', 'accuracy'],
@@ -204,5 +216,5 @@
 #######################################################################
 #                      PART 5  Utils Configuaration                   #
 #######################################################################
-base_exp_dir = 'outputs/corebench/'
+base_exp_dir = 'outputs/corebench_2409_objective/'
 work_dir = osp.join(base_exp_dir, 'chat_objective')
diff --git a/configs/models/bailing_api/bailing-lite-0830.py b/configs/models/bailing_api/bailing-lite-0830.py
index 1a43b4be1..88053ce98 100644
--- a/configs/models/bailing_api/bailing-lite-0830.py
+++ b/configs/models/bailing_api/bailing-lite-0830.py
@@ -2,30 +2,29 @@
 
 api_meta_template = dict(
     round=[
-        dict(role="HUMAN", api_role="HUMAN"),
-        dict(role="BOT", api_role="BOT", generate=False),
+        dict(role='HUMAN', api_role='HUMAN'),
+        dict(role='BOT', api_role='BOT', generate=False),
     ],
-    reserved_roles=[dict(role="SYSTEM", api_role="SYSTEM")],
+    reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')],
 )
 
 models = [
     dict(
-        path="Bailing-Lite-0830",
-        token="",  # set your key here or in environment variable BAILING_API_KEY
-        url="https://bailingchat.alipay.com/chat/completions",
+        path='Bailing-Lite-0830',
+        token='',  # set your key here or in environment variable BAILING_API_KEY
+        url='https://bailingchat.alipay.com/chat/completions',
         type=BailingAPI,
         meta_template=api_meta_template,
         query_per_second=1,
         max_seq_len=4096,
         batch_size=1,
         generation_kwargs={
-            "temperature": 0.4,
-            "top_p": 1.0,
-            "top_k": -1,
-            "n": 1,
-            "logprobs": 1,
-            "use_beam_search": False,
+            'temperature': 0.4,
+            'top_p': 1.0,
+            'top_k': -1,
+            'n': 1,
+            'logprobs': 1,
+            'use_beam_search': False,
         },
     ),
 ]
-
diff --git a/configs/models/bailing_api/bailing-pro-0920.py b/configs/models/bailing_api/bailing-pro-0920.py
index 35814bf79..db69b263e 100644
--- a/configs/models/bailing_api/bailing-pro-0920.py
+++ b/configs/models/bailing_api/bailing-pro-0920.py
@@ -2,30 +2,29 @@
 
 api_meta_template = dict(
     round=[
-        dict(role="HUMAN", api_role="HUMAN"),
-        dict(role="BOT", api_role="BOT", generate=False),
+        dict(role='HUMAN', api_role='HUMAN'),
+        dict(role='BOT', api_role='BOT', generate=False),
     ],
-    reserved_roles=[dict(role="SYSTEM", api_role="SYSTEM")],
+    reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')],
 )
 
 models = [
     dict(
-        path="Bailing-Pro-0920",
-        token="",  # set your key here or in environment variable BAILING_API_KEY
-        url="https://bailingchat.alipay.com/chat/completions",
+        path='Bailing-Pro-0920',
+        token='',  # set your key here or in environment variable BAILING_API_KEY
+        url='https://bailingchat.alipay.com/chat/completions',
         type=BailingAPI,
         meta_template=api_meta_template,
         query_per_second=1,
         max_seq_len=4096,
         batch_size=1,
         generation_kwargs={
-            "temperature": 0.4,
-            "top_p": 1.0,
-            "top_k": -1,
-            "n": 1,
-            "logprobs": 1,
-            "use_beam_search": False,
+            'temperature': 0.4,
+            'top_p': 1.0,
+            'top_k': -1,
+            'n': 1,
+            'logprobs': 1,
+            'use_beam_search': False,
         },
     ),
 ]
-
diff --git a/configs/models/qwen2_5/lmdeploy_qwen2_5_1_5b.py b/configs/models/qwen2_5/lmdeploy_qwen2_5_1_5b.py
new file mode 100644
index 000000000..a2661c9fd
--- /dev/null
+++ b/configs/models/qwen2_5/lmdeploy_qwen2_5_1_5b.py
@@ -0,0 +1,15 @@
+from opencompass.models import TurboMindModel
+
+models = [
+    dict(
+        type=TurboMindModel,
+        abbr='qwen2.5-1.5b-turbomind',
+        path='Qwen/Qwen2.5-1.5B',
+        engine_config=dict(session_len=7168, max_batch_size=16, tp=1),
+        gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024),
+        max_seq_len=7168,
+        max_out_len=1024,
+        batch_size=16,
+        run_cfg=dict(num_gpus=1),
+    )
+]
diff --git a/configs/models/qwen2_5/lmdeploy_qwen2_5_7b.py b/configs/models/qwen2_5/lmdeploy_qwen2_5_7b.py
new file mode 100644
index 000000000..b2d7aa0c5
--- /dev/null
+++ b/configs/models/qwen2_5/lmdeploy_qwen2_5_7b.py
@@ -0,0 +1,15 @@
+from opencompass.models import TurboMindModel
+
+models = [
+    dict(
+        type=TurboMindModel,
+        abbr='qwen2.5-7b-turbomind',
+        path='Qwen/Qwen2.5-7B',
+        engine_config=dict(session_len=7168, max_batch_size=16, tp=1),
+        gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024),
+        max_seq_len=7168,
+        max_out_len=1024,
+        batch_size=16,
+        run_cfg=dict(num_gpus=1),
+    )
+]
diff --git a/opencompass/configs/models/bailing_api/bailing-lite-0830.py b/opencompass/configs/models/bailing_api/bailing-lite-0830.py
index 1a43b4be1..88053ce98 100644
--- a/opencompass/configs/models/bailing_api/bailing-lite-0830.py
+++ b/opencompass/configs/models/bailing_api/bailing-lite-0830.py
@@ -2,30 +2,29 @@
 
 api_meta_template = dict(
     round=[
-        dict(role="HUMAN", api_role="HUMAN"),
-        dict(role="BOT", api_role="BOT", generate=False),
+        dict(role='HUMAN', api_role='HUMAN'),
+        dict(role='BOT', api_role='BOT', generate=False),
     ],
-    reserved_roles=[dict(role="SYSTEM", api_role="SYSTEM")],
+    reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')],
 )
 
 models = [
     dict(
-        path="Bailing-Lite-0830",
-        token="",  # set your key here or in environment variable BAILING_API_KEY
-        url="https://bailingchat.alipay.com/chat/completions",
+        path='Bailing-Lite-0830',
+        token='',  # set your key here or in environment variable BAILING_API_KEY
+        url='https://bailingchat.alipay.com/chat/completions',
         type=BailingAPI,
         meta_template=api_meta_template,
         query_per_second=1,
         max_seq_len=4096,
         batch_size=1,
         generation_kwargs={
-            "temperature": 0.4,
-            "top_p": 1.0,
-            "top_k": -1,
-            "n": 1,
-            "logprobs": 1,
-            "use_beam_search": False,
+            'temperature': 0.4,
+            'top_p': 1.0,
+            'top_k': -1,
+            'n': 1,
+            'logprobs': 1,
+            'use_beam_search': False,
         },
     ),
 ]
-
diff --git a/opencompass/configs/models/bailing_api/bailing-pro-0920.py b/opencompass/configs/models/bailing_api/bailing-pro-0920.py
index 35814bf79..db69b263e 100644
--- a/opencompass/configs/models/bailing_api/bailing-pro-0920.py
+++ b/opencompass/configs/models/bailing_api/bailing-pro-0920.py
@@ -2,30 +2,29 @@
 
 api_meta_template = dict(
     round=[
-        dict(role="HUMAN", api_role="HUMAN"),
-        dict(role="BOT", api_role="BOT", generate=False),
+        dict(role='HUMAN', api_role='HUMAN'),
+        dict(role='BOT', api_role='BOT', generate=False),
     ],
-    reserved_roles=[dict(role="SYSTEM", api_role="SYSTEM")],
+    reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')],
 )
 
 models = [
     dict(
-        path="Bailing-Pro-0920",
-        token="",  # set your key here or in environment variable BAILING_API_KEY
-        url="https://bailingchat.alipay.com/chat/completions",
+        path='Bailing-Pro-0920',
+        token='',  # set your key here or in environment variable BAILING_API_KEY
+        url='https://bailingchat.alipay.com/chat/completions',
         type=BailingAPI,
         meta_template=api_meta_template,
         query_per_second=1,
         max_seq_len=4096,
         batch_size=1,
         generation_kwargs={
-            "temperature": 0.4,
-            "top_p": 1.0,
-            "top_k": -1,
-            "n": 1,
-            "logprobs": 1,
-            "use_beam_search": False,
+            'temperature': 0.4,
+            'top_p': 1.0,
+            'top_k': -1,
+            'n': 1,
+            'logprobs': 1,
+            'use_beam_search': False,
         },
     ),
 ]
-
diff --git a/opencompass/configs/models/qwen2_5/lmdeploy_qwen2_5_1_5b.py b/opencompass/configs/models/qwen2_5/lmdeploy_qwen2_5_1_5b.py
new file mode 100644
index 000000000..a2661c9fd
--- /dev/null
+++ b/opencompass/configs/models/qwen2_5/lmdeploy_qwen2_5_1_5b.py
@@ -0,0 +1,15 @@
+from opencompass.models import TurboMindModel
+
+models = [
+    dict(
+        type=TurboMindModel,
+        abbr='qwen2.5-1.5b-turbomind',
+        path='Qwen/Qwen2.5-1.5B',
+        engine_config=dict(session_len=7168, max_batch_size=16, tp=1),
+        gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024),
+        max_seq_len=7168,
+        max_out_len=1024,
+        batch_size=16,
+        run_cfg=dict(num_gpus=1),
+    )
+]
diff --git a/opencompass/configs/models/qwen2_5/lmdeploy_qwen2_5_7b.py b/opencompass/configs/models/qwen2_5/lmdeploy_qwen2_5_7b.py
new file mode 100644
index 000000000..b2d7aa0c5
--- /dev/null
+++ b/opencompass/configs/models/qwen2_5/lmdeploy_qwen2_5_7b.py
@@ -0,0 +1,15 @@
+from opencompass.models import TurboMindModel
+
+models = [
+    dict(
+        type=TurboMindModel,
+        abbr='qwen2.5-7b-turbomind',
+        path='Qwen/Qwen2.5-7B',
+        engine_config=dict(session_len=7168, max_batch_size=16, tp=1),
+        gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024),
+        max_seq_len=7168,
+        max_out_len=1024,
+        batch_size=16,
+        run_cfg=dict(num_gpus=1),
+    )
+]
diff --git a/opencompass/models/__init__.py b/opencompass/models/__init__.py
index 0beb963a1..0f55b869c 100644
--- a/opencompass/models/__init__.py
+++ b/opencompass/models/__init__.py
@@ -42,7 +42,8 @@
 from .stepfun_api import StepFun  # noqa: F401
 from .turbomind import TurboMindModel  # noqa: F401
 from .turbomind_tis import TurboMindTisModel  # noqa: F401
-from .turbomind_with_tf_above_v4_33 import TurboMindModelwithChatTemplate  # noqa: F401
+from .turbomind_with_tf_above_v4_33 import \
+    TurboMindModelwithChatTemplate  # noqa: F401
 from .unigpt_api import UniGPT  # noqa: F401
 from .vllm import VLLM  # noqa: F401
 from .vllm_with_tf_above_v4_33 import VLLMwithChatTemplate  # noqa: F401
diff --git a/opencompass/models/bailing_api_oc.py b/opencompass/models/bailing_api_oc.py
index 6ff75e0d5..34e8a333a 100644
--- a/opencompass/models/bailing_api_oc.py
+++ b/opencompass/models/bailing_api_oc.py
@@ -7,9 +7,14 @@
 
 import requests
 from requests.adapters import HTTPAdapter
-from retrying import retry
 from urllib3.connection import HTTPConnection
 
+try:
+    from retrying import retry
+except ImportError:
+    retry = None
+    print('please install retrying by `pip install retrying`')
+
 from opencompass.utils.prompt import PromptList
 
 from .base_api import BaseAPIModel
@@ -18,6 +23,7 @@
 
 
 class HTTPAdapterWithSocketOptions(HTTPAdapter):
+
     def __init__(self, *args, **kwargs):
         self._socket_options = HTTPConnection.default_socket_options + [
             (socket.SOL_SOCKET, socket.SO_KEEPALIVE, 1),
@@ -29,8 +35,9 @@ def __init__(self, *args, **kwargs):
 
     def init_poolmanager(self, *args, **kwargs):
         if self._socket_options is not None:
-            kwargs["socket_options"] = self._socket_options
-        super(HTTPAdapterWithSocketOptions, self).init_poolmanager(*args, **kwargs)
+            kwargs['socket_options'] = self._socket_options
+        super(HTTPAdapterWithSocketOptions,
+              self).init_poolmanager(*args, **kwargs)
 
 
 class BailingAPI(BaseAPIModel):
@@ -64,31 +71,29 @@ def __init__(
             generation_kwargs=generation_kwargs,
         )
 
-        self.logger.info(f"Bailing API Model Init path: {path} url={url}")
+        self.logger.info(f'Bailing API Model Init path: {path} url={url}')
         if not token:
-            token = os.environ.get("BAILING_API_KEY")
+            token = os.environ.get('BAILING_API_KEY')
             if token:
-                self._headers = {"Authorization": f"Bearer {token}"}
+                self._headers = {'Authorization': f'Bearer {token}'}
             else:
-                raise RuntimeError(f"There is not valid token.")
-        self._headers["Content-Type"] = "application/json"
-        self._url = url if url else "https://bailingchat.alipay.com/chat/completions"
+                raise RuntimeError('There is not valid token.')
+        self._headers['Content-Type'] = 'application/json'
+        self._url = url if url else \
+            'https://bailingchat.alipay.com/chat/completions'
         self._model = path
         self._sessions = []
-        self._num = (
-            int(os.environ.get("BAILING_API_PARALLEL_NUM"))
-            if os.environ.get("BAILING_API_PARALLEL_NUM")
-            else 1
-        )
+        self._num = (int(os.environ.get('BAILING_API_PARALLEL_NUM'))
+                     if os.environ.get('BAILING_API_PARALLEL_NUM') else 1)
         try:
             for _ in range(self._num):
                 adapter = HTTPAdapterWithSocketOptions()
                 sess = requests.Session()
-                sess.mount("http://", adapter)
-                sess.mount("https://", adapter)
+                sess.mount('http://', adapter)
+                sess.mount('https://', adapter)
                 self._sessions.append(sess)
         except Exception as e:
-            self.logger.error(f"Fail to setup the session. {e}")
+            self.logger.error(f'Fail to setup the session. {e}')
             raise e
 
     def generate(
@@ -99,7 +104,8 @@ def generate(
         """Generate results given a list of inputs.
 
         Args:
-            inputs (Union[List[str], PromptList]): A list of strings or PromptDicts.
+            inputs (Union[List[str], PromptList]):
+                A list of strings or PromptDicts.
                 The PromptDict should be organized in OpenCompass' API format.
             max_out_len (int): The maximum length of the output.
 
@@ -107,8 +113,7 @@ def generate(
             List[str]: A list of generated strings.
         """
         with concurrent.futures.ThreadPoolExecutor(
-            max_workers=self._num,
-        ) as executor:
+                max_workers=self._num, ) as executor:
             future_to_m = {
                 executor.submit(
                     self._generate,
@@ -120,22 +125,22 @@ def generate(
             }
             results = []
             for future in concurrent.futures.as_completed(future_to_m):
-                m = future_to_m[future]
+                m = future_to_m[future]  # noqa F841
                 resp = future.result()
                 if resp and resp.status_code == 200:
                     try:
                         result = resp.json()
-                    except:
-                        results.append("")
+                    except Exception as e:  # noqa F841
+                        results.append('')
                     else:
-                        if (
-                            result.get("choices")
-                            and result["choices"][0].get("message")
-                            and result["choices"][0]["message"].get("content")
-                        ):
-                            results.append(result["choices"][0]["message"]["content"])
+                        if (result.get('choices')
+                                and result['choices'][0].get('message')
+                                and result['choices'][0]['message'].get(
+                                    'content')):
+                            results.append(
+                                result['choices'][0]['message']['content'])
                 else:
-                    results.append("")
+                    results.append('')
         self.flush()
         return results
 
@@ -156,27 +161,30 @@ def _generate(
             str: The generated string.
         """
         if isinstance(input, str):
-            messages = [{"role": "user", "content": input}]
+            messages = [{'role': 'user', 'content': input}]
         else:
             messages = []
             for item in input:
-                content = item["prompt"]
+                content = item['prompt']
                 if not content:
                     continue
-                message = {"content": content}
-                if item["role"] == "HUMAN":
-                    message["role"] = "user"
-                elif item["role"] == "BOT":
-                    message["role"] = "assistant"
-                elif item["role"] == "SYSTEM":
-                    message["role"] = "system"
+                message = {'content': content}
+                if item['role'] == 'HUMAN':
+                    message['role'] = 'user'
+                elif item['role'] == 'BOT':
+                    message['role'] = 'assistant'
+                elif item['role'] == 'SYSTEM':
+                    message['role'] = 'system'
                 else:
-                    message["role"] = item["role"]
+                    message['role'] = item['role']
                 messages.append(message)
         request = {
-            "model": self._model,
-            "messages": messages,
-            "max_seq_len": max(
+            'model':
+            self._model,
+            'messages':
+            messages,
+            'max_seq_len':
+            max(
                 max_out_len if max_out_len else 4096,
                 self.max_seq_len if self.max_seq_len else 4096,
             ),
@@ -191,22 +199,22 @@ def _generate(
                 elif response.status_code == 426:
                     retry_num += 1  # retry
                 else:
-                    raise ValueError(f"Status code = {response.status_code}")
+                    raise ValueError(f'Status code = {response.status_code}')
             else:
                 raise ValueError(
-                    f"Exceed the maximal retry times. Last status code = {response.status_code}"
-                )
+                    f'Exceed the maximal retry times. Last status code '
+                    f'= {response.status_code}')
         except Exception as e:
-            self.logger.error(
-                f"Fail to inference request={request}; model_name={self.path};  error={e}, stack:{traceback.format_exc()}"
-            )
+            self.logger.error(f'Fail to inference request={request}; '
+                              f'model_name={self.path};  error={e}, '
+                              f'stack:{traceback.format_exc()}')
             raise e
         return response
 
     @retry(stop_max_attempt_number=3, wait_fixed=16000)  # ms
     def _infer_result(self, request, sess):
         response = sess.request(
-            "POST",
+            'POST',
             self._url,
             json=request,
             headers=self._headers,

From 7d50294117e319dfe9fc8ffbf6c5c0268329ee09 Mon Sep 17 00:00:00 2001
From: Songyang Zhang <tonysy@users.noreply.github.com>
Date: Thu, 26 Sep 2024 18:56:17 +0800
Subject: [PATCH 10/20] [Feature] Update Bailing (#1567)

* [Feature] 1. Update CoreBench Base\n 2. Fix lint issue in BalingAPI

* Update

* Update

* Update
---
 opencompass/models/bailing_api_oc.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/opencompass/models/bailing_api_oc.py b/opencompass/models/bailing_api_oc.py
index 34e8a333a..54e0d502f 100644
--- a/opencompass/models/bailing_api_oc.py
+++ b/opencompass/models/bailing_api_oc.py
@@ -211,7 +211,7 @@ def _generate(
             raise e
         return response
 
-    @retry(stop_max_attempt_number=3, wait_fixed=16000)  # ms
+    # @retry(stop_max_attempt_number=3, wait_fixed=16000)  # ms
     def _infer_result(self, request, sess):
         response = sess.request(
             'POST',

From e8437db98fc6a817ed101d1945077cdd421089f1 Mon Sep 17 00:00:00 2001
From: Songyang Zhang <tonysy@users.noreply.github.com>
Date: Fri, 27 Sep 2024 11:15:25 +0800
Subject: [PATCH 11/20] [Feature] Update BailingLM/OpenAI verbose (#1568)

* [Feature] 1. Update CoreBench Base\n 2. Fix lint issue in BalingAPI

* Update

* [Feature] Update API

* Update
---
 configs/eval_corebench_2409_base_objective.py | 6 +++---
 opencompass/models/bailing_api_oc.py          | 1 -
 opencompass/models/openai_api.py              | 4 ++++
 3 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/configs/eval_corebench_2409_base_objective.py b/configs/eval_corebench_2409_base_objective.py
index 9c9043657..d5d7a3879 100644
--- a/configs/eval_corebench_2409_base_objective.py
+++ b/configs/eval_corebench_2409_base_objective.py
@@ -81,7 +81,7 @@
             ['drop', 'accuracy'],
             ['math', 'accuracy'],
             ['gsm8k', 'accuracy'],
-            ['mathbench-t (average)', 'naive_average']
+            ['mathbench-t (average)', 'naive_average'],
             ['GPQA_diamond', 'accuracy'],
             ['openai_humaneval', 'humaneval_pass@1'],
             ['IFEval', 'Prompt-level-strict-accuracy'],
@@ -101,7 +101,7 @@
         ['drop', 'accuracy'],
         ['math', 'accuracy'],
         ['gsm8k', 'accuracy'],
-        ['mathbench-t (average)', 'naive_average']
+        ['mathbench-t (average)', 'naive_average'],
         ['GPQA_diamond', 'accuracy'],
         ['openai_humaneval', 'humaneval_pass@1'],
         ['IFEval', 'Prompt-level-strict-accuracy'],
@@ -185,4 +185,4 @@
 #                      PART 5  Utils Configuaration                   #
 #######################################################################
 base_exp_dir = 'outputs/corebench_2409_objective/'
-work_dir = osp.join(base_exp_dir, 'chat_objective')
+work_dir = osp.join(base_exp_dir, 'base_objective')
diff --git a/opencompass/models/bailing_api_oc.py b/opencompass/models/bailing_api_oc.py
index 54e0d502f..d4368a363 100644
--- a/opencompass/models/bailing_api_oc.py
+++ b/opencompass/models/bailing_api_oc.py
@@ -13,7 +13,6 @@
     from retrying import retry
 except ImportError:
     retry = None
-    print('please install retrying by `pip install retrying`')
 
 from opencompass.utils.prompt import PromptList
 
diff --git a/opencompass/models/openai_api.py b/opencompass/models/openai_api.py
index 4a07dee3f..aff2579a6 100644
--- a/opencompass/models/openai_api.py
+++ b/opencompass/models/openai_api.py
@@ -601,6 +601,10 @@ def _generate(self, input: PromptList | str, max_out_len: int,
                 if self.verbose:
                     self.logger.info(
                         'Successfully get response from OpenAI API')
+                    try:
+                        self.logger.info(responses)
+                    except Exception as e:  # noqa F841
+                        pass
                 return responses.choices[0].message.content
             except Exception as e:
                 self.logger.error(e)

From 85a28874aacf14dd215eb6b3212c7307adacbb43 Mon Sep 17 00:00:00 2001
From: Yi Ding <cuauty@users.noreply.github.com>
Date: Fri, 27 Sep 2024 11:56:57 +0800
Subject: [PATCH 12/20] [BUG]: Fix Bailing API configs (#1570)

---
 opencompass/models/bailing_api_oc.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/opencompass/models/bailing_api_oc.py b/opencompass/models/bailing_api_oc.py
index d4368a363..8e107556c 100644
--- a/opencompass/models/bailing_api_oc.py
+++ b/opencompass/models/bailing_api_oc.py
@@ -77,6 +77,9 @@ def __init__(
                 self._headers = {'Authorization': f'Bearer {token}'}
             else:
                 raise RuntimeError('There is not valid token.')
+        else:
+            self._headers = {'Authorization': f'Bearer {token}'}
+
         self._headers['Content-Type'] = 'application/json'
         self._url = url if url else \
             'https://bailingchat.alipay.com/chat/completions'

From 7528b8ab8a9b80210e2c51b7257895bdd2ac49ae Mon Sep 17 00:00:00 2001
From: shijinpjlab <shijinpjlab@163.com>
Date: Sun, 29 Sep 2024 19:24:58 +0800
Subject: [PATCH 13/20] [Feature] Add dingo test  (#1529)

* add qa dingo

* update

* change name qa to dingo

* eval model: llm_base

* update path

* change name and move path

* add eval_dingo

* update import

* add for pip

* add dingo package

* change import place

* update import place

* fix lint fail

* isort

* double quoted

---------

Co-authored-by: sj <shijin@pjlab.org.cn>
---
 configs/datasets/dingo/dingo_gen.py           | 34 ++++++++
 configs/eval_dingo.py                         |  7 ++
 .../configs/datasets/dingo/dingo_gen.py       | 34 ++++++++
 opencompass/datasets/__init__.py              |  1 +
 opencompass/datasets/dingo.py                 | 84 +++++++++++++++++++
 requirements/extra.txt                        |  1 +
 6 files changed, 161 insertions(+)
 create mode 100644 configs/datasets/dingo/dingo_gen.py
 create mode 100644 configs/eval_dingo.py
 create mode 100644 opencompass/configs/datasets/dingo/dingo_gen.py
 create mode 100644 opencompass/datasets/dingo.py

diff --git a/configs/datasets/dingo/dingo_gen.py b/configs/datasets/dingo/dingo_gen.py
new file mode 100644
index 000000000..c36f6cdcc
--- /dev/null
+++ b/configs/datasets/dingo/dingo_gen.py
@@ -0,0 +1,34 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import DingoDataset, DingoEvaluator
+
+
+dingo_paths = [
+    './data/dingo/en_192.csv',
+    './data/dingo/zh_170.csv',
+]
+
+dingo_datasets = []
+for path in dingo_paths:
+    dingo_reader_cfg = dict(input_columns='input', output_column=None)
+    dingo_infer_cfg = dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(round=[dict(role='HUMAN', prompt='{input}')])),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=GenInferencer),
+    )
+    dingo_eval_cfg = dict(evaluator=dict(type=DingoEvaluator), pred_role='BOT')
+
+    dingo_datasets.append(
+        dict(
+            abbr='dingo_' + path.split('/')[-1].split('.csv')[0],
+            type=DingoDataset,
+            path=path,
+            reader_cfg=dingo_reader_cfg,
+            infer_cfg=dingo_infer_cfg,
+            eval_cfg=dingo_eval_cfg,
+        ))
+
+datasets = dingo_datasets
diff --git a/configs/eval_dingo.py b/configs/eval_dingo.py
new file mode 100644
index 000000000..3e0ecb86b
--- /dev/null
+++ b/configs/eval_dingo.py
@@ -0,0 +1,7 @@
+from mmengine.config import read_base
+
+with read_base():
+    from .models.hf_internlm.hf_internlm_7b import models
+    from .datasets.dingo.dingo_gen import datasets
+
+work_dir = './outputs/eval_dingo'
diff --git a/opencompass/configs/datasets/dingo/dingo_gen.py b/opencompass/configs/datasets/dingo/dingo_gen.py
new file mode 100644
index 000000000..c36f6cdcc
--- /dev/null
+++ b/opencompass/configs/datasets/dingo/dingo_gen.py
@@ -0,0 +1,34 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import DingoDataset, DingoEvaluator
+
+
+dingo_paths = [
+    './data/dingo/en_192.csv',
+    './data/dingo/zh_170.csv',
+]
+
+dingo_datasets = []
+for path in dingo_paths:
+    dingo_reader_cfg = dict(input_columns='input', output_column=None)
+    dingo_infer_cfg = dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(round=[dict(role='HUMAN', prompt='{input}')])),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=GenInferencer),
+    )
+    dingo_eval_cfg = dict(evaluator=dict(type=DingoEvaluator), pred_role='BOT')
+
+    dingo_datasets.append(
+        dict(
+            abbr='dingo_' + path.split('/')[-1].split('.csv')[0],
+            type=DingoDataset,
+            path=path,
+            reader_cfg=dingo_reader_cfg,
+            infer_cfg=dingo_infer_cfg,
+            eval_cfg=dingo_eval_cfg,
+        ))
+
+datasets = dingo_datasets
diff --git a/opencompass/datasets/__init__.py b/opencompass/datasets/__init__.py
index a1f201efd..8f178242c 100644
--- a/opencompass/datasets/__init__.py
+++ b/opencompass/datasets/__init__.py
@@ -33,6 +33,7 @@
 from .csl import *  # noqa: F401, F403
 from .custom import *  # noqa: F401, F403
 from .cvalues import *  # noqa: F401, F403
+from .dingo import *  # noqa: F401, F403
 from .drcd import *  # noqa: F401, F403
 from .drop import *  # noqa: F401, F403
 from .drop_simple_eval import *  # noqa: F401, F403
diff --git a/opencompass/datasets/dingo.py b/opencompass/datasets/dingo.py
new file mode 100644
index 000000000..753d78ddb
--- /dev/null
+++ b/opencompass/datasets/dingo.py
@@ -0,0 +1,84 @@
+# flake8: nodingo
+# yapf: disable
+import csv
+import json
+import os
+import time
+from typing import List
+
+from datasets import Dataset
+
+from opencompass.openicl.icl_evaluator import BaseEvaluator
+from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET
+
+from .base import BaseDataset
+
+
+@LOAD_DATASET.register_module()
+class DingoDataset(BaseDataset):
+
+    @staticmethod
+    def load(path: str):
+        raw_data = []
+        with open(path, encoding='utf-8') as f:
+            reader = csv.reader(f, delimiter=';')
+            for row in reader:
+                if len(row) < 1:
+                    row = ['']
+                raw_data.append({'input': row[0]})
+        return Dataset.from_list(raw_data)
+
+
+@LOAD_DATASET.register_module()
+class DingoLongDataset(BaseDataset):
+
+    @staticmethod
+    def load(path: str):
+        raw_data = []
+        with open(path, 'r', encoding='utf-8') as f:
+            for line in f:
+                raw_data.append({'input': json.loads(line).get('input')})
+        return Dataset.from_list(raw_data)
+
+
+@ICL_EVALUATORS.register_module()
+class DingoEvaluator(BaseEvaluator):
+
+    def score(self, origin_prompt: List, predictions: List) -> dict:
+        try:
+            # from dingo.model.model import Model
+            from dingo.exec import Executor
+            from dingo.io import InputArgs
+        except Exception:
+            raise ModuleNotFoundError(
+                '=========== '
+                'dingo register fail. please try: pip install dingo-python.'
+                ' ===========')
+
+        current_time = time.strftime('%Y%m%d_%H%M%S', time.localtime())
+        file_data = [{'prompt': pmt, 'prediction': prd}
+                     for pmt, prd in zip(origin_prompt, predictions)]
+        file_name = 'dingo_file_' + current_time + '.jsonl'
+        with open(file_name, 'a', encoding='utf-8') as f:
+            for d in file_data:
+                json.dump(d, f, ensure_ascii=False)
+                f.write('\n')
+
+        input_data = {
+            'eval_models': ['llm_base'],
+            'input_path': file_name,
+            'output_path': './outputs/dingo/',
+            'dataset': 'local',
+            'datasource': 'local',
+            'data_format': 'jsonl',
+            'column_prompt': ['prompt'],
+            'column_content': ['prediction'],
+        }
+        # Model.apply_config(input_data["custom_config_path"])
+        input_args = InputArgs(**input_data)
+        executor = Executor.exec_map['local'](input_args)
+        result = executor.execute()
+        summary = result[0].to_dict()
+
+        os.remove(file_name)
+        return summary
diff --git a/requirements/extra.txt b/requirements/extra.txt
index 218348344..efeef772e 100644
--- a/requirements/extra.txt
+++ b/requirements/extra.txt
@@ -1,6 +1,7 @@
 # Alpaca-eval
 alpaca-eval==0.6
 cn2an
+dingo-python
 # Icl topk retriever
 faiss_gpu==1.7.2
 # Humaneval, Humaneval X

From 763d7755b6a22bcf4ac1d579966829125d4dbc61 Mon Sep 17 00:00:00 2001
From: Linchen Xiao <xxllcc1993@gmail.com>
Date: Mon, 30 Sep 2024 15:13:26 +0800
Subject: [PATCH 14/20] [BUG]GaokaoBench dataset fix (#1583)

---
 opencompass/datasets/GaokaoBench.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/opencompass/datasets/GaokaoBench.py b/opencompass/datasets/GaokaoBench.py
index 383845356..d3cd31a00 100644
--- a/opencompass/datasets/GaokaoBench.py
+++ b/opencompass/datasets/GaokaoBench.py
@@ -16,7 +16,7 @@ class GaokaoBenchDataset(BaseDataset):
 
     @staticmethod
     def load(path: str, name: str):
-        data = get_data_path(path, local_mode=True)
+        path = get_data_path(path, local_mode=True)
         if environ.get('DATASET_SOURCE') == 'ModelScope':
             from modelscope import MsDataset
             return MsDataset.load(path, subset_name=name, split='test')

From bbdca5eb4cb08c24a386c22bf677d1856485f5f4 Mon Sep 17 00:00:00 2001
From: x54-729 <45304952+x54-729@users.noreply.github.com>
Date: Mon, 30 Sep 2024 15:46:06 +0800
Subject: [PATCH 15/20] [BUG] Fix eos token handling and add comments for
 InternTrain (#1569)

Co-authored-by: x54-729 <xingshuhao.dispatch@pjlab.org.cn>
---
 opencompass/models/interntrain.py | 71 ++++++++++++++++++++++++++++---
 1 file changed, 65 insertions(+), 6 deletions(-)

diff --git a/opencompass/models/interntrain.py b/opencompass/models/interntrain.py
index d6c233cdb..6d904acf7 100644
--- a/opencompass/models/interntrain.py
+++ b/opencompass/models/interntrain.py
@@ -79,6 +79,50 @@ def initialize_model(self):
 
 @MODELS.register_module()
 class InternTrain(BaseModel):
+    """Model wrapper for InternTrain.
+
+    Args:
+        path (str): The name or path to HuggingFace's model.
+        module_path (str): Path of InternTrain repository.
+        max_seq_len (int): The maximum length of the input sequence. Defaults
+            to 2048.
+        tokenizer_only (bool): If True, only the tokenizer will be initialized.
+            Defaults to False.
+        tokenizer_path (str): The path to the tokenizer. Defaults to None.
+        tokenizer_type: InternTrain's tokenizer type. Defaults to 'InternLM'.
+        model_config (str, dict, optional): Config of model. There are several
+            options for this parameter:
+
+                - filename (str): The config items are defined in a python file
+                  so the model will load configs from this file.
+                - config (dict): The configuration items are defined in a dict
+                  and the model will be initialized from ```model_config```.
+                - None: The config is loaded from ```path```. In this case,
+                  please make sure that ```path``` contains a config file named
+                  ``model_config.pt``.
+
+            Defaults to None.
+        model_type: Type of model. Defaults to 'InternTrain'
+        ckpt_type: The type of load function in InternTrain when checkpoints
+            are loaded. Defaults to None, which means load the checkpoint
+            directlywith pipeline merged.
+        meta_template (Dict, optional): The model's meta prompt
+            template if needed, in case the requirement of injecting or
+            wrapping of any meta instructions.
+        model_dtype: The model's dtype. If None, will use dtype defined in
+            ```model_config```. Defaults to None.
+        generation_kwargs (Dict, optional): The generation kwargs for the
+            model. Defaults to dict().
+        sync_rank (bool): Whether to sync inputs between ranks. Do not use this
+            if you are not familiar with this behavior. Check `sync_inputs`
+            function for more details. Defaults to False.
+        mode (str, optional): The method of input truncation when input length
+            exceeds max_seq_len. 'mid' represents the part of input to
+            truncate. Defaults to 'none'.
+        end_str (str, optional): Whether to trim generated strings with end_str
+            if the model has special ending strings that are not handled well.
+            Defaults to None.
+    """
 
     def __init__(self,
                  path: str,
@@ -87,14 +131,15 @@ def __init__(self,
                  tokenizer_only: bool = False,
                  tokenizer_path: Optional[str] = None,
                  tokenizer_type: str = 'INTERNLM',
-                 model_config: Optional[str] = None,
+                 model_config: Optional[Union[str, Dict]] = None,
                  model_type: str = 'INTERNLM2',
                  ckpt_type: Optional[str] = None,
                  meta_template: Optional[Dict] = None,
                  model_dtype: Optional[str] = None,
                  generation_kwargs={},
                  sync_rank: bool = False,
-                 mode='none'):
+                 mode='none',
+                 end_str: Optional[str] = None):
         super().__init__(path=path,
                          max_seq_len=max_seq_len,
                          tokenizer_only=tokenizer_only,
@@ -146,6 +191,7 @@ def __init__(self,
                                            bos_token_id=self.tokenizer.bos_id,
                                            pad_token_id=self.tokenizer.bos_id,
                                            eos_token_id=eos_token_ids)
+        self.end_str = end_str
 
     def _load_model(self,
                     path: str,
@@ -287,8 +333,10 @@ def generate(self,
             max_length=tokens.shape[1] + max_out_len,
             **self.generation_kwargs)  # bsz, num_return_sequences, max_length
         outputs = outputs[:, 0, tokens.shape[1]:]
-        output_text = self.batch_decode(outputs,
-                                        stopping_criteria=stopping_criteria)
+        output_text = self.batch_decode(
+            outputs,
+            eos_token_ids=self.generator.eos_token_id,
+            stopping_criteria=stopping_criteria)
 
         return output_text
 
@@ -407,11 +455,22 @@ def batch_encode(self,
 
         return torch.LongTensor(tokens).cuda()
 
-    def batch_decode(self, outputs, stopping_criteria: List[str] = []):
+    def batch_decode(self,
+                     outputs,
+                     eos_token_ids: List[int],
+                     stopping_criteria: List[str] = []):
         # outputs: bsz, seq_len
         output_text = []
+        outputs = outputs.tolist()
         for output in outputs:
-            text = self.tokenizer.decode(output.tolist())
+            # cut off by eos_token_ids
+            eos_idx = len(output)
+            for eos_id in eos_token_ids:
+                if eos_id in output:
+                    eos_idx = min(output.index(eos_id), eos_idx)
+            text = self.tokenizer.decode(output[:eos_idx])
+            if self.end_str is not None:
+                text = text.split(self.end_str)[0]
             for stop_word in stopping_criteria:
                 text = text.split(stop_word)[0]
             output_text.append(text)

From 22a4e7651180f0940ea7173e58e8121abe46ca11 Mon Sep 17 00:00:00 2001
From: Linchen Xiao <xxllcc1993@gmail.com>
Date: Mon, 30 Sep 2024 16:57:41 +0800
Subject: [PATCH 16/20] [BUMP] Bump version to 0.3.3 (#1581)

---
 opencompass/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/opencompass/__init__.py b/opencompass/__init__.py
index d1daced0e..80eb7f98f 100644
--- a/opencompass/__init__.py
+++ b/opencompass/__init__.py
@@ -1 +1 @@
-__version__ = '0.3.2.post1'
+__version__ = '0.3.3'

From 89abcba486b8c1e6c6c8b93b6ed856a0d0bb3554 Mon Sep 17 00:00:00 2001
From: zhulinJulia24 <145004780+zhulinJulia24@users.noreply.github.com>
Date: Wed, 2 Oct 2024 12:30:38 +0800
Subject: [PATCH 17/20] [CI] Fix testcase failure (#1582)

* update

* Update oc_score_baseline.yaml

* Update daily-run-test.yml

* Update daily-run-test.yml

* Update daily-run-test.yml

* Update daily-run-test.yml

---------

Co-authored-by: zhulin1 <zhulin1@pjlab.org.cn>
---
 .github/scripts/oc_score_assert.py     |  3 +--
 .github/scripts/oc_score_baseline.yaml | 12 ++++++------
 .github/workflows/daily-run-test.yml   |  8 ++++----
 3 files changed, 11 insertions(+), 12 deletions(-)

diff --git a/.github/scripts/oc_score_assert.py b/.github/scripts/oc_score_assert.py
index 6f2c0a11a..c01ef6864 100644
--- a/.github/scripts/oc_score_assert.py
+++ b/.github/scripts/oc_score_assert.py
@@ -7,8 +7,7 @@
 output_path = 'regression_result_daily'
 
 chat_model_list = [
-    'baichuan2-7b-chat-hf', 'glm-4-9b-chat-turbomind', 'glm-4-9b-chat-vllm',
-    'deepseek-7b-chat-hf', 'deepseek-moe-16b-chat-hf',
+    'baichuan2-7b-chat-hf', 'deepseek-7b-chat-hf', 'deepseek-moe-16b-chat-hf',
     'deepseek-v2-lite-chat-hf', 'deepseek-7b-chat-vllm', 'gemma2-2b-it-hf',
     'gemma2-9b-it-hf', 'gemma-7b-it-vllm', 'internlm2_5-7b-chat-hf',
     'internlm2_5-20b-chat-hf', 'internlm2_5-7b-chat-turbomind',
diff --git a/.github/scripts/oc_score_baseline.yaml b/.github/scripts/oc_score_baseline.yaml
index 9690aa2c5..809dfea45 100644
--- a/.github/scripts/oc_score_baseline.yaml
+++ b/.github/scripts/oc_score_baseline.yaml
@@ -244,14 +244,14 @@ gemma-7b-hf:
     race-high: 66
 
 gemma2-2b-hf:
-    gsm8k: 8
-    race-middle: 31
-    race-high: 30
+    gsm8k: 33
+    race-middle: 56
+    race-high: 58
 
 gemma2-9b-hf:
-    gsm8k: 20
-    race-middle: 42
-    race-high: 35
+    gsm8k: 70
+    race-middle: 82
+    race-high: 84
 
 internlm2_5-7b-hf:
     gsm8k: 47
diff --git a/.github/workflows/daily-run-test.yml b/.github/workflows/daily-run-test.yml
index 894b149e0..42ada2f08 100644
--- a/.github/workflows/daily-run-test.yml
+++ b/.github/workflows/daily-run-test.yml
@@ -123,16 +123,16 @@ jobs:
           conda info --envs
           export from_tf=TRUE
           python tools/list_configs.py internlm2_5 mmlu
-          opencompass --models hf_internlm2_5_7b --datasets race_ppl --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd1_${{ matrix.cuda_env }} --reuse --max-num-workers 2
+          opencompass --models hf_internlm2_5_7b --datasets race_ppl --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd1_${{ matrix.cuda_env }} --reuse
           rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd1_${{ matrix.cuda_env }}/*/summary regression_result_daily
           python -m pytest -m case1 -s -v --color=yes .github/scripts/oc_score_assert.py
-          opencompass --models hf_internlm2_5_7b_chat hf_internlm2_5_1_8b_chat --datasets race_gen -a lmdeploy --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd2 --reuse --max-num-workers 2
+          opencompass --models hf_internlm2_5_7b_chat --datasets race_gen -a lmdeploy --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd2_${{ matrix.cuda_env }} --reuse
           rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd2_${{ matrix.cuda_env }}/*/summary regression_result_daily
           python -m pytest -m case2 -s -v --color=yes .github/scripts/oc_score_assert.py
-          opencompass --datasets race_ppl --hf-type base --hf-path internlm/internlm2_5-7b --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd3 --reuse --max-num-workers 2
+          opencompass --datasets race_ppl --hf-type base --hf-path internlm/internlm2_5-7b --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd3_${{ matrix.cuda_env }} --reuse
           rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd3_${{ matrix.cuda_env }}/*/summary regression_result_daily
           python -m pytest -m case3 -s -v --color=yes .github/scripts/oc_score_assert.py
-          opencompass --datasets race_gen --hf-type chat --hf-path internlm/internlm2_5-7b-chat --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd4 --reuse --max-num-workers 2
+          opencompass --datasets race_gen --hf-type chat --hf-path internlm/internlm2_5-7b-chat --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd4_${{ matrix.cuda_env }} --reuse
           rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd4_${{ matrix.cuda_env }}/*/summary regression_result_daily
           python -m pytest -m case4 -s -v --color=yes .github/scripts/oc_score_assert.py
       - name:  Remove Conda Env

From 4d6349dfe14f81dc5eea68704c9597f1866a0d51 Mon Sep 17 00:00:00 2001
From: x54-729 <45304952+x54-729@users.noreply.github.com>
Date: Tue, 8 Oct 2024 11:34:04 +0800
Subject: [PATCH 18/20] [FIX] fix interntrain get_loglikelihood (#1584)

---
 opencompass/models/interntrain.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/opencompass/models/interntrain.py b/opencompass/models/interntrain.py
index 6d904acf7..e846aae2f 100644
--- a/opencompass/models/interntrain.py
+++ b/opencompass/models/interntrain.py
@@ -288,7 +288,7 @@ def _convert_dtype(self, default_dtype, model_dtype=None):
         else:
             raise NotImplementedError(f'Unknown model dtype {model_dtype}')
 
-    def get_token_len(self, prompt: str) -> int:
+    def get_token_len(self, prompt: str, use_bos=None, use_eos=None) -> int:
         """Get lengths of the tokenized strings.
 
         Args:
@@ -297,7 +297,7 @@ def get_token_len(self, prompt: str) -> int:
         Returns:
             int: Length of the input tokens
         """
-        tokens = self.tokenizer(prompt, use_bos=True, use_eos=True)
+        tokens = self.tokenizer(prompt, use_bos=use_bos, use_eos=use_eos)
         return len(tokens)
 
     def generate(self,
@@ -391,7 +391,7 @@ def get_loglikelihood(self, input_texts: List[str],
             for input_text, cont in zip(input_texts, conts)
         ]
         replaced_lens = [
-            len(self.encode(input_text)[0]) for input_text in replaced_texts
+            self.get_token_len(input_text) for input_text in replaced_texts
         ]
         loglikelihoods = []
         for nloss, nlen, rlen in zip(loss, lens, replaced_lens):

From d2ab51abbd628b3b2c260c403ffc069c4d0a43ee Mon Sep 17 00:00:00 2001
From: Songyang Zhang <tonysy@users.noreply.github.com>
Date: Wed, 9 Oct 2024 17:09:48 +0800
Subject: [PATCH 19/20] [Bug] Fix pre-commit hook (#1592)

---
 .github/workflows/lint.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index ae9a9bd2f..bc6d36a7e 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -17,7 +17,7 @@ jobs:
           python-version: '3.10'
       - name: Install pre-commit hook
         run: |
-          pip install pre-commit mmengine
+          pip install pre-commit==3.8.0 mmengine
           pre-commit install
       - name: Linting
         run: pre-commit run --all-files

From b52ba65c267c4d8bf05cd57ed3386a2d466887db Mon Sep 17 00:00:00 2001
From: Lyu Han <lvhan_028@163.com>
Date: Wed, 9 Oct 2024 22:58:06 +0800
Subject: [PATCH 20/20] [Feature] Integrate lmdeploy pipeline api (#1198)

* integrate lmdeploy's pipeline api

* fix linting

* update user guide

* rename

* update

* update

* update

* rollback class name

* update

* remove unused code

* update

* update

* fix ci check

* compatibility

* remove concurrency

* Update configs/models/hf_internlm/lmdeploy_internlm2_chat_7b.py

* Update docs/zh_cn/advanced_guides/evaluation_lmdeploy.md

* [Bug] fix lint

---------

Co-authored-by: Songyang Zhang <tonysy@users.noreply.github.com>
Co-authored-by: tonysy <sy.zhangbuaa@gmail.com>
---
 .../eval_internlm_chat_lmdeploy_pytorch.py    |  69 ------
 configs/eval_internlm_chat_lmdeploy_tis.py    |  41 ----
 configs/eval_internlm_chat_turbomind_tis.py   |  40 ----
 configs/eval_internlm_turbomind_tis.py        |  28 ---
 .../hf_internlm/lmdeploy_internlm2_chat_7b.py |  17 +-
 .../en/advanced_guides/evaluation_lmdeploy.md |  88 ++++++++
 .../advanced_guides/evaluation_turbomind.md   |  78 -------
 .../advanced_guides/evaluation_lmdeploy.md    |  86 ++++++++
 .../advanced_guides/evaluation_turbomind.md   |  75 -------
 .../hf_internlm/lmdeploy_internlm2_chat_7b.py |  17 +-
 opencompass/models/__init__.py                |   3 -
 opencompass/models/lmdeploy_pytorch.py        | 188 ----------------
 opencompass/models/lmdeploy_tis.py            | 200 ------------------
 opencompass/models/turbomind_tis.py           | 135 ------------
 .../models/turbomind_with_tf_above_v4_33.py   | 128 ++++-------
 opencompass/utils/run.py                      |  11 +-
 16 files changed, 249 insertions(+), 955 deletions(-)
 delete mode 100644 configs/eval_internlm_chat_lmdeploy_pytorch.py
 delete mode 100644 configs/eval_internlm_chat_lmdeploy_tis.py
 delete mode 100644 configs/eval_internlm_chat_turbomind_tis.py
 delete mode 100644 configs/eval_internlm_turbomind_tis.py
 create mode 100644 docs/en/advanced_guides/evaluation_lmdeploy.md
 delete mode 100644 docs/en/advanced_guides/evaluation_turbomind.md
 create mode 100644 docs/zh_cn/advanced_guides/evaluation_lmdeploy.md
 delete mode 100644 docs/zh_cn/advanced_guides/evaluation_turbomind.md
 delete mode 100644 opencompass/models/lmdeploy_pytorch.py
 delete mode 100644 opencompass/models/lmdeploy_tis.py
 delete mode 100644 opencompass/models/turbomind_tis.py

diff --git a/configs/eval_internlm_chat_lmdeploy_pytorch.py b/configs/eval_internlm_chat_lmdeploy_pytorch.py
deleted file mode 100644
index 4ea1f84c2..000000000
--- a/configs/eval_internlm_chat_lmdeploy_pytorch.py
+++ /dev/null
@@ -1,69 +0,0 @@
-from mmengine.config import read_base
-from opencompass.models import LmdeployPytorchModel
-
-
-with read_base():
-    # choose a list of datasets
-    from opencompass.configs.datasets.mmlu.mmlu_gen_a484b3 import mmlu_datasets
-    from opencompass.configs.datasets.ceval.ceval_gen_5f30c7 import ceval_datasets
-    from opencompass.configs.datasets.SuperGLUE_WiC.SuperGLUE_WiC_gen_d06864 import WiC_datasets
-    from opencompass.configs.datasets.SuperGLUE_WSC.SuperGLUE_WSC_gen_7902a7 import WSC_datasets
-    from opencompass.configs.datasets.triviaqa.triviaqa_gen_2121ce import triviaqa_datasets
-    from opencompass.configs.datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
-    from opencompass.configs.datasets.race.race_gen_69ee4f import race_datasets
-    from opencompass.configs.datasets.crowspairs.crowspairs_gen_381af0 import crowspairs_datasets
-    # and output the results in a choosen format
-    from opencompass.configs.summarizers.medium import summarizer
-
-
-datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
-
-
-meta_template = dict(
-    round=[
-        dict(role='HUMAN', begin='<|User|>:', end='<eoh>\n'),
-        dict(role='BOT', begin='<|Bot|>:', end='<eoa>\n', generate=True),
-    ],
-    eos_token_id=103028)
-
-# config for internlm-chat-7b
-internlm_chat_7b = dict(
-    type=LmdeployPytorchModel,
-    abbr='internlm-chat-7b-pytorch',
-    path='internlm/internlm-chat-7b',
-    engine_config=dict(session_len=2048,
-                       max_batch_size=16),
-    gen_config=dict(top_k=1,
-                    top_p=0.8,
-                    temperature=1.0,
-                    max_new_tokens=100),
-    max_out_len=100,
-    max_seq_len=2048,
-    batch_size=16,
-    concurrency=16,
-    meta_template=meta_template,
-    run_cfg=dict(num_gpus=1, num_procs=1),
-    end_str='<eoa>',
-)
-
-# config for internlm-chat-20b
-internlm_chat_20b = dict(
-    type=LmdeployPytorchModel,
-    abbr='internlm-chat-20b-pytorch',
-    path='internlm/internlm-chat-20b',
-    engine_config=dict(session_len=2048,
-                       max_batch_size=8),
-    gen_config=dict(top_k=1,
-                    top_p=0.8,
-                    temperature=1.0,
-                    max_new_tokens=100),
-    max_out_len=100,
-    max_seq_len=2048,
-    batch_size=8,
-    concurrency=8,
-    meta_template=meta_template,
-    run_cfg=dict(num_gpus=1, num_procs=1),
-    end_str='<eoa>',
-    )
-
-models = [internlm_chat_20b]
diff --git a/configs/eval_internlm_chat_lmdeploy_tis.py b/configs/eval_internlm_chat_lmdeploy_tis.py
deleted file mode 100644
index 8f5470d52..000000000
--- a/configs/eval_internlm_chat_lmdeploy_tis.py
+++ /dev/null
@@ -1,41 +0,0 @@
-from mmengine.config import read_base
-from opencompass.models.lmdeploy_tis import LmdeployTisModel
-
-with read_base():
-    # choose a list of datasets
-    from opencompass.configs.datasets.mmlu.mmlu_gen_a484b3 import mmlu_datasets
-    from opencompass.configs.datasets.ceval.ceval_gen_5f30c7 import ceval_datasets
-    from opencompass.configs.datasets.SuperGLUE_WiC.SuperGLUE_WiC_gen_d06864 import WiC_datasets
-    from opencompass.configs.datasets.SuperGLUE_WSC.SuperGLUE_WSC_gen_7902a7 import WSC_datasets
-    from opencompass.configs.datasets.triviaqa.triviaqa_gen_2121ce import triviaqa_datasets
-    from opencompass.configs.datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
-    from opencompass.configs.datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets
-    from opencompass.configs.datasets.race.race_gen_69ee4f import race_datasets
-    from opencompass.configs.datasets.crowspairs.crowspairs_gen_381af0 import crowspairs_datasets
-    # and output the results in a choosen format
-    from opencompass.configs.summarizers.medium import summarizer
-
-datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
-
-meta_template = dict(
-    round=[
-        dict(role='HUMAN', begin='<|im_start|>user\n', end='<|im_end|>\n'),
-        dict(role='BOT', begin='<|im_start|>assistant\n', end='<|im_end|>\n', generate=True),
-    ],
-    eos_token_id=92542
-)
-
-models = [
-    dict(
-        type=LmdeployTisModel,
-        abbr='internlm-chat-20b-lmdeploy-tis',
-        path='internlm/internlm-chat-20b',
-        tis_addr='0.0.0.0:33337',
-        max_out_len=100,
-        max_seq_len=2048,
-        batch_size=8,
-        meta_template=meta_template,
-        run_cfg=dict(num_gpus=1, num_procs=1),
-        end_str='<|im_end|>',
-    )
-]
diff --git a/configs/eval_internlm_chat_turbomind_tis.py b/configs/eval_internlm_chat_turbomind_tis.py
deleted file mode 100644
index 01f42000f..000000000
--- a/configs/eval_internlm_chat_turbomind_tis.py
+++ /dev/null
@@ -1,40 +0,0 @@
-from mmengine.config import read_base
-from opencompass.models.turbomind_tis import TurboMindTisModel
-
-with read_base():
-    # choose a list of datasets
-    from opencompass.configs.datasets.mmlu.mmlu_gen_a484b3 import mmlu_datasets
-    from opencompass.configs.datasets.ceval.ceval_gen_5f30c7 import ceval_datasets
-    from opencompass.configs.datasets.SuperGLUE_WiC.SuperGLUE_WiC_gen_d06864 import WiC_datasets
-    from opencompass.configs.datasets.SuperGLUE_WSC.SuperGLUE_WSC_gen_7902a7 import WSC_datasets
-    from opencompass.configs.datasets.triviaqa.triviaqa_gen_2121ce import triviaqa_datasets
-    from opencompass.configs.datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
-    from opencompass.configs.datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets
-    from opencompass.configs.datasets.race.race_gen_69ee4f import race_datasets
-    from opencompass.configs.datasets.crowspairs.crowspairs_gen_381af0 import crowspairs_datasets
-    # and output the results in a choosen format
-    from opencompass.configs.summarizers.medium import summarizer
-
-datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
-
-
-meta_template = dict(
-    round=[
-        dict(role='HUMAN', begin='<|User|>:', end='\n'),
-        dict(role='BOT', begin='<|Bot|>:', end='<eoa>\n', generate=True),
-    ],
-    eos_token_id=103028)
-
-models = [
-    dict(
-        type=TurboMindTisModel,
-        abbr='internlm-chat-20b-turbomind',
-        path='internlm',
-        tis_addr='0.0.0.0:33337',
-        max_out_len=100,
-        max_seq_len=2048,
-        batch_size=8,
-        meta_template=meta_template,
-        run_cfg=dict(num_gpus=1, num_procs=1),
-    )
-]
diff --git a/configs/eval_internlm_turbomind_tis.py b/configs/eval_internlm_turbomind_tis.py
deleted file mode 100644
index 98914fa47..000000000
--- a/configs/eval_internlm_turbomind_tis.py
+++ /dev/null
@@ -1,28 +0,0 @@
-from mmengine.config import read_base
-from opencompass.models.turbomind_tis import TurboMindTisModel
-
-with read_base():
-    # choose a list of datasets
-    from opencompass.configs.datasets.mmlu.mmlu_gen_a484b3 import mmlu_datasets
-    from opencompass.configs.datasets.ceval.ceval_gen_5f30c7 import ceval_datasets
-    from opencompass.configs.datasets.SuperGLUE_WiC.SuperGLUE_WiC_gen_d06864 import WiC_datasets
-    from opencompass.configs.datasets.triviaqa.triviaqa_gen_2121ce import triviaqa_datasets
-    from opencompass.configs.datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
-    from opencompass.configs.datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets
-    # and output the results in a choosen format
-    from opencompass.configs.summarizers.medium import summarizer
-
-datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
-
-models = [
-    dict(
-        type=TurboMindTisModel,
-        abbr='internlm-chat-20b-turbomind',
-        path='internlm',
-        tis_addr='0.0.0.0:33337',
-        max_out_len=100,
-        max_seq_len=2048,
-        batch_size=8,
-        run_cfg=dict(num_gpus=1, num_procs=1),
-    )
-]
diff --git a/configs/models/hf_internlm/lmdeploy_internlm2_chat_7b.py b/configs/models/hf_internlm/lmdeploy_internlm2_chat_7b.py
index 60097e373..38ea39d7d 100644
--- a/configs/models/hf_internlm/lmdeploy_internlm2_chat_7b.py
+++ b/configs/models/hf_internlm/lmdeploy_internlm2_chat_7b.py
@@ -1,15 +1,24 @@
 from opencompass.models import TurboMindModelwithChatTemplate
 
+
 models = [
     dict(
         type=TurboMindModelwithChatTemplate,
-        abbr='internlm2-chat-7b-turbomind',
+        abbr=f'internlm2-chat-7b-lmdeploy',
         path='internlm/internlm2-chat-7b',
-        engine_config=dict(session_len=8192, max_batch_size=16, tp=1),
-        gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096),
+        # inference backend of LMDeploy. It can be either 'turbomind' or 'pytorch'.
+        # If the model is not supported by 'turbomind', it will fallback to
+        # 'pytorch'
+        backend='turbomind',
+        # For the detailed engine config and generation config, please refer to
+        # https://github.com/InternLM/lmdeploy/blob/main/lmdeploy/messages.py
+        engine_config=dict(tp=1),
+        gen_config=dict(do_sample=False),
         max_seq_len=8192,
         max_out_len=4096,
-        batch_size=16,
+        # the max number of prompts that LMDeploy receives
+        # in `generate` function
+        batch_size=5000,
         run_cfg=dict(num_gpus=1),
     )
 ]
diff --git a/docs/en/advanced_guides/evaluation_lmdeploy.md b/docs/en/advanced_guides/evaluation_lmdeploy.md
new file mode 100644
index 000000000..bfacd4881
--- /dev/null
+++ b/docs/en/advanced_guides/evaluation_lmdeploy.md
@@ -0,0 +1,88 @@
+# Evaluation with LMDeploy
+
+We now support evaluation of models accelerated by the [LMDeploy](https://github.com/InternLM/lmdeploy). LMDeploy is a toolkit designed for compressing, deploying, and serving LLM. It has a remarkable inference performance. We now illustrate how to evaluate a model with the support of LMDeploy in OpenCompass.
+
+## Setup
+
+### Install OpenCompass
+
+Please follow the [instructions](https://opencompass.readthedocs.io/en/latest/get_started/installation.html) to install the OpenCompass and prepare the evaluation datasets.
+
+### Install LMDeploy
+
+Install lmdeploy via pip (python 3.8+)
+
+```shell
+pip install lmdeploy
+```
+
+The default prebuilt package is compiled on CUDA 12. However, if CUDA 11+ is required, you can install lmdeploy by:
+
+```shell
+export LMDEPLOY_VERSION=0.6.0
+export PYTHON_VERSION=310
+pip install https://github.com/InternLM/lmdeploy/releases/download/v${LMDEPLOY_VERSION}/lmdeploy-${LMDEPLOY_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux2014_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118
+```
+
+## Evaluation
+
+When evaluating a model, it is necessary to prepare an evaluation configuration that specifies information such as the evaluation dataset, the model, and inference parameters.
+
+Taking [internlm2-chat-7b](https://huggingface.co/internlm/internlm2-chat-7b) as an example, the evaluation config is as follows:
+
+```python
+# configure the dataset
+from mmengine.config import read_base
+
+
+with read_base():
+    # choose a list of datasets
+    from .datasets.mmlu.mmlu_gen_a484b3 import mmlu_datasets
+    from .datasets.ceval.ceval_gen_5f30c7 import ceval_datasets
+    from .datasets.triviaqa.triviaqa_gen_2121ce import triviaqa_datasets
+    from opencompass.configs.datasets.gsm8k.gsm8k_0shot_v2_gen_a58960 import \
+        gsm8k_datasets
+    # and output the results in a chosen format
+    from .summarizers.medium import summarizer
+
+datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
+
+# configure lmdeploy
+from opencompass.models import TurboMindModelwithChatTemplate
+
+
+
+# configure the model
+models = [
+    dict(
+        type=TurboMindModelwithChatTemplate,
+        abbr=f'internlm2-chat-7b-lmdeploy',
+        # model path, which can be the address of a model repository on the Hugging Face Hub or a local path
+        path='internlm/internlm2-chat-7b',
+        # inference backend of LMDeploy. It can be either 'turbomind' or 'pytorch'.
+        # If the model is not supported by 'turbomind', it will fallback to
+        # 'pytorch'
+        backend='turbomind',
+        # For the detailed engine config and generation config, please refer to
+        # https://github.com/InternLM/lmdeploy/blob/main/lmdeploy/messages.py
+        engine_config=dict(tp=1),
+        gen_config=dict(do_sample=False),
+        # the max size of the context window
+        max_seq_len=7168,
+        # the max number of new tokens
+        max_out_len=1024,
+        # the max number of prompts that LMDeploy receives
+        # in `generate` function
+        batch_size=5000,
+        run_cfg=dict(num_gpus=1),
+    )
+]
+```
+
+Place the aforementioned configuration in a file, such as "configs/eval_internlm2_lmdeploy.py". Then, in the home folder of OpenCompass, start evaluation by the following command:
+
+```shell
+python run.py configs/eval_internlm2_lmdeploy.py -w outputs
+```
+
+You are expected to get the evaluation results after the inference and evaluation.
diff --git a/docs/en/advanced_guides/evaluation_turbomind.md b/docs/en/advanced_guides/evaluation_turbomind.md
deleted file mode 100644
index c1299f0b3..000000000
--- a/docs/en/advanced_guides/evaluation_turbomind.md
+++ /dev/null
@@ -1,78 +0,0 @@
-# Evaluation with LMDeploy
-
-We now support evaluation of models accelerated by the [LMDeploy](https://github.com/InternLM/lmdeploy). LMDeploy is a toolkit designed for compressing, deploying, and serving LLM. **TurboMind** is an efficient inference engine proposed by LMDeploy. OpenCompass is compatible with TurboMind. We now illustrate how to evaluate a model with the support of TurboMind in OpenCompass.
-
-## Setup
-
-### Install OpenCompass
-
-Please follow the [instructions](https://opencompass.readthedocs.io/en/latest/get_started/installation.html) to install the OpenCompass and prepare the evaluation datasets.
-
-### Install LMDeploy
-
-Install lmdeploy via pip (python 3.8+)
-
-```shell
-pip install lmdeploy
-```
-
-## Evaluation
-
-OpenCompass integrates turbomind's python API for evaluation.
-
-We take the InternLM-20B as example. Firstly, we prepare the evaluation config `configs/eval_internlm_turbomind.py`:
-
-```python
-from mmengine.config import read_base
-from opencompass.models.turbomind import TurboMindModel
-
-
-with read_base():
-    # choose a list of datasets
-    from .datasets.mmlu.mmlu_gen_a484b3 import mmlu_datasets
-    from .datasets.ceval.ceval_gen_5f30c7 import ceval_datasets
-    from .datasets.SuperGLUE_WiC.SuperGLUE_WiC_gen_d06864 import WiC_datasets
-    from .datasets.triviaqa.triviaqa_gen_2121ce import triviaqa_datasets
-    from .datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
-    from .datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets
-    # and output the results in a chosen format
-    from .summarizers.medium import summarizer
-
-datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
-
-# config for internlm-20b model
-internlm_20b = dict(
-        type=TurboMindModel,
-        abbr='internlm-20b-turbomind',
-        path="internlm/internlm-20b",  # this path should be same as in huggingface
-        engine_config=dict(session_len=2048,
-                           max_batch_size=8,
-                           rope_scaling_factor=1.0),
-        gen_config=dict(top_k=1, top_p=0.8,
-                        temperature=1.0,
-                        max_new_tokens=100),
-        max_out_len=100,
-        max_seq_len=2048,
-        batch_size=8,
-        concurrency=8,
-        run_cfg=dict(num_gpus=1, num_procs=1),
-        end_str='<eoa>'
-    )
-
-models = [internlm_20b]
-```
-
-Then, in the home folder of OpenCompass, start evaluation by the following command:
-
-```shell
-python run.py configs/eval_internlm_turbomind.py -w outputs/turbomind/internlm-20b
-```
-
-You are expected to get the evaluation results after the inference and evaluation.
-
-**Note**:
-
-- If you want to pass more arguments for `engine_config`和`gen_config` in the evaluation config file, please refer to [TurbomindEngineConfig](https://lmdeploy.readthedocs.io/en/latest/inference/pipeline.html#turbomindengineconfig)
-  and [GenerationConfig](https://lmdeploy.readthedocs.io/en/latest/inference/pipeline.html#generationconfig)
-- If you evaluate the InternLM Chat model, please use configuration file `eval_internlm_chat_turbomind.py`
-- If you evaluate the InternLM 7B model, please modify `eval_internlm_turbomind.py` or `eval_internlm_chat_turbomind.py` by changing to the setting `models = [internlm_7b]` in the last line.
diff --git a/docs/zh_cn/advanced_guides/evaluation_lmdeploy.md b/docs/zh_cn/advanced_guides/evaluation_lmdeploy.md
new file mode 100644
index 000000000..158399641
--- /dev/null
+++ b/docs/zh_cn/advanced_guides/evaluation_lmdeploy.md
@@ -0,0 +1,86 @@
+# 使用 LMDeploy 加速评测
+
+我们支持在评测大语言模型时，使用 [LMDeploy](https://github.com/InternLM/lmdeploy) 作为推理加速引擎。LMDeploy 是涵盖了 LLM 和 VLM 任务的全套轻量化、部署和服务解决方案，拥有卓越的推理性能。本教程将介绍如何使用 LMDeploy 加速对模型的评测。
+
+## 环境配置
+
+### 安装 OpenCompass
+
+请根据 OpenCompass [安装指南](https://opencompass.readthedocs.io/en/latest/get_started/installation.html) 来安装算法库和准备数据集。
+
+### 安装 LMDeploy
+
+使用 pip 安装 LMDeploy (python 3.8+)：
+
+```shell
+pip install lmdeploy
+```
+
+LMDeploy 预编译包默认基于 CUDA 12 编译。如果需要在 CUDA 11+ 下安装 LMDeploy，请执行以下命令：
+
+```shell
+export LMDEPLOY_VERSION=0.6.0
+export PYTHON_VERSION=310
+pip install https://github.com/InternLM/lmdeploy/releases/download/v${LMDEPLOY_VERSION}/lmdeploy-${LMDEPLOY_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux2014_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118
+```
+
+## 评测
+
+在评测一个模型时，需要准备一份评测配置，指明评测集、模型和推理参数等信息。
+
+以 [internlm2-chat-7b](https://huggingface.co/internlm/internlm2-chat-7b) 模型为例，相关的配置信息如下：
+
+```python
+# configure the dataset
+from mmengine.config import read_base
+
+
+with read_base():
+    # choose a list of datasets
+    from .datasets.mmlu.mmlu_gen_a484b3 import mmlu_datasets
+    from .datasets.ceval.ceval_gen_5f30c7 import ceval_datasets
+    from .datasets.triviaqa.triviaqa_gen_2121ce import triviaqa_datasets
+    from opencompass.configs.datasets.gsm8k.gsm8k_0shot_v2_gen_a58960 import \
+        gsm8k_datasets
+    # and output the results in a chosen format
+    from .summarizers.medium import summarizer
+
+datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
+
+# configure lmdeploy
+from opencompass.models import TurboMindModelwithChatTemplate
+
+
+
+# configure the model
+models = [
+    dict(
+        type=TurboMindModelwithChatTemplate,
+        abbr=f'internlm2-chat-7b-lmdeploy',
+        # model path, which can be the address of a model repository on the Hugging Face Hub or a local path
+        path='internlm/internlm2-chat-7b',
+        # inference backend of LMDeploy. It can be either 'turbomind' or 'pytorch'.
+        # If the model is not supported by 'turbomind', it will fallback to
+        # 'pytorch'
+        backend='turbomind',
+        # For the detailed engine config and generation config, please refer to
+        # https://github.com/InternLM/lmdeploy/blob/main/lmdeploy/messages.py
+        engine_config=dict(tp=1),
+        gen_config=dict(do_sample=False),
+        # the max size of the context window
+        max_seq_len=7168,
+        # the max number of new tokens
+        max_out_len=1024,
+        # the max number of prompts that LMDeploy receives
+        # in `generate` function
+        batch_size=32,
+        run_cfg=dict(num_gpus=1),
+    )
+]
+```
+
+把上述配置放在文件中，比如 "configs/eval_internlm2_lmdeploy.py"。然后，在 OpenCompass 的项目目录下，执行如下命令可得到评测结果：
+
+```shell
+python run.py configs/eval_internlm2_lmdeploy.py -w outputs
+```
diff --git a/docs/zh_cn/advanced_guides/evaluation_turbomind.md b/docs/zh_cn/advanced_guides/evaluation_turbomind.md
deleted file mode 100644
index a7c37b758..000000000
--- a/docs/zh_cn/advanced_guides/evaluation_turbomind.md
+++ /dev/null
@@ -1,75 +0,0 @@
-# 评测 LMDeploy 模型
-
-我们支持评测使用 [LMDeploy](https://github.com/InternLM/lmdeploy) 加速过的大语言模型。LMDeploy 由 MMDeploy 和 MMRazor 团队联合开发，是涵盖了 LLM 任务的全套轻量化、部署和服务解决方案。 **TurboMind** 是 LMDeploy 推出的高效推理引擎。OpenCompass 对 TurboMind 进行了适配，本教程将介绍如何使用 OpenCompass 来对 TurboMind 加速后的模型进行评测。
-
-## 环境配置
-
-### 安装 OpenCompass
-
-请根据 OpenCompass [安装指南](https://opencompass.readthedocs.io/en/latest/get_started/installation.html) 来安装算法库和准备数据集。
-
-### 安装 LMDeploy
-
-使用 pip 安装 LMDeploy (python 3.8+)：
-
-```shell
-pip install lmdeploy
-```
-
-## 评测
-
-OpenCompass 支持分别通过 turbomind python API 评测数据集。
-
-下文以 InternLM-20B 模型为例，介绍如何评测。首先我们准备好测试配置文件`configs/eval_internlm_turbomind.py`:
-
-```python
-from mmengine.config import read_base
-from opencompass.models.turbomind import TurboMindModel
-
-
-with read_base():
-    # choose a list of datasets
-    from .datasets.mmlu.mmlu_gen_a484b3 import mmlu_datasets
-    from .datasets.ceval.ceval_gen_5f30c7 import ceval_datasets
-    from .datasets.SuperGLUE_WiC.SuperGLUE_WiC_gen_d06864 import WiC_datasets
-    from .datasets.triviaqa.triviaqa_gen_2121ce import triviaqa_datasets
-    from .datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
-    from .datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets
-    # and output the results in a chosen format
-    from .summarizers.medium import summarizer
-
-datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
-
-# config for internlm-20b model
-internlm_20b = dict(
-        type=TurboMindModel,
-        abbr='internlm-20b-turbomind',
-        path="internlm/internlm-20b", # 注意路径与huggingface保持一致
-        engine_config=dict(session_len=2048,
-                           max_batch_size=8,
-                           rope_scaling_factor=1.0),
-        gen_config=dict(top_k=1, top_p=0.8,
-                        temperature=1.0,
-                        max_new_tokens=100),
-        max_out_len=100,
-        max_seq_len=2048,
-        batch_size=8,
-        concurrency=8,
-        run_cfg=dict(num_gpus=1, num_procs=1),
-        end_str='<eoa>'
-    )
-
-models = [internlm_20b]
-```
-
-然后，在 OpenCompass 的项目目录下，执行如下命令可得到评测结果：
-
-```shell
-python run.py configs/eval_internlm_turbomind.py -w outputs/turbomind/internlm-20b
-```
-
-**注：**
-
-- 如果想在测评配置文件中`engine_config`和`gen_config`字段传递更多参数，请参考[TurbomindEngineConfig](https://lmdeploy.readthedocs.io/zh-cn/latest/inference/pipeline.html#turbomindengineconfig) 和 [GenerationConfig](https://lmdeploy.readthedocs.io/zh-cn/latest/inference/pipeline.html#generationconfig)
-- 如果评测 InternLM Chat 模型，请使用配置文件 `eval_internlm_chat_turbomind.py`
-- 如果评测 InternLM 7B 模型，请修改 `eval_internlm_turbomind.py` 或者 `eval_internlm_chat_turbomind.py`。将`models`字段配置为`models = [internlm_7b]` 。
diff --git a/opencompass/configs/models/hf_internlm/lmdeploy_internlm2_chat_7b.py b/opencompass/configs/models/hf_internlm/lmdeploy_internlm2_chat_7b.py
index 60097e373..38ea39d7d 100644
--- a/opencompass/configs/models/hf_internlm/lmdeploy_internlm2_chat_7b.py
+++ b/opencompass/configs/models/hf_internlm/lmdeploy_internlm2_chat_7b.py
@@ -1,15 +1,24 @@
 from opencompass.models import TurboMindModelwithChatTemplate
 
+
 models = [
     dict(
         type=TurboMindModelwithChatTemplate,
-        abbr='internlm2-chat-7b-turbomind',
+        abbr=f'internlm2-chat-7b-lmdeploy',
         path='internlm/internlm2-chat-7b',
-        engine_config=dict(session_len=8192, max_batch_size=16, tp=1),
-        gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096),
+        # inference backend of LMDeploy. It can be either 'turbomind' or 'pytorch'.
+        # If the model is not supported by 'turbomind', it will fallback to
+        # 'pytorch'
+        backend='turbomind',
+        # For the detailed engine config and generation config, please refer to
+        # https://github.com/InternLM/lmdeploy/blob/main/lmdeploy/messages.py
+        engine_config=dict(tp=1),
+        gen_config=dict(do_sample=False),
         max_seq_len=8192,
         max_out_len=4096,
-        batch_size=16,
+        # the max number of prompts that LMDeploy receives
+        # in `generate` function
+        batch_size=5000,
         run_cfg=dict(num_gpus=1),
     )
 ]
diff --git a/opencompass/models/__init__.py b/opencompass/models/__init__.py
index 0f55b869c..580402d46 100644
--- a/opencompass/models/__init__.py
+++ b/opencompass/models/__init__.py
@@ -25,8 +25,6 @@
 from .krgpt_api import KrGPT  # noqa: F401
 from .lightllm_api import LightllmAPI, LightllmChatAPI  # noqa: F401
 from .llama2 import Llama2, Llama2Chat  # noqa: F401
-from .lmdeploy_pytorch import LmdeployPytorchModel  # noqa: F401
-from .lmdeploy_tis import LmdeployTisModel  # noqa: F401
 from .minimax_api import MiniMax, MiniMaxChatCompletionV2  # noqa: F401
 from .mistral_api import Mistral  # noqa: F401
 from .mixtral import Mixtral  # noqa: F401
@@ -41,7 +39,6 @@
 from .sensetime_api import SenseTime  # noqa: F401
 from .stepfun_api import StepFun  # noqa: F401
 from .turbomind import TurboMindModel  # noqa: F401
-from .turbomind_tis import TurboMindTisModel  # noqa: F401
 from .turbomind_with_tf_above_v4_33 import \
     TurboMindModelwithChatTemplate  # noqa: F401
 from .unigpt_api import UniGPT  # noqa: F401
diff --git a/opencompass/models/lmdeploy_pytorch.py b/opencompass/models/lmdeploy_pytorch.py
deleted file mode 100644
index 80924c276..000000000
--- a/opencompass/models/lmdeploy_pytorch.py
+++ /dev/null
@@ -1,188 +0,0 @@
-from concurrent.futures import ThreadPoolExecutor
-from typing import Dict, List, Optional, Union
-
-from opencompass.models.base import BaseModel
-from opencompass.utils.logging import get_logger
-from opencompass.utils.prompt import PromptList
-
-PromptType = Union[PromptList, str]
-
-
-def valid_str(string, coding='utf-8'):
-    """decode text according to its encoding type."""
-    invalid_chars = [b'\xef\xbf\xbd']
-    bstr = bytes(string, coding)
-    for invalid_char in invalid_chars:
-        bstr = bstr.replace(invalid_char, b'')
-    ret = bstr.decode(encoding=coding, errors='ignore')
-    return ret
-
-
-class LmdeployPytorchModel(BaseModel):
-    """Model wrapper for lmdeploy pytorch engine through python API.
-
-    Args:
-        path (str): path of the supported pytorch model.
-        max_seq_len (int): The maximum allowed sequence length of a model.
-            Note that the length of prompt + generated tokens shall not exceed
-            this value. Defaults to 2048.
-        meta_template (Dict, optional): The model's meta prompt
-            template if needed, in case the requirement of injecting or
-            wrapping of any meta instructions.
-        engine_config (Dict, optional): The engine config to set
-            arguments like session_len, max_batch_size for TurboMind.
-        gen_config (Dict, optional): Generation config to set
-                arguments like top_k, top_p, temperature.
-        end_str (str, optional): Whether to trim generated strings with end_str
-            if the model has special ending strings that are not handled well.
-            Defaults to None.
-    """
-
-    def __init__(self,
-                 path: str,
-                 concurrency: int = 8,
-                 max_seq_len: int = 2048,
-                 meta_template: Optional[Dict] = None,
-                 engine_config: Optional[Dict] = None,
-                 gen_config: Optional[Dict] = None,
-                 end_str: Optional[str] = None):
-        super().__init__(path=path,
-                         max_seq_len=max_seq_len,
-                         meta_template=meta_template)
-        from lmdeploy.pytorch import engine as tm
-        from lmdeploy.version import version_info
-
-        if engine_config is not None:
-            from lmdeploy.messages import PytorchEngineConfig
-            engine_config = PytorchEngineConfig(**engine_config)
-            # set thread_safe
-            if hasattr(engine_config, 'thread_safe'):
-                engine_config.thread_safe = True
-
-        if gen_config is not None:
-            from lmdeploy.messages import GenerationConfig
-            gen_config = GenerationConfig(**gen_config)
-
-        self.logger = get_logger()
-        tm_model = tm.Engine(path, engine_config)
-        self.tokenizer = tm_model.tokenizer
-        self.generators = [
-            tm_model.create_instance() for i in range(concurrency)
-        ]
-        self.generator_ids = [i + 1 for i in range(concurrency)]
-
-        from transformers import GenerationConfig
-        try:
-            generation_config = GenerationConfig.from_pretrained(path)
-        except Exception:
-            generation_config = None
-        if generation_config and hasattr(generation_config, 'eos_token_id'):
-            if gen_config.stop_words is None:
-                stop_words = []
-            if isinstance(generation_config.eos_token_id, int):
-                stop_words.append(generation_config.eos_token_id)
-            else:
-                assert isinstance(generation_config.eos_token_id, list)
-                for token_id in generation_config.eos_token_id:
-                    stop_words.append(token_id)
-            gen_config.stop_words = stop_words
-            if version_info >= (0, 6, 0):
-                gen_config.stop_token_ids = stop_words
-        self.gen_config = gen_config
-        self.end_str = end_str
-        self.major_version, self.minor_version = version_info[:2]
-
-    def generate(
-        self,
-        inputs: List[str],
-        max_out_len: int = 512,
-    ) -> List[str]:
-        """Generate results given a list of inputs.
-
-        Args:
-            inputs (List[str]): A list of prompts
-            max_out_len (int): The maximum length of the output.
-
-        Returns:
-            List[str]: A list of generated strings.
-        """
-        assert isinstance(
-            inputs, List), f'List(str) is expected, but got {type(inputs)}'
-
-        # split inputs into batches
-        batch_size = len(self.generators)
-        batch_inputs = [
-            inputs[i:i + batch_size] for i in range(0, len(inputs), batch_size)
-        ]
-
-        results = []
-        for batch_input in batch_inputs:
-            with ThreadPoolExecutor() as executor:
-                _results = list(
-                    executor.map(
-                        self._generate,
-                        self.generators[:len(batch_input)],
-                        self.generator_ids[:len(batch_input)],
-                        batch_input,
-                        [self.gen_config] * len(batch_input),
-                        [self.end_str] * len(batch_input),
-                    ))
-                results += _results
-        return results
-
-    def get_token_len(self, prompt: str) -> int:
-        input_ids = self.tokenizer.encode(prompt)
-        return len(input_ids)
-
-    def wait(self):
-        """Wait till the next query can be sent.
-
-        Applicable in both single-thread and multi-thread environments.
-        """
-        return self.token_bucket.get_token()
-
-    def _generate(self,
-                  generator,
-                  session_id,
-                  prompt: PromptType,
-                  gen_config=None,
-                  end_str: Optional[str] = None) -> str:
-        """Generate results given a list of inputs.
-
-        Args:
-            prompt (PromptType): A string or PromptDict.
-                The PromptDict should be organized in OpenCompass'
-                API format.
-            gen_config (GenerationConfig, optional): Generation
-                config to set arguments like top_k, top_p, temperature.
-            end_str (str, optional): Whether to trim generated strings
-                with end_str if the model has special ending strings
-                that are not handled well.
-                Defaults to None.
-        Returns:
-            str: The generated string.
-        """
-        assert type(
-            prompt) is str, 'We only support string for TurboMind Python API'
-        input_ids = self.tokenizer.encode(prompt)
-        if self.major_version >= 0 and self.minor_version >= 4:
-            outputs = generator.infer(session_id,
-                                      input_ids,
-                                      gen_config=gen_config)
-            output_ids = outputs.token_ids
-        else:
-            _, output_ids, _ = generator.infer(session_id,
-                                               input_ids,
-                                               gen_config=gen_config)
-
-        # stop engine
-        if hasattr(generator, 'end'):
-            generator.end(session_id)
-        # decode output
-        response_all = self.tokenizer.decode(output_ids)
-        # trim output
-        if end_str:
-            response_all = response_all.split(end_str)[0]
-        # remove invalid characters
-        response_all = valid_str(response_all)
-        return response_all
diff --git a/opencompass/models/lmdeploy_tis.py b/opencompass/models/lmdeploy_tis.py
deleted file mode 100644
index 9c92ef18a..000000000
--- a/opencompass/models/lmdeploy_tis.py
+++ /dev/null
@@ -1,200 +0,0 @@
-import threading
-from concurrent.futures import ThreadPoolExecutor
-from functools import partial
-from queue import Queue
-from typing import Dict, List, Optional, Union
-
-import numpy as np
-
-from opencompass.models.base import BaseModel, LMTemplateParser
-from opencompass.utils.logging import get_logger
-from opencompass.utils.prompt import PromptList
-
-PromptType = Union[PromptList, str]
-
-
-def valid_str(string, coding='utf-8'):
-    """decode text according to its encoding type."""
-    invalid_chars = [b'\xef\xbf\xbd']
-    bstr = bytes(string, coding)
-    for invalid_char in invalid_chars:
-        bstr = bstr.replace(invalid_char, b'')
-    ret = bstr.decode(encoding=coding, errors='ignore')
-    return ret
-
-
-def prepare_tensor(name, input_tensor):
-    """Create grpcclient's InferInput instance according to a given tensor."""
-    import tritonclient.grpc as grpcclient
-    from tritonclient.utils import np_to_triton_dtype
-    t = grpcclient.InferInput(name, list(input_tensor.shape),
-                              np_to_triton_dtype(input_tensor.dtype))
-    t.set_data_from_numpy(input_tensor)
-    return t
-
-
-def stream_callback(que, result, error):
-    """callback function invoked by triton client."""
-    que.put((result, error))
-
-
-class LmdeployTisModel(BaseModel):
-    """Model wrapper for LMDeploy Python Backend Triton Inference Server gRPC
-    API.
-
-    Args:
-        path (str): The name of OpenAI's model.
-        tis_addr (str): The address (ip:port format) of turbomind's
-            triton inference server
-        max_seq_len (int): The maximum allowed sequence length of a model.
-            Note that the length of prompt + generated tokens shall not exceed
-            this value. Defaults to 2048.
-        meta_template (Dict, optional): The model's meta prompt
-            template if needed, in case the requirement of injecting or
-            wrapping of any meta instructions.
-    """
-
-    is_api: bool = True
-
-    def __init__(self,
-                 path: str,
-                 tis_addr: str = '0.0.0.0:33337',
-                 max_seq_len: int = 2048,
-                 meta_template: Optional[Dict] = None,
-                 end_str: Optional[str] = None):
-        super().__init__(path=path,
-                         max_seq_len=max_seq_len,
-                         meta_template=meta_template)
-        from lmdeploy.tokenizer import Tokenizer
-
-        self.logger = get_logger()
-        self.template_parser = LMTemplateParser(meta_template)
-        self.eos_token_id = None
-        if meta_template and 'eos_token_id' in meta_template:
-            self.eos_token_id = meta_template['eos_token_id']
-        self.tis_addr = tis_addr
-        self.tokenizer = Tokenizer(path)
-        self.end_str = end_str
-
-    def generate(
-        self,
-        inputs: List[str or PromptList],
-        max_out_len: int = 512,
-        temperature: float = 1.0,
-    ) -> List[str]:
-        """Generate results given a list of inputs.
-
-        Args:
-            inputs (List[str or PromptList]): A list of strings or PromptDicts.
-                The PromptDict should be organized in OpenCompass'
-                API format.
-            max_out_len (int): The maximum length of the output.
-            temperature (float): What sampling temperature to use,
-                between 0 and 2. Higher values like 0.8 will make the output
-                more random, while lower values like 0.2 will make it more
-                focused and deterministic. Defaults to 0.7.
-
-        Returns:
-            List[str]: A list of generated strings.
-        """
-
-        with ThreadPoolExecutor() as executor:
-            results = list(
-                executor.map(self._generate, inputs,
-                             [max_out_len] * len(inputs),
-                             [temperature] * len(inputs),
-                             [self.end_str] * len(inputs)))
-        return results
-
-    def wait(self):
-        """Wait till the next query can be sent.
-
-        Applicable in both single-thread and multi-thread environments.
-        """
-        return self.token_bucket.get_token()
-
-    def get_token_len(self, prompt: str) -> int:
-        input_ids = self.tokenizer.encode(prompt)
-        return len(input_ids)
-
-    def _call_triton_server(self, prompt, tis_addr, session_id,
-                            request_output_len, temperature, res_que):
-        import tritonclient.grpc as grpcclient
-
-        with grpcclient.InferenceServerClient(tis_addr) as client:
-            inputs = [
-                prepare_tensor('prompt',
-                               np.array([prompt.encode()], dtype=np.object_)),
-                prepare_tensor('max_tokens',
-                               np.array([request_output_len], dtype=np.int32)),
-                prepare_tensor('temperature',
-                               np.array([temperature], dtype=np.float_)),
-                prepare_tensor('top_p', np.array([1.0], dtype=np.float_)),
-                prepare_tensor('top_k', np.array([1], dtype=np.int32)),
-                prepare_tensor('ignore_eos', np.array([False],
-                                                      dtype=np.bool_)),
-                prepare_tensor('stream', np.array([True], dtype=np.bool_)),
-            ]
-
-            # async_stream
-            client.start_stream(partial(stream_callback, res_que))
-            client.async_stream_infer('lmdeploy_model',
-                                      inputs,
-                                      sequence_id=session_id,
-                                      sequence_start=True,
-                                      sequence_end=True)
-
-        res_que.put(None)
-        return
-
-    def _process_result(self, que):
-        text = ''
-        while True:
-            res = que.get()
-            if res is not None:
-                result, err = res
-                if err is not None:
-                    print(err)
-                else:
-                    res = result.as_numpy('response').item().decode()
-                    text += res
-            else:
-                return text
-
-    def _generate(self,
-                  prompt: str or PromptList,
-                  max_out_len: int,
-                  temperature: float,
-                  end_str: Optional[str] = None) -> str:
-        """Generate results given a list of inputs.
-
-        Args:
-            prompt (str or PromptList): A string or PromptDict.
-                The PromptDict should be organized in OpenCompass'
-                API format.
-            max_out_len (int): The maximum length of the output.
-            temperature (float): What sampling temperature to use,
-                between 0 and 2. Higher values like 0.8 will make the output
-                more random, while lower values like 0.2 will make it more
-                focused and deterministic.
-
-        Returns:
-            str: The generated string.
-        """
-        assert type(
-            prompt
-        ) is str, 'We only support string for LMDeploy Python Backend TIS API'
-
-        res_que = Queue()
-
-        self._call_triton_server(prompt=prompt,
-                                 tis_addr=self.tis_addr,
-                                 session_id=threading.currentThread().ident,
-                                 request_output_len=max_out_len,
-                                 temperature=temperature,
-                                 res_que=res_que)
-        text = self._process_result(res_que)
-        response = valid_str(text)
-        if end_str:
-            response = response.split(end_str)[0]
-        return response
diff --git a/opencompass/models/turbomind_tis.py b/opencompass/models/turbomind_tis.py
deleted file mode 100644
index 8541b9de5..000000000
--- a/opencompass/models/turbomind_tis.py
+++ /dev/null
@@ -1,135 +0,0 @@
-import logging
-import threading
-from concurrent.futures import ThreadPoolExecutor
-from typing import Dict, List, Optional, Union
-
-from opencompass.models.base import BaseModel, LMTemplateParser
-from opencompass.utils.logging import get_logger
-from opencompass.utils.prompt import PromptList
-
-PromptType = Union[PromptList, str]
-
-
-def valid_str(string, coding='utf-8'):
-    """decode text according to its encoding type."""
-    invalid_chars = [b'\xef\xbf\xbd']
-    bstr = bytes(string, coding)
-    for invalid_char in invalid_chars:
-        bstr = bstr.replace(invalid_char, b'')
-    ret = bstr.decode(encoding=coding, errors='ignore')
-    return ret
-
-
-class TurboMindTisModel(BaseModel):
-    """Model wrapper for TurboMind Triton Inference Server gRPC API.
-
-    Args:
-        path (str): The name of OpenAI's model.
-        tis_addr (str): The address (ip:port format) of turbomind's
-            triton inference server
-        max_seq_len (int): The maximum allowed sequence length of a model.
-            Note that the length of prompt + generated tokens shall not exceed
-            this value. Defaults to 2048.
-        meta_template (Dict, optional): The model's meta prompt
-            template if needed, in case the requirement of injecting or
-            wrapping of any meta instructions.
-    """
-
-    is_api: bool = True
-
-    def __init__(
-        self,
-        path: str,
-        tis_addr: str = '0.0.0.0:33337',
-        max_seq_len: int = 2048,
-        meta_template: Optional[Dict] = None,
-    ):
-        super().__init__(path=path,
-                         max_seq_len=max_seq_len,
-                         meta_template=meta_template)
-        from lmdeploy.serve.turbomind.utils import Preprocessor
-        self.preprocess = Preprocessor(tis_addr)
-        self.logger = get_logger()
-        self.template_parser = LMTemplateParser(meta_template)
-        self.eos_token_id = None
-        if meta_template and 'eos_token_id' in meta_template:
-            self.eos_token_id = meta_template['eos_token_id']
-        self.tis_addr = tis_addr
-
-    def generate(
-        self,
-        inputs: List[PromptType],
-        max_out_len: int = 512,
-        temperature: float = 1.0,
-    ) -> List[str]:
-        """Generate results given a list of inputs.
-
-        Args:
-            inputs (List[PromptType]): A list of strings or PromptDicts.
-                The PromptDict should be organized in OpenCompass'
-                API format.
-            max_out_len (int): The maximum length of the output.
-            temperature (float): What sampling temperature to use,
-                between 0 and 2. Higher values like 0.8 will make the output
-                more random, while lower values like 0.2 will make it more
-                focused and deterministic. Defaults to 0.7.
-
-        Returns:
-            List[str]: A list of generated strings.
-        """
-
-        with ThreadPoolExecutor() as executor:
-            results = list(
-                executor.map(self._generate, inputs,
-                             [max_out_len] * len(inputs),
-                             [temperature] * len(inputs)))
-        return results
-
-    def get_token_len(self, prompt: str) -> int:
-        input_ids, _ = self.preprocess(prompt)
-        return input_ids.shape[-1]
-
-    def wait(self):
-        """Wait till the next query can be sent.
-
-        Applicable in both single-thread and multi-thread environments.
-        """
-        return self.token_bucket.get_token()
-
-    def _generate(self, prompt: PromptType, max_out_len: int,
-                  temperature: float) -> str:
-        """Generate results given a list of inputs.
-
-        Args:
-            prompt (PromptType): A string or PromptDict.
-                The PromptDict should be organized in OpenCompass'
-                API format.
-            max_out_len (int): The maximum length of the output.
-            temperature (float): What sampling temperature to use,
-                between 0 and 2. Higher values like 0.8 will make the output
-                more random, while lower values like 0.2 will make it more
-                focused and deterministic.
-
-        Returns:
-            str: The generated string.
-        """
-        assert type(
-            prompt) is str, 'We only support string for TurboMind RPC API'
-
-        from lmdeploy.serve.turbomind.chatbot import Chatbot
-        chatbot = Chatbot(self.tis_addr,
-                          temperature=temperature,
-                          capability='completion',
-                          top_k=1,
-                          log_level=logging.ERROR)
-
-        for status, text, n_token in chatbot.stream_infer(
-                session_id=threading.currentThread().ident,
-                prompt=prompt,
-                request_output_len=max_out_len,
-                sequence_start=True,
-                sequence_end=True):
-            continue
-        response = valid_str(text)
-        response = response.replace('<eoa>', '')
-        return response
diff --git a/opencompass/models/turbomind_with_tf_above_v4_33.py b/opencompass/models/turbomind_with_tf_above_v4_33.py
index 48706671f..ab6801c9c 100644
--- a/opencompass/models/turbomind_with_tf_above_v4_33.py
+++ b/opencompass/models/turbomind_with_tf_above_v4_33.py
@@ -1,7 +1,6 @@
 # flake8: noqa
 # yapf: disable
 import copy
-from concurrent.futures import ThreadPoolExecutor
 from typing import Dict, List, Optional, Union
 
 from opencompass.models.base import BaseModel
@@ -31,38 +30,32 @@ def __init__(
         self,
         path: str,
         tokenizer_only: bool = False,
+        backend: str = 'turbomind',
         engine_config: Dict = {},
         gen_config: Dict = {},
-        concurrency: int = 8,
         max_seq_len: int = None,
         meta_template: Optional[Dict] = None,
         fastchat_template: Optional[str] = None,
         stop_words: List[str] = [],
     ):
-        from lmdeploy.messages import TurbomindEngineConfig
-        from lmdeploy.turbomind import TurboMind
-        from lmdeploy.version import version_info
-        from transformers import AutoTokenizer
-
         self.logger = get_logger()
         self.path = path
         self.tokenizer_only = tokenizer_only
         self.template_parser = _get_meta_template(meta_template)
         self.max_seq_len = _get_possible_max_seq_len(max_seq_len, path)
 
-        self.origin_tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True)
+        from lmdeploy import version_info
+        from transformers import AutoTokenizer
+        self.version_info = version_info
+        self.tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True)
         if not tokenizer_only:
             DEFAULT_ENGING_CONFIG = {'session_len': self.max_seq_len}
             _engine_config = DEFAULT_ENGING_CONFIG.copy()
             _engine_config.update(engine_config)
-            engine_config = TurbomindEngineConfig(**_engine_config)
-            tm_model = TurboMind.from_pretrained(path, engine_config=engine_config)
-            self.tokenizer = tm_model.tokenizer
-        self.generators = [tm_model.create_instance() for i in range(concurrency)]
-        self.generator_ids = [i + 1 for i in range(concurrency)]
-        self.concurrency = concurrency
+            self.pipe = self._build_pipe(path, backend, _engine_config)
+        else:
+            self.pipe = None
         self.gen_config = gen_config
-        self.version_info = version_info
         self.fastchat_template = fastchat_template
         self.stop_words = list(set(stop_words + self._get_potential_stop_words(path)))
         self.logger.info(f'using stop words: {self.stop_words}')
@@ -76,23 +69,23 @@ def _get_potential_stop_words(self, path: Optional[str]):
             generation_config = None
         if generation_config and hasattr(generation_config, 'eos_token_id'):
             if isinstance(generation_config.eos_token_id, int):
-                potential_stop_words.append(self.origin_tokenizer.decode(generation_config.eos_token_id))
+                potential_stop_words.append(self.tokenizer.decode(generation_config.eos_token_id))
             else:
                 assert isinstance(generation_config.eos_token_id, list)
                 for token_id in generation_config.eos_token_id:
-                    potential_stop_words.append(self.origin_tokenizer.decode(token_id))
-        if self.origin_tokenizer.eos_token is not None:
-            potential_stop_words.append(self.origin_tokenizer.eos_token)
+                    potential_stop_words.append(self.tokenizer.decode(token_id))
+        if self.tokenizer.eos_token is not None:
+            potential_stop_words.append(self.tokenizer.eos_token)
         potential_stop_words = list(set(potential_stop_words))
         potential_stop_words = [s for s in potential_stop_words if s]
         return potential_stop_words
 
     def generate(self,
                  inputs: List[str],
-                 max_out_len: int = 512,
+                 max_out_len: int,
                  stopping_criteria: List[str] = [],
                  do_sample: Optional[bool] = None,
-                 temperature: int = 1,
+                 temperature: float = 1.0,
                  **kwargs) -> List[str]:
         """Generate results given a list of inputs.
 
@@ -104,93 +97,45 @@ def generate(self,
             List[str]: A list of generated strings.
         """
         assert isinstance(inputs, List), f'List(str) is expected, but got {type(inputs)}'
-
         messages = _convert_chat_messages(inputs)
         if self.fastchat_template:
             messages = _format_with_fast_chat_template(messages, self.fastchat_template)
         else:
-            messages = [self.origin_tokenizer.apply_chat_template(m, add_generation_prompt=True, tokenize=False) for m in messages]
-
-        # split messages into batches
-        batch_messages = [messages[i:i + self.concurrency] for i in range(0, len(messages), self.concurrency)]
+            messages = [self.tokenizer.apply_chat_template(m, add_generation_prompt=True, tokenize=False) for m in messages]
 
         stop_words = list(set(self.stop_words + stopping_criteria))
-        encode_stop_words = []
-        if stop_words is not None and len(stop_words) > 0:
-            for words in stop_words:
-                encode_stop_words += self.tokenizer.encode(words, add_bos=False)
 
         DEFAULT_GEN_CONFIG = {
             'max_new_tokens': max_out_len,
             'min_new_tokens': 1,
-            'top_k': 1,
-            'stop_words': encode_stop_words,
+            'stop_words': stop_words,
         }
 
         gen_config = copy.deepcopy(DEFAULT_GEN_CONFIG)
         gen_config.update(self.gen_config)
         if do_sample:
-            gen_config['top_k'] = 1000
+            gen_config['top_k'] = 40
             gen_config['temperature'] = temperature
+        else:
+            if self.version_info >= (0, 6, 0):
+                gen_config['do_sample'] = False
+            else:
+                gen_config['top_k'] = 1
 
-        from lmdeploy.messages import GenerationConfig
+        from lmdeploy import GenerationConfig
+        gen_config = {k: v for k, v in gen_config.items() if hasattr(GenerationConfig, k)}
         gen_config = GenerationConfig(**gen_config)
-        if self.version_info >= (0, 6, 0):
-            gen_config.stop_words = stop_words
-            gen_config.convert_stop_bad_words_to_ids(self.tokenizer)
 
         results = []
-        for batch_message in batch_messages:
-            n = len(batch_message)
-            with ThreadPoolExecutor() as executor:
-                _results = list(
-                    executor.map(
-                        self._generate,
-                        self.generators[:n],
-                        self.generator_ids[:n],
-                        batch_message,
-                        [gen_config] * n,
-                    ))
-                results += _results
+        outputs = self.pipe(messages, gen_config=gen_config, do_preprocess=False)
+        for output in outputs:
+            text = self.tokenizer.decode(output.token_ids)
+            results.append(text)
 
         for s in stop_words:
             results = [r.split(s)[0] for r in results]
         return results
 
-    def _generate(self,
-                  generator,
-                  session_id,
-                  prompt: PromptType,
-                  gen_config=None) -> str:
-        """Generate results given a list of inputs.
-
-        Args:
-            prompt (PromptType): A string or PromptDict.
-                The PromptDict should be organized in OpenCompass'
-                API format.
-            gen_config (GenerationConfig, optional): Generation
-                config to set arguments like top_k, top_p, temperature.
-        Returns:
-            str: The generated string.
-        """
-        assert type(prompt) is str, 'We only support string for TurboMind Python API'
-
-        input_ids = self.tokenizer.encode(prompt, add_bos=False)
-        for outputs in generator.stream_infer(session_id=session_id,
-                                              input_ids=[input_ids],
-                                              gen_config=gen_config,
-                                              sequence_start=True,
-                                              sequence_end=True,
-                                              step=0,
-                                              stream_output=False):
-            if self.version_info >= (0, 4, 0):
-                output_ids = outputs.token_ids
-            else:
-                _, output_ids, _ = outputs
-            response = self.tokenizer.decode(output_ids)
-            response = valid_str(response)
-        return response
-
     def get_token_len(self, prompt: str) -> int:
         """Get lengths of the tokenized strings.
 
@@ -201,5 +146,20 @@ def get_token_len(self, prompt: str) -> int:
             int: Length of the input tokens
         """
         m = _convert_chat_messages([prompt])[0]
-        t = self.origin_tokenizer.apply_chat_template(m, add_generation_prompt=True, return_dict=True)
+        t = self.tokenizer.apply_chat_template(m, add_generation_prompt=True, return_dict=True)
         return len(t['input_ids'])
+
+    def _build_pipe(self, model_path, backend, engine_config):
+        from lmdeploy import (PytorchEngineConfig, TurbomindEngineConfig,
+                              pipeline)
+
+        assert backend in ['pytorch', 'turbomind'], \
+                f'unsupported backend type: {backend}'
+
+        if backend == 'turbomind':
+            filtered = {k: v for k, v in engine_config.items() if hasattr(TurbomindEngineConfig, k)}
+            backend_config = TurbomindEngineConfig(**filtered)
+        else:
+            filtered = {k: v for k, v in engine_config.items() if hasattr(PytorchEngineConfig, k)}
+            backend_config = PytorchEngineConfig(**filtered)
+        return pipeline(model_path, backend_config=backend_config, log_level='INFO', max_log_len=10)
diff --git a/opencompass/utils/run.py b/opencompass/utils/run.py
index 67c465941..025efc4b3 100644
--- a/opencompass/utils/run.py
+++ b/opencompass/utils/run.py
@@ -9,7 +9,7 @@
 from opencompass.datasets.custom import make_custom_dataset_config
 from opencompass.models import (VLLM, HuggingFace, HuggingFaceBaseModel,
                                 HuggingFaceCausalLM, HuggingFaceChatGLM3,
-                                HuggingFacewithChatTemplate, TurboMindModel,
+                                HuggingFacewithChatTemplate,
                                 TurboMindModelwithChatTemplate,
                                 VLLMwithChatTemplate)
 from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
@@ -233,7 +233,7 @@ def change_accelerator(models, accelerator):
     model_accels = []
     for model in models:
         logger.info(f'Transforming {model["abbr"]} to {accelerator}')
-        # change HuggingFace model to VLLM or TurboMindModel
+        # change HuggingFace model to VLLM or LMDeploy
         if model['type'] in [HuggingFace, HuggingFaceCausalLM, HuggingFaceChatGLM3, f'{HuggingFaceBaseModel.__module__}.{HuggingFaceBaseModel.__name__}']:
             gen_args = dict()
             if model.get('generation_kwargs') is not None:
@@ -254,10 +254,10 @@ def change_accelerator(models, accelerator):
 
             if accelerator == 'lmdeploy':
                 logger.info(f'Transforming {model["abbr"]} to {accelerator}')
-                mod = TurboMindModel
+                mod = TurboMindModelwithChatTemplate
                 acc_model = dict(
                     type=f'{mod.__module__}.{mod.__name__}',
-                    abbr=model['abbr'].replace('hf', 'turbomind') if '-hf' in model['abbr'] else model['abbr'] + '-turbomind',
+                    abbr=model['abbr'].replace('hf', 'lmdeploy') if '-hf' in model['abbr'] else model['abbr'] + '-lmdeploy',
                     path=model['path'],
                     engine_config=dict(session_len=model['max_seq_len'],
                                        max_batch_size=model['batch_size'],
@@ -270,7 +270,6 @@ def change_accelerator(models, accelerator):
                     max_out_len=model['max_out_len'],
                     max_seq_len=model['max_seq_len'],
                     batch_size=model['batch_size'],
-                    concurrency=model['batch_size'],
                     run_cfg=model['run_cfg'],
                 )
                 for item in ['meta_template']:
@@ -312,7 +311,7 @@ def change_accelerator(models, accelerator):
                 mod = TurboMindModelwithChatTemplate
                 acc_model = dict(
                     type=f'{mod.__module__}.{mod.__name__}',
-                    abbr=model['abbr'].replace('hf', 'turbomind') if '-hf' in model['abbr'] else model['abbr'] + '-turbomind',
+                    abbr=model['abbr'].replace('hf', 'lmdeploy') if '-hf' in model['abbr'] else model['abbr'] + '-lmdeploy',
                     path=model['path'],
                     engine_config=dict(max_batch_size=model.get('batch_size', 16), tp=model['run_cfg']['num_gpus']),
                     gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9),