diff --git a/.github/scripts/eval_regression_base.py b/.github/scripts/eval_regression_base.py index 8b4c64468..12339ecfa 100644 --- a/.github/scripts/eval_regression_base.py +++ b/.github/scripts/eval_regression_base.py @@ -8,15 +8,17 @@ race_datasets # noqa: F401, E501 from opencompass.configs.models.deepseek.hf_deepseek_moe_16b_base import \ models as hf_deepseek_moe_16b_base_model # noqa: F401, E501 + from opencompass.configs.models.deepseek.hf_deepseek_v2_lite import \ + models as hf_deepseek_v2_lite_model # noqa: F401, E501 # read hf models - chat models from opencompass.configs.models.deepseek.lmdeploy_deepseek_7b_base import \ models as lmdeploy_deepseek_7b_base_model # noqa: F401, E501 from opencompass.configs.models.deepseek.vllm_deepseek_moe_16b_base import \ models as vllm_deepseek_moe_16b_base_model # noqa: F401, E501 - from opencompass.configs.models.gemma.hf_gemma_2b import \ - models as hf_gemma_2b_model # noqa: F401, E501 - from opencompass.configs.models.gemma.hf_gemma_7b import \ - models as hf_gemma_7b_model # noqa: F401, E501 + from opencompass.configs.models.gemma.hf_gemma2_2b import \ + models as hf_gemma2_2b_model # noqa: F401, E501 + from opencompass.configs.models.gemma.hf_gemma2_9b import \ + models as hf_gemma2_9b_model # noqa: F401, E501 from opencompass.configs.models.hf_internlm.hf_internlm2_5_7b import \ models as hf_internlm2_5_7b_model # noqa: F401, E501 from opencompass.configs.models.hf_internlm.hf_internlm2_7b import \ @@ -31,16 +33,28 @@ models as lmdeploy_internlm2_7b_model # noqa: F401, E501 from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_base_7b import \ models as lmdeploy_internlm2_base_7b_model # noqa: F401, E501 + from opencompass.configs.models.hf_llama.hf_llama2_7b import \ + models as hf_llama2_7b_model # noqa: F401, E501 + from opencompass.configs.models.hf_llama.hf_llama3_8b import \ + models as hf_llama3_8b_model # noqa: F401, E501 + from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_8b import \ + models as lmdeploy_llama3_1_8b_model # noqa: F401, E501 from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b import \ models as lmdeploy_llama3_8b_model # noqa: F401, E501 - from opencompass.configs.models.mistral.hf_mistral_7b_v0_2 import \ - models as hf_mistral_7b_v0_2_model # noqa: F401, E501 + from opencompass.configs.models.mistral.hf_mistral_7b_v0_3 import \ + models as hf_mistral_7b_v0_3_model # noqa: F401, E501 from opencompass.configs.models.mistral.vllm_mistral_7b_v0_2 import \ models as vllm_mistral_7b_v0_2_model # noqa: F401, E501 + from opencompass.configs.models.mistral.vllm_mixtral_8x7b_v0_1 import \ + models as vllm_mixtral_8x7b_v0_1_model # noqa: F401, E501 from opencompass.configs.models.qwen.hf_qwen1_5_moe_a2_7b import \ models as hf_qwen1_5_moe_a2_7b_model # noqa: F401, E501 from opencompass.configs.models.qwen.hf_qwen2_0_5b import \ models as hf_qwen2_0_5b_model # noqa: F401, E501 + from opencompass.configs.models.qwen.hf_qwen2_1_5b import \ + models as hf_qwen2_1_5b_model # noqa: F401, E501 + from opencompass.configs.models.qwen.hf_qwen2_7b import \ + models as hf_qwen2_7b_model # noqa: F401, E501 from opencompass.configs.models.qwen.lmdeploy_qwen2_1_5b import \ models as lmdeploy_qwen2_1_5b_model # noqa: F401, E501 from opencompass.configs.models.qwen.lmdeploy_qwen2_7b import \ diff --git a/.github/scripts/eval_regression_chat.py b/.github/scripts/eval_regression_chat.py index 1ee28e630..fa28562f4 100644 --- a/.github/scripts/eval_regression_chat.py +++ b/.github/scripts/eval_regression_chat.py @@ -13,20 +13,32 @@ models as hf_baichuan2_7b_chat_model # noqa: F401, E501 from opencompass.configs.models.chatglm.hf_glm4_9b_chat import \ models as hf_glm4_9b_chat_model # noqa: F401, E501 + from opencompass.configs.models.chatglm.lmdeploy_glm4_9b_chat import \ + models as lmdeploy_glm4_9b_chat_model # noqa: F401, E501 + from opencompass.configs.models.chatglm.vllm_glm4_9b_chat import \ + models as vllm_glm4_9b_chat_model # noqa: F401, E501 from opencompass.configs.models.deepseek.hf_deepseek_7b_chat import \ models as hf_deepseek_7b_chat_model # noqa: F401, E501 from opencompass.configs.models.deepseek.hf_deepseek_moe_16b_chat import \ models as hf_deepseek_moe_16b_chat_model # noqa: F401, E501 + from opencompass.configs.models.deepseek.hf_deepseek_v2_lite_chat import \ + models as hf_deepseek_v2_lite_chat_model # noqa: F401, E501 from opencompass.configs.models.deepseek.vllm_deepseek_7b_chat import \ models as vllm_deepseek_7b_chat_model # noqa: F401, E501 - from opencompass.configs.models.gemma.hf_gemma_2b_it import \ - models as hf_gemma_2b_it_model # noqa: F401, E501 - from opencompass.configs.models.gemma.hf_gemma_7b_it import \ - models as hf_gemma_7b_it_model # noqa: F401, E501 + from opencompass.configs.models.gemma.hf_gemma2_2b_it import \ + models as hf_gemma2_2b_it_model # noqa: F401, E501 + from opencompass.configs.models.gemma.hf_gemma2_9b_it import \ + models as hf_gemma2_9b_it_model # noqa: F401, E501 + from opencompass.configs.models.gemma.vllm_gemma_7b_it import \ + models as vllm_gemma_7b_it_model # noqa: F401, E501 from opencompass.configs.models.hf_internlm.hf_internlm2_5_7b_chat import \ models as hf_internlm2_5_7b_chat_model # noqa: F401, E501 + from opencompass.configs.models.hf_internlm.hf_internlm2_5_20b_chat import \ + models as hf_internlm2_5_20b_chat_model # noqa: F401, E501 from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import \ models as lmdeploy_internlm2_5_7b_chat_model # noqa: F401, E501 + from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_20b_chat import \ + models as lmdeploy_internlm2_5_20b_chat_model # noqa: F401, E501 from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_chat_1_8b import \ models as lmdeploy_internlm2_chat_1_8b_model # noqa: F401, E501 from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_chat_1_8b_sft import \ @@ -37,14 +49,20 @@ models as lmdeploy_internlm2_chat_7b_sft_model # noqa: F401, E501 from opencompass.configs.models.hf_internlm.vllm_internlm2_chat_7b import \ models as vllm_internlm2_chat_7b_model # noqa: F401, E501 + from opencompass.configs.models.hf_llama.hf_llama3_1_8b_instruct import \ + models as hf_llama3_1_8b_instruct_model # noqa: F401, E501 from opencompass.configs.models.hf_llama.hf_llama3_8b_instruct import \ models as hf_llama3_8b_instruct_model # noqa: F401, E501 + from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_8b_instruct import \ + models as lmdeploy_llama3_1_8b_instruct_model # noqa: F401, E501 from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b_instruct import \ models as lmdeploy_llama3_8b_instruct_model # noqa: F401, E501 - from opencompass.configs.models.mistral.hf_mistral_7b_instruct_v0_2 import \ - models as hf_mistral_7b_instruct_v0_2_model # noqa: F401, E501 + from opencompass.configs.models.mistral.hf_mistral_7b_instruct_v0_3 import \ + models as hf_mistral_7b_instruct_v0_3_model # noqa: F401, E501 from opencompass.configs.models.mistral.vllm_mistral_7b_instruct_v0_2 import \ models as vllm_mistral_7b_instruct_v0_2_model # noqa: F401, E501 + from opencompass.configs.models.mistral.vllm_mixtral_8x7b_instruct_v0_1 import \ + models as vllm_mixtral_8x7b_instruct_v0_1_model # noqa: F401, E501 from opencompass.configs.models.openbmb.hf_minicpm_2b_dpo_fp32 import \ models as hf_minicpm_2b_dpo_fp32_model # noqa: F401, E501 from opencompass.configs.models.openbmb.hf_minicpm_2b_sft_bf16 import \ @@ -57,6 +75,10 @@ models as hf_phi_3_mini_8k_instruct_model # noqa: F401, E501 from opencompass.configs.models.qwen.hf_qwen1_5_0_5b_chat import \ models as hf_qwen1_5_0_5b_chat_model # noqa: F401, E501 + from opencompass.configs.models.qwen.hf_qwen2_1_5b_instruct import \ + models as hf_qwen2_1_5b_instruct_model # noqa: F401, E501 + from opencompass.configs.models.qwen.hf_qwen2_7b_instruct import \ + models as hf_qwen2_7b_instruct_model # noqa: F401, E501 from opencompass.configs.models.qwen.lmdeploy_qwen2_1_5b_instruct import \ models as lmdeploy_qwen2_1_5b_instruct_model # noqa: F401, E501 from opencompass.configs.models.qwen.lmdeploy_qwen2_7b_instruct import \ diff --git a/.github/scripts/oc_score_assert.py b/.github/scripts/oc_score_assert.py index f869b157b..c01ef6864 100644 --- a/.github/scripts/oc_score_assert.py +++ b/.github/scripts/oc_score_assert.py @@ -8,29 +8,33 @@ chat_model_list = [ 'baichuan2-7b-chat-hf', 'deepseek-7b-chat-hf', 'deepseek-moe-16b-chat-hf', - 'deepseek-7b-chat-vllm', 'gemma-2b-it-hf', 'gemma-7b-it-hf', - 'internlm2_5-7b-chat-hf', 'internlm2_5-7b-chat-turbomind', - 'internlm2-chat-1.8b-turbomind', 'internlm2-chat-1.8b-sft-turbomind', - 'internlm2-chat-7b-turbomind', 'internlm2-chat-7b-sft-turbomind', - 'internlm2-chat-7b-vllm', 'llama-3-8b-instruct-hf', - 'llama-3-8b-instruct-turbomind', 'mistral-7b-instruct-v0.2-hf', - 'mistral-7b-instruct-v0.2-vllm', 'minicpm-2b-dpo-fp32-hf', - 'minicpm-2b-sft-bf16-hf', 'minicpm-2b-sft-fp32-hf', - 'phi-3-mini-4k-instruct-hf', 'qwen1.5-0.5b-chat-hf', + 'deepseek-v2-lite-chat-hf', 'deepseek-7b-chat-vllm', 'gemma2-2b-it-hf', + 'gemma2-9b-it-hf', 'gemma-7b-it-vllm', 'internlm2_5-7b-chat-hf', + 'internlm2_5-20b-chat-hf', 'internlm2_5-7b-chat-turbomind', + 'internlm2_5-20b-chat-turbomind', 'internlm2-chat-1.8b-turbomind', + 'internlm2-chat-1.8b-sft-turbomind', 'internlm2-chat-7b-turbomind', + 'internlm2-chat-7b-sft-turbomind', 'internlm2-chat-7b-vllm', + 'llama-3_1-8b-instruct-hf', 'llama-3-8b-instruct-hf', + 'llama-3_1-8b-instruct-turbomind', 'llama-3-8b-instruct-turbomind', + 'mistral-7b-instruct-v0.3-hf', 'mistral-7b-instruct-v0.2-vllm', + 'minicpm-2b-dpo-fp32-hf', 'minicpm-2b-sft-bf16-hf', + 'minicpm-2b-sft-fp32-hf', 'phi-3-mini-4k-instruct-hf', + 'qwen1.5-0.5b-chat-hf', 'qwen2-1.5b-instruct-hf', 'qwen2-7b-instruct-hf', 'qwen2-1.5b-instruct-turbomind', 'qwen2-7b-instruct-turbomind', 'qwen1.5-0.5b-chat-vllm', 'yi-1.5-6b-chat-hf', 'yi-1.5-9b-chat-hf', 'lmdeploy-api-test' ] base_model_list = [ - 'deepseek-moe-16b-base-hf', 'deepseek-7b-base-turbomind', - 'deepseek-moe-16b-base-vllm', 'gemma-2b-hf', 'gemma-7b-hf', - 'internlm2_5-7b-hf', 'internlm2-7b-hf', 'internlm2-base-7b-hf', - 'internlm2_5-7b-turbomind', 'internlm2-1.8b-turbomind', - 'internlm2-7b-turbomind', 'internlm2-base-7b-hf', - 'internlm2-base-7b-turbomind', 'llama-3-8b-turbomind', - 'mistral-7b-v0.2-hf', 'mistral-7b-v0.2-vllm', 'qwen1.5-moe-a2.7b-hf', - 'qwen2-0.5b-hf', 'qwen2-1.5b-turbomind', 'qwen2-7b-turbomind', - 'qwen1.5-0.5b-vllm', 'yi-1.5-6b-hf', 'yi-1.5-9b-hf' + 'deepseek-moe-16b-base-hf', 'deepseek-v2-lite-hf', + 'deepseek-7b-base-turbomind', 'deepseek-moe-16b-base-vllm', 'gemma2-2b-hf', + 'gemma2-9b-hf', 'internlm2_5-7b-hf', 'internlm2-7b-hf', + 'internlm2-base-7b-hf', 'internlm2-1.8b-turbomind', + 'internlm2_5-7b-turbomind', 'internlm2-7b-turbomind', + 'internlm2-base-7b-turbomind', 'llama-2-7b-hf', 'llama-3-8b-hf', + 'llama-3.1-8b-turbomind', 'llama-3-8b-turbomind', 'mistral-7b-v0.3-hf', + 'mistral-7b-v0.2-vllm', 'qwen1.5-moe-a2.7b-hf', 'qwen2-0.5b-hf', + 'qwen2-1.5b-hf', 'qwen2-7b-hf', 'qwen2-1.5b-turbomind', + 'qwen2-7b-turbomind', 'qwen1.5-0.5b-vllm', 'yi-1.5-6b-hf', 'yi-1.5-9b-hf' ] dataset_list = ['gsm8k', 'race-middle', 'race-high'] diff --git a/.github/scripts/oc_score_baseline.yaml b/.github/scripts/oc_score_baseline.yaml index d7e765be2..809dfea45 100644 --- a/.github/scripts/oc_score_baseline.yaml +++ b/.github/scripts/oc_score_baseline.yaml @@ -8,6 +8,16 @@ glm-4-9b-chat-hf: race-middle: 88 race-high: 88 +glm-4-9b-chat-turbomind: + gsm8k: 69 + race-middle: 82 + race-high: 77 + +glm-4-9b-chat-vllm: + gsm8k: 73 + race-middle: 87 + race-high: 87 + deepseek-7b-chat-hf: gsm8k: 60 race-middle: 74 @@ -18,6 +28,11 @@ deepseek-moe-16b-chat-hf: race-middle: 62 race-high: 70 +deepseek-v2-lite-chat-hf: + gsm8k: 59 + race-middle: 82 + race-high: 79 + deepseek-7b-chat-vllm: gsm8k: 63 race-middle: 74 @@ -33,23 +48,48 @@ gemma-7b-it-hf: race-middle: 74 race-high: 71 +gemma-7b-it-vllm: + gsm8k: 38 + race-middle: 75 + race-high: 70 + +gemma2-2b-it-hf: + gsm8k: 62 + race-middle: 75 + race-high: 67 + +gemma2-9b-it-hf: + gsm8k: 80 + race-middle: 89 + race-high: 85 + internlm2_5-7b-chat-hf: gsm8k: 86 race-middle: 92 race-high: 93 +internlm2_5-20b-chat-hf: + gsm8k: 91 + race-middle: 95 + race-high: 91 + internlm2_5-7b-chat-turbomind: gsm8k: 87 race-middle: 92 race-high: 93 +internlm2_5-20b-chat-turbomind: + gsm8k: 91 + race-middle: 95 + race-high: 91 + internlm2-chat-1.8b-turbomind: gsm8k: 40 race-middle: 82 race-high: 83 internlm2-chat-1.8b-sft-turbomind: - gsm8k: 32 + gsm8k: 34 race-middle: 81 race-high: 83 @@ -68,11 +108,21 @@ internlm2-chat-7b-vllm: race-middle: 90 race-high: 91 +llama-3_1-8b-instruct-hf: + gsm8k: 82 + race-middle: 82 + race-high: 88 + llama-3-8b-instruct-hf: gsm8k: 77 race-middle: 85 race-high: 87 +llama-3_1-8b-instruct-turbomind: + gsm8k: 79 + race-middle: 82 + race-high: 88 + llama-3-8b-instruct-turbomind: gsm8k: 77 race-middle: 85 @@ -83,6 +133,11 @@ mistral-7b-instruct-v0.2-hf: race-middle: 82 race-high: 78 +mistral-7b-instruct-v0.3-hf: + gsm8k: 53 + race-middle: 80 + race-high: 78 + mistral-7b-instruct-v0.2-vllm: gsm8k: 49 race-middle: 81 @@ -118,6 +173,11 @@ qwen1.5-0.5b-chat-hf: race-middle: 55 race-high: 50 +qwen2-1.5b-instruct-hf: + gsm8k: 63 + race-middle: 77 + race-high: 86 + qwen2-1.5b-instruct-turbomind: gsm8k: 60 race-middle: 77 @@ -128,6 +188,11 @@ qwen2-7b-instruct-turbomind: race-middle: 87 race-high: 89 +qwen2-7b-instruct-hf: + gsm8k: 85 + race-middle: 87 + race-high: 91 + qwen1.5-0.5b-chat-vllm: gsm8k: 5 race-middle: 57 @@ -153,6 +218,11 @@ deepseek-moe-16b-base-hf: race-middle: 35 race-high: 23 +deepseek-v2-lite-hf: + gsm8k: 37 + race-middle: 56 + race-high: 62 + deepseek-7b-base-turbomind: gsm8k: 21 race-middle: 42 @@ -173,8 +243,18 @@ gemma-7b-hf: race-middle: 59 race-high: 66 +gemma2-2b-hf: + gsm8k: 33 + race-middle: 56 + race-high: 58 + +gemma2-9b-hf: + gsm8k: 70 + race-middle: 82 + race-high: 84 + internlm2_5-7b-hf: - gsm8k: 46 + gsm8k: 47 race-middle: 92 race-high: 91 @@ -208,6 +288,21 @@ internlm2-base-7b-turbomind: race-middle: 75 race-high: 81 +llama-2-7b-hf: + gsm8k: 17 + race-middle: 32 + race-high: 38 + +llama-3-8b-hf: + gsm8k: 48 + race-middle: 64 + race-high: 70 + +llama-3.1-8b-turbomind: + gsm8k: 57 + race-middle: 67 + race-high: 75 + llama-3-8b-turbomind: gsm8k: 52 race-middle: 63 @@ -218,6 +313,11 @@ mistral-7b-v0.2-hf: race-middle: 42 race-high: 60 +mistral-7b-v0.3-hf: + gsm8k: 43 + race-middle: 42 + race-high: 60 + mistral-7b-v0.2-vllm: gsm8k: 45 race-middle: 42 @@ -228,11 +328,21 @@ qwen1.5-moe-a2.7b-hf: race-middle: 78 race-high: 90 +qwen2-1.5b-hf: + gsm8k: 58 + race-middle: 65 + race-high: 78 + qwen2-0.5b-hf: gsm8k: 35 race-middle: 52 race-high: 48 +qwen2-7b-hf: + gsm8k: 82 + race-middle: 88 + race-high: 89 + qwen2-1.5b-turbomind: gsm8k: 57 race-middle: 64 diff --git a/.github/workflows/daily-run-test.yml b/.github/workflows/daily-run-test.yml index 7d7affafc..42ada2f08 100644 --- a/.github/workflows/daily-run-test.yml +++ b/.github/workflows/daily-run-test.yml @@ -14,9 +14,14 @@ env: PIP_CACHE_PATH: /cpfs01/user/qa-llm-cicd/.cache/pip USERSPACE_PREFIX: /cpfs01/user/qa-llm-cicd HF_CACHE_PATH: /cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub + HUGGINGFACE_HUB_CACHE: /cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub + HF_HUB_CACHE: /cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub DATEASET_CACHE_PATH: /cpfs01/shared/public/public_hdd/llmeval/llm-evaluation-datasets HF_DATASETS_OFFLINE: 1 + HF_EVALUATE_OFFLINE: 1 TRANSFORMERS_OFFLINE: 1 + VLLM_USE_MODELSCOPE: false + LMDEPLOY_USE_MODELSCOPE: false HF_HUB_OFFLINE: 1 TRITON_PTXAS_PATH: /usr/local/cuda/bin/ptxas @@ -43,7 +48,11 @@ jobs: daily_run_test: needs: build-pypi - runs-on: self-hosted + strategy: + fail-fast: false + matrix: + cuda_env: [dsw_cu11, dsw_cu12] + runs-on: ${{ matrix.cuda_env }} environment: 'prod' timeout-minutes: 420 #7hours steps: @@ -53,22 +62,38 @@ jobs: uses: actions/download-artifact@v4 with: name: my-artifact-${{ github.run_id }} - - name: Prepare - create conda env and install torch + - name: Prepare - create conda env and install torch - cu11 + if: ${{matrix.cuda_env == 'dsw_cu11'}} run: | . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate - conda create -y --name ${{env.CONDA_ENV}} python=3.10 - conda activate ${{env.CONDA_ENV}} - pip install opencompass*.whl - pip install /cpfs01/user/qa-llm-cicd/packages/lmdeploy-0.5.0+cu118-cp310-cp310-manylinux2014_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}} - pip install /cpfs01/user/qa-llm-cicd/packages/vllm-0.5.5+cu118-cp310-cp310-manylinux1_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}} - - pip install human_eval transformers protobuf pytest gguf msgspec librosa vllm_flash_attn bitsandbytes --cache-dir ${{env.PIP_CACHE_PATH}} + conda create -y --name ${{env.CONDA_ENV}}_${{ matrix.cuda_env }} python=3.10 + conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }} + pip install opencompass*.whl --cache-dir ${{env.PIP_CACHE_PATH}} + pip install /cpfs01/user/qa-llm-cicd/packages/lmdeploy-0.6.0+cu118-cp310-cp310-manylinux2014_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}} + pip install /cpfs01/user/qa-llm-cicd/packages/vllm-0.6.1.post1+cu118-cp310-cp310-manylinux1_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}} + pip install human_eval transformers protobuf pytest gguf msgspec librosa vllm_flash_attn bitsandbytes modelscope --cache-dir ${{env.PIP_CACHE_PATH}} pip uninstall torch torchvision torchaudio -y pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --cache-dir ${{env.PIP_CACHE_PATH}} --index-url https://download.pytorch.org/whl/cu118 FLASH_ATTENTION_FORCE_BUILD=TRUE pip install /cpfs01/user/qa-llm-cicd/packages/flash_attn-2.6.3+cu118torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl pip install /cpfs01/user/qa-llm-cicd/packages/xformers-0.0.27.post2+cu118-cp310-cp310-manylinux2014_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}} conda info --envs pip list + - name: Prepare - create conda env and install torch - cu12 + if: ${{matrix.cuda_env == 'dsw_cu12'}} + run: | + . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate + conda create -y --name ${{env.CONDA_ENV}}_${{ matrix.cuda_env }} python=3.10 + conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }} + pip install opencompass*.whl --cache-dir ${{env.PIP_CACHE_PATH}} + pip install lmdeploy==0.6.0 --cache-dir ${{env.PIP_CACHE_PATH}} --no-cache-dir + pip install opencompass[vllm] --cache-dir ${{env.PIP_CACHE_PATH}} + pip install human_eval transformers protobuf pytest gguf msgspec librosa vllm_flash_attn bitsandbytes modelscope --cache-dir ${{env.PIP_CACHE_PATH}} + pip uninstall torch torchvision torchaudio -y + pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --cache-dir ${{env.PIP_CACHE_PATH}} + FLASH_ATTENTION_FORCE_BUILD=TRUE pip install /cpfs01/user/qa-llm-cicd/packages/flash_attn-2.6.3+cu123torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl + pip install /cpfs01/user/qa-llm-cicd/packages/xformers-0.0.27.post2-cp310-cp310-manylinux2014_x86_64.whl --cache-dir ${{env.PIP_CACHE_PATH}} + conda info --envs + pip list - name: Prepare - prepare data and hf model run: | ln -s ${{env.DATEASET_CACHE_PATH}} data @@ -77,45 +102,45 @@ jobs: - name: Run chat model test run: | . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate - conda activate ${{env.CONDA_ENV}} + conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }} conda info --envs sed -i 's/judgemodel/'$(tail -n 1 /cpfs01/shared/public/llmeval/share_info/compassjuder_ip.txt)'/g' .github/scripts/eval_regression_chat.py - python3 run.py .github/scripts/eval_regression_chat.py --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/chat --reuse - rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/chat/*/summary regression_result_daily + opencompass .github/scripts/eval_regression_chat.py --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/chat_${{ matrix.cuda_env }} --reuse --max-num-workers 2 + rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/chat_${{ matrix.cuda_env }}/*/summary regression_result_daily python -m pytest -m chat -s -v --color=yes .github/scripts/oc_score_assert.py - name: Run base model test run: | . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate - conda activate ${{env.CONDA_ENV}} + conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }} conda info --envs - python3 run.py .github/scripts/eval_regression_base.py --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/base --reuse - rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/base/*/summary regression_result_daily + opencompass .github/scripts/eval_regression_base.py --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/base_${{ matrix.cuda_env }} --reuse --max-num-workers 2 + rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/base_${{ matrix.cuda_env }}/*/summary regression_result_daily python -m pytest -m base -s -v --color=yes .github/scripts/oc_score_assert.py - name: Run command testcase run: | . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate - conda activate ${{env.CONDA_ENV}} + conda activate ${{env.CONDA_ENV}}_${{ matrix.cuda_env }} conda info --envs export from_tf=TRUE python tools/list_configs.py internlm2_5 mmlu - python run.py --models hf_internlm2_5_7b --datasets race_ppl --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd1 --reuse - rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd1/*/summary regression_result_daily + opencompass --models hf_internlm2_5_7b --datasets race_ppl --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd1_${{ matrix.cuda_env }} --reuse + rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd1_${{ matrix.cuda_env }}/*/summary regression_result_daily python -m pytest -m case1 -s -v --color=yes .github/scripts/oc_score_assert.py - python run.py --models hf_internlm2_5_7b_chat --datasets race_gen -a lmdeploy --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd2 --reuse - rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd2/*/summary regression_result_daily + opencompass --models hf_internlm2_5_7b_chat --datasets race_gen -a lmdeploy --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd2_${{ matrix.cuda_env }} --reuse + rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd2_${{ matrix.cuda_env }}/*/summary regression_result_daily python -m pytest -m case2 -s -v --color=yes .github/scripts/oc_score_assert.py - python run.py --datasets race_ppl --hf-type base --hf-path internlm/internlm2_5-7b --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd3 --reuse - rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd3/*/summary regression_result_daily + opencompass --datasets race_ppl --hf-type base --hf-path internlm/internlm2_5-7b --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd3_${{ matrix.cuda_env }} --reuse + rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd3_${{ matrix.cuda_env }}/*/summary regression_result_daily python -m pytest -m case3 -s -v --color=yes .github/scripts/oc_score_assert.py - python run.py --datasets race_gen --hf-type chat --hf-path internlm/internlm2_5-7b-chat --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd4 --reuse - rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd4/*/summary regression_result_daily + opencompass --datasets race_gen --hf-type chat --hf-path internlm/internlm2_5-7b-chat --work-dir /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd4_${{ matrix.cuda_env }} --reuse + rm regression_result_daily -f && ln -s /cpfs01/user/qa-llm-cicd/report/${{ github.run_id }}/cmd4_${{ matrix.cuda_env }}/*/summary regression_result_daily python -m pytest -m case4 -s -v --color=yes .github/scripts/oc_score_assert.py - name: Remove Conda Env if: always() run: | rm -rf regression_result_daily . /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate - conda env remove -y --name ${{env.CONDA_ENV}} + conda env remove -y --name ${{env.CONDA_ENV}}_${{ matrix.cuda_env }} conda info --envs notify_to_feishu: diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index ae9a9bd2f..bc6d36a7e 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -17,7 +17,7 @@ jobs: python-version: '3.10' - name: Install pre-commit hook run: | - pip install pre-commit mmengine + pip install pre-commit==3.8.0 mmengine pre-commit install - name: Linting run: pre-commit run --all-files diff --git a/.github/workflows/pr-run-test.yml b/.github/workflows/pr-run-test.yml index 6cab13786..d9fcdc3ae 100644 --- a/.github/workflows/pr-run-test.yml +++ b/.github/workflows/pr-run-test.yml @@ -51,7 +51,7 @@ jobs: conda activate ${{env.CONDA_ENV}} conda info --envs rm -rf regression_result - python3 run.py --models hf_internlm2_chat_7b --datasets siqa_gen --work-dir regression_result --debug + opencompass --models hf_internlm2_chat_7b --datasets siqa_gen --work-dir regression_result --debug - name: Get result run: | score=$(sed -n '$p' regression_result/*/summary/*.csv | awk -F ',' '{print $NF}') diff --git a/README.md b/README.md index ffcca3fd4..2da411958 100644 --- a/README.md +++ b/README.md @@ -34,17 +34,6 @@ English | [简体中文](README_zh-CN.md) > > **Star Us**, You will receive all release notifications from GitHub without any delay ~ ⭐️ -## 📣 OpenCompass 2.0 - -We are thrilled to introduce OpenCompass 2.0, an advanced suite featuring three key components: [CompassKit](https://github.com/open-compass), [CompassHub](https://hub.opencompass.org.cn/home), and [CompassRank](https://rank.opencompass.org.cn/home). -![oc20](https://github.com/tonysy/opencompass/assets/7881589/90dbe1c0-c323-470a-991e-2b37ab5350b2) - -**CompassRank** has been significantly enhanced into the leaderboards that now incorporates both open-source benchmarks and proprietary benchmarks. This upgrade allows for a more comprehensive evaluation of models across the industry. - -**CompassHub** presents a pioneering benchmark browser interface, designed to simplify and expedite the exploration and utilization of an extensive array of benchmarks for researchers and practitioners alike. To enhance the visibility of your own benchmark within the community, we warmly invite you to contribute it to CompassHub. You may initiate the submission process by clicking [here](https://hub.opencompass.org.cn/dataset-submit). - -**CompassKit** is a powerful collection of evaluation toolkits specifically tailored for Large Language Models and Large Vision-language Models. It provides an extensive set of tools to assess and measure the performance of these complex models effectively. Welcome to try our toolkits for in your research and products. -
Star History @@ -70,6 +59,8 @@ Just like a compass guides us on our journey, OpenCompass will guide you through ## 🚀 What's New +- **\[2024.09.19\]** We now support [Qwen2.5](https://huggingface.co/Qwen)(0.5B to 72B) with multiple backend(huggingface/vllm/lmdeploy). Feel free to give them a try! 🔥🔥🔥 +- **\[2024.09.17\]** We now support OpenAI o1(`o1-mini-2024-09-12` and `o1-preview-2024-09-12`). Feel free to give them a try! 🔥🔥🔥 - **\[2024.09.05\]** We now support answer extraction through model post-processing to provide a more accurate representation of the model's capabilities. As part of this update, we have integrated [XFinder](https://github.com/IAAR-Shanghai/xFinder) as our first post-processing model. For more detailed information, please refer to the [documentation](opencompass/utils/postprocessors/xfinder/README.md), and give it a try! 🔥🔥🔥 - **\[2024.08.20\]** OpenCompass now supports the [SciCode](https://github.com/scicode-bench/SciCode): A Research Coding Benchmark Curated by Scientists. 🔥🔥🔥 - **\[2024.08.16\]** OpenCompass now supports the brand new long-context language model evaluation benchmark — [RULER](https://arxiv.org/pdf/2404.06654). RULER provides an evaluation of long-context including retrieval, multi-hop tracing, aggregation, and question answering through flexible configurations. Check out the [RULER](configs/datasets/ruler/README.md) evaluation config now! 🔥🔥🔥 @@ -79,33 +70,9 @@ Just like a compass guides us on our journey, OpenCompass will guide you through - **\[2024.07.17\]** We are excited to announce the release of NeedleBench's [technical report](http://arxiv.org/abs/2407.11963). We invite you to visit our [support documentation](https://opencompass.readthedocs.io/en/latest/advanced_guides/needleinahaystack_eval.html) for detailed evaluation guidelines. 🔥🔥🔥 - **\[2024.07.04\]** OpenCompass now supports InternLM2.5, which has **outstanding reasoning capability**, **1M Context window and** and **stronger tool use**, you can try the models in [OpenCompass Config](https://github.com/open-compass/opencompass/tree/main/configs/models/hf_internlm) and [InternLM](https://github.com/InternLM/InternLM) .🔥🔥🔥. - **\[2024.06.20\]** OpenCompass now supports one-click switching between inference acceleration backends, enhancing the efficiency of the evaluation process. In addition to the default HuggingFace inference backend, it now also supports popular backends [LMDeploy](https://github.com/InternLM/lmdeploy) and [vLLM](https://github.com/vllm-project/vllm). This feature is available via a simple command-line switch and through deployment APIs. For detailed usage, see the [documentation](docs/en/advanced_guides/accelerator_intro.md).🔥🔥🔥. -- **\[2024.05.08\]** We supported the evaluation of 4 MoE models: [Mixtral-8x22B-v0.1](configs/models/mixtral/hf_mixtral_8x22b_v0_1.py), [Mixtral-8x22B-Instruct-v0.1](configs/models/mixtral/hf_mixtral_8x22b_instruct_v0_1.py), [Qwen1.5-MoE-A2.7B](configs/models/qwen/hf_qwen1_5_moe_a2_7b.py), [Qwen1.5-MoE-A2.7B-Chat](configs/models/qwen/hf_qwen1_5_moe_a2_7b_chat.py). Try them out now! -- **\[2024.04.30\]** We supported evaluating a model's compression efficiency by calculating its Bits per Character (BPC) metric on an [external corpora](configs/datasets/llm_compression/README.md) ([official paper](https://github.com/hkust-nlp/llm-compression-intelligence)). Check out the [llm-compression](configs/eval_llm_compression.py) evaluation config now! 🔥🔥🔥 -- **\[2024.04.29\]** We report the performance of several famous LLMs on the common benchmarks, welcome to [documentation](https://opencompass.readthedocs.io/en/latest/user_guides/corebench.html) for more information! 🔥🔥🔥. -- **\[2024.04.26\]** We deprecated the multi-madality evaluating function from OpenCompass, related implement has moved to [VLMEvalKit](https://github.com/open-compass/VLMEvalKit), welcome to use! 🔥🔥🔥. -- **\[2024.04.26\]** We supported the evaluation of [ArenaHard](configs/eval_subjective_arena_hard.py) welcome to try!🔥🔥🔥. -- **\[2024.04.22\]** We supported the evaluation of [LLaMA3](configs/models/hf_llama/hf_llama3_8b.py) 和 [LLaMA3-Instruct](configs/models/hf_llama/hf_llama3_8b_instruct.py), welcome to try! 🔥🔥🔥 -- **\[2024.02.29\]** We supported the MT-Bench, AlpacalEval and AlignBench, more information can be found [here](https://opencompass.readthedocs.io/en/latest/advanced_guides/subjective_evaluation.html) -- **\[2024.01.30\]** We release OpenCompass 2.0. Click [CompassKit](https://github.com/open-compass), [CompassHub](https://hub.opencompass.org.cn/home), and [CompassRank](https://rank.opencompass.org.cn/home) for more information ! > [More](docs/en/notes/news.md) -## ✨ Introduction - -![image](https://github.com/open-compass/opencompass/assets/22607038/f45fe125-4aed-4f8c-8fe8-df4efb41a8ea) - -OpenCompass is a one-stop platform for large model evaluation, aiming to provide a fair, open, and reproducible benchmark for large model evaluation. Its main features include: - -- **Comprehensive support for models and datasets**: Pre-support for 20+ HuggingFace and API models, a model evaluation scheme of 70+ datasets with about 400,000 questions, comprehensively evaluating the capabilities of the models in five dimensions. - -- **Efficient distributed evaluation**: One line command to implement task division and distributed evaluation, completing the full evaluation of billion-scale models in just a few hours. - -- **Diversified evaluation paradigms**: Support for zero-shot, few-shot, and chain-of-thought evaluations, combined with standard or dialogue-type prompt templates, to easily stimulate the maximum performance of various models. - -- **Modular design with high extensibility**: Want to add new models or datasets, customize an advanced task division strategy, or even support a new cluster management system? Everything about OpenCompass can be easily expanded! - -- **Experiment management and reporting mechanism**: Use config files to fully record each experiment, and support real-time reporting of results. - ## 📊 Leaderboard We provide [OpenCompass Leaderboard](https://rank.opencompass.org.cn/home) for the community to rank all public models and API models. If you would like to join the evaluation, please provide the model repository URL or a standard API interface to the email address `opencompass@pjlab.org.cn`. @@ -226,6 +193,8 @@ After ensuring that OpenCompass is installed correctly according to the above st # Python scripts opencompass ./configs/eval_api_demo.py + + # You can use o1_mini_2024_09_12/o1_preview_2024_09_12 for o1 models, we set max_completion_tokens=8192 as default. ``` - Accelerated Evaluation @@ -257,6 +226,16 @@ After ensuring that OpenCompass is installed correctly according to the above st opencompass --datasets demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm2_5-1_8b-chat ``` + If you want to use multiple GPUs to evaluate the model in data parallel, you can use `--max-num-worker`. + + ```bash + CUDA_VISIBLE_DEVICES=0,1 opencompass --datasets demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm2_5-1_8b-chat --max-num-worker 2 + ``` + +> \[!TIP\] +> +> `--hf-num-gpus` is used for model parallel(huggingface format), `--max-num-worker` is used for data parallel. + > \[!TIP\] > > configuration with `_ppl` is designed for base model typically. @@ -266,6 +245,33 @@ Through the command line or configuration files, OpenCompass also supports evalu

🔝Back to top

+## 📣 OpenCompass 2.0 + +We are thrilled to introduce OpenCompass 2.0, an advanced suite featuring three key components: [CompassKit](https://github.com/open-compass), [CompassHub](https://hub.opencompass.org.cn/home), and [CompassRank](https://rank.opencompass.org.cn/home). +![oc20](https://github.com/tonysy/opencompass/assets/7881589/90dbe1c0-c323-470a-991e-2b37ab5350b2) + +**CompassRank** has been significantly enhanced into the leaderboards that now incorporates both open-source benchmarks and proprietary benchmarks. This upgrade allows for a more comprehensive evaluation of models across the industry. + +**CompassHub** presents a pioneering benchmark browser interface, designed to simplify and expedite the exploration and utilization of an extensive array of benchmarks for researchers and practitioners alike. To enhance the visibility of your own benchmark within the community, we warmly invite you to contribute it to CompassHub. You may initiate the submission process by clicking [here](https://hub.opencompass.org.cn/dataset-submit). + +**CompassKit** is a powerful collection of evaluation toolkits specifically tailored for Large Language Models and Large Vision-language Models. It provides an extensive set of tools to assess and measure the performance of these complex models effectively. Welcome to try our toolkits for in your research and products. + +## ✨ Introduction + +![image](https://github.com/open-compass/opencompass/assets/22607038/f45fe125-4aed-4f8c-8fe8-df4efb41a8ea) + +OpenCompass is a one-stop platform for large model evaluation, aiming to provide a fair, open, and reproducible benchmark for large model evaluation. Its main features include: + +- **Comprehensive support for models and datasets**: Pre-support for 20+ HuggingFace and API models, a model evaluation scheme of 70+ datasets with about 400,000 questions, comprehensively evaluating the capabilities of the models in five dimensions. + +- **Efficient distributed evaluation**: One line command to implement task division and distributed evaluation, completing the full evaluation of billion-scale models in just a few hours. + +- **Diversified evaluation paradigms**: Support for zero-shot, few-shot, and chain-of-thought evaluations, combined with standard or dialogue-type prompt templates, to easily stimulate the maximum performance of various models. + +- **Modular design with high extensibility**: Want to add new models or datasets, customize an advanced task division strategy, or even support a new cluster management system? Everything about OpenCompass can be easily expanded! + +- **Experiment management and reporting mechanism**: Use config files to fully record each experiment, and support real-time reporting of results. + ## 📖 Dataset Support @@ -588,7 +594,7 @@ Through the command line or configuration files, OpenCompass also supports evalu ## 🔜 Roadmap - [x] Subjective Evaluation - - [ ] Release CompassAreana + - [x] Release CompassAreana. - [x] Subjective evaluation. - [x] Long-context - [x] Long-context evaluation with extensive datasets. @@ -597,10 +603,10 @@ Through the command line or configuration files, OpenCompass also supports evalu - [ ] Coding evaluation leaderboard. - [x] Non-python language evaluation service. - [x] Agent - - [ ] Support various agenet framework. + - [ ] Support various agent frameworks. - [x] Evaluation of tool use of the LLMs. - [x] Robustness - - [x] Support various attack method + - [x] Support various attack methods. ## 👷‍♂️ Contributing diff --git a/README_zh-CN.md b/README_zh-CN.md index 20a131b3d..90de53f9e 100644 --- a/README_zh-CN.md +++ b/README_zh-CN.md @@ -34,16 +34,6 @@ > > **收藏项目**,你将能第一时间获取 OpenCompass 的最新动态~⭐️ -## 📣 OpenCompass 2.0 - -我们很高兴发布 OpenCompass 司南 2.0 大模型评测体系,它主要由三大核心模块构建而成:[CompassKit](https://github.com/open-compass)、[CompassHub](https://hub.opencompass.org.cn/home)以及[CompassRank](https://rank.opencompass.org.cn/home)。 - -**CompassRank** 系统进行了重大革新与提升,现已成为一个兼容并蓄的排行榜体系,不仅囊括了开源基准测试项目,还包含了私有基准测试。此番升级极大地拓宽了对行业内各类模型进行全面而深入测评的可能性。 - -**CompassHub** 创新性地推出了一个基准测试资源导航平台,其设计初衷旨在简化和加快研究人员及行业从业者在多样化的基准测试库中进行搜索与利用的过程。为了让更多独具特色的基准测试成果得以在业内广泛传播和应用,我们热忱欢迎各位将自定义的基准数据贡献至CompassHub平台。只需轻点鼠标,通过访问[这里](https://hub.opencompass.org.cn/dataset-submit),即可启动提交流程。 - -**CompassKit** 是一系列专为大型语言模型和大型视觉-语言模型打造的强大评估工具合集,它所提供的全面评测工具集能够有效地对这些复杂模型的功能性能进行精准测量和科学评估。在此,我们诚挚邀请您在学术研究或产品研发过程中积极尝试运用我们的工具包,以助您取得更加丰硕的研究成果和产品优化效果。 -
Star History @@ -69,6 +59,8 @@ ## 🚀 最新进展 +- **\[2024.09.19\]** 现已支持[Qwen2.5](https://huggingface.co/Qwen)(0.5B to 72B) ,可以使用多种推理后端(huggingface/vllm/lmdeploy), 欢迎尝试! 🔥🔥🔥 +- **\[2024.09.05\]** 现已支持OpenAI o1 模型(`o1-mini-2024-09-12` and `o1-preview-2024-09-12`), 欢迎尝试! 🔥🔥🔥 - **\[2024.09.05\]** OpenCompass 现在支持通过模型后处理来进行答案提取,以更准确地展示模型的能力。作为此次更新的一部分,我们集成了 [XFinder](https://github.com/IAAR-Shanghai/xFinder) 作为首个后处理模型。具体信息请参阅 [文档](opencompass/utils/postprocessors/xfinder/README.md),欢迎尝试! 🔥🔥🔥 - **\[2024.08.20\]** OpenCompass 现已支持 [SciCode](https://github.com/scicode-bench/SciCode): A Research Coding Benchmark Curated by Scientists。 🔥🔥🔥 - **\[2024.08.16\]** OpenCompass 现已支持全新的长上下文语言模型评估基准——[RULER](https://arxiv.org/pdf/2404.06654)。RULER 通过灵活的配置,提供了对长上下文包括检索、多跳追踪、聚合和问答等多种任务类型的评测,欢迎访问[RULER](configs/datasets/ruler/README.md)。🔥🔥🔥 @@ -77,37 +69,10 @@ - **\[2024.07.17\]** 我们发布了CompassBench-202407榜单的示例数据和评测规则,敬请访问 [CompassBench](https://opencompass.readthedocs.io/zh-cn/latest/advanced_guides/compassbench_intro.html) 获取更多信息。 🔥🔥🔥 - **\[2024.07.17\]** 我们正式发布 NeedleBench 的[技术报告](http://arxiv.org/abs/2407.11963)。诚邀您访问我们的[帮助文档](https://opencompass.readthedocs.io/zh-cn/latest/advanced_guides/needleinahaystack_eval.html)进行评估。🔥🔥🔥 - **\[2024.07.04\]** OpenCompass 现已支持 InternLM2.5, 它拥有卓越的推理性能、有效支持百万字超长上下文以及工具调用能力整体升级,欢迎访问[OpenCompass Config](https://github.com/open-compass/opencompass/tree/main/configs/models/hf_internlm) 和 [InternLM](https://github.com/InternLM/InternLM) .🔥🔥🔥. -- **\[2024.06.20\]** OpenCompass 现已支持一键切换推理加速后端,助力评测过程更加高效。除了默认的HuggingFace推理后端外,还支持了常用的 [LMDeploy](https://github.com/InternLM/lmdeploy) 和 [vLLM](https://github.com/vllm-project/vllm) ,支持命令行一键切换和部署 API 加速服务两种方式,详细使用方法见[文档](docs/zh_cn/advanced_guides/accelerator_intro.md)。 - 欢迎试用!🔥🔥🔥. -- **\[2024.05.08\]** 我们支持了以下四个MoE模型的评测配置文件: [Mixtral-8x22B-v0.1](configs/models/mixtral/hf_mixtral_8x22b_v0_1.py), [Mixtral-8x22B-Instruct-v0.1](configs/models/mixtral/hf_mixtral_8x22b_instruct_v0_1.py), [Qwen1.5-MoE-A2.7B](configs/models/qwen/hf_qwen1_5_moe_a2_7b.py), [Qwen1.5-MoE-A2.7B-Chat](configs/models/qwen/hf_qwen1_5_moe_a2_7b_chat.py) 。欢迎试用! -- **\[2024.04.30\]** 我们支持了计算模型在给定[数据集](configs/datasets/llm_compression/README.md)上的压缩率(Bits per Character)的评测方法([官方文献](https://github.com/hkust-nlp/llm-compression-intelligence))。欢迎试用[llm-compression](configs/eval_llm_compression.py)评测集! 🔥🔥🔥 -- **\[2024.04.26\]** 我们报告了典型LLM在常用基准测试上的表现,欢迎访问[文档](https://opencompass.readthedocs.io/zh-cn/latest/user_guides/corebench.html)以获取更多信息!🔥🔥🔥. -- **\[2024.04.26\]** 我们废弃了 OpenCompass 进行多模态大模型评测的功能,相关功能转移至 [VLMEvalKit](https://github.com/open-compass/VLMEvalKit),推荐使用!🔥🔥🔥. -- **\[2024.04.26\]** 我们支持了 [ArenaHard评测](configs/eval_subjective_arena_hard.py) 欢迎试用!🔥🔥🔥. -- **\[2024.04.22\]** 我们支持了 [LLaMA3](configs/models/hf_llama/hf_llama3_8b.py) 和 [LLaMA3-Instruct](configs/models/hf_llama/hf_llama3_8b_instruct.py) 的评测,欢迎试用!🔥🔥🔥. -- **\[2024.02.29\]** 我们支持了MT-Bench、AlpacalEval和AlignBench,更多信息可以在[这里](https://opencompass.readthedocs.io/en/latest/advanced_guides/subjective_evaluation.html)找到。 -- **\[2024.01.30\]** 我们发布了OpenCompass 2.0。更多信息,请访问[CompassKit](https://github.com/open-compass)、[CompassHub](https://hub.opencompass.org.cn/home)和[CompassRank](https://rank.opencompass.org.cn/home)。 +- **\[2024.06.20\]** OpenCompass 现已支持一键切换推理加速后端,助力评测过程更加高效。除了默认的HuggingFace推理后端外,还支持了常用的 [LMDeploy](https://github.com/InternLM/lmdeploy) 和 [vLLM](https://github.com/vllm-project/vllm) ,支持命令行一键切换和部署 API 加速服务两种方式,详细使用方法见[文档](docs/zh_cn/advanced_guides/accelerator_intro.md)。欢迎试用!🔥🔥🔥. > [更多](docs/zh_cn/notes/news.md) -## ✨ 介绍 - -![image](https://github.com/open-compass/opencompass/assets/22607038/30bcb2e2-3969-4ac5-9f29-ad3f4abb4f3b) - -OpenCompass 是面向大模型评测的一站式平台。其主要特点如下: - -- **开源可复现**:提供公平、公开、可复现的大模型评测方案 - -- **全面的能力维度**:五大维度设计,提供 70+ 个数据集约 40 万题的的模型评测方案,全面评估模型能力 - -- **丰富的模型支持**:已支持 20+ HuggingFace 及 API 模型 - -- **分布式高效评测**:一行命令实现任务分割和分布式评测,数小时即可完成千亿模型全量评测 - -- **多样化评测范式**:支持零样本、小样本及思维链评测,结合标准型或对话型提示词模板,轻松激发各种模型最大性能 - -- **灵活化拓展**:想增加新模型或数据集?想要自定义更高级的任务分割策略,甚至接入新的集群管理系统?OpenCompass 的一切均可轻松扩展! - ## 📊 性能榜单 我们将陆续提供开源模型和 API 模型的具体性能榜单,请见 [OpenCompass Leaderboard](https://rank.opencompass.org.cn/home) 。如需加入评测,请提供模型仓库地址或标准的 API 接口至邮箱 `opencompass@pjlab.org.cn`. @@ -224,6 +189,9 @@ humaneval, triviaqa, commonsenseqa, tydiqa, strategyqa, cmmlu, lambada, piqa, ce # Python 脚本 opencompass ./configs/eval_api_demo.py + + + # 现已支持 o1_mini_2024_09_12/o1_preview_2024_09_12 模型, 默认情况下 max_completion_tokens=8192. ``` - ### 推理后端 @@ -251,12 +219,56 @@ humaneval, triviaqa, commonsenseqa, tydiqa, strategyqa, cmmlu, lambada, piqa, ce opencompass --datasets demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm2_5-1_8b-chat ``` + 如果你想在多块 GPU 上使用模型进行推理,您可以使用 `--max-num-worker` 参数。 + + ```bash + CUDA_VISIBLE_DEVICES=0,1 opencompass --datasets demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm2_5-1_8b-chat --max-num-worker 2 + ``` + +> \[!TIP\] +> +> `--hf-num-gpus` 用于 模型并行(huggingface 格式),`--max-num-worker` 用于数据并行。 + +> \[!TIP\] +> +> configuration with `_ppl` is designed for base model typically. +> 配置带 `_ppl` 的配置设计给基础模型使用。 +> 配置带 `_gen` 的配置可以同时用于基础模型和对话模型。 + 通过命令行或配置文件,OpenCompass 还支持评测 API 或自定义模型,以及更多样化的评测策略。请阅读[快速开始](https://opencompass.readthedocs.io/zh_CN/latest/get_started/quick_start.html)了解如何运行一个评测任务。 更多教程请查看我们的[文档](https://opencompass.readthedocs.io/zh_CN/latest/index.html)。

🔝返回顶部

+## 📣 OpenCompass 2.0 + +我们很高兴发布 OpenCompass 司南 2.0 大模型评测体系,它主要由三大核心模块构建而成:[CompassKit](https://github.com/open-compass)、[CompassHub](https://hub.opencompass.org.cn/home)以及[CompassRank](https://rank.opencompass.org.cn/home)。 + +**CompassRank** 系统进行了重大革新与提升,现已成为一个兼容并蓄的排行榜体系,不仅囊括了开源基准测试项目,还包含了私有基准测试。此番升级极大地拓宽了对行业内各类模型进行全面而深入测评的可能性。 + +**CompassHub** 创新性地推出了一个基准测试资源导航平台,其设计初衷旨在简化和加快研究人员及行业从业者在多样化的基准测试库中进行搜索与利用的过程。为了让更多独具特色的基准测试成果得以在业内广泛传播和应用,我们热忱欢迎各位将自定义的基准数据贡献至CompassHub平台。只需轻点鼠标,通过访问[这里](https://hub.opencompass.org.cn/dataset-submit),即可启动提交流程。 + +**CompassKit** 是一系列专为大型语言模型和大型视觉-语言模型打造的强大评估工具合集,它所提供的全面评测工具集能够有效地对这些复杂模型的功能性能进行精准测量和科学评估。在此,我们诚挚邀请您在学术研究或产品研发过程中积极尝试运用我们的工具包,以助您取得更加丰硕的研究成果和产品优化效果。 + +## ✨ 介绍 + +![image](https://github.com/open-compass/opencompass/assets/22607038/30bcb2e2-3969-4ac5-9f29-ad3f4abb4f3b) + +OpenCompass 是面向大模型评测的一站式平台。其主要特点如下: + +- **开源可复现**:提供公平、公开、可复现的大模型评测方案 + +- **全面的能力维度**:五大维度设计,提供 70+ 个数据集约 40 万题的的模型评测方案,全面评估模型能力 + +- **丰富的模型支持**:已支持 20+ HuggingFace 及 API 模型 + +- **分布式高效评测**:一行命令实现任务分割和分布式评测,数小时即可完成千亿模型全量评测 + +- **多样化评测范式**:支持零样本、小样本及思维链评测,结合标准型或对话型提示词模板,轻松激发各种模型最大性能 + +- **灵活化拓展**:想增加新模型或数据集?想要自定义更高级的任务分割策略,甚至接入新的集群管理系统?OpenCompass 的一切均可轻松扩展! + ## 📖 数据集支持
@@ -582,7 +594,7 @@ humaneval, triviaqa, commonsenseqa, tydiqa, strategyqa, cmmlu, lambada, piqa, ce - [x] 主观评测 - [x] 发布主观评测榜单 - - [ ] 发布主观评测数据集 + - [x] 发布主观评测数据集 - [x] 长文本 - [x] 支持广泛的长文本评测集 - [ ] 发布长文本评测榜单 diff --git a/configs/api_examples/eval_api_bailing.py b/configs/api_examples/eval_api_bailing.py new file mode 100644 index 000000000..00640fb4f --- /dev/null +++ b/configs/api_examples/eval_api_bailing.py @@ -0,0 +1,38 @@ +from mmengine.config import read_base + +from opencompass.models import BailingAPI +from opencompass.partitioners import NaivePartitioner +from opencompass.runners.local_api import LocalAPIRunner +from opencompass.tasks import OpenICLInferTask + +with read_base(): + from opencompass.configs.datasets.ceval.ceval_gen import ceval_datasets + from opencompass.configs.summarizers.medium import summarizer + +datasets = [ + *ceval_datasets, +] + +models = [ + dict( + path='Bailing-Lite-0830', + token='xxxxxx', # set your key here or in environment variable BAILING_API_KEY + url='https://bailingchat.alipay.com/chat/completions', + type=BailingAPI, + generation_kwargs={}, + query_per_second=1, + max_seq_len=4096, + ), +] + +infer = dict( + partitioner=dict(type=NaivePartitioner), + runner=dict( + type=LocalAPIRunner, + max_num_workers=2, + concurrent_users=2, + task=dict(type=OpenICLInferTask), + ), +) + +work_dir = 'outputs/api_bailing/' diff --git a/configs/datasets/MathBench/mathbench_2024_few_shot_mixed_4a3fd4.py b/configs/datasets/MathBench/mathbench_2024_few_shot_mixed_4a3fd4.py new file mode 100644 index 000000000..e7f2859e9 --- /dev/null +++ b/configs/datasets/MathBench/mathbench_2024_few_shot_mixed_4a3fd4.py @@ -0,0 +1,81 @@ +from mmengine.config import read_base +from copy import deepcopy +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer, PPLInferencer +from opencompass.openicl.icl_evaluator import CircularEvaluator, AccEvaluator +from opencompass.datasets import MathBenchDataset, mathbench_postprocess +from opencompass.utils.text_postprocessors import first_option_postprocess + +with read_base(): + from .mathbench_prompt import zero_shot_prompts, few_shot_prompts, mathbench_sets + +# Max for this dataset is 4 +num_shot = 4 +# Generate reasoning path or not, only for single choice +with_reasoning = True +# Use circular evaluation or not +with_circular_eval = True +# Use PPL mode in single choice test or not +use_ppl_single_choice = True + +assert 0 <= num_shot <= 4 +if num_shot == 0: + prompts = zero_shot_prompts +else: + prompts = {name: p[- 2 * num_shot - 2:] for name, p in few_shot_prompts.items()} + +mathbench_datasets = [] +for _split in mathbench_sets: + for _name in mathbench_sets[_split]: + if 'single_choice' in _name: + if with_reasoning and not use_ppl_single_choice: + template_round = prompts[_name + '_with_reasoning'] + else: + template_round = prompts[_name] + else: + template_round = prompts[_name] + + if 'single_choice' in _name: + pred_postprocessor = dict(type=first_option_postprocess, options='ABCD') + else: + pred_postprocessor = dict(type=mathbench_postprocess, name=_name) + + if 'single_choice' in _name and with_circular_eval: + evaluator = dict(type=CircularEvaluator) + else: + evaluator = dict(type=AccEvaluator) + + # assemble the final config + mathbench_reader_cfg = dict(input_columns=['question'], output_column='answer') + if use_ppl_single_choice and 'single_choice' in _name: + template = {} + for answer in ['A', 'B', 'C', 'D']: + one_template_round = deepcopy(template_round) + one_template_round[-1]['prompt'] = one_template_round[-1]['prompt'].format(answer=answer) + template[answer] = dict(round=one_template_round) + mathbench_infer_cfg = dict( + prompt_template=dict(type=PromptTemplate, template=template), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=PPLInferencer), + ) + else: + mathbench_infer_cfg = dict( + prompt_template=dict(type=PromptTemplate, template=dict(round=template_round)), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=2048, stopping_criteria=['Question:']), + ) + mathbench_eval_cfg = dict(evaluator=evaluator, pred_postprocessor=pred_postprocessor) + + mathbench_datasets.append( + dict( + abbr='mathbench-' + _split + '-' + _name, + type=MathBenchDataset, + path=f'data/mathbench_v1/{_split}', + name=_name, + with_circular=with_circular_eval, + reader_cfg=mathbench_reader_cfg, + infer_cfg=mathbench_infer_cfg, + eval_cfg=mathbench_eval_cfg, + ) + ) diff --git a/configs/datasets/MathBench/mathbench_2024_gen_50a320.py b/configs/datasets/MathBench/mathbench_2024_gen_50a320.py new file mode 100644 index 000000000..2d20f4506 --- /dev/null +++ b/configs/datasets/MathBench/mathbench_2024_gen_50a320.py @@ -0,0 +1,81 @@ +from mmengine.config import read_base +from copy import deepcopy +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer, PPLInferencer +from opencompass.openicl.icl_evaluator import CircularEvaluator, AccEvaluator +from opencompass.datasets import MathBenchDataset, math_postprocess_v2 +from opencompass.utils.text_postprocessors import first_option_postprocess + +with read_base(): + from .mathbench_prompt import zero_shot_prompts, few_shot_prompts, mathbench_sets + +# Max for this dataset is 4 +num_shot = 0 +# Generate reasoning path or not, only for single choice +with_reasoning = True +# Use circular evaluation or not +with_circular_eval = True +# Use PPL mode in single choice test or not +use_ppl_single_choice = False + +assert 0 <= num_shot <= 4 +if num_shot == 0: + prompts = zero_shot_prompts +else: + prompts = {name: p[- 2 * num_shot - 2:] for name, p in few_shot_prompts.items()} + +mathbench_datasets = [] +for _split in mathbench_sets: + for _name in mathbench_sets[_split]: + if 'single_choice' in _name: + if with_reasoning: + template_round = prompts[_name + '_with_reasoning'] + else: + template_round = prompts[_name] + else: + template_round = prompts[_name] + + if 'single_choice' in _name: + pred_postprocessor = dict(type=first_option_postprocess, options='ABCD') + else: + pred_postprocessor = dict(type=math_postprocess_v2) + + if 'single_choice' in _name and with_circular_eval: + evaluator = dict(type=CircularEvaluator) + else: + evaluator = dict(type=AccEvaluator) + + # assemble the final config + mathbench_reader_cfg = dict(input_columns=['question'], output_column='answer') + if use_ppl_single_choice and 'single_choice' in _name and not with_reasoning: + template = {} + for answer in ['A', 'B', 'C', 'D']: + one_template_round = deepcopy(template_round) + one_template_round['round'][-1]['prompt'] = one_template_round['round'][-1]['prompt'].format(answer=answer) + template[answer] = dict(round=one_template_round) + mathbench_infer_cfg = dict( + prompt_template=dict(type=PromptTemplate, template=template), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=PPLInferencer), + ) + else: + mathbench_infer_cfg = dict( + prompt_template=dict(type=PromptTemplate, template=dict(round=template_round)), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=2048), + ) + mathbench_eval_cfg = dict(evaluator=evaluator, pred_postprocessor=pred_postprocessor) + + mathbench_datasets.append( + dict( + abbr='mathbench-' + _split + '-' + _name, + type=MathBenchDataset, + path=f'data/mathbench_v1/{_split}', + name=_name, + with_circular=with_circular_eval, + reader_cfg=mathbench_reader_cfg, + infer_cfg=mathbench_infer_cfg, + eval_cfg=mathbench_eval_cfg, + ) + ) diff --git a/configs/datasets/MathBench/mathbench_prompt.py b/configs/datasets/MathBench/mathbench_prompt.py index 069528ee4..8052dab62 100644 --- a/configs/datasets/MathBench/mathbench_prompt.py +++ b/configs/datasets/MathBench/mathbench_prompt.py @@ -11,6 +11,12 @@ 'single_choice_en': [ dict(role='HUMAN', prompt='Question: Here is a multiple-choice question about mathematics. Please provide the correct answer option directly.\nHere is the question you need to answer:\n{question}\nThe answer is:'), ], + 'cloze_en': [ + dict(role='HUMAN', prompt='{question}\nPlease reason step by step, and put your final answer within \\boxed{}.'), + ], + 'cloze_cn': [ + dict(role='HUMAN', prompt='{question}\n请一步一步推理,并在最后用\\boxed{}给出你的答案。'), + ] } few_shot_prompts = { diff --git a/configs/datasets/dingo/dingo_gen.py b/configs/datasets/dingo/dingo_gen.py new file mode 100644 index 000000000..c36f6cdcc --- /dev/null +++ b/configs/datasets/dingo/dingo_gen.py @@ -0,0 +1,34 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import DingoDataset, DingoEvaluator + + +dingo_paths = [ + './data/dingo/en_192.csv', + './data/dingo/zh_170.csv', +] + +dingo_datasets = [] +for path in dingo_paths: + dingo_reader_cfg = dict(input_columns='input', output_column=None) + dingo_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[dict(role='HUMAN', prompt='{input}')])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer), + ) + dingo_eval_cfg = dict(evaluator=dict(type=DingoEvaluator), pred_role='BOT') + + dingo_datasets.append( + dict( + abbr='dingo_' + path.split('/')[-1].split('.csv')[0], + type=DingoDataset, + path=path, + reader_cfg=dingo_reader_cfg, + infer_cfg=dingo_infer_cfg, + eval_cfg=dingo_eval_cfg, + )) + +datasets = dingo_datasets diff --git a/configs/datasets/gaokao_math/README.md b/configs/datasets/gaokao_math/README.md new file mode 100644 index 000000000..08253add1 --- /dev/null +++ b/configs/datasets/gaokao_math/README.md @@ -0,0 +1,108 @@ +# GaoKao MATH Answer Evaluation Dataset +A dataset for testing the performance of the model in the GaoKao MATH Answer Extraction task. +Now support the following format of GAOKAO math questions: +1. '单选题':Single choice question +2. '多选题':Multiple choice question +3. '填空题':Fill in the blank question, can be multiple blanks +4. '解答题':Answer question, can be multiple answers + +Sample data: +```json +[ + { + "id": "3b270bc4-570a-4d77-b122-a2fc372f7d6a", + "question": "过椭圆${x^2\\over {16}} +{ y^2 \\over {4}}=1$ %内一点$M(2,1)$ %引一条弦,使该弦被点$M$ %平分,则这条弦所在直线的方程为( ).\nA. $x+2y-4=0$ %\nB. $x-2y-4=0$ %\nC. $x+2y+4=0$ %\nD. $x-2y+4=0$ %\n\n", + "response": "本题主要考查直线与圆锥曲线.设所求直线与椭圆的一个交点为$A(x,y)$ %,由于中点$M(2,1)$ %,所以另一个交点$B$ %为$(4-x,2-y)$ %.因为$A$ %,$B$ %两点都在椭圆上,所以$x^2+4y^2=16$ %,$(4-x)^2+4(2-y)^2=16$ %,两式相减,整理可得$x+2y-4=0$ %.由于过$A$ %,$B$ %两点的直线只有一条,所以这条弦所在直线的方程为$x+2y-4=0$ %.故本题正确答案为A.\n答案是:A", + "extract_answer": "A", + "question_type": "单选题" + }, + { + "id": "d60e42d7-30ee-44f9-a94d-aff6a8127750", + "question": "若函数$f(x)$ 具有下列性质:1.定义域为$(-1,1)$ ;2.对于任意的$x,y\\in(-1,1)$ ,都有$f(x)+f(y)=f\\left({\\dfrac{x+y}{1+xy}}\\right)$ ;3.当$-1< x< 0$ 时,$f(x)>0$ ,则称函数$f(x)$ 为$δ$ 的函数$.$ 若函数$f(x)$ 为$δ$ 的函数,则以下结论正确的是$(\\quad)$\nA. $\nB. x)$ 为奇函数\nC. $\nD. x)$ 为偶函数\nE. $\nF. x)$ 为单调递减函数\nG. $\nH. x)$ 为单调递增函数\n\n", + "response": "函数$f(x)$ 为$δ$ 的函数,令$x=y=0$ ,则$f(0)+f(0)=f(0)$ ,即$f(0)=0$ ,令$y=-x$ ,则$f(x)+f(-x)=f\\left(\\dfrac{x-x}{1-{x}^{2}}\\right)=f(0)=0$ ,则$f(-x)=-f(x)$ ,即函数$f(x)$ 是奇函数,设$-1< x< y< 1$ ,则$f(x)-f(y)=f(x)+f(-y)=f\\left(\\dfrac{x-y}{1-xy}\\right)$ ,$∵-1< x< y< 1$ ,$∴-1< \\dfrac{x-y}{1-xy}< 0$ ,则$f\\left(\\dfrac{x-y}{1-xy}\\right)>0$ ,即$f(x)-f(y)>0$ ,则$f(x)>f(y)$ ,即$f(x)$ 在$(-1,1)$ 上是减函数.故选$AC.$ 本题考查函数的奇偶性和单调性的判断,注意运用定义法,考查运算能力和推理能力,属于中档题.可令$x=y=0$ ,求得$f(0)=0$ ,再令$y=-x$ 可得$f(-x)=-f(x)$ ,可得$f(x)$ 的奇偶性;再令$-1< x< y< 1$ ,运用单调性的定义,结合其偶性的定义可得其单调性.\n答案是:A; C", + "extract_answer": "A, C", + "question_type": "多选题" + }, + { + "id": "31b3f702-e60c-4a20-9a40-73bd72b92d1e", + "question": "请完成以下题目(1)曲线$$y=-5\\text{e}^{x}+3$$在点$$(0,-2)$$处的切线方程为___.(2)若曲线$$f(x)=x \\sin x+1$$在$$x=\\dfrac{ \\pi }{2}$$处的切线与直线$$ax+2y+1=0$$相互垂直,则实数$$a=$$___.\n\n", + "response": "(1)由$$y=-5\\text{e}^{x}+3$$,得$$y'=-5\\text{e}^{x}$$,所以切线的斜率$$k=y'|_{x=0}=-5$$,所以切线方程为$$y+2=-5(x-0)$$,即$$5x+y+2=0$$.(2)因为$$f'(x)= \\sin x+x \\cos x$$,所以$$f'\\left(\\dfrac{ \\pi }{2}\\right)= \\sin \\dfrac{ \\pi }{2}+\\dfrac{ \\pi }{2}\\cdot \\cos \\dfrac{ \\pi }{2}=1$$.又直线$$ax+2y+1=0$$的斜率为$$-\\dfrac{a}{2}$$,所以根据题意得$$1\\times \\left(-\\dfrac{a}{2}\\right)=-1$$,解得$$a=2$$.\n答案是:(1)$$5x+y+2=0$$ (2)$$2$$", + "extract_answer": "['(1)$$5x+y+2=0$$ (2)$$2$$']", + "question_type": "填空题" + }, + { + "id": "16878941-1772-4290-bc61-00b193d5cf70", + "question": "已知函数$f\\left( x \\right)=\\left| 2x-1 \\right|$.(1)若不等式$f\\left( x+\\frac{1}{2} \\right)\\ge 2m+1\\left( m > 0 \\right)$的解集为$\\left( -\\infty ,-2 \\right]\\bigcup \\left[ 2,+\\infty \\right)$,求实数$m$的值;(2)若不等式$f\\left( x \\right)\\le {{2}^{y}}+\\frac{a}{{{2}^{y}}}+\\left| 2x+3 \\right|$对任意的实数$x,y\\in R$恒成立,求实数$a$的最小值.\n\n", + "response": "(1)直接写出不等式,解含有绝对值的函数不等式即可;(2)这是恒成立求参的问题,根据绝对值三角不等式得到左侧函数的最值,再结合均值不等式得最值.(1)由条件得$\\left| 2x \\right|\\le 2m+1$得$-m-\\frac{1}{2}\\le x\\le m+\\frac{1}{2}$,所以$m=\\frac{3}{2}$.(2)原不等式等价于$\\left| 2x-1 \\right|-\\left| 2x+3 \\right|\\le {{2}^{y}}+\\frac{a}{{{2}^{y}}}$,而$\\left| 2x-1 \\right|-\\left| 2x+3 \\right|\\le \\left| \\left( 2x-1 \\right)-\\left( 2x+3 \\right) \\right|=4$,所以${{2}^{y}}+\\frac{a}{{{2}^{y}}}\\ge 4$,则$a\\ge {{\\left[ {{2}^{y}}\\left( 4-{{2}^{y}} \\right) \\right]}_{\\text{max}}}=4$,当且仅当$y=1$时取得.\n答案是:(1) $m=\\frac{3}{2}$;(2) 最小值为$a=4$.", + "extract_answer": [ + "(1) $m=\\frac{3}{2}$;(2) 最小值为$a=4$." + ], + "question_type": "解答题" + } +] +``` +## How to use + +### 1. Prepare the dataset +```bash +cd opencompass +cp -rf /cpfs01/shared/public/liuhongwei/data/gaokao_math_dataset/gaokao_math ./data +``` +📢:If you want to evaluate your own gaokao math data, replace the `test_v2.jsonl` with your own data, but follow the format above. + +### 2. Set the evaluation model + +open `opencompass.datasets.gaokao_math.gaokao_math_gen_9b076f` and set the model name and api url for evaluation, multiple urls are supported for acceleration. + +```python +... + +gaokao_math_eval_cfg = dict( + evaluator=dict(type=GaoKaoMATHEvaluator, model_name='EVALUATE_MODEL_NAME', url=['http://0.0.0.0:23333/v1', 'http://...'])) + +... + +``` +We recommand `Qwen2.5-72B-Instruct` model for evaluation. + + +### 3. Set Extractor model and run the evaluation + +```python +from mmengine.config import read_base +from opencompass.models import HuggingFacewithChatTemplate + + +with read_base(): + from opencompass.datasets.gaokao_math.gaokao_math_gen_9b076f import gaokao_math_datasets + + +trained_qwen2_1_5b_model = [ # trained extractor model + dict( + type=HuggingFacewithChatTemplate, + abbr='gaokao_math_extractor_1_5b_v02', + path='/cpfs01/shared/public/liuhongwei/models/gaokao_math_trained/gaokao_math_extractor_1_5b_v02', + max_out_len=1024, + batch_size=8, + run_cfg=dict(num_gpus=1), + ) +] + +datasets = sum([v for k, v in locals().items() if k.endswith("_datasets")], []) +models = sum([v for k, v in locals().items() if k.endswith("_model")], []) + +... +``` + +### 4. Run the evaluation + +```bash +python run.py eval.py --dump-eval-details # eval and dump the evaluation details to `results` folder +``` + + +### 5. Evaluation results + +| Evaluator / Extractor | Qwen2.5-72B-Instruct | gaokao_math_extractor_1.5b_v0.2 | +|-----------------------|-----------------------|----------------------------------| +| Qwen2.5-72B-Instruct (ACC) | 95.85 | 95.2 | diff --git a/configs/datasets/gaokao_math/gaokao_math_gen_f5fd28.py b/configs/datasets/gaokao_math/gaokao_math_gen_f5fd28.py new file mode 100644 index 000000000..80ae4264f --- /dev/null +++ b/configs/datasets/gaokao_math/gaokao_math_gen_f5fd28.py @@ -0,0 +1,48 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import GaoKaoMATHDataset, GaoKaoMATHEvaluator + + +MATH_CN_PROMPT=""" +你是一个数学阅卷专家,任务是从给定的回答句子中提取精确的关键答案。你必须只提供提取的关键答案,不包括任何额外的文字。 +— +我将为你提供一个问题、回答句子和问题类型。回答句子是对所提供问题的回应。利用提供的信息,你必须准确而精确地确定并从回答句子中提取预期的关键答案。请不要对问题发表主观看法。 + +对于单选题,答案应该是选项字母,例如 "A"; +对于多选题,答案应该是一个选项字母的列表,例如 ["A"] 或 ["A", "B", "C"]; +对于填空题,答案应该是一个填入空白处的答案列表,列表的数量应该与问题中的空白数量相同,例如 ["$$\\frac{{1}}{{2}}$$"] 或 ["$$\\frac{{1}}{{2}}$$", "2"]。 +对于问答题,类似填空题,为每个小问抽出相应答案,例如 ["$$\\frac{{1}}{{2}}$$"] 或 ["$$\\frac{{1}}{{2}}$$", "2"]。 + +如果回答句子提供了多个不同的答案,请仔细判断后面提供的答案是否是对前面答案的修正或修改。如果是这样,提取这个修正或修改后的答案作为最终答案。相反,如果回答句子在多个答案之间波动而没有明确的最终答案,你应该输出 [No valid answer]。 +— +问题类型: {question_type} +原始问题: {question} +回答: {response} +提取的关键答案: +""" + +gaokao_math_reader_cfg = dict(input_columns=['question', 'response', 'question_type'], output_column='extract_answer') + + +gaokao_math_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict(role='HUMAN', prompt=MATH_CN_PROMPT), + ])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512)) + +gaokao_math_eval_cfg = dict( + evaluator=dict(type=GaoKaoMATHEvaluator, model_name='Qwen/Qwen2.5-72B-Instruct', url=['http://22.8.73.119:23333/v1', 'http://22.8.4.97:23333/v1', 'http://22.8.22.254:23333/v1', 'http://22.8.17.14:23333/v1'])) + +gaokao_math_datasets = [ + dict( + type=GaoKaoMATHDataset, + abbr='GaoKaoMATH', + path='./data/gaokao_math/test_2k.json', + reader_cfg=gaokao_math_reader_cfg, + infer_cfg=gaokao_math_infer_cfg, + eval_cfg=gaokao_math_eval_cfg) +] diff --git a/configs/datasets/gsm8k/gsm8k_model_postprocess_gen_a58960.py b/configs/datasets/gsm8k/gsm8k_model_postprocess_gen_a58960.py new file mode 100644 index 000000000..a95feb8d1 --- /dev/null +++ b/configs/datasets/gsm8k/gsm8k_model_postprocess_gen_a58960.py @@ -0,0 +1,52 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import GSM8KDataset, gsm8k_dataset_postprocess +from opencompass.datasets import MATHEvaluator, math_postprocess_v2 +from opencompass.utils.model_postprocessors import navie_model_postprocess +from opencompass.utils.postprocessors.naive import MATH_NAVIE_PROMPT_TEMPLATE + +gsm8k_reader_cfg = dict(input_columns=['question'], output_column='answer') + +gsm8k_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='{question}\nPlease reason step by step, and put your final answer within \\boxed{}.'), + ], + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512), +) + +# # You can write your own postprocess prompt like: +# GSM8K_NAVIE_PROMPT_TEMPLATE = """ +# There is a detailed explanation of the final answer you should extract: +# 1. ... +# 2. ... +# ... +# """ + +gsm8k_eval_cfg = dict( + evaluator=dict(type=MATHEvaluator, version='v2'), + pred_postprocessor=dict(type=math_postprocess_v2), + dataset_postprocessor=dict(type=gsm8k_dataset_postprocess), + model_postprocessor=dict( + type=navie_model_postprocess, + custom_instruction=MATH_NAVIE_PROMPT_TEMPLATE, + model_name='', + api_url='http://0.0.0.0:23333/v1,http://0.0.0.0:23334/v1') + ) + +gsm8k_datasets = [ + dict( + abbr='gsm8k', + type=GSM8KDataset, + path='opencompass/gsm8k', + reader_cfg=gsm8k_reader_cfg, + infer_cfg=gsm8k_infer_cfg, + eval_cfg=gsm8k_eval_cfg, + ) +] diff --git a/configs/datasets/math/math_0shot_llm_judge_gen_393424.py b/configs/datasets/math/math_0shot_llm_judge_gen_393424.py new file mode 100644 index 000000000..eb302c854 --- /dev/null +++ b/configs/datasets/math/math_0shot_llm_judge_gen_393424.py @@ -0,0 +1,78 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import MATHDataset, MATHEvaluator, math_postprocess_v2, GaoKaoMATHEvaluator +from opencompass.utils.model_postprocessors import naive_model_postprocess, xfinder_postprocess +from opencompass.utils.postprocessors.naive import MATH_NAVIE_PROMPT_TEMPLATE + +# ----------------------------- Eval Parameters ----------------------------- +## Postprocess function +post_func = 're' # 're', 'xfinder_model', 'naive_model' + +## Evalute function +eval_func = 'naive_model' # 're', 'naive_model' + +## Model api url +xfinder_url = 'http://0.0.0.0:23333/v1' # for 'xFinder-qwen1505' if post_func is 'xfinder_model' +naive_model_name = 'Qwen/Qwen2.5-72B-Instruct' # replace with your model name +naive_model_url = ['http://22.8.6.22:23333/v1', 'http://22.8.67.84:23333/v1', 'http://22.8.72.81:23333/v1', 'http://22.9.42.143:23333/v1'] # Multi-apis for accerlation + +# ----------------------------- Detailed Config ----------------------------- + +math_reader_cfg = dict(input_columns=['problem'], output_column='solution') + +math_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='{problem}\nPlease reason step by step, and put your final answer within \\boxed{}.'), + ] + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=1024), +) + +if post_func == 're': + pred_postprocessor = dict(type=math_postprocess_v2) +elif post_func == 'xfinder_model': + pred_postprocessor = dict( + type=xfinder_postprocess, + question_type='math', + model_name='xFinder-qwen1505', + num_processes=128, + api_url=xfinder_url, + ) +elif post_func == 'naive_model': + pred_postprocessor = dict( + type=naive_model_postprocess, + custom_instruction=MATH_NAVIE_PROMPT_TEMPLATE, + model_name=naive_model_name, + num_processes=64, + api_url=naive_model_url, + ) + +if eval_func == 're': + evaluator = dict(type=MATHEvaluator, version='v2') +elif eval_func == 'naive_model': + evaluator = dict( + type=GaoKaoMATHEvaluator, + model_name=naive_model_name, + url=naive_model_url, + ) + +math_eval_cfg = dict( + evaluator=evaluator, pred_postprocessor=pred_postprocessor, +) + +math_datasets = [ + dict( + type=MATHDataset, + abbr='math', + path='opencompass/math', + reader_cfg=math_reader_cfg, + infer_cfg=math_infer_cfg, + eval_cfg=math_eval_cfg, + ) +] diff --git a/configs/datasets/math/math_4shot_base_gen_43d5b6.py b/configs/datasets/math/math_4shot_base_gen_43d5b6.py new file mode 100644 index 000000000..1e8696798 --- /dev/null +++ b/configs/datasets/math/math_4shot_base_gen_43d5b6.py @@ -0,0 +1,30 @@ +from mmengine.config import read_base +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import MATHDataset, MATHEvaluator, math_postprocess_v2 + +with read_base(): + from .math_4shot_example_from_google_research import prompt + +math_reader_cfg = dict(input_columns=['problem'], output_column='solution') + +math_infer_cfg = dict( + prompt_template=dict(type=PromptTemplate, template=prompt + '\n\nProblem:\n{problem}\nSolution:'), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=2048, stopping_criteria=['Problem', '问题:'])) + +# postprocess v2 +math_eval_cfg = dict( + evaluator=dict(type=MATHEvaluator, version='v2'), + pred_postprocessor=dict(type=math_postprocess_v2)) + +math_datasets = [ + dict( + type=MATHDataset, + abbr='math', + path='opencompass/math', + reader_cfg=math_reader_cfg, + infer_cfg=math_infer_cfg, + eval_cfg=math_eval_cfg) +] diff --git a/configs/datasets/mmlu/mmlu_model_postprocess_gen_4d595a.py b/configs/datasets/mmlu/mmlu_model_postprocess_gen_4d595a.py new file mode 100644 index 000000000..45cefb762 --- /dev/null +++ b/configs/datasets/mmlu/mmlu_model_postprocess_gen_4d595a.py @@ -0,0 +1,141 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import FixKRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import AccwithDetailsEvaluator +from opencompass.datasets import MMLUDataset +from opencompass.utils.text_postprocessors import first_option_postprocess +from opencompass.utils.model_postprocessors import navie_model_postprocess +from opencompass.utils.postprocessors.naive import OPTION_NAVIE_PROMPT_TEMPLATE + + +# None of the mmlu dataset in huggingface is correctly parsed, so we use our own dataset reader +# Please download the dataset from https://people.eecs.berkeley.edu/~hendrycks/data.tar + +mmlu_reader_cfg = dict( + input_columns=['input', 'A', 'B', 'C', 'D'], + output_column='target', + train_split='dev') + +mmlu_all_sets = [ + 'college_biology', + 'college_chemistry', + 'college_computer_science', + 'college_mathematics', + 'college_physics', + 'electrical_engineering', + 'astronomy', + 'anatomy', + 'abstract_algebra', + 'machine_learning', + 'clinical_knowledge', + 'global_facts', + 'management', + 'nutrition', + 'marketing', + 'professional_accounting', + 'high_school_geography', + 'international_law', + 'moral_scenarios', + 'computer_security', + 'high_school_microeconomics', + 'professional_law', + 'medical_genetics', + 'professional_psychology', + 'jurisprudence', + 'world_religions', + 'philosophy', + 'virology', + 'high_school_chemistry', + 'public_relations', + 'high_school_macroeconomics', + 'human_sexuality', + 'elementary_mathematics', + 'high_school_physics', + 'high_school_computer_science', + 'high_school_european_history', + 'business_ethics', + 'moral_disputes', + 'high_school_statistics', + 'miscellaneous', + 'formal_logic', + 'high_school_government_and_politics', + 'prehistory', + 'security_studies', + 'high_school_biology', + 'logical_fallacies', + 'high_school_world_history', + 'professional_medicine', + 'high_school_mathematics', + 'college_medicine', + 'high_school_us_history', + 'sociology', + 'econometrics', + 'high_school_psychology', + 'human_aging', + 'us_foreign_policy', + 'conceptual_physics', +] + +mmlu_datasets = [] +for _name in mmlu_all_sets: + _hint = f'There is a single choice question about {_name.replace("_", " ")}. Answer the question by replying A, B, C or D.' + mmlu_infer_cfg = dict( + ice_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role='HUMAN', + prompt= + f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: ' + ), + dict(role='BOT', prompt='{target}\n') + ]), + ), + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin='', + round=[ + dict( + role='HUMAN', + prompt=f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: ' + ), + ], + ), + ice_token='', + ), + retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]), + inferencer=dict(type=GenInferencer), + ) + +# # You can write your own postprocess prompt like: +# MMLU_NAVIE_PROMPT_TEMPLATE = """ +# There is a detailed explanation of the final answer you should extract: +# 1. ... +# 2. ... +# ... +# """ + + mmlu_eval_cfg = dict( + evaluator=dict(type=AccwithDetailsEvaluator), + pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'), + model_postprocessor=dict( + type=navie_model_postprocess, + custom_instruction=OPTION_NAVIE_PROMPT_TEMPLATE, + model_name='', + api_url='http://0.0.0.0:23333/v1,http://0.0.0.0:23334/v1') + ) + + + mmlu_datasets.append( + dict( + abbr=f'lukaemon_mmlu_{_name}', + type=MMLUDataset, + path='opencompass/mmlu', + name=_name, + reader_cfg=mmlu_reader_cfg, + infer_cfg=mmlu_infer_cfg, + eval_cfg=mmlu_eval_cfg, + )) + +del _name, _hint diff --git a/configs/datasets/flames/README.md b/configs/datasets/subjective/flames/README.md similarity index 100% rename from configs/datasets/flames/README.md rename to configs/datasets/subjective/flames/README.md diff --git a/configs/datasets/flames/flames_gen.py b/configs/datasets/subjective/flames/flames_gen.py similarity index 100% rename from configs/datasets/flames/flames_gen.py rename to configs/datasets/subjective/flames/flames_gen.py diff --git a/configs/datasets/flames/flames_gen_1a58bb.py b/configs/datasets/subjective/flames/flames_gen_1a58bb.py similarity index 96% rename from configs/datasets/flames/flames_gen_1a58bb.py rename to configs/datasets/subjective/flames/flames_gen_1a58bb.py index 1082e2174..64a10519c 100644 --- a/configs/datasets/flames/flames_gen_1a58bb.py +++ b/configs/datasets/subjective/flames/flames_gen_1a58bb.py @@ -58,5 +58,6 @@ name=_name, reader_cfg=subjective_reader_cfg, infer_cfg=subjective_infer_cfg, - eval_cfg=subjective_eval_cfg + eval_cfg=subjective_eval_cfg, + mode='singlescore', )) diff --git a/configs/datasets/subjective/followbench/followbench_llmeval.py b/configs/datasets/subjective/followbench/followbench_llmeval.py index 0733340ed..e601bda34 100644 --- a/configs/datasets/subjective/followbench/followbench_llmeval.py +++ b/configs/datasets/subjective/followbench/followbench_llmeval.py @@ -15,7 +15,7 @@ ] data_path ='data/subjective/followbench/converted_data' -followbench_llmeval_dataset = [] +followbench_llmeval_datasets = [] for _name in subjective_all_sets: subjective_infer_cfg = dict( @@ -48,7 +48,7 @@ pred_role='BOT', ) - followbench_llmeval_dataset.append( + followbench_llmeval_datasets.append( dict( abbr=f'{_name}', type=FollowBenchDataset, diff --git a/configs/datasets/wikibench/wikibench_few_shot_ppl_c23d79.py b/configs/datasets/wikibench/wikibench_few_shot_ppl_c23d79.py new file mode 100644 index 000000000..0669bd7b9 --- /dev/null +++ b/configs/datasets/wikibench/wikibench_few_shot_ppl_c23d79.py @@ -0,0 +1,73 @@ +import copy + +from opencompass.datasets import WikiBenchDataset +from opencompass.openicl.icl_evaluator import AccEvaluator, CircularEvaluator +from opencompass.openicl.icl_inferencer import PPLInferencer +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever + +single_choice_prompts = { + 'single_choice_cn': [ + dict(role='HUMAN', + prompt='问题: 白色念珠菌常被用作哪种生物的研究模式?\nA. 病毒\nB. 细菌\nC. 真菌\nD. 寄生虫'), + dict(role='BOT', prompt='回答: C'), + dict( + role='HUMAN', + prompt='问题: 星期五广场(荷兰语:Vrijdagmarkt;荷兰语发音: )是比利时根特老城的一个城市广场。 星期五广场下方有一个什么设施?\nA. 游乐场\nB. 地下停车场\nC. 公园\nD. 地下商场' # noqa: E501 + ), + dict(role='BOT', prompt='回答: B'), + dict( + role='HUMAN', + prompt='问题: 尔迪雷·巴斯杜克代表土耳其国家队出场的次数?\nA. 60次\nB. 35次\nC. 49次\nD. 20次' + ), + dict(role='BOT', prompt='回答: C'), + dict( + role='HUMAN', + prompt='问题: 陈酆被任命为漳州刺史是因为什么原因?\nA. 朝廷认为他有能力担任该职务\nB. 漳州人怀念陈元光、陈伯珙的政绩\nC. 他是陈伯珙的儿子\nD. 他是陈元光的孙子' # noqa: E501 + ), + dict(role='BOT', prompt='回答: B'), + dict(role='HUMAN', + prompt='问题: 丹徒县在1928年改名为什么?\nA. 苏州市\nB. 润州县\nC. 镇江县\nD. 丹阳县'), + dict(role='BOT', prompt='回答: C'), + dict(role='HUMAN', prompt='问题: {question}'), + dict(role='BOT', prompt='回答: {answer}'), + ] +} + +wikibench_sets = { + 'wiki': ['single_choice_cn'], +} + +do_circular = True + +wikibench_datasets = [] + +for _split in list(wikibench_sets.keys()): + for _name in wikibench_sets[_split]: + template = {} + for answer in ['A', 'B', 'C', 'D']: + one_template_round = copy.deepcopy(single_choice_prompts[_name]) + one_template_round[-1]['prompt'] = one_template_round[-1][ + 'prompt'].format(answer=answer) + template[answer] = dict(round=one_template_round) + wikibench_infer_cfg = dict( + prompt_template=dict(type=PromptTemplate, template=template), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=PPLInferencer), + ) + wikibench_eval_cfg = dict(evaluator=dict( + type=CircularEvaluator if do_circular else AccEvaluator), ) + wikibench_datasets.append( + dict( + type=WikiBenchDataset, + path=f'./data/WikiBench/{_name}.jsonl', + name='circular_' + _name if do_circular else _name, + abbr='wikibench-' + _split + '-' + _name + + 'circular' if do_circular else '', + reader_cfg=dict( + input_columns=['question'], + output_column='answer', + ), + infer_cfg=wikibench_infer_cfg, + eval_cfg=wikibench_eval_cfg, + )) diff --git a/configs/datasets/wikibench/wikibench_gen_0978ad.py b/configs/datasets/wikibench/wikibench_gen_0978ad.py new file mode 100644 index 000000000..871133f9e --- /dev/null +++ b/configs/datasets/wikibench/wikibench_gen_0978ad.py @@ -0,0 +1,56 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import CircularEvaluator, AccEvaluator +from opencompass.datasets import WikiBenchDataset +from opencompass.utils.text_postprocessors import first_option_postprocess + + +single_choice_prompts = { + 'single_choice_cn': '以下是一道单项选择题,请你根据你了解的知识一步步推理,并在最后用“所以答案为选项X”给出答案,其中“X”为选项A,B,C,D中你认为正确的选项。。\n下面是你要回答的题目:\n{question}\n让我们一步步推理:', +} + +wikibench_sets = { + 'wiki': ['single_choice_cn'], +} + +do_circular = True + +wikibench_datasets = [] + +for _split in list(wikibench_sets.keys()): + for _name in wikibench_sets[_split]: + wikibench_infer_cfg = dict( + ice_template=dict( + type=PromptTemplate, + template=dict( + begin='', + round=[ + dict(role='HUMAN', prompt=single_choice_prompts[_name]), + dict(role='BOT', prompt='{answer}'), + ], + ), + ice_token='', + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer), + ) + wikibench_eval_cfg = dict( + evaluator=dict(type=CircularEvaluator if do_circular else AccEvaluator), + pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'), + ) + + wikibench_datasets.append( + dict( + type=WikiBenchDataset, + path=f'./data/WikiBench/{_name}.jsonl', + name='circular_' + _name if do_circular else _name, + abbr='wikibench-' + _split + '-' + _name + 'circular' if do_circular else '', + reader_cfg=dict( + input_columns=['question'], + output_column='answer', + ), + infer_cfg=wikibench_infer_cfg, + eval_cfg=wikibench_eval_cfg, + ) + ) diff --git a/configs/eval_corebench_2409_base_objective.py b/configs/eval_corebench_2409_base_objective.py new file mode 100644 index 000000000..d5d7a3879 --- /dev/null +++ b/configs/eval_corebench_2409_base_objective.py @@ -0,0 +1,188 @@ +from mmengine.config import read_base +import os.path as osp +from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner +from opencompass.runners import LocalRunner +from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask + + +####################################################################### +# PART 0 Essential Configs # +####################################################################### +with read_base(): + # Datasets Part + ## Core Set + # ## Examination + from opencompass.configs.datasets.mmlu.mmlu_ppl_ac766d import mmlu_datasets + from opencompass.configs.datasets.mmlu_pro.mmlu_pro_few_shot_gen_bfaf90 import \ + mmlu_pro_datasets + from opencompass.configs.datasets.cmmlu.cmmlu_ppl_041cbf import \ + cmmlu_datasets + # ## Reasoning + from opencompass.configs.datasets.bbh.bbh_gen_98fba6 import bbh_datasets + from opencompass.configs.datasets.hellaswag.hellaswag_10shot_ppl_59c85e import hellaswag_datasets + from opencompass.configs.datasets.drop.drop_gen_a2697c import drop_datasets + + # ## Math + from opencompass.configs.datasets.math.math_4shot_base_gen_43d5b6 import math_datasets + from opencompass.configs.datasets.gsm8k.gsm8k_gen_17d0dc import gsm8k_datasets + from opencompass.configs.datasets.MathBench.mathbench_2024_few_shot_mixed_4a3fd4 import \ + mathbench_datasets + + # ## Scientific + from opencompass.configs.datasets.gpqa.gpqa_few_shot_ppl_2c9cd6 import \ + gpqa_datasets + + # ## Coding + from opencompass.configs.datasets.humaneval.deprecated_humaneval_gen_d2537e import humaneval_datasets + from opencompass.configs.datasets.mbpp.sanitized_mbpp_gen_742f0c import sanitized_mbpp_datasets + # TODO: Add LiveCodeBench + + # ## Instruction Following + # from opencompass.configs.datasets.IFEval.IFEval_gen_3321a3 import ifeval_datasets + + # Summarizer + from opencompass.configs.summarizers.groups.mmlu import mmlu_summary_groups + from opencompass.configs.summarizers.groups.mmlu_pro import mmlu_pro_summary_groups + from opencompass.configs.summarizers.groups.cmmlu import cmmlu_summary_groups + from opencompass.configs.summarizers.groups.bbh import bbh_summary_groups + from opencompass.configs.summarizers.groups.mathbench_v1_2024 import \ + mathbench_2024_summary_groups + + # Model List + from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_1_5b import models as lmdeploy_qwen2_5_1_5b_model + # from opencompass.configs.models.qwen.lmdeploy_qwen2_1_5b_instruct import models as lmdeploy_qwen2_1_5b_instruct_model + # from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import models as hf_internlm2_5_7b_chat_model + # from opencompass.configs.models.openbmb.hf_minicpm_2b_sft_bf16 import models as hf_minicpm_2b_sft_bf16_model + # from opencompass.configs.models.yi.hf_yi_1_5_6b_chat import models as hf_yi_1_5_6b_chat_model + # from opencompass.configs.models.gemma.hf_gemma_2b_it import models as hf_gemma_2b_it_model + # from opencompass.configs.models.yi.hf_yi_1_5_34b_chat import models as hf_yi_1_5_34b_chat_model + +####################################################################### +# PART 1 Datasets List # +####################################################################### +# datasets list for evaluation +datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), []) + + +####################################################################### +# PART 2 Datset Summarizer # +####################################################################### +# with read_base(): + +core_summary_groups = [ + { + 'name': 'core_average', + 'subsets': [ + ['mmlu', 'accuracy'], + ['mmlu_pro', 'accuracy'], + ['cmmlu', 'accuracy'], + ['bbh', 'naive_average'], + ['hellaswag', 'accuracy'], + ['drop', 'accuracy'], + ['math', 'accuracy'], + ['gsm8k', 'accuracy'], + ['mathbench-t (average)', 'naive_average'], + ['GPQA_diamond', 'accuracy'], + ['openai_humaneval', 'humaneval_pass@1'], + ['IFEval', 'Prompt-level-strict-accuracy'], + ['sanitized_mbpp', 'score'], + ['mathbench-t (average)', 'naive_average'] + ], + }, +] + +summarizer = dict( + dataset_abbrs=[ + ['mmlu', 'accuracy'], + ['mmlu_pro', 'accuracy'], + ['cmmlu', 'accuracy'], + ['bbh', 'naive_average'], + ['hellaswag', 'accuracy'], + ['drop', 'accuracy'], + ['math', 'accuracy'], + ['gsm8k', 'accuracy'], + ['mathbench-t (average)', 'naive_average'], + ['GPQA_diamond', 'accuracy'], + ['openai_humaneval', 'humaneval_pass@1'], + ['IFEval', 'Prompt-level-strict-accuracy'], + ['sanitized_mbpp', 'score'], + 'mathbench-a (average)', + 'mathbench-t (average)' + '', + ['mmlu', 'accuracy'], + ['mmlu-stem', 'accuracy'], + ['mmlu-social-science', 'accuracy'], + ['mmlu-humanities', 'accuracy'], + ['mmlu-other', 'accuracy'], + + '', + ['mmlu_pro', 'accuracy'], + ['mmlu_pro_math','accuracy'], + ['mmlu_pro_physics', 'accuracy'], + ['mmlu_pro_chemistry', 'accuracy'], + ['mmlu_pro_law', 'accuracy'], + ['mmlu_pro_engineering', 'accuracy'], + ['mmlu_pro_other', 'accuracy'], + ['mmlu_pro_economics', 'accuracy'], + ['mmlu_pro_health', 'accuracy'], + ['mmlu_pro_psychology', 'accuracy'], + ['mmlu_pro_business', 'accuracy'], + ['mmlu_pro_biology', 'accuracy'], + ['mmlu_pro_philosophy', 'accuracy'], + ['mmlu_pro_computer_science','accuracy'], + ['mmlu_pro_history', 'accuracy'], + '', + ['cmmlu', 'accuracy'], + ['cmmlu-stem', 'accuracy'], + ['cmmlu-social-science', 'accuracy'], + ['cmmlu-humanities', 'accuracy'], + ['cmmlu-other', 'accuracy'], + ['cmmlu-china-specific', 'accuracy'], + + ], + summary_groups=sum( + [v for k, v in locals().items() if k.endswith('_summary_groups')], []), +) + + +####################################################################### +# PART 3 Models List # +####################################################################### + +models = sum([v for k, v in locals().items() if k.endswith('_model')], []) + + + +####################################################################### +# PART 4 Inference/Evaluation Configuaration # +####################################################################### + +# Local Runner +infer = dict( + partitioner=dict( + type=NumWorkerPartitioner, + num_worker=8 + ), + runner=dict( + type=LocalRunner, + max_num_workers=16, + retry=0, # Modify if needed + task=dict(type=OpenICLInferTask) + ), +) + +# eval with local runner +eval = dict( + partitioner=dict(type=NaivePartitioner, n=10), + runner=dict( + type=LocalRunner, + max_num_workers=16, + task=dict(type=OpenICLEvalTask)), +) + + +####################################################################### +# PART 5 Utils Configuaration # +####################################################################### +base_exp_dir = 'outputs/corebench_2409_objective/' +work_dir = osp.join(base_exp_dir, 'base_objective') diff --git a/configs/eval_corebench_2409_chat_objective.py b/configs/eval_corebench_2409_chat_objective.py new file mode 100644 index 000000000..0b6735062 --- /dev/null +++ b/configs/eval_corebench_2409_chat_objective.py @@ -0,0 +1,220 @@ +from mmengine.config import read_base +import os.path as osp +from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner +from opencompass.runners import LocalRunner +from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask + + +####################################################################### +# PART 0 Essential Configs # +####################################################################### +with read_base(): + # Datasets Part + ## Core Set + # ## Examination + from opencompass.configs.datasets.mmlu.mmlu_openai_simple_evals_gen_b618ea import mmlu_datasets + from opencompass.configs.datasets.mmlu_pro.mmlu_pro_0shot_cot_gen_08c1de import mmlu_pro_datasets + from opencompass.configs.datasets.cmmlu.cmmlu_0shot_cot_gen_305931 import cmmlu_datasets + + # ## Reasoning + from opencompass.configs.datasets.bbh.bbh_gen_4a31fa import bbh_datasets + from opencompass.configs.datasets.hellaswag.hellaswag_10shot_gen_e42710 import \ + hellaswag_datasets + from opencompass.configs.datasets.drop.drop_openai_simple_evals_gen_3857b0 import drop_datasets + + # ## Math + from opencompass.configs.datasets.math.math_0shot_gen_393424 import math_datasets + from opencompass.configs.datasets.gsm8k.gsm8k_0shot_v2_gen_a58960 import \ + gsm8k_datasets + from opencompass.configs.datasets.MathBench.mathbench_2024_gen_50a320 import mathbench_datasets + + # ## Scientific + from opencompass.configs.datasets.gpqa.gpqa_openai_simple_evals_gen_5aeece import gpqa_datasets + + # ## Coding + from opencompass.configs.datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets + from opencompass.configs.datasets.mbpp.sanitized_mbpp_mdblock_gen_a447ff import sanitized_mbpp_datasets + # TODO: Add LiveCodeBench + + # ## Instruction Following + from opencompass.configs.datasets.IFEval.IFEval_gen_3321a3 import ifeval_datasets + + # Summarizer + from opencompass.configs.summarizers.groups.mmlu import mmlu_summary_groups + from opencompass.configs.summarizers.groups.mmlu_pro import mmlu_pro_summary_groups + from opencompass.configs.summarizers.groups.cmmlu import cmmlu_summary_groups + from opencompass.configs.summarizers.groups.bbh import bbh_summary_groups + + + # Model List + # from opencompass.configs.models.qwen.lmdeploy_qwen2_1_5b_instruct import models as lmdeploy_qwen2_1_5b_instruct_model + # from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import models as hf_internlm2_5_7b_chat_model + # from opencompass.configs.models.openbmb.hf_minicpm_2b_sft_bf16 import models as hf_minicpm_2b_sft_bf16_model + # from opencompass.configs.models.yi.hf_yi_1_5_6b_chat import models as hf_yi_1_5_6b_chat_model + # from opencompass.configs.models.gemma.hf_gemma_2b_it import models as hf_gemma_2b_it_model + # from opencompass.configs.models.yi.hf_yi_1_5_34b_chat import models as hf_yi_1_5_34b_chat_model + +####################################################################### +# PART 1 Datasets List # +####################################################################### +# datasets list for evaluation +datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), []) + + +####################################################################### +# PART 2 Datset Summarizer # +####################################################################### +# with read_base(): + +core_summary_groups = [ + { + 'name': 'core_average', + 'subsets': [ + ['mmlu', 'accuracy'], + ['mmlu_pro', 'accuracy'], + ['cmmlu', 'accuracy'], + ['bbh', 'score'], + ['math', 'accuracy'], + ['openai_humaneval', 'humaneval_pass@1'], + ['GPQA_diamond', 'accuracy'], + ['IFEval', 'Prompt-level-strict-accuracy'], + ['drop', 'accuracy'], + ['sanitized_mbpp', 'score'], + ['gsm8k', 'accuracy'], + ['hellaswag', 'accuracy'], + ['mathbench-t (average)', 'naive_average'] + ], + }, +] + +summarizer = dict( + dataset_abbrs=[ + ['core_average', 'naive_average'], + ['mmlu', 'accuracy'], + ['mmlu_pro', 'accuracy'], + ['cmmlu', 'accuracy'], + ['bbh', 'score'], + ['math', 'accuracy'], + ['openai_humaneval', 'humaneval_pass@1'], + ['GPQA_diamond', 'accuracy'], + ['IFEval', 'Prompt-level-strict-accuracy'], + ['drop', 'accuracy'], + ['sanitized_mbpp', 'score'], + ['gsm8k', 'accuracy'], + ['hellaswag', 'accuracy'], + 'mathbench-a (average)', + 'mathbench-t (average)' + '', + + ['mmlu', 'accuracy'], + ['mmlu-stem', 'accuracy'], + ['mmlu-social-science', 'accuracy'], + ['mmlu-humanities', 'accuracy'], + ['mmlu-other', 'accuracy'], + + '', + ['mmlu_pro', 'accuracy'], + ['mmlu_pro_math','accuracy'], + ['mmlu_pro_physics', 'accuracy'], + ['mmlu_pro_chemistry', 'accuracy'], + ['mmlu_pro_law', 'accuracy'], + ['mmlu_pro_engineering', 'accuracy'], + ['mmlu_pro_other', 'accuracy'], + ['mmlu_pro_economics', 'accuracy'], + ['mmlu_pro_health', 'accuracy'], + ['mmlu_pro_psychology', 'accuracy'], + ['mmlu_pro_business', 'accuracy'], + ['mmlu_pro_biology', 'accuracy'], + ['mmlu_pro_philosophy', 'accuracy'], + ['mmlu_pro_computer_science','accuracy'], + ['mmlu_pro_history', 'accuracy'], + '', + ['cmmlu', 'accuracy'], + ['cmmlu-stem', 'accuracy'], + ['cmmlu-social-science', 'accuracy'], + ['cmmlu-humanities', 'accuracy'], + ['cmmlu-other', 'accuracy'], + ['cmmlu-china-specific', 'accuracy'], + '', + ['bbh', 'extract_rate'], + ['math', 'extract_rate'], + # ['openai_humaneval', 'extract_rate'], + ['GPQA_diamond', 'extract_rate'], + # ['IFEval', 'extract_rate'], + '', + ['mmlu', 'extract_rate'], + ['mmlu-stem', 'extract_rate'], + ['mmlu-social-science', 'extract_rate'], + ['mmlu-humanities', 'extract_rate'], + ['mmlu-other', 'extract_rate'], + '', + ['mmlu_pro', 'extract_rate'], + ['mmlu_pro_math', 'extract_rate'], + ['mmlu_pro_physics', 'extract_rate'], + ['mmlu_pro_chemistry', 'extract_rate'], + ['mmlu_pro_law', 'extract_rate'], + ['mmlu_pro_engineering', 'extract_rate'], + ['mmlu_pro_other', 'extract_rate'], + ['mmlu_pro_economics', 'extract_rate'], + ['mmlu_pro_health', 'extract_rate'], + ['mmlu_pro_psychology', 'extract_rate'], + ['mmlu_pro_business', 'extract_rate'], + ['mmlu_pro_biology', 'extract_rate'], + ['mmlu_pro_philosophy', 'extract_rate'], + ['mmlu_pro_computer_science', 'extract_rate'], + ['mmlu_pro_history', 'extract_rate'], + '', + ['cmmlu', 'extract_rate'], + ['cmmlu-stem', 'extract_rate'], + ['cmmlu-social-science', 'extract_rate'], + ['cmmlu-humanities', 'extract_rate'], + ['cmmlu-other', 'extract_rate'], + ['cmmlu-china-specific', 'extract_rate'], + + ], + summary_groups=sum( + [v for k, v in locals().items() if k.endswith('_summary_groups')], []), +) + + +####################################################################### +# PART 3 Models List # +####################################################################### + +models = sum([v for k, v in locals().items() if k.endswith('_model')], []) + + + +####################################################################### +# PART 4 Inference/Evaluation Configuaration # +####################################################################### + +# Local Runner +infer = dict( + partitioner=dict( + type=NumWorkerPartitioner, + num_worker=8 + ), + runner=dict( + type=LocalRunner, + max_num_workers=16, + retry=0, # Modify if needed + task=dict(type=OpenICLInferTask) + ), +) + +# eval with local runner +eval = dict( + partitioner=dict(type=NaivePartitioner, n=10), + runner=dict( + type=LocalRunner, + max_num_workers=16, + task=dict(type=OpenICLEvalTask)), +) + + +####################################################################### +# PART 5 Utils Configuaration # +####################################################################### +base_exp_dir = 'outputs/corebench_2409_objective/' +work_dir = osp.join(base_exp_dir, 'chat_objective') diff --git a/configs/eval_corebench_2409_longcontext.py b/configs/eval_corebench_2409_longcontext.py new file mode 100644 index 000000000..718044d2a --- /dev/null +++ b/configs/eval_corebench_2409_longcontext.py @@ -0,0 +1,138 @@ +import os.path as osp +from copy import deepcopy + +from mmengine.config import read_base +from opencompass.models import (HuggingFacewithChatTemplate, + TurboMindModelwithChatTemplate) +from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner +from opencompass.runners import DLCRunner, LocalRunner +from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask + + +####################################################################### +# PART 0 Essential Configs # +####################################################################### +with read_base(): + from opencompass.configs.datasets.longbench.longbench import \ + longbench_datasets + from opencompass.configs.datasets.needlebench.needlebench_8k.needlebench_8k import \ + needlebench_datasets as needlebench_8k_datasets + from opencompass.configs.datasets.needlebench.needlebench_32k.needlebench_32k import \ + needlebench_datasets as needlebench_32k_datasets + from opencompass.configs.datasets.needlebench.needlebench_128k.needlebench_128k import \ + needlebench_datasets as needlebench_128k_datasets + from opencompass.configs.datasets.ruler.ruler_8k_gen import \ + ruler_datasets as ruler_8k_datasets + from opencompass.configs.datasets.ruler.ruler_32k_gen import \ + ruler_datasets as ruler_32k_datasets + from opencompass.configs.datasets.ruler.ruler_128k_gen import \ + ruler_datasets as ruler_128k_datasets + # Summary Groups + from opencompass.configs.summarizers.groups.longbench import \ + longbench_summary_groups + from opencompass.configs.summarizers.groups.ruler import \ + ruler_summary_groups + from opencompass.configs.summarizers.needlebench import ( + needlebench_8k_summarizer, needlebench_32k_summarizer, + needlebench_128k_summarizer) + + # Instruct models + from opencompass.configs.models.qwen.lmdeploy_qwen2_7b_instruct import \ + models as lmdeploy_qwen2_7b_instruct_model + + from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat_1m import \ + models as lmdeploy_internlm2_5_7b_1m_chat_model + from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_8b_instruct import \ + models as llama3_1_8b_instruct_model + + +####################################################################### +# PART 1 Datasets List # +####################################################################### +# datasets list for evaluation +datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), []) + + +####################################################################### +# PART 2 Datset Summarizer # +####################################################################### +needlebench_8k_summary_groups = needlebench_8k_summarizer['summary_groups'] +needlebench_32k_summary_groups = needlebench_32k_summarizer['summary_groups'] +needlebench_128k_summary_groups = needlebench_128k_summarizer['summary_groups'] + +# Instruct models summarizer +summarizer = dict( + dataset_abbrs=[ + ['ruler_8k', 'naive_average'], + ['ruler_32k', 'naive_average'], + ['ruler_128k', 'naive_average'], + ['NeedleBench-Overall-Score-8K', 'weighted_average'], + ['NeedleBench-Overall-Score-32K', 'weighted_average'], + ['NeedleBench-Overall-Score-128K', 'weighted_average'], + ['longbench', 'naive_average'], + ['longbench_zh', 'naive_average'], + ['longbench_en', 'naive_average'], + '', + 'longbench_single-document-qa', + 'longbench_multi-document-qa', + 'longbench_summarization', + 'longbench_few-shot-learning', + 'longbench_synthetic-tasks', + 'longbench_code-completion', + ], + summary_groups=sum( + [v for k, v in locals().items() if k.endswith('_summary_groups')], []), +) + + +####################################################################### +# PART 3 Models List # +####################################################################### + +lmdeploy_qwen2_7b_instruct_model[0]['max_seq_len'] = 1048576 +lmdeploy_qwen2_7b_instruct_model[0]['engine_config']['session_len'] = 1048576 +lmdeploy_qwen2_7b_instruct_model[0]['engine_config']['tp'] = 4 +lmdeploy_qwen2_7b_instruct_model[0]['engine_config']['rope_scaling_factor'] = 4 +lmdeploy_qwen2_7b_instruct_model[0]['run_cfg']['num_gpus'] = 4 + +llama3_1_8b_instruct_model[0]['max_seq_len'] = 1048576 +llama3_1_8b_instruct_model[0]['engine_config']['session_len'] = 1048576 +llama3_1_8b_instruct_model[0]['engine_config']['tp'] = 4 +llama3_1_8b_instruct_model[0]['engine_config']['rope_scaling_factor'] = 4 +llama3_1_8b_instruct_model[0]['run_cfg']['num_gpus'] = 4 + +models = sum([v for k, v in locals().items() if k.endswith('_model')], []) + +####################################################################### +# PART 4 Inference/Evaluation Configuaration # +####################################################################### + +# Local Runner +infer = dict( + partitioner=dict( + type=NumWorkerPartitioner, + num_worker=8 + ), + runner=dict( + type=LocalRunner, + max_num_workers=16, + retry=0, # Modify if needed + task=dict(type=OpenICLInferTask) + ), +) + +# eval with local runner +eval = dict( + partitioner=dict(type=NaivePartitioner, n=10), + runner=dict( + type=LocalRunner, + max_num_workers=16, + task=dict(type=OpenICLEvalTask)), +) + + +####################################################################### +# PART 5 Utils Configuaration # +####################################################################### +base_exp_dir = 'outputs/corebench/' +work_dir = osp.join(base_exp_dir, 'long_context') diff --git a/configs/eval_corebench_2409_subjective.py b/configs/eval_corebench_2409_subjective.py new file mode 100644 index 000000000..c0623c804 --- /dev/null +++ b/configs/eval_corebench_2409_subjective.py @@ -0,0 +1,134 @@ +import os.path as osp +from copy import deepcopy + +from mmengine.config import read_base +from opencompass.models import (HuggingFacewithChatTemplate, + TurboMindModelwithChatTemplate) +from opencompass.models.openai_api import OpenAI, OpenAISDK +from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner +from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner +from opencompass.runners import DLCRunner, LocalRunner +from opencompass.summarizers import SubjectiveSummarizer +from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask +from opencompass.tasks.subjective_eval import SubjectiveEvalTask + + +####################################################################### +# PART 0 Essential Configs # +####################################################################### +with read_base(): + # Datasets Part + from opencompass.configs.datasets.subjective.arena_hard.arena_hard_compare import \ + arenahard_datasets + from opencompass.configs.datasets.subjective.alignbench.alignbench_v1_1_judgeby_critiquellm import \ + alignbench_datasets + from opencompass.configs.datasets.subjective.multiround.mtbench_single_judge_diff_temp import \ + mtbench_datasets + + # Summarizer + + # Model List + # from opencompass.configs.models.qwen.lmdeploy_qwen2_1_5b_instruct import models as lmdeploy_qwen2_1_5b_instruct_model + # from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import models as hf_internlm2_5_7b_chat_model + + +####################################################################### +# PART 1 Datasets List # +####################################################################### +# datasets list for evaluation + +datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), []) + + +####################################################################### +# PART 2 Datset Summarizer # +####################################################################### +summarizer = dict(type=SubjectiveSummarizer, function='subjective') + +####################################################################### +# PART 3 Models List # +####################################################################### + +models = [ + dict( + type=TurboMindModelwithChatTemplate, + abbr='internlm2_5-7b-chat-turbomind', + path='internlm/internlm2_5-7b-chat', + engine_config=dict(session_len=16384, max_batch_size=16, tp=1), + gen_config=dict(top_k=40, temperature=1.0, top_p=0.9, max_new_tokens=4096), + max_seq_len=16384, + max_out_len=4096, + batch_size=16, + run_cfg=dict(num_gpus=1), + ) +] + +models = sum([v for k, v in locals().items() if k.endswith('_model')], models) + + + +####################################################################### +# PART 4 Inference/Evaluation Configuaration # +####################################################################### + +# Local Runner +infer = dict( + partitioner=dict( + type=NumWorkerPartitioner, + num_worker=8 + ), + runner=dict( + type=LocalRunner, + max_num_workers=16, + retry=0, # Modify if needed + task=dict(type=OpenICLInferTask) + ), +) + +# JudgeLLM +api_meta_template = dict(round=[ + dict(role='HUMAN', api_role='HUMAN'), + dict(role='BOT', api_role='BOT', generate=True), +]) + + +judge_models = [ + dict( + type=OpenAISDK, + abbr='gpt-4o-2024-08-06', + path='gpt-4o-2024-08-06', + # openai_api_base= + # 'http://10.140.1.86:10001/v1', # Change to your own url if needed. + key='YOUR_API_KEY', + retry=10, + meta_template=api_meta_template, + rpm_verbose=True, + query_per_second=1, + max_out_len=4096, + max_seq_len=16384, + batch_size=16, + temperature=0.01, + tokenizer_path='gpt-4o-2024-08-06' + ) +] + +# Evaluation with local runner +eval = dict( + partitioner=dict( + type=SubjectiveNaivePartitioner, + models=models, + judge_models=judge_models, + ), + runner=dict( + type=LocalRunner, + max_num_workers=16, + task=dict(type=SubjectiveEvalTask)), +) + + + +####################################################################### +# PART 5 Utils Configuaration # +####################################################################### +base_exp_dir = 'outputs/corebench/' +work_dir = osp.join(base_exp_dir, 'chat_subjective') diff --git a/configs/eval_dingo.py b/configs/eval_dingo.py new file mode 100644 index 000000000..3e0ecb86b --- /dev/null +++ b/configs/eval_dingo.py @@ -0,0 +1,7 @@ +from mmengine.config import read_base + +with read_base(): + from .models.hf_internlm.hf_internlm_7b import models + from .datasets.dingo.dingo_gen import datasets + +work_dir = './outputs/eval_dingo' diff --git a/configs/eval_internlm_chat_lmdeploy_pytorch.py b/configs/eval_internlm_chat_lmdeploy_pytorch.py deleted file mode 100644 index 4ea1f84c2..000000000 --- a/configs/eval_internlm_chat_lmdeploy_pytorch.py +++ /dev/null @@ -1,69 +0,0 @@ -from mmengine.config import read_base -from opencompass.models import LmdeployPytorchModel - - -with read_base(): - # choose a list of datasets - from opencompass.configs.datasets.mmlu.mmlu_gen_a484b3 import mmlu_datasets - from opencompass.configs.datasets.ceval.ceval_gen_5f30c7 import ceval_datasets - from opencompass.configs.datasets.SuperGLUE_WiC.SuperGLUE_WiC_gen_d06864 import WiC_datasets - from opencompass.configs.datasets.SuperGLUE_WSC.SuperGLUE_WSC_gen_7902a7 import WSC_datasets - from opencompass.configs.datasets.triviaqa.triviaqa_gen_2121ce import triviaqa_datasets - from opencompass.configs.datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets - from opencompass.configs.datasets.race.race_gen_69ee4f import race_datasets - from opencompass.configs.datasets.crowspairs.crowspairs_gen_381af0 import crowspairs_datasets - # and output the results in a choosen format - from opencompass.configs.summarizers.medium import summarizer - - -datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), []) - - -meta_template = dict( - round=[ - dict(role='HUMAN', begin='<|User|>:', end='\n'), - dict(role='BOT', begin='<|Bot|>:', end='\n', generate=True), - ], - eos_token_id=103028) - -# config for internlm-chat-7b -internlm_chat_7b = dict( - type=LmdeployPytorchModel, - abbr='internlm-chat-7b-pytorch', - path='internlm/internlm-chat-7b', - engine_config=dict(session_len=2048, - max_batch_size=16), - gen_config=dict(top_k=1, - top_p=0.8, - temperature=1.0, - max_new_tokens=100), - max_out_len=100, - max_seq_len=2048, - batch_size=16, - concurrency=16, - meta_template=meta_template, - run_cfg=dict(num_gpus=1, num_procs=1), - end_str='', -) - -# config for internlm-chat-20b -internlm_chat_20b = dict( - type=LmdeployPytorchModel, - abbr='internlm-chat-20b-pytorch', - path='internlm/internlm-chat-20b', - engine_config=dict(session_len=2048, - max_batch_size=8), - gen_config=dict(top_k=1, - top_p=0.8, - temperature=1.0, - max_new_tokens=100), - max_out_len=100, - max_seq_len=2048, - batch_size=8, - concurrency=8, - meta_template=meta_template, - run_cfg=dict(num_gpus=1, num_procs=1), - end_str='', - ) - -models = [internlm_chat_20b] diff --git a/configs/eval_internlm_chat_lmdeploy_tis.py b/configs/eval_internlm_chat_lmdeploy_tis.py deleted file mode 100644 index 8f5470d52..000000000 --- a/configs/eval_internlm_chat_lmdeploy_tis.py +++ /dev/null @@ -1,41 +0,0 @@ -from mmengine.config import read_base -from opencompass.models.lmdeploy_tis import LmdeployTisModel - -with read_base(): - # choose a list of datasets - from opencompass.configs.datasets.mmlu.mmlu_gen_a484b3 import mmlu_datasets - from opencompass.configs.datasets.ceval.ceval_gen_5f30c7 import ceval_datasets - from opencompass.configs.datasets.SuperGLUE_WiC.SuperGLUE_WiC_gen_d06864 import WiC_datasets - from opencompass.configs.datasets.SuperGLUE_WSC.SuperGLUE_WSC_gen_7902a7 import WSC_datasets - from opencompass.configs.datasets.triviaqa.triviaqa_gen_2121ce import triviaqa_datasets - from opencompass.configs.datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets - from opencompass.configs.datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets - from opencompass.configs.datasets.race.race_gen_69ee4f import race_datasets - from opencompass.configs.datasets.crowspairs.crowspairs_gen_381af0 import crowspairs_datasets - # and output the results in a choosen format - from opencompass.configs.summarizers.medium import summarizer - -datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), []) - -meta_template = dict( - round=[ - dict(role='HUMAN', begin='<|im_start|>user\n', end='<|im_end|>\n'), - dict(role='BOT', begin='<|im_start|>assistant\n', end='<|im_end|>\n', generate=True), - ], - eos_token_id=92542 -) - -models = [ - dict( - type=LmdeployTisModel, - abbr='internlm-chat-20b-lmdeploy-tis', - path='internlm/internlm-chat-20b', - tis_addr='0.0.0.0:33337', - max_out_len=100, - max_seq_len=2048, - batch_size=8, - meta_template=meta_template, - run_cfg=dict(num_gpus=1, num_procs=1), - end_str='<|im_end|>', - ) -] diff --git a/configs/eval_internlm_chat_turbomind_tis.py b/configs/eval_internlm_chat_turbomind_tis.py deleted file mode 100644 index 01f42000f..000000000 --- a/configs/eval_internlm_chat_turbomind_tis.py +++ /dev/null @@ -1,40 +0,0 @@ -from mmengine.config import read_base -from opencompass.models.turbomind_tis import TurboMindTisModel - -with read_base(): - # choose a list of datasets - from opencompass.configs.datasets.mmlu.mmlu_gen_a484b3 import mmlu_datasets - from opencompass.configs.datasets.ceval.ceval_gen_5f30c7 import ceval_datasets - from opencompass.configs.datasets.SuperGLUE_WiC.SuperGLUE_WiC_gen_d06864 import WiC_datasets - from opencompass.configs.datasets.SuperGLUE_WSC.SuperGLUE_WSC_gen_7902a7 import WSC_datasets - from opencompass.configs.datasets.triviaqa.triviaqa_gen_2121ce import triviaqa_datasets - from opencompass.configs.datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets - from opencompass.configs.datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets - from opencompass.configs.datasets.race.race_gen_69ee4f import race_datasets - from opencompass.configs.datasets.crowspairs.crowspairs_gen_381af0 import crowspairs_datasets - # and output the results in a choosen format - from opencompass.configs.summarizers.medium import summarizer - -datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), []) - - -meta_template = dict( - round=[ - dict(role='HUMAN', begin='<|User|>:', end='\n'), - dict(role='BOT', begin='<|Bot|>:', end='\n', generate=True), - ], - eos_token_id=103028) - -models = [ - dict( - type=TurboMindTisModel, - abbr='internlm-chat-20b-turbomind', - path='internlm', - tis_addr='0.0.0.0:33337', - max_out_len=100, - max_seq_len=2048, - batch_size=8, - meta_template=meta_template, - run_cfg=dict(num_gpus=1, num_procs=1), - ) -] diff --git a/configs/eval_internlm_turbomind_tis.py b/configs/eval_internlm_turbomind_tis.py deleted file mode 100644 index 98914fa47..000000000 --- a/configs/eval_internlm_turbomind_tis.py +++ /dev/null @@ -1,28 +0,0 @@ -from mmengine.config import read_base -from opencompass.models.turbomind_tis import TurboMindTisModel - -with read_base(): - # choose a list of datasets - from opencompass.configs.datasets.mmlu.mmlu_gen_a484b3 import mmlu_datasets - from opencompass.configs.datasets.ceval.ceval_gen_5f30c7 import ceval_datasets - from opencompass.configs.datasets.SuperGLUE_WiC.SuperGLUE_WiC_gen_d06864 import WiC_datasets - from opencompass.configs.datasets.triviaqa.triviaqa_gen_2121ce import triviaqa_datasets - from opencompass.configs.datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets - from opencompass.configs.datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets - # and output the results in a choosen format - from opencompass.configs.summarizers.medium import summarizer - -datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), []) - -models = [ - dict( - type=TurboMindTisModel, - abbr='internlm-chat-20b-turbomind', - path='internlm', - tis_addr='0.0.0.0:33337', - max_out_len=100, - max_seq_len=2048, - batch_size=8, - run_cfg=dict(num_gpus=1, num_procs=1), - ) -] diff --git a/configs/models/bailing_api/bailing-lite-0830.py b/configs/models/bailing_api/bailing-lite-0830.py new file mode 100644 index 000000000..88053ce98 --- /dev/null +++ b/configs/models/bailing_api/bailing-lite-0830.py @@ -0,0 +1,30 @@ +from opencompass.models import BailingAPI + +api_meta_template = dict( + round=[ + dict(role='HUMAN', api_role='HUMAN'), + dict(role='BOT', api_role='BOT', generate=False), + ], + reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')], +) + +models = [ + dict( + path='Bailing-Lite-0830', + token='', # set your key here or in environment variable BAILING_API_KEY + url='https://bailingchat.alipay.com/chat/completions', + type=BailingAPI, + meta_template=api_meta_template, + query_per_second=1, + max_seq_len=4096, + batch_size=1, + generation_kwargs={ + 'temperature': 0.4, + 'top_p': 1.0, + 'top_k': -1, + 'n': 1, + 'logprobs': 1, + 'use_beam_search': False, + }, + ), +] diff --git a/configs/models/bailing_api/bailing-pro-0920.py b/configs/models/bailing_api/bailing-pro-0920.py new file mode 100644 index 000000000..db69b263e --- /dev/null +++ b/configs/models/bailing_api/bailing-pro-0920.py @@ -0,0 +1,30 @@ +from opencompass.models import BailingAPI + +api_meta_template = dict( + round=[ + dict(role='HUMAN', api_role='HUMAN'), + dict(role='BOT', api_role='BOT', generate=False), + ], + reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')], +) + +models = [ + dict( + path='Bailing-Pro-0920', + token='', # set your key here or in environment variable BAILING_API_KEY + url='https://bailingchat.alipay.com/chat/completions', + type=BailingAPI, + meta_template=api_meta_template, + query_per_second=1, + max_seq_len=4096, + batch_size=1, + generation_kwargs={ + 'temperature': 0.4, + 'top_p': 1.0, + 'top_k': -1, + 'n': 1, + 'logprobs': 1, + 'use_beam_search': False, + }, + ), +] diff --git a/configs/models/chatglm/lmdeploy_glm4_9b_chat.py b/configs/models/chatglm/lmdeploy_glm4_9b_chat.py index 2f8218a62..c5cb8c4d5 100644 --- a/configs/models/chatglm/lmdeploy_glm4_9b_chat.py +++ b/configs/models/chatglm/lmdeploy_glm4_9b_chat.py @@ -6,9 +6,9 @@ abbr='glm-4-9b-chat-turbomind', path='THUDM/glm-4-9b-chat', engine_config=dict(max_batch_size=16, tp=1), - gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024), + gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=2048), max_seq_len=8192, - max_out_len=1024, + max_out_len=2048, batch_size=16, run_cfg=dict(num_gpus=1), ) diff --git a/configs/models/deepseek/lmdeploy_deepseek_67b_chat.py b/configs/models/deepseek/lmdeploy_deepseek_67b_chat.py index e369e6e12..67624eb89 100644 --- a/configs/models/deepseek/lmdeploy_deepseek_67b_chat.py +++ b/configs/models/deepseek/lmdeploy_deepseek_67b_chat.py @@ -7,8 +7,8 @@ path='deepseek-ai/deepseek-llm-67b-chat', engine_config=dict(max_batch_size=16, tp=4), gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9), - max_seq_len=7168, - max_out_len=1024, + max_seq_len=8192, + max_out_len=2048, batch_size=16, run_cfg=dict(num_gpus=4), ) diff --git a/configs/models/deepseek/lmdeploy_deepseek_7b_chat.py b/configs/models/deepseek/lmdeploy_deepseek_7b_chat.py index 26aa2afce..2c108cc13 100644 --- a/configs/models/deepseek/lmdeploy_deepseek_7b_chat.py +++ b/configs/models/deepseek/lmdeploy_deepseek_7b_chat.py @@ -7,8 +7,8 @@ path='deepseek-ai/deepseek-llm-7b-chat', engine_config=dict(max_batch_size=16, tp=1), gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9), - max_seq_len=7168, - max_out_len=1024, + max_seq_len=8192, + max_out_len=2048, batch_size=16, run_cfg=dict(num_gpus=1), ) diff --git a/configs/models/hf_internlm/lmdeploy_internlm2_5_1_8b_chat.py b/configs/models/hf_internlm/lmdeploy_internlm2_5_1_8b_chat.py index 5d5c257b1..cf4691f16 100644 --- a/configs/models/hf_internlm/lmdeploy_internlm2_5_1_8b_chat.py +++ b/configs/models/hf_internlm/lmdeploy_internlm2_5_1_8b_chat.py @@ -5,10 +5,10 @@ type=TurboMindModelwithChatTemplate, abbr='internlm2_5-1_8b-chat-turbomind', path='internlm/internlm2_5-1_8b-chat', - engine_config=dict(session_len=8192, max_batch_size=16, tp=1), - gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=2048), - max_seq_len=8192, - max_out_len=2048, + engine_config=dict(session_len=16384, max_batch_size=16, tp=1), + gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096), + max_seq_len=16384, + max_out_len=4096, batch_size=16, run_cfg=dict(num_gpus=1), ) diff --git a/configs/models/hf_internlm/lmdeploy_internlm2_5_20b_chat.py b/configs/models/hf_internlm/lmdeploy_internlm2_5_20b_chat.py index f1bb1b081..7fb521618 100644 --- a/configs/models/hf_internlm/lmdeploy_internlm2_5_20b_chat.py +++ b/configs/models/hf_internlm/lmdeploy_internlm2_5_20b_chat.py @@ -5,10 +5,10 @@ type=TurboMindModelwithChatTemplate, abbr='internlm2_5-20b-chat-turbomind', path='internlm/internlm2_5-20b-chat', - engine_config=dict(session_len=8192, max_batch_size=16, tp=2), - gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=2048), - max_seq_len=8192, - max_out_len=2048, + engine_config=dict(session_len=16384, max_batch_size=16, tp=2), + gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096), + max_seq_len=16384, + max_out_len=4096, batch_size=16, run_cfg=dict(num_gpus=2), ) diff --git a/configs/models/hf_internlm/lmdeploy_internlm2_5_7b_chat.py b/configs/models/hf_internlm/lmdeploy_internlm2_5_7b_chat.py index 75fb93713..8dce26843 100644 --- a/configs/models/hf_internlm/lmdeploy_internlm2_5_7b_chat.py +++ b/configs/models/hf_internlm/lmdeploy_internlm2_5_7b_chat.py @@ -5,10 +5,10 @@ type=TurboMindModelwithChatTemplate, abbr='internlm2_5-7b-chat-turbomind', path='internlm/internlm2_5-7b-chat', - engine_config=dict(session_len=7168, max_batch_size=16, tp=1), - gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024), - max_seq_len=7168, - max_out_len=1024, + engine_config=dict(session_len=16384, max_batch_size=16, tp=1), + gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096), + max_seq_len=16384, + max_out_len=4096, batch_size=16, run_cfg=dict(num_gpus=1), ) diff --git a/configs/models/hf_internlm/lmdeploy_internlm2_chat_1_8b.py b/configs/models/hf_internlm/lmdeploy_internlm2_chat_1_8b.py index 9c358d5a6..f5df7926d 100644 --- a/configs/models/hf_internlm/lmdeploy_internlm2_chat_1_8b.py +++ b/configs/models/hf_internlm/lmdeploy_internlm2_chat_1_8b.py @@ -5,10 +5,10 @@ type=TurboMindModelwithChatTemplate, abbr='internlm2-chat-1.8b-turbomind', path='internlm/internlm2-chat-1_8b', - engine_config=dict(session_len=7168, max_batch_size=16, tp=1), - gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024), - max_seq_len=7168, - max_out_len=1024, + engine_config=dict(session_len=8192, max_batch_size=16, tp=1), + gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096), + max_seq_len=8192, + max_out_len=4096, batch_size=16, run_cfg=dict(num_gpus=1), ) diff --git a/configs/models/hf_internlm/lmdeploy_internlm2_chat_20b.py b/configs/models/hf_internlm/lmdeploy_internlm2_chat_20b.py index 443715494..23f35636c 100644 --- a/configs/models/hf_internlm/lmdeploy_internlm2_chat_20b.py +++ b/configs/models/hf_internlm/lmdeploy_internlm2_chat_20b.py @@ -5,10 +5,10 @@ type=TurboMindModelwithChatTemplate, abbr='internlm2-chat-20b-turbomind', path='internlm/internlm2-chat-20b', - engine_config=dict(session_len=7168, max_batch_size=16, tp=2), - gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024), - max_seq_len=7168, - max_out_len=1024, + engine_config=dict(session_len=8192, max_batch_size=16, tp=2), + gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096), + max_seq_len=8192, + max_out_len=4096, batch_size=16, run_cfg=dict(num_gpus=2), ) diff --git a/configs/models/hf_internlm/lmdeploy_internlm2_chat_7b.py b/configs/models/hf_internlm/lmdeploy_internlm2_chat_7b.py index 82ad2e46a..38ea39d7d 100644 --- a/configs/models/hf_internlm/lmdeploy_internlm2_chat_7b.py +++ b/configs/models/hf_internlm/lmdeploy_internlm2_chat_7b.py @@ -1,15 +1,24 @@ from opencompass.models import TurboMindModelwithChatTemplate + models = [ dict( type=TurboMindModelwithChatTemplate, - abbr='internlm2-chat-7b-turbomind', + abbr=f'internlm2-chat-7b-lmdeploy', path='internlm/internlm2-chat-7b', - engine_config=dict(session_len=7168, max_batch_size=16, tp=1), - gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024), - max_seq_len=7168, - max_out_len=1024, - batch_size=16, + # inference backend of LMDeploy. It can be either 'turbomind' or 'pytorch'. + # If the model is not supported by 'turbomind', it will fallback to + # 'pytorch' + backend='turbomind', + # For the detailed engine config and generation config, please refer to + # https://github.com/InternLM/lmdeploy/blob/main/lmdeploy/messages.py + engine_config=dict(tp=1), + gen_config=dict(do_sample=False), + max_seq_len=8192, + max_out_len=4096, + # the max number of prompts that LMDeploy receives + # in `generate` function + batch_size=5000, run_cfg=dict(num_gpus=1), ) ] diff --git a/configs/models/hf_internlm/lmdeploy_internlm_chat_20b.py b/configs/models/hf_internlm/lmdeploy_internlm_chat_20b.py index 8718a6cfc..e9af5578b 100644 --- a/configs/models/hf_internlm/lmdeploy_internlm_chat_20b.py +++ b/configs/models/hf_internlm/lmdeploy_internlm_chat_20b.py @@ -6,9 +6,9 @@ abbr='internlm-chat-20b-turbomind', path='internlm/internlm-chat-20b', engine_config=dict(session_len=4096, max_batch_size=16, tp=2), - gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024), + gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=2048), max_seq_len=4096, - max_out_len=1024, + max_out_len=2048, batch_size=16, run_cfg=dict(num_gpus=2), ) diff --git a/configs/models/hf_internlm/lmdeploy_internlm_chat_7b.py b/configs/models/hf_internlm/lmdeploy_internlm_chat_7b.py index ea61313af..50656a5f8 100644 --- a/configs/models/hf_internlm/lmdeploy_internlm_chat_7b.py +++ b/configs/models/hf_internlm/lmdeploy_internlm_chat_7b.py @@ -6,9 +6,9 @@ abbr='internlm-chat-7b-turbomind', path='internlm/internlm-chat-7b', engine_config=dict(session_len=4096, max_batch_size=16, tp=1), - gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024), + gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=2048), max_seq_len=4096, - max_out_len=1024, + max_out_len=2048, batch_size=16, run_cfg=dict(num_gpus=1), ) diff --git a/configs/models/hf_llama/hf_llama3_1_70b_instruct.py b/configs/models/hf_llama/hf_llama3_1_70b_instruct.py index 4a17de935..c7527bb53 100644 --- a/configs/models/hf_llama/hf_llama3_1_70b_instruct.py +++ b/configs/models/hf_llama/hf_llama3_1_70b_instruct.py @@ -5,7 +5,7 @@ type=HuggingFacewithChatTemplate, abbr='llama-3_1-70b-instruct-hf', path='meta-llama/Meta-Llama-3.1-70B-Instruct', - max_out_len=1024, + max_out_len=4096, batch_size=8, run_cfg=dict(num_gpus=4), stop_words=['<|end_of_text|>', '<|eot_id|>'], diff --git a/configs/models/hf_llama/hf_llama3_1_8b.py b/configs/models/hf_llama/hf_llama3_1_8b.py new file mode 100644 index 000000000..a41e1ddfc --- /dev/null +++ b/configs/models/hf_llama/hf_llama3_1_8b.py @@ -0,0 +1,12 @@ +from opencompass.models import HuggingFaceBaseModel + +models = [ + dict( + type=HuggingFaceBaseModel, + abbr='llama-3_1-8b-hf', + path='meta-llama/Meta-Llama-3.1-8B-Instruct', + max_out_len=1024, + batch_size=8, + run_cfg=dict(num_gpus=1), + ) +] diff --git a/configs/models/hf_llama/lmdeploy_llama2_13b_chat.py b/configs/models/hf_llama/lmdeploy_llama2_13b_chat.py index cb42cb294..cacdec9a5 100644 --- a/configs/models/hf_llama/lmdeploy_llama2_13b_chat.py +++ b/configs/models/hf_llama/lmdeploy_llama2_13b_chat.py @@ -6,9 +6,9 @@ abbr='llama-2-13b-chat-turbomind', path='meta-llama/Llama-2-13b-chat-hf', engine_config=dict(max_batch_size=16, tp=1), - gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024), + gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=2048), max_seq_len=4096, - max_out_len=1024, + max_out_len=2048, batch_size=16, run_cfg=dict(num_gpus=1), ) diff --git a/configs/models/hf_llama/lmdeploy_llama2_70b_chat.py b/configs/models/hf_llama/lmdeploy_llama2_70b_chat.py index d6c69c6f9..b850106b3 100644 --- a/configs/models/hf_llama/lmdeploy_llama2_70b_chat.py +++ b/configs/models/hf_llama/lmdeploy_llama2_70b_chat.py @@ -6,9 +6,9 @@ abbr='llama-2-70b-chat-turbomind', path='meta-llama/Llama-2-70b-chat-hf', engine_config=dict(max_batch_size=16, tp=4), - gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024), + gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=2048), max_seq_len=4096, - max_out_len=1024, + max_out_len=2048, batch_size=16, run_cfg=dict(num_gpus=4), ) diff --git a/configs/models/hf_llama/lmdeploy_llama2_7b_chat.py b/configs/models/hf_llama/lmdeploy_llama2_7b_chat.py index f520ce8b3..aa3452488 100644 --- a/configs/models/hf_llama/lmdeploy_llama2_7b_chat.py +++ b/configs/models/hf_llama/lmdeploy_llama2_7b_chat.py @@ -6,9 +6,9 @@ abbr='llama-2-7b-chat-turbomind', path='meta-llama/Llama-2-7b-chat-hf', engine_config=dict(max_batch_size=16, tp=1), - gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024), + gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=2048), max_seq_len=4096, - max_out_len=1024, + max_out_len=2048, batch_size=16, run_cfg=dict(num_gpus=1), ) diff --git a/configs/models/hf_llama/lmdeploy_llama3_1_70b_instruct.py b/configs/models/hf_llama/lmdeploy_llama3_1_70b_instruct.py index 23f9bc2a1..9674169f5 100644 --- a/configs/models/hf_llama/lmdeploy_llama3_1_70b_instruct.py +++ b/configs/models/hf_llama/lmdeploy_llama3_1_70b_instruct.py @@ -6,9 +6,9 @@ abbr='llama-3_1-70b-instruct-turbomind', path='meta-llama/Meta-Llama-3.1-70B-Instruct', engine_config=dict(max_batch_size=16, tp=4), - gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024), - max_seq_len=7168, - max_out_len=1024, + gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096), + max_seq_len=16384, + max_out_len=4096, batch_size=16, run_cfg=dict(num_gpus=4), stop_words=['<|end_of_text|>', '<|eot_id|>'], diff --git a/configs/models/hf_llama/lmdeploy_llama3_1_8b_instruct.py b/configs/models/hf_llama/lmdeploy_llama3_1_8b_instruct.py index 429dfec72..2754eb835 100644 --- a/configs/models/hf_llama/lmdeploy_llama3_1_8b_instruct.py +++ b/configs/models/hf_llama/lmdeploy_llama3_1_8b_instruct.py @@ -6,9 +6,9 @@ abbr='llama-3_1-8b-instruct-turbomind', path='meta-llama/Meta-Llama-3.1-8B-Instruct', engine_config=dict(max_batch_size=16, tp=1), - gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024), - max_seq_len=7168, - max_out_len=1024, + gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096), + max_seq_len=16384, + max_out_len=4096, batch_size=16, run_cfg=dict(num_gpus=1), stop_words=['<|end_of_text|>', '<|eot_id|>'], diff --git a/configs/models/hf_llama/lmdeploy_llama3_70b_instruct.py b/configs/models/hf_llama/lmdeploy_llama3_70b_instruct.py index 333dc0153..12fc944c7 100644 --- a/configs/models/hf_llama/lmdeploy_llama3_70b_instruct.py +++ b/configs/models/hf_llama/lmdeploy_llama3_70b_instruct.py @@ -6,9 +6,9 @@ abbr='llama-3-70b-instruct-turbomind', path='meta-llama/Meta-Llama-3-70B-Instruct', engine_config=dict(max_batch_size=16, tp=4), - gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024), - max_seq_len=7168, - max_out_len=1024, + gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096), + max_seq_len=16384, + max_out_len=4096, batch_size=16, run_cfg=dict(num_gpus=4), stop_words=['<|end_of_text|>', '<|eot_id|>'], diff --git a/configs/models/hf_llama/lmdeploy_llama3_8b_instruct.py b/configs/models/hf_llama/lmdeploy_llama3_8b_instruct.py index cc5b3bd45..5a6431b7a 100644 --- a/configs/models/hf_llama/lmdeploy_llama3_8b_instruct.py +++ b/configs/models/hf_llama/lmdeploy_llama3_8b_instruct.py @@ -6,9 +6,9 @@ abbr='llama-3-8b-instruct-turbomind', path='meta-llama/Meta-Llama-3-8B-Instruct', engine_config=dict(max_batch_size=16, tp=1), - gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024), - max_seq_len=7168, - max_out_len=1024, + gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096), + max_seq_len=16384, + max_out_len=4096, batch_size=16, run_cfg=dict(num_gpus=1), stop_words=['<|end_of_text|>', '<|eot_id|>'], diff --git a/configs/models/mistral/lmdeploy_mistral_7b_instruct_v0_3.py b/configs/models/mistral/lmdeploy_mistral_7b_instruct_v0_3.py new file mode 100644 index 000000000..4c867b602 --- /dev/null +++ b/configs/models/mistral/lmdeploy_mistral_7b_instruct_v0_3.py @@ -0,0 +1,15 @@ +from opencompass.models import TurboMindModelwithChatTemplate + +models = [ + dict( + type=TurboMindModelwithChatTemplate, + abbr='mistral-7b-instruct-v0.3-turbomind', + path='mistralai/Mistral-7B-Instruct-v0.3', + engine_config=dict(session_len=32768, max_batch_size=16, tp=1), + gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096), + max_seq_len=32768, + max_out_len=4096, + batch_size=16, + run_cfg=dict(num_gpus=1), + ) +] diff --git a/configs/models/mistral/lmdeploy_mixtral_large_instruct_2407.py b/configs/models/mistral/lmdeploy_mixtral_large_instruct_2407.py new file mode 100644 index 000000000..e79a1f73a --- /dev/null +++ b/configs/models/mistral/lmdeploy_mixtral_large_instruct_2407.py @@ -0,0 +1,15 @@ +from opencompass.models import TurboMindModelwithChatTemplate + +models = [ + dict( + type=TurboMindModelwithChatTemplate, + abbr='mixtral-large-instruct-2407-turbomind', + path='mistralai/Mistral-Large-Instruct-2407', + engine_config=dict(session_len=32768, max_batch_size=16, tp=4), + gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096), + max_seq_len=32768, + max_out_len=4096, + batch_size=16, + run_cfg=dict(num_gpus=4), + ) +] diff --git a/configs/models/openai/o1_mini_2024_09_12.py b/configs/models/openai/o1_mini_2024_09_12.py new file mode 100644 index 000000000..331ecf319 --- /dev/null +++ b/configs/models/openai/o1_mini_2024_09_12.py @@ -0,0 +1,20 @@ +from opencompass.models import OpenAISDK + +api_meta_template = dict(round=[ + dict(role='HUMAN', api_role='HUMAN'), + dict(role='BOT', api_role='BOT', generate=True), +], ) + +models = [ + dict( + abbr='o1-mini-2024-09-12', + type=OpenAISDK, + path='o1-mini-2024-09-12', + key= + 'ENV', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well + meta_template=api_meta_template, + query_per_second=1, + batch_size=1, + temperature=1, + max_completion_tokens=8192), # you can change it for large reasoning inference cost, according to: https://platform.openai.com/docs/guides/reasoning +] diff --git a/configs/models/openai/o1_preview_2024_09_12.py b/configs/models/openai/o1_preview_2024_09_12.py new file mode 100644 index 000000000..9dff10371 --- /dev/null +++ b/configs/models/openai/o1_preview_2024_09_12.py @@ -0,0 +1,20 @@ +from opencompass.models import OpenAISDK + +api_meta_template = dict(round=[ + dict(role='HUMAN', api_role='HUMAN'), + dict(role='BOT', api_role='BOT', generate=True), +], ) + +models = [ + dict( + abbr='o1-preview-2024-09-12', + type=OpenAISDK, + path='o1-preview-2024-09-12', + key= + 'ENV', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well + meta_template=api_meta_template, + query_per_second=1, + batch_size=1, + temperature=1, + max_completion_tokens=8192), # you can change it for large reasoning inference cost, according to: https://platform.openai.com/docs/guides/reasoning +] diff --git a/configs/models/qwen/lmdeploy_qwen1_5_110b_chat.py b/configs/models/qwen/lmdeploy_qwen1_5_110b_chat.py index 9b92b8140..bc123b405 100644 --- a/configs/models/qwen/lmdeploy_qwen1_5_110b_chat.py +++ b/configs/models/qwen/lmdeploy_qwen1_5_110b_chat.py @@ -5,10 +5,10 @@ type=TurboMindModelwithChatTemplate, abbr='qwen1.5-110b-chat-turbomind', path='Qwen/Qwen1.5-110B-Chat', - engine_config=dict(session_len=7168, max_batch_size=8, tp=4), - gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024), - max_seq_len=7168, - max_out_len=1024, + engine_config=dict(session_len=16834, max_batch_size=8, tp=4), + gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096), + max_seq_len=16834, + max_out_len=4096, batch_size=8, run_cfg=dict(num_gpus=4), stop_words=['<|im_end|>', '<|im_start|>'], diff --git a/configs/models/qwen/lmdeploy_qwen1_5_14b_chat.py b/configs/models/qwen/lmdeploy_qwen1_5_14b_chat.py index d2b85c2aa..5f0d54b96 100644 --- a/configs/models/qwen/lmdeploy_qwen1_5_14b_chat.py +++ b/configs/models/qwen/lmdeploy_qwen1_5_14b_chat.py @@ -5,10 +5,10 @@ type=TurboMindModelwithChatTemplate, abbr='qwen1.5-14b-chat-turbomind', path='Qwen/Qwen1.5-14B-Chat', - engine_config=dict(session_len=7168, max_batch_size=16, tp=1), - gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024), - max_seq_len=7168, - max_out_len=1024, + engine_config=dict(session_len=16384, max_batch_size=16, tp=1), + gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096), + max_seq_len=16384, + max_out_len=4096, batch_size=16, run_cfg=dict(num_gpus=1), stop_words=['<|im_end|>', '<|im_start|>'], diff --git a/configs/models/qwen/lmdeploy_qwen1_5_1_8b_chat.py b/configs/models/qwen/lmdeploy_qwen1_5_1_8b_chat.py index ff28ac0be..803ff3336 100644 --- a/configs/models/qwen/lmdeploy_qwen1_5_1_8b_chat.py +++ b/configs/models/qwen/lmdeploy_qwen1_5_1_8b_chat.py @@ -5,10 +5,10 @@ type=TurboMindModelwithChatTemplate, abbr='qwen1.5-1.8b-chat-turbomind', path='Qwen/Qwen1.5-1.8B-Chat', - engine_config=dict(session_len=7168, max_batch_size=16, tp=1), - gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024), - max_seq_len=7168, - max_out_len=1024, + engine_config=dict(session_len=16384, max_batch_size=16, tp=1), + gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096), + max_seq_len=16384, + max_out_len=4096, batch_size=16, run_cfg=dict(num_gpus=1), stop_words=['<|im_end|>', '<|im_start|>'], diff --git a/configs/models/qwen/lmdeploy_qwen1_5_32b_chat.py b/configs/models/qwen/lmdeploy_qwen1_5_32b_chat.py index 1196548a0..96fd1e43c 100644 --- a/configs/models/qwen/lmdeploy_qwen1_5_32b_chat.py +++ b/configs/models/qwen/lmdeploy_qwen1_5_32b_chat.py @@ -5,10 +5,10 @@ type=TurboMindModelwithChatTemplate, abbr='qwen1.5-32b-chat-turbomind', path='Qwen/Qwen1.5-32B-Chat', - engine_config=dict(session_len=7168, max_batch_size=16, tp=2), - gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024), - max_seq_len=7168, - max_out_len=1024, + engine_config=dict(session_len=16384, max_batch_size=16, tp=2), + gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096), + max_seq_len=16384, + max_out_len=4096, batch_size=16, run_cfg=dict(num_gpus=2), stop_words=['<|im_end|>', '<|im_start|>'], diff --git a/configs/models/qwen/lmdeploy_qwen1_5_4b_chat.py b/configs/models/qwen/lmdeploy_qwen1_5_4b_chat.py index bde14a295..f9fcc3fb9 100644 --- a/configs/models/qwen/lmdeploy_qwen1_5_4b_chat.py +++ b/configs/models/qwen/lmdeploy_qwen1_5_4b_chat.py @@ -5,10 +5,10 @@ type=TurboMindModelwithChatTemplate, abbr='qwen1.5-4b-chat-turbomind', path='Qwen/Qwen1.5-4B-Chat', - engine_config=dict(session_len=7168, max_batch_size=16, tp=1), - gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024), - max_seq_len=7168, - max_out_len=1024, + engine_config=dict(session_len=16384, max_batch_size=16, tp=1), + gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096), + max_seq_len=16384, + max_out_len=4096, batch_size=16, run_cfg=dict(num_gpus=1), stop_words=['<|im_end|>', '<|im_start|>'], diff --git a/configs/models/qwen/lmdeploy_qwen1_5_72b_chat.py b/configs/models/qwen/lmdeploy_qwen1_5_72b_chat.py index 38175eaf3..64a5f7cb6 100644 --- a/configs/models/qwen/lmdeploy_qwen1_5_72b_chat.py +++ b/configs/models/qwen/lmdeploy_qwen1_5_72b_chat.py @@ -5,10 +5,10 @@ type=TurboMindModelwithChatTemplate, abbr='qwen1.5-72b-chat-turbomind', path='Qwen/Qwen1.5-72B-Chat', - engine_config=dict(session_len=7168, max_batch_size=16, tp=4), - gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024), - max_seq_len=7168, - max_out_len=1024, + engine_config=dict(session_len=16384, max_batch_size=16, tp=4), + gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096), + max_seq_len=16384, + max_out_len=4096, batch_size=16, run_cfg=dict(num_gpus=4), stop_words=['<|im_end|>', '<|im_start|>'], diff --git a/configs/models/qwen/lmdeploy_qwen1_5_7b_chat.py b/configs/models/qwen/lmdeploy_qwen1_5_7b_chat.py index ca733c0b2..1ab393036 100644 --- a/configs/models/qwen/lmdeploy_qwen1_5_7b_chat.py +++ b/configs/models/qwen/lmdeploy_qwen1_5_7b_chat.py @@ -5,10 +5,10 @@ type=TurboMindModelwithChatTemplate, abbr='qwen1.5-7b-chat-turbomind', path='Qwen/Qwen1.5-7B-Chat', - engine_config=dict(session_len=7168, max_batch_size=16, tp=1), - gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024), - max_seq_len=7168, - max_out_len=1024, + engine_config=dict(session_len=16384, max_batch_size=16, tp=1), + gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096), + max_seq_len=16384, + max_out_len=4096, batch_size=16, run_cfg=dict(num_gpus=1), stop_words=['<|im_end|>', '<|im_start|>'], diff --git a/configs/models/qwen/lmdeploy_qwen2_1_5b_instruct.py b/configs/models/qwen/lmdeploy_qwen2_1_5b_instruct.py index 502de1876..f050ca382 100644 --- a/configs/models/qwen/lmdeploy_qwen2_1_5b_instruct.py +++ b/configs/models/qwen/lmdeploy_qwen2_1_5b_instruct.py @@ -5,10 +5,10 @@ type=TurboMindModelwithChatTemplate, abbr='qwen2-1.5b-instruct-turbomind', path='Qwen/Qwen2-1.5B-Instruct', - engine_config=dict(session_len=7168, max_batch_size=16, tp=1), - gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024), - max_seq_len=7168, - max_out_len=1024, + engine_config=dict(session_len=16384, max_batch_size=16, tp=1), + gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096), + max_seq_len=16384, + max_out_len=4096, batch_size=16, run_cfg=dict(num_gpus=1), ) diff --git a/configs/models/qwen/lmdeploy_qwen2_72b_instruct.py b/configs/models/qwen/lmdeploy_qwen2_72b_instruct.py index 69ecb7981..c29482b5b 100644 --- a/configs/models/qwen/lmdeploy_qwen2_72b_instruct.py +++ b/configs/models/qwen/lmdeploy_qwen2_72b_instruct.py @@ -5,10 +5,10 @@ type=TurboMindModelwithChatTemplate, abbr='qwen2-72b-instruct-turbomind', path='Qwen/Qwen2-72B-Instruct', - engine_config=dict(session_len=7168, max_batch_size=16, tp=4), - gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024), - max_seq_len=7168, - max_out_len=1024, + engine_config=dict(session_len=16384, max_batch_size=16, tp=4), + gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096), + max_seq_len=16384, + max_out_len=4096, batch_size=16, run_cfg=dict(num_gpus=4), ) diff --git a/configs/models/qwen/lmdeploy_qwen2_7b_instruct.py b/configs/models/qwen/lmdeploy_qwen2_7b_instruct.py index 4dff85e06..05fa25c5e 100644 --- a/configs/models/qwen/lmdeploy_qwen2_7b_instruct.py +++ b/configs/models/qwen/lmdeploy_qwen2_7b_instruct.py @@ -5,10 +5,10 @@ type=TurboMindModelwithChatTemplate, abbr='qwen2-7b-instruct-turbomind', path='Qwen/Qwen2-7B-Instruct', - engine_config=dict(session_len=7168, max_batch_size=16, tp=1), - gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024), - max_seq_len=7168, - max_out_len=1024, + engine_config=dict(session_len=16384, max_batch_size=16, tp=1), + gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096), + max_seq_len=16384, + max_out_len=4096, batch_size=16, run_cfg=dict(num_gpus=1), ) diff --git a/configs/models/qwen2_5/hf_qwen2_5_0_5b_instruct.py b/configs/models/qwen2_5/hf_qwen2_5_0_5b_instruct.py new file mode 100644 index 000000000..35289bb10 --- /dev/null +++ b/configs/models/qwen2_5/hf_qwen2_5_0_5b_instruct.py @@ -0,0 +1,12 @@ +from opencompass.models import HuggingFacewithChatTemplate + +models = [ + dict( + type=HuggingFacewithChatTemplate, + abbr='qwen2.5-0.5b-instruct-hf', + path='Qwen/Qwen2.5-0.5B-Instruct', + max_out_len=4096, + batch_size=8, + run_cfg=dict(num_gpus=1), + ) +] diff --git a/configs/models/qwen2_5/hf_qwen2_5_14b_instruct.py b/configs/models/qwen2_5/hf_qwen2_5_14b_instruct.py new file mode 100644 index 000000000..af5a8816a --- /dev/null +++ b/configs/models/qwen2_5/hf_qwen2_5_14b_instruct.py @@ -0,0 +1,12 @@ +from opencompass.models import HuggingFacewithChatTemplate + +models = [ + dict( + type=HuggingFacewithChatTemplate, + abbr='qwen2.5-14b-instruct-hf', + path='Qwen/Qwen2.5-14B-Instruc', + max_out_len=4096, + batch_size=8, + run_cfg=dict(num_gpus=2), + ) +] diff --git a/configs/models/qwen2_5/hf_qwen2_5_1_5b_instruct.py b/configs/models/qwen2_5/hf_qwen2_5_1_5b_instruct.py new file mode 100644 index 000000000..52da52895 --- /dev/null +++ b/configs/models/qwen2_5/hf_qwen2_5_1_5b_instruct.py @@ -0,0 +1,12 @@ +from opencompass.models import HuggingFacewithChatTemplate + +models = [ + dict( + type=HuggingFacewithChatTemplate, + abbr='qwen2.5-1.5b-instruct-hf', + path='Qwen/Qwen2.5-1.5B-Instruct', + max_out_len=4096, + batch_size=8, + run_cfg=dict(num_gpus=1), + ) +] diff --git a/configs/models/qwen2_5/hf_qwen2_5_32b_instruct.py b/configs/models/qwen2_5/hf_qwen2_5_32b_instruct.py new file mode 100644 index 000000000..f2051f810 --- /dev/null +++ b/configs/models/qwen2_5/hf_qwen2_5_32b_instruct.py @@ -0,0 +1,12 @@ +from opencompass.models import HuggingFacewithChatTemplate + +models = [ + dict( + type=HuggingFacewithChatTemplate, + abbr='qwen2.5-32b-instruct-hf', + path='Qwen/Qwen2.5-32B-Instruc', + max_out_len=4096, + batch_size=8, + run_cfg=dict(num_gpus=2), + ) +] diff --git a/configs/models/qwen2_5/hf_qwen2_5_3b_instruct.py b/configs/models/qwen2_5/hf_qwen2_5_3b_instruct.py new file mode 100644 index 000000000..88a101994 --- /dev/null +++ b/configs/models/qwen2_5/hf_qwen2_5_3b_instruct.py @@ -0,0 +1,12 @@ +from opencompass.models import HuggingFacewithChatTemplate + +models = [ + dict( + type=HuggingFacewithChatTemplate, + abbr='qwen2.5-3b-instruct-hf', + path='Qwen/Qwen2.5-3B-Instruct', + max_out_len=4096, + batch_size=8, + run_cfg=dict(num_gpus=1), + ) +] diff --git a/configs/models/qwen2_5/hf_qwen2_5_72b_instruct.py b/configs/models/qwen2_5/hf_qwen2_5_72b_instruct.py new file mode 100644 index 000000000..c7fcf53ca --- /dev/null +++ b/configs/models/qwen2_5/hf_qwen2_5_72b_instruct.py @@ -0,0 +1,12 @@ +from opencompass.models import HuggingFacewithChatTemplate + +models = [ + dict( + type=HuggingFacewithChatTemplate, + abbr='qwen2.5-72b-instruct-hf', + path='Qwen/Qwen2.5-72B-Instruc', + max_out_len=4096, + batch_size=8, + run_cfg=dict(num_gpus=4), + ) +] diff --git a/configs/models/qwen2_5/hf_qwen2_5_7b_instruct.py b/configs/models/qwen2_5/hf_qwen2_5_7b_instruct.py new file mode 100644 index 000000000..a9895be67 --- /dev/null +++ b/configs/models/qwen2_5/hf_qwen2_5_7b_instruct.py @@ -0,0 +1,12 @@ +from opencompass.models import HuggingFacewithChatTemplate + +models = [ + dict( + type=HuggingFacewithChatTemplate, + abbr='qwen2.5-7b-instruct-hf', + path='Qwen/Qwen2.5-7B-Instruc', + max_out_len=4096, + batch_size=8, + run_cfg=dict(num_gpus=1), + ) +] diff --git a/configs/models/qwen2_5/lmdeploy_qwen2_5_0_5b_instruct.py b/configs/models/qwen2_5/lmdeploy_qwen2_5_0_5b_instruct.py new file mode 100644 index 000000000..145549630 --- /dev/null +++ b/configs/models/qwen2_5/lmdeploy_qwen2_5_0_5b_instruct.py @@ -0,0 +1,15 @@ +from opencompass.models import TurboMindModelwithChatTemplate + +models = [ + dict( + type=TurboMindModelwithChatTemplate, + abbr='qwen2.5-0.5b-instruct-turbomind', + path='Qwen/Qwen2.5-0.5B-Instruct', + engine_config=dict(session_len=16384, max_batch_size=16, tp=1), + gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096), + max_seq_len=16384, + max_out_len=4096, + batch_size=16, + run_cfg=dict(num_gpus=1), + ) +] diff --git a/configs/models/qwen2_5/lmdeploy_qwen2_5_14b_instruct.py b/configs/models/qwen2_5/lmdeploy_qwen2_5_14b_instruct.py new file mode 100644 index 000000000..364690288 --- /dev/null +++ b/configs/models/qwen2_5/lmdeploy_qwen2_5_14b_instruct.py @@ -0,0 +1,15 @@ +from opencompass.models import TurboMindModelwithChatTemplate + +models = [ + dict( + type=TurboMindModelwithChatTemplate, + abbr='qwen2.5-14b-instruct-turbomind', + path='Qwen/Qwen2.5-14B-Instruct', + engine_config=dict(session_len=16384, max_batch_size=16, tp=2), + gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096), + max_seq_len=16384, + max_out_len=4096, + batch_size=16, + run_cfg=dict(num_gpus=2), + ) +] diff --git a/configs/models/qwen2_5/lmdeploy_qwen2_5_1_5b.py b/configs/models/qwen2_5/lmdeploy_qwen2_5_1_5b.py new file mode 100644 index 000000000..a2661c9fd --- /dev/null +++ b/configs/models/qwen2_5/lmdeploy_qwen2_5_1_5b.py @@ -0,0 +1,15 @@ +from opencompass.models import TurboMindModel + +models = [ + dict( + type=TurboMindModel, + abbr='qwen2.5-1.5b-turbomind', + path='Qwen/Qwen2.5-1.5B', + engine_config=dict(session_len=7168, max_batch_size=16, tp=1), + gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024), + max_seq_len=7168, + max_out_len=1024, + batch_size=16, + run_cfg=dict(num_gpus=1), + ) +] diff --git a/configs/models/qwen2_5/lmdeploy_qwen2_5_1_5b_instruct.py b/configs/models/qwen2_5/lmdeploy_qwen2_5_1_5b_instruct.py new file mode 100644 index 000000000..9b2f6c5fd --- /dev/null +++ b/configs/models/qwen2_5/lmdeploy_qwen2_5_1_5b_instruct.py @@ -0,0 +1,15 @@ +from opencompass.models import TurboMindModelwithChatTemplate + +models = [ + dict( + type=TurboMindModelwithChatTemplate, + abbr='qwen2.5-1.5b-instruct-turbomind', + path='Qwen/Qwen2.5-1.5B-Instruct', + engine_config=dict(session_len=16384, max_batch_size=16, tp=1), + gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096), + max_seq_len=16384, + max_out_len=4096, + batch_size=16, + run_cfg=dict(num_gpus=1), + ) +] diff --git a/configs/models/qwen2_5/lmdeploy_qwen2_5_32b_instruct.py b/configs/models/qwen2_5/lmdeploy_qwen2_5_32b_instruct.py new file mode 100644 index 000000000..81e0cb0f1 --- /dev/null +++ b/configs/models/qwen2_5/lmdeploy_qwen2_5_32b_instruct.py @@ -0,0 +1,15 @@ +from opencompass.models import TurboMindModelwithChatTemplate + +models = [ + dict( + type=TurboMindModelwithChatTemplate, + abbr='qwen2.5-32b-instruct-turbomind', + path='Qwen/Qwen2.5-32B-Instruct', + engine_config=dict(session_len=16384, max_batch_size=16, tp=2), + gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096), + max_seq_len=16384, + max_out_len=4096, + batch_size=16, + run_cfg=dict(num_gpus=2), + ) +] diff --git a/configs/models/qwen2_5/lmdeploy_qwen2_5_3b_instruct.py b/configs/models/qwen2_5/lmdeploy_qwen2_5_3b_instruct.py new file mode 100644 index 000000000..bab4b32ea --- /dev/null +++ b/configs/models/qwen2_5/lmdeploy_qwen2_5_3b_instruct.py @@ -0,0 +1,15 @@ +from opencompass.models import TurboMindModelwithChatTemplate + +models = [ + dict( + type=TurboMindModelwithChatTemplate, + abbr='qwen2.5-3b-instruct-turbomind', + path='Qwen/Qwen2.5-3B-Instruct', + engine_config=dict(session_len=16384, max_batch_size=16, tp=1), + gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096), + max_seq_len=16384, + max_out_len=4096, + batch_size=16, + run_cfg=dict(num_gpus=1), + ) +] diff --git a/configs/models/qwen2_5/lmdeploy_qwen2_5_72b_instruct.py b/configs/models/qwen2_5/lmdeploy_qwen2_5_72b_instruct.py new file mode 100644 index 000000000..8b61daa8a --- /dev/null +++ b/configs/models/qwen2_5/lmdeploy_qwen2_5_72b_instruct.py @@ -0,0 +1,15 @@ +from opencompass.models import TurboMindModelwithChatTemplate + +models = [ + dict( + type=TurboMindModelwithChatTemplate, + abbr='qwen2.5-72b-instruct-turbomind', + path='Qwen/Qwen2.5-72B-Instruct', + engine_config=dict(session_len=16384, max_batch_size=16, tp=4), + gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096), + max_seq_len=16384, + max_out_len=4096, + batch_size=16, + run_cfg=dict(num_gpus=4), + ) +] diff --git a/configs/models/qwen2_5/lmdeploy_qwen2_5_7b.py b/configs/models/qwen2_5/lmdeploy_qwen2_5_7b.py new file mode 100644 index 000000000..b2d7aa0c5 --- /dev/null +++ b/configs/models/qwen2_5/lmdeploy_qwen2_5_7b.py @@ -0,0 +1,15 @@ +from opencompass.models import TurboMindModel + +models = [ + dict( + type=TurboMindModel, + abbr='qwen2.5-7b-turbomind', + path='Qwen/Qwen2.5-7B', + engine_config=dict(session_len=7168, max_batch_size=16, tp=1), + gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024), + max_seq_len=7168, + max_out_len=1024, + batch_size=16, + run_cfg=dict(num_gpus=1), + ) +] diff --git a/configs/models/qwen2_5/lmdeploy_qwen2_5_7b_instruct.py b/configs/models/qwen2_5/lmdeploy_qwen2_5_7b_instruct.py new file mode 100644 index 000000000..54732521f --- /dev/null +++ b/configs/models/qwen2_5/lmdeploy_qwen2_5_7b_instruct.py @@ -0,0 +1,15 @@ +from opencompass.models import TurboMindModelwithChatTemplate + +models = [ + dict( + type=TurboMindModelwithChatTemplate, + abbr='qwen2.5-7b-instruct-turbomind', + path='Qwen/Qwen2.5-7B-Instruct', + engine_config=dict(session_len=16384, max_batch_size=16, tp=1), + gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096), + max_seq_len=16384, + max_out_len=4096, + batch_size=16, + run_cfg=dict(num_gpus=1), + ) +] diff --git a/configs/models/qwen2_5/vllm_qwen2_5_0_5b_instruct.py b/configs/models/qwen2_5/vllm_qwen2_5_0_5b_instruct.py new file mode 100644 index 000000000..effd12018 --- /dev/null +++ b/configs/models/qwen2_5/vllm_qwen2_5_0_5b_instruct.py @@ -0,0 +1,14 @@ +from opencompass.models import VLLMwithChatTemplate + +models = [ + dict( + type=VLLMwithChatTemplate, + abbr='qwen2.5-0.5b-instruct-vllm', + path='Qwen/Qwen2.5-0.5B-Instruct', + model_kwargs=dict(tensor_parallel_size=1, gpu_memory_utilization=0.5), + max_out_len=4096, + batch_size=16, + generation_kwargs=dict(temperature=0), + run_cfg=dict(num_gpus=1), + ) +] diff --git a/configs/models/qwen2_5/vllm_qwen2_5_14b_instruct.py b/configs/models/qwen2_5/vllm_qwen2_5_14b_instruct.py new file mode 100644 index 000000000..438279f15 --- /dev/null +++ b/configs/models/qwen2_5/vllm_qwen2_5_14b_instruct.py @@ -0,0 +1,14 @@ +from opencompass.models import VLLMwithChatTemplate + +models = [ + dict( + type=VLLMwithChatTemplate, + abbr='qwen2.5-14b-instruct-vllm', + path='Qwen/Qwen2.5-14B-Instruct', + model_kwargs=dict(tensor_parallel_size=2), + max_out_len=4096, + batch_size=16, + generation_kwargs=dict(temperature=0), + run_cfg=dict(num_gpus=2), + ) +] diff --git a/configs/models/qwen2_5/vllm_qwen2_5_1_5b_instruct.py b/configs/models/qwen2_5/vllm_qwen2_5_1_5b_instruct.py new file mode 100644 index 000000000..16b7f809b --- /dev/null +++ b/configs/models/qwen2_5/vllm_qwen2_5_1_5b_instruct.py @@ -0,0 +1,14 @@ +from opencompass.models import VLLMwithChatTemplate + +models = [ + dict( + type=VLLMwithChatTemplate, + abbr='qwen2.5-1.5b-instruct-vllm', + path='Qwen/Qwen2.5-1.5B-Instruct', + model_kwargs=dict(tensor_parallel_size=1, gpu_memory_utilization=0.5), + max_out_len=4096, + batch_size=16, + generation_kwargs=dict(temperature=0), + run_cfg=dict(num_gpus=1), + ) +] diff --git a/configs/models/qwen2_5/vllm_qwen2_5_32b_instruct.py b/configs/models/qwen2_5/vllm_qwen2_5_32b_instruct.py new file mode 100644 index 000000000..58d518459 --- /dev/null +++ b/configs/models/qwen2_5/vllm_qwen2_5_32b_instruct.py @@ -0,0 +1,14 @@ +from opencompass.models import VLLMwithChatTemplate + +models = [ + dict( + type=VLLMwithChatTemplate, + abbr='qwen2.5-32b-instruct-vllm', + path='Qwen/Qwen2.5-32B-Instruct', + model_kwargs=dict(tensor_parallel_size=2), + max_out_len=4096, + batch_size=16, + generation_kwargs=dict(temperature=0), + run_cfg=dict(num_gpus=2), + ) +] diff --git a/configs/models/qwen2_5/vllm_qwen2_5_3b_instruct.py b/configs/models/qwen2_5/vllm_qwen2_5_3b_instruct.py new file mode 100644 index 000000000..e24c7c1f8 --- /dev/null +++ b/configs/models/qwen2_5/vllm_qwen2_5_3b_instruct.py @@ -0,0 +1,14 @@ +from opencompass.models import VLLMwithChatTemplate + +models = [ + dict( + type=VLLMwithChatTemplate, + abbr='qwen2.5-3b-instruct-vllm', + path='Qwen/Qwen2.5-3B-Instruct', + model_kwargs=dict(tensor_parallel_size=1, gpu_memory_utilization=0.5), + max_out_len=4096, + batch_size=16, + generation_kwargs=dict(temperature=0), + run_cfg=dict(num_gpus=1), + ) +] diff --git a/configs/models/qwen2_5/vllm_qwen2_5_72b_instruct.py b/configs/models/qwen2_5/vllm_qwen2_5_72b_instruct.py new file mode 100644 index 000000000..c1a557901 --- /dev/null +++ b/configs/models/qwen2_5/vllm_qwen2_5_72b_instruct.py @@ -0,0 +1,14 @@ +from opencompass.models import VLLMwithChatTemplate + +models = [ + dict( + type=VLLMwithChatTemplate, + abbr='qwen2_5-72b-instruct-vllm', + path='Qwen/Qwen2.5-72B-Instruct', + model_kwargs=dict(tensor_parallel_size=4), + max_out_len=4096, + batch_size=16, + generation_kwargs=dict(temperature=0), + run_cfg=dict(num_gpus=4), + ) +] diff --git a/configs/models/qwen2_5/vllm_qwen2_5_7b_instruct.py b/configs/models/qwen2_5/vllm_qwen2_5_7b_instruct.py new file mode 100644 index 000000000..25c38d0f8 --- /dev/null +++ b/configs/models/qwen2_5/vllm_qwen2_5_7b_instruct.py @@ -0,0 +1,14 @@ +from opencompass.models import VLLMwithChatTemplate + +models = [ + dict( + type=VLLMwithChatTemplate, + abbr='qwen2.5-7b-instruct-vllm', + path='Qwen/Qwen2.5-7B-Instruct', + model_kwargs=dict(tensor_parallel_size=1), + max_out_len=4096, + batch_size=16, + generation_kwargs=dict(temperature=0), + run_cfg=dict(num_gpus=1), + ) +] diff --git a/configs/models/yi/lmdeploy_yi_1_5_34b_chat.py b/configs/models/yi/lmdeploy_yi_1_5_34b_chat.py new file mode 100644 index 000000000..d296a1008 --- /dev/null +++ b/configs/models/yi/lmdeploy_yi_1_5_34b_chat.py @@ -0,0 +1,15 @@ +from opencompass.models import TurboMindModelwithChatTemplate + +models = [ + dict( + type=TurboMindModelwithChatTemplate, + abbr='yi-1.5-34b-chat-turbomind', + path='01-ai/Yi-1.5-34B-Chat', + engine_config=dict(session_len=4096, max_batch_size=16, tp=2), + gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=2048), + max_seq_len=4096, + max_out_len=2048, + batch_size=16, + run_cfg=dict(num_gpus=2), + ) +] diff --git a/configs/models/yi/lmdeploy_yi_1_5_6b_chat.py b/configs/models/yi/lmdeploy_yi_1_5_6b_chat.py new file mode 100644 index 000000000..eeaf8ea25 --- /dev/null +++ b/configs/models/yi/lmdeploy_yi_1_5_6b_chat.py @@ -0,0 +1,15 @@ +from opencompass.models import TurboMindModelwithChatTemplate + +models = [ + dict( + type=TurboMindModelwithChatTemplate, + abbr='yi-1.5-6b-chat-turbomind', + path='01-ai/Yi-1.5-6B-Chat', + engine_config=dict(session_len=4096, max_batch_size=16, tp=1), + gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=2048), + max_seq_len=4096, + max_out_len=2048, + batch_size=16, + run_cfg=dict(num_gpus=1), + ) +] diff --git a/configs/models/yi/lmdeploy_yi_1_5_9b_chat.py b/configs/models/yi/lmdeploy_yi_1_5_9b_chat.py new file mode 100644 index 000000000..4e33ba232 --- /dev/null +++ b/configs/models/yi/lmdeploy_yi_1_5_9b_chat.py @@ -0,0 +1,15 @@ +from opencompass.models import TurboMindModelwithChatTemplate + +models = [ + dict( + type=TurboMindModelwithChatTemplate, + abbr='yi-1.5-9b-chat-turbomind', + path='01-ai/Yi-1.5-9B-Chat', + engine_config=dict(session_len=4096, max_batch_size=16, tp=1), + gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=2048), + max_seq_len=4096, + max_out_len=2048, + batch_size=16, + run_cfg=dict(num_gpus=1), + ) +] diff --git a/configs/models/yi/lmdeploy_yi_34b_chat.py b/configs/models/yi/lmdeploy_yi_34b_chat.py new file mode 100644 index 000000000..5ed603a6d --- /dev/null +++ b/configs/models/yi/lmdeploy_yi_34b_chat.py @@ -0,0 +1,15 @@ +from opencompass.models import TurboMindModelwithChatTemplate + +models = [ + dict( + type=TurboMindModelwithChatTemplate, + abbr='yi-34b-chat-turbomind', + path='01-ai/Yi-34B-Chat', + engine_config=dict(session_len=4096, max_batch_size=16, tp=2), + gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=2048), + max_seq_len=4096, + max_out_len=2048, + batch_size=16, + run_cfg=dict(num_gpus=2), + ) +] diff --git a/configs/models/yi/lmdeploy_yi_6b_chat.py b/configs/models/yi/lmdeploy_yi_6b_chat.py new file mode 100644 index 000000000..5c75bfa50 --- /dev/null +++ b/configs/models/yi/lmdeploy_yi_6b_chat.py @@ -0,0 +1,15 @@ +from opencompass.models import TurboMindModelwithChatTemplate + +models = [ + dict( + type=TurboMindModelwithChatTemplate, + abbr='yi-6b-chat-turbomind', + path='01-ai/Yi-6B-Chat', + engine_config=dict(session_len=4096, max_batch_size=16, tp=1), + gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=2048), + max_seq_len=4096, + max_out_len=2048, + batch_size=16, + run_cfg=dict(num_gpus=1), + ) +] diff --git a/configs/summarizers/groups/cibench.py b/configs/summarizers/groups/cibench.py index f08bcba51..8c6650c26 100644 --- a/configs/summarizers/groups/cibench.py +++ b/configs/summarizers/groups/cibench.py @@ -3,7 +3,7 @@ _cibench_generation = ['cibench_generation/' + i for i in _cibench_generation_modules] cibench_summary_groups = [] _cibench_generation_weight = { - 'matplotlib': [223, 50, 1, 156], + 'matplotlib': [175, 2, 1, 156], 'pandas': [200, 45, 45, 38], 'pytorch': [69, 0, 8, 11], 'seaborn': [130, 0, 2, 106], diff --git a/docs/en/advanced_guides/evaluation_lmdeploy.md b/docs/en/advanced_guides/evaluation_lmdeploy.md new file mode 100644 index 000000000..bfacd4881 --- /dev/null +++ b/docs/en/advanced_guides/evaluation_lmdeploy.md @@ -0,0 +1,88 @@ +# Evaluation with LMDeploy + +We now support evaluation of models accelerated by the [LMDeploy](https://github.com/InternLM/lmdeploy). LMDeploy is a toolkit designed for compressing, deploying, and serving LLM. It has a remarkable inference performance. We now illustrate how to evaluate a model with the support of LMDeploy in OpenCompass. + +## Setup + +### Install OpenCompass + +Please follow the [instructions](https://opencompass.readthedocs.io/en/latest/get_started/installation.html) to install the OpenCompass and prepare the evaluation datasets. + +### Install LMDeploy + +Install lmdeploy via pip (python 3.8+) + +```shell +pip install lmdeploy +``` + +The default prebuilt package is compiled on CUDA 12. However, if CUDA 11+ is required, you can install lmdeploy by: + +```shell +export LMDEPLOY_VERSION=0.6.0 +export PYTHON_VERSION=310 +pip install https://github.com/InternLM/lmdeploy/releases/download/v${LMDEPLOY_VERSION}/lmdeploy-${LMDEPLOY_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux2014_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118 +``` + +## Evaluation + +When evaluating a model, it is necessary to prepare an evaluation configuration that specifies information such as the evaluation dataset, the model, and inference parameters. + +Taking [internlm2-chat-7b](https://huggingface.co/internlm/internlm2-chat-7b) as an example, the evaluation config is as follows: + +```python +# configure the dataset +from mmengine.config import read_base + + +with read_base(): + # choose a list of datasets + from .datasets.mmlu.mmlu_gen_a484b3 import mmlu_datasets + from .datasets.ceval.ceval_gen_5f30c7 import ceval_datasets + from .datasets.triviaqa.triviaqa_gen_2121ce import triviaqa_datasets + from opencompass.configs.datasets.gsm8k.gsm8k_0shot_v2_gen_a58960 import \ + gsm8k_datasets + # and output the results in a chosen format + from .summarizers.medium import summarizer + +datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), []) + +# configure lmdeploy +from opencompass.models import TurboMindModelwithChatTemplate + + + +# configure the model +models = [ + dict( + type=TurboMindModelwithChatTemplate, + abbr=f'internlm2-chat-7b-lmdeploy', + # model path, which can be the address of a model repository on the Hugging Face Hub or a local path + path='internlm/internlm2-chat-7b', + # inference backend of LMDeploy. It can be either 'turbomind' or 'pytorch'. + # If the model is not supported by 'turbomind', it will fallback to + # 'pytorch' + backend='turbomind', + # For the detailed engine config and generation config, please refer to + # https://github.com/InternLM/lmdeploy/blob/main/lmdeploy/messages.py + engine_config=dict(tp=1), + gen_config=dict(do_sample=False), + # the max size of the context window + max_seq_len=7168, + # the max number of new tokens + max_out_len=1024, + # the max number of prompts that LMDeploy receives + # in `generate` function + batch_size=5000, + run_cfg=dict(num_gpus=1), + ) +] +``` + +Place the aforementioned configuration in a file, such as "configs/eval_internlm2_lmdeploy.py". Then, in the home folder of OpenCompass, start evaluation by the following command: + +```shell +python run.py configs/eval_internlm2_lmdeploy.py -w outputs +``` + +You are expected to get the evaluation results after the inference and evaluation. diff --git a/docs/en/advanced_guides/evaluation_turbomind.md b/docs/en/advanced_guides/evaluation_turbomind.md deleted file mode 100644 index c1299f0b3..000000000 --- a/docs/en/advanced_guides/evaluation_turbomind.md +++ /dev/null @@ -1,78 +0,0 @@ -# Evaluation with LMDeploy - -We now support evaluation of models accelerated by the [LMDeploy](https://github.com/InternLM/lmdeploy). LMDeploy is a toolkit designed for compressing, deploying, and serving LLM. **TurboMind** is an efficient inference engine proposed by LMDeploy. OpenCompass is compatible with TurboMind. We now illustrate how to evaluate a model with the support of TurboMind in OpenCompass. - -## Setup - -### Install OpenCompass - -Please follow the [instructions](https://opencompass.readthedocs.io/en/latest/get_started/installation.html) to install the OpenCompass and prepare the evaluation datasets. - -### Install LMDeploy - -Install lmdeploy via pip (python 3.8+) - -```shell -pip install lmdeploy -``` - -## Evaluation - -OpenCompass integrates turbomind's python API for evaluation. - -We take the InternLM-20B as example. Firstly, we prepare the evaluation config `configs/eval_internlm_turbomind.py`: - -```python -from mmengine.config import read_base -from opencompass.models.turbomind import TurboMindModel - - -with read_base(): - # choose a list of datasets - from .datasets.mmlu.mmlu_gen_a484b3 import mmlu_datasets - from .datasets.ceval.ceval_gen_5f30c7 import ceval_datasets - from .datasets.SuperGLUE_WiC.SuperGLUE_WiC_gen_d06864 import WiC_datasets - from .datasets.triviaqa.triviaqa_gen_2121ce import triviaqa_datasets - from .datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets - from .datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets - # and output the results in a chosen format - from .summarizers.medium import summarizer - -datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), []) - -# config for internlm-20b model -internlm_20b = dict( - type=TurboMindModel, - abbr='internlm-20b-turbomind', - path="internlm/internlm-20b", # this path should be same as in huggingface - engine_config=dict(session_len=2048, - max_batch_size=8, - rope_scaling_factor=1.0), - gen_config=dict(top_k=1, top_p=0.8, - temperature=1.0, - max_new_tokens=100), - max_out_len=100, - max_seq_len=2048, - batch_size=8, - concurrency=8, - run_cfg=dict(num_gpus=1, num_procs=1), - end_str='' - ) - -models = [internlm_20b] -``` - -Then, in the home folder of OpenCompass, start evaluation by the following command: - -```shell -python run.py configs/eval_internlm_turbomind.py -w outputs/turbomind/internlm-20b -``` - -You are expected to get the evaluation results after the inference and evaluation. - -**Note**: - -- If you want to pass more arguments for `engine_config`和`gen_config` in the evaluation config file, please refer to [TurbomindEngineConfig](https://lmdeploy.readthedocs.io/en/latest/inference/pipeline.html#turbomindengineconfig) - and [GenerationConfig](https://lmdeploy.readthedocs.io/en/latest/inference/pipeline.html#generationconfig) -- If you evaluate the InternLM Chat model, please use configuration file `eval_internlm_chat_turbomind.py` -- If you evaluate the InternLM 7B model, please modify `eval_internlm_turbomind.py` or `eval_internlm_chat_turbomind.py` by changing to the setting `models = [internlm_7b]` in the last line. diff --git a/docs/en/index.rst b/docs/en/index.rst index 2f04aaee4..fdad9c9e5 100644 --- a/docs/en/index.rst +++ b/docs/en/index.rst @@ -61,7 +61,7 @@ We always welcome *PRs* and *Issues* for the betterment of OpenCompass. advanced_guides/new_dataset.md advanced_guides/custom_dataset.md advanced_guides/new_model.md - advanced_guides/evaluation_turbomind.md + advanced_guides/evaluation_lmdeploy.md advanced_guides/evaluation_lightllm.md advanced_guides/accelerator_intro.md advanced_guides/code_eval.md diff --git a/docs/en/notes/news.md b/docs/en/notes/news.md index b848f6bfd..b4a56fa59 100644 --- a/docs/en/notes/news.md +++ b/docs/en/notes/news.md @@ -1,5 +1,13 @@ # News +- **\[2024.05.08\]** We supported the evaluation of 4 MoE models: [Mixtral-8x22B-v0.1](configs/models/mixtral/hf_mixtral_8x22b_v0_1.py), [Mixtral-8x22B-Instruct-v0.1](configs/models/mixtral/hf_mixtral_8x22b_instruct_v0_1.py), [Qwen1.5-MoE-A2.7B](configs/models/qwen/hf_qwen1_5_moe_a2_7b.py), [Qwen1.5-MoE-A2.7B-Chat](configs/models/qwen/hf_qwen1_5_moe_a2_7b_chat.py). Try them out now! +- **\[2024.04.30\]** We supported evaluating a model's compression efficiency by calculating its Bits per Character (BPC) metric on an [external corpora](configs/datasets/llm_compression/README.md) ([official paper](https://github.com/hkust-nlp/llm-compression-intelligence)). Check out the [llm-compression](configs/eval_llm_compression.py) evaluation config now! 🔥🔥🔥 +- **\[2024.04.29\]** We report the performance of several famous LLMs on the common benchmarks, welcome to [documentation](https://opencompass.readthedocs.io/en/latest/user_guides/corebench.html) for more information! 🔥🔥🔥. +- **\[2024.04.26\]** We deprecated the multi-madality evaluating function from OpenCompass, related implement has moved to [VLMEvalKit](https://github.com/open-compass/VLMEvalKit), welcome to use! 🔥🔥🔥. +- **\[2024.04.26\]** We supported the evaluation of [ArenaHard](configs/eval_subjective_arena_hard.py) welcome to try!🔥🔥🔥. +- **\[2024.04.22\]** We supported the evaluation of [LLaMA3](configs/models/hf_llama/hf_llama3_8b.py) 和 [LLaMA3-Instruct](configs/models/hf_llama/hf_llama3_8b_instruct.py), welcome to try! 🔥🔥🔥 +- **\[2024.02.29\]** We supported the MT-Bench, AlpacalEval and AlignBench, more information can be found [here](https://opencompass.readthedocs.io/en/latest/advanced_guides/subjective_evaluation.html) +- **\[2024.01.30\]** We release OpenCompass 2.0. Click [CompassKit](https://github.com/open-compass), [CompassHub](https://hub.opencompass.org.cn/home), and [CompassRank](https://rank.opencompass.org.cn/home) for more information ! - **\[2024.01.17\]** We supported the evaluation of [InternLM2](https://github.com/open-compass/opencompass/blob/main/configs/eval_internlm2_keyset.py) and [InternLM2-Chat](https://github.com/open-compass/opencompass/blob/main/configs/eval_internlm2_chat_keyset.py), InternLM2 showed extremely strong performance in these tests, welcome to try! - **\[2024.01.17\]** We supported the needle in a haystack test with multiple needles, more information can be found [here](https://opencompass.readthedocs.io/en/latest/advanced_guides/needleinahaystack_eval.html#id8). - **\[2023.12.28\]** We have enabled seamless evaluation of all models developed using [LLaMA2-Accessory](https://github.com/Alpha-VLLM/LLaMA2-Accessory), a powerful toolkit for comprehensive LLM development. @@ -24,7 +32,7 @@ - **\[2023.08.18\]** [Dataset card](https://opencompass.org.cn/dataset-detail/MMLU) is now online. Welcome new evaluation benchmark OpenCompass ! - **\[2023.08.11\]** [Model comparison](https://opencompass.org.cn/model-compare/GPT-4,ChatGPT,LLaMA-2-70B,LLaMA-65B) is now online. We hope this feature offers deeper insights! - **\[2023.08.11\]** We have supported [LEval](https://github.com/OpenLMLab/LEval). -- **\[2023.08.10\]** OpenCompass is compatible with [LMDeploy](https://github.com/InternLM/lmdeploy). Now you can follow this [instruction](https://opencompass.readthedocs.io/en/latest/advanced_guides/evaluation_turbomind.html#) to evaluate the accelerated models provide by the **Turbomind**. +- **\[2023.08.10\]** OpenCompass is compatible with [LMDeploy](https://github.com/InternLM/lmdeploy). Now you can follow this [instruction](https://opencompass.readthedocs.io/en/latest/advanced_guides/evaluation_lmdeploy.html#) to evaluate the accelerated models provide by the **Turbomind**. - **\[2023.08.10\]** We have supported [Qwen-7B](https://github.com/QwenLM/Qwen-7B) and [XVERSE-13B](https://github.com/xverse-ai/XVERSE-13B) ! Go to our [leaderboard](https://opencompass.org.cn/leaderboard-llm) for more results! More models are welcome to join OpenCompass. - **\[2023.08.09\]** Several new datasets(**CMMLU, TydiQA, SQuAD2.0, DROP**) are updated on our [leaderboard](https://opencompass.org.cn/leaderboard-llm)! More datasets are welcomed to join OpenCompass. - **\[2023.08.07\]** We have added a [script](tools/eval_mmbench.py) for users to evaluate the inference results of [MMBench](https://opencompass.org.cn/MMBench)-dev. diff --git a/docs/zh_cn/advanced_guides/evaluation_lmdeploy.md b/docs/zh_cn/advanced_guides/evaluation_lmdeploy.md new file mode 100644 index 000000000..14bcbc6bd --- /dev/null +++ b/docs/zh_cn/advanced_guides/evaluation_lmdeploy.md @@ -0,0 +1,86 @@ +# 使用 LMDeploy 加速评测 + +我们支持在评测大语言模型时,使用 [LMDeploy](https://github.com/InternLM/lmdeploy) 作为推理加速引擎。LMDeploy 是涵盖了 LLM 和 VLM 任务的全套轻量化、部署和服务解决方案,拥有卓越的推理性能。本教程将介绍如何使用 LMDeploy 加速对模型的评测。 + +## 环境配置 + +### 安装 OpenCompass + +请根据 OpenCompass [安装指南](https://opencompass.readthedocs.io/en/latest/get_started/installation.html) 来安装算法库和准备数据集。 + +### 安装 LMDeploy + +使用 pip 安装 LMDeploy (python 3.8+): + +```shell +pip install lmdeploy +``` + +LMDeploy 预编译包默认基于 CUDA 12 编译。如果需要在 CUDA 11+ 下安装 LMDeploy,请执行以下命令: + +```shell +export LMDEPLOY_VERSION=0.6.0 +export PYTHON_VERSION=310 +pip install https://github.com/InternLM/lmdeploy/releases/download/v${LMDEPLOY_VERSION}/lmdeploy-${LMDEPLOY_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux2014_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118 +``` + +## 评测 + +在评测一个模型时,需要准备一份评测配置,指明评测集、模型和推理参数等信息。 + +以 [internlm2-chat-7b](https://huggingface.co/internlm/internlm2-chat-7b) 模型为例,相关的配置信息如下: + +```python +# configure the dataset +from mmengine.config import read_base + + +with read_base(): + # choose a list of datasets + from .datasets.mmlu.mmlu_gen_a484b3 import mmlu_datasets + from .datasets.ceval.ceval_gen_5f30c7 import ceval_datasets + from .datasets.triviaqa.triviaqa_gen_2121ce import triviaqa_datasets + from opencompass.configs.datasets.gsm8k.gsm8k_0shot_v2_gen_a58960 import \ + gsm8k_datasets + # and output the results in a chosen format + from .summarizers.medium import summarizer + +datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), []) + +# configure lmdeploy +from opencompass.models import TurboMindModelwithChatTemplate + + + +# configure the model +models = [ + dict( + type=TurboMindModelwithChatTemplate, + abbr=f'internlm2-chat-7b-lmdeploy', + # model path, which can be the address of a model repository on the Hugging Face Hub or a local path + path='internlm/internlm2-chat-7b', + # inference backend of LMDeploy. It can be either 'turbomind' or 'pytorch'. + # If the model is not supported by 'turbomind', it will fallback to + # 'pytorch' + backend='turbomind', + # For the detailed engine config and generation config, please refer to + # https://github.com/InternLM/lmdeploy/blob/main/lmdeploy/messages.py + engine_config=dict(tp=1), + gen_config=dict(do_sample=False), + # the max size of the context window + max_seq_len=7168, + # the max number of new tokens + max_out_len=1024, + # the max number of prompts that LMDeploy receives + # in `generate` function + batch_size=5000, + run_cfg=dict(num_gpus=1), + ) +] +``` + +把上述配置放在文件中,比如 "configs/eval_internlm2_lmdeploy.py"。然后,在 OpenCompass 的项目目录下,执行如下命令可得到评测结果: + +```shell +python run.py configs/eval_internlm2_lmdeploy.py -w outputs +``` diff --git a/docs/zh_cn/advanced_guides/evaluation_turbomind.md b/docs/zh_cn/advanced_guides/evaluation_turbomind.md deleted file mode 100644 index a7c37b758..000000000 --- a/docs/zh_cn/advanced_guides/evaluation_turbomind.md +++ /dev/null @@ -1,75 +0,0 @@ -# 评测 LMDeploy 模型 - -我们支持评测使用 [LMDeploy](https://github.com/InternLM/lmdeploy) 加速过的大语言模型。LMDeploy 由 MMDeploy 和 MMRazor 团队联合开发,是涵盖了 LLM 任务的全套轻量化、部署和服务解决方案。 **TurboMind** 是 LMDeploy 推出的高效推理引擎。OpenCompass 对 TurboMind 进行了适配,本教程将介绍如何使用 OpenCompass 来对 TurboMind 加速后的模型进行评测。 - -## 环境配置 - -### 安装 OpenCompass - -请根据 OpenCompass [安装指南](https://opencompass.readthedocs.io/en/latest/get_started/installation.html) 来安装算法库和准备数据集。 - -### 安装 LMDeploy - -使用 pip 安装 LMDeploy (python 3.8+): - -```shell -pip install lmdeploy -``` - -## 评测 - -OpenCompass 支持分别通过 turbomind python API 评测数据集。 - -下文以 InternLM-20B 模型为例,介绍如何评测。首先我们准备好测试配置文件`configs/eval_internlm_turbomind.py`: - -```python -from mmengine.config import read_base -from opencompass.models.turbomind import TurboMindModel - - -with read_base(): - # choose a list of datasets - from .datasets.mmlu.mmlu_gen_a484b3 import mmlu_datasets - from .datasets.ceval.ceval_gen_5f30c7 import ceval_datasets - from .datasets.SuperGLUE_WiC.SuperGLUE_WiC_gen_d06864 import WiC_datasets - from .datasets.triviaqa.triviaqa_gen_2121ce import triviaqa_datasets - from .datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets - from .datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets - # and output the results in a chosen format - from .summarizers.medium import summarizer - -datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), []) - -# config for internlm-20b model -internlm_20b = dict( - type=TurboMindModel, - abbr='internlm-20b-turbomind', - path="internlm/internlm-20b", # 注意路径与huggingface保持一致 - engine_config=dict(session_len=2048, - max_batch_size=8, - rope_scaling_factor=1.0), - gen_config=dict(top_k=1, top_p=0.8, - temperature=1.0, - max_new_tokens=100), - max_out_len=100, - max_seq_len=2048, - batch_size=8, - concurrency=8, - run_cfg=dict(num_gpus=1, num_procs=1), - end_str='' - ) - -models = [internlm_20b] -``` - -然后,在 OpenCompass 的项目目录下,执行如下命令可得到评测结果: - -```shell -python run.py configs/eval_internlm_turbomind.py -w outputs/turbomind/internlm-20b -``` - -**注:** - -- 如果想在测评配置文件中`engine_config`和`gen_config`字段传递更多参数,请参考[TurbomindEngineConfig](https://lmdeploy.readthedocs.io/zh-cn/latest/inference/pipeline.html#turbomindengineconfig) 和 [GenerationConfig](https://lmdeploy.readthedocs.io/zh-cn/latest/inference/pipeline.html#generationconfig) -- 如果评测 InternLM Chat 模型,请使用配置文件 `eval_internlm_chat_turbomind.py` -- 如果评测 InternLM 7B 模型,请修改 `eval_internlm_turbomind.py` 或者 `eval_internlm_chat_turbomind.py`。将`models`字段配置为`models = [internlm_7b]` 。 diff --git a/docs/zh_cn/index.rst b/docs/zh_cn/index.rst index 44f22c1a5..37a3bc0cd 100644 --- a/docs/zh_cn/index.rst +++ b/docs/zh_cn/index.rst @@ -61,7 +61,7 @@ OpenCompass 上手路线 advanced_guides/new_dataset.md advanced_guides/custom_dataset.md advanced_guides/new_model.md - advanced_guides/evaluation_turbomind.md + advanced_guides/evaluation_lmdeploy.md advanced_guides/evaluation_lightllm.md advanced_guides/accelerator_intro.md advanced_guides/code_eval.md diff --git a/docs/zh_cn/notes/news.md b/docs/zh_cn/notes/news.md index 776f2f505..9f1d15c6c 100644 --- a/docs/zh_cn/notes/news.md +++ b/docs/zh_cn/notes/news.md @@ -1,5 +1,13 @@ # 新闻 +- **\[2024.05.08\]** 我们支持了以下四个MoE模型的评测配置文件: [Mixtral-8x22B-v0.1](configs/models/mixtral/hf_mixtral_8x22b_v0_1.py), [Mixtral-8x22B-Instruct-v0.1](configs/models/mixtral/hf_mixtral_8x22b_instruct_v0_1.py), [Qwen1.5-MoE-A2.7B](configs/models/qwen/hf_qwen1_5_moe_a2_7b.py), [Qwen1.5-MoE-A2.7B-Chat](configs/models/qwen/hf_qwen1_5_moe_a2_7b_chat.py) 。欢迎试用! +- **\[2024.04.30\]** 我们支持了计算模型在给定[数据集](configs/datasets/llm_compression/README.md)上的压缩率(Bits per Character)的评测方法([官方文献](https://github.com/hkust-nlp/llm-compression-intelligence))。欢迎试用[llm-compression](configs/eval_llm_compression.py)评测集! 🔥🔥🔥 +- **\[2024.04.26\]** 我们报告了典型LLM在常用基准测试上的表现,欢迎访问[文档](https://opencompass.readthedocs.io/zh-cn/latest/user_guides/corebench.html)以获取更多信息!🔥🔥🔥. +- **\[2024.04.26\]** 我们废弃了 OpenCompass 进行多模态大模型评测的功能,相关功能转移至 [VLMEvalKit](https://github.com/open-compass/VLMEvalKit),推荐使用!🔥🔥🔥. +- **\[2024.04.26\]** 我们支持了 [ArenaHard评测](configs/eval_subjective_arena_hard.py) 欢迎试用!🔥🔥🔥. +- **\[2024.04.22\]** 我们支持了 [LLaMA3](configs/models/hf_llama/hf_llama3_8b.py) 和 [LLaMA3-Instruct](configs/models/hf_llama/hf_llama3_8b_instruct.py) 的评测,欢迎试用!🔥🔥🔥. +- **\[2024.02.29\]** 我们支持了MT-Bench、AlpacalEval和AlignBench,更多信息可以在[这里](https://opencompass.readthedocs.io/en/latest/advanced_guides/subjective_evaluation.html)找到。 +- **\[2024.01.30\]** 我们发布了OpenCompass 2.0。更多信息,请访问[CompassKit](https://github.com/open-compass)、[CompassHub](https://hub.opencompass.org.cn/home)和[CompassRank](https://rank.opencompass.org.cn/home)。 - **\[2024.01.17\]** 我们支持了 [InternLM2](https://github.com/open-compass/opencompass/blob/main/configs/eval_internlm2_chat_keyset.py) 和 [InternLM2-Chat](https://github.com/open-compass/opencompass/blob/main/configs/eval_internlm2_chat_keyset.py) 的相关评测,InternLM2 在这些测试中表现出非常强劲的性能,欢迎试用!. - **\[2024.01.17\]** 我们支持了多根针版本的大海捞针测试,更多信息见[这里](https://opencompass.readthedocs.io/zh-cn/latest/advanced_guides/needleinahaystack_eval.html#id8). - **\[2023.12.28\]** 我们支持了对使用[LLaMA2-Accessory](https://github.com/Alpha-VLLM/LLaMA2-Accessory)(一款强大的LLM开发工具箱)开发的所有模型的无缝评估! @@ -24,7 +32,7 @@ - **\[2023.08.18\]** [数据集页面](https://opencompass.org.cn/dataset-detail/MMLU) 现已在OpenCompass官网上线,欢迎更多社区评测数据集加入OpenCompass ! - **\[2023.08.11\]** 官网榜单上新增了[模型对比](https://opencompass.org.cn/model-compare/GPT-4,ChatGPT,LLaMA-2-70B,LLaMA-65B)功能,希望该功能可以协助提供更多发现! - **\[2023.08.11\]** 新增了 [LEval](https://github.com/OpenLMLab/LEval) 评测支持. -- **\[2023.08.10\]** OpenCompass 现已适配 [LMDeploy](https://github.com/InternLM/lmdeploy). 请参考 [评测指南](https://opencompass.readthedocs.io/zh_CN/latest/advanced_guides/evaluation_turbomind.html) 对 **Turbomind** 加速后的模型进行评估. +- **\[2023.08.10\]** OpenCompass 现已适配 [LMDeploy](https://github.com/InternLM/lmdeploy). 请参考 [评测指南](https://opencompass.readthedocs.io/zh_CN/latest/advanced_guides/evaluation_lmdeploy.html) 对 **Turbomind** 加速后的模型进行评估. - **\[2023.08.10\]** [Qwen-7B](https://github.com/QwenLM/Qwen-7B) 和 [XVERSE-13B](https://github.com/xverse-ai/XVERSE-13B)的评测结果已更新在 OpenCompass [大语言模型评测榜单](https://opencompass.org.cn/leaderboard-llm)! - **\[2023.08.09\]** 更新更多评测数据集(**CMMLU, TydiQA, SQuAD2.0, DROP**) ,请登录 [大语言模型评测榜单](https://opencompass.org.cn/leaderboard-llm) 查看更多结果! 欢迎添加你的评测数据集到OpenCompass. - **\[2023.08.07\]** 新增了 [MMBench 评测脚本](tools/eval_mmbench.py) 以支持用户自行获取 [MMBench](https://opencompass.org.cn/MMBench)-dev 的测试结果. diff --git a/opencompass/__init__.py b/opencompass/__init__.py index d1daced0e..80eb7f98f 100644 --- a/opencompass/__init__.py +++ b/opencompass/__init__.py @@ -1 +1 @@ -__version__ = '0.3.2.post1' +__version__ = '0.3.3' diff --git a/opencompass/configs/datasets/MathBench/mathbench_2024_few_shot_mixed_4a3fd4.py b/opencompass/configs/datasets/MathBench/mathbench_2024_few_shot_mixed_4a3fd4.py new file mode 100644 index 000000000..e7f2859e9 --- /dev/null +++ b/opencompass/configs/datasets/MathBench/mathbench_2024_few_shot_mixed_4a3fd4.py @@ -0,0 +1,81 @@ +from mmengine.config import read_base +from copy import deepcopy +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer, PPLInferencer +from opencompass.openicl.icl_evaluator import CircularEvaluator, AccEvaluator +from opencompass.datasets import MathBenchDataset, mathbench_postprocess +from opencompass.utils.text_postprocessors import first_option_postprocess + +with read_base(): + from .mathbench_prompt import zero_shot_prompts, few_shot_prompts, mathbench_sets + +# Max for this dataset is 4 +num_shot = 4 +# Generate reasoning path or not, only for single choice +with_reasoning = True +# Use circular evaluation or not +with_circular_eval = True +# Use PPL mode in single choice test or not +use_ppl_single_choice = True + +assert 0 <= num_shot <= 4 +if num_shot == 0: + prompts = zero_shot_prompts +else: + prompts = {name: p[- 2 * num_shot - 2:] for name, p in few_shot_prompts.items()} + +mathbench_datasets = [] +for _split in mathbench_sets: + for _name in mathbench_sets[_split]: + if 'single_choice' in _name: + if with_reasoning and not use_ppl_single_choice: + template_round = prompts[_name + '_with_reasoning'] + else: + template_round = prompts[_name] + else: + template_round = prompts[_name] + + if 'single_choice' in _name: + pred_postprocessor = dict(type=first_option_postprocess, options='ABCD') + else: + pred_postprocessor = dict(type=mathbench_postprocess, name=_name) + + if 'single_choice' in _name and with_circular_eval: + evaluator = dict(type=CircularEvaluator) + else: + evaluator = dict(type=AccEvaluator) + + # assemble the final config + mathbench_reader_cfg = dict(input_columns=['question'], output_column='answer') + if use_ppl_single_choice and 'single_choice' in _name: + template = {} + for answer in ['A', 'B', 'C', 'D']: + one_template_round = deepcopy(template_round) + one_template_round[-1]['prompt'] = one_template_round[-1]['prompt'].format(answer=answer) + template[answer] = dict(round=one_template_round) + mathbench_infer_cfg = dict( + prompt_template=dict(type=PromptTemplate, template=template), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=PPLInferencer), + ) + else: + mathbench_infer_cfg = dict( + prompt_template=dict(type=PromptTemplate, template=dict(round=template_round)), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=2048, stopping_criteria=['Question:']), + ) + mathbench_eval_cfg = dict(evaluator=evaluator, pred_postprocessor=pred_postprocessor) + + mathbench_datasets.append( + dict( + abbr='mathbench-' + _split + '-' + _name, + type=MathBenchDataset, + path=f'data/mathbench_v1/{_split}', + name=_name, + with_circular=with_circular_eval, + reader_cfg=mathbench_reader_cfg, + infer_cfg=mathbench_infer_cfg, + eval_cfg=mathbench_eval_cfg, + ) + ) diff --git a/opencompass/configs/datasets/MathBench/mathbench_2024_gen_50a320.py b/opencompass/configs/datasets/MathBench/mathbench_2024_gen_50a320.py new file mode 100644 index 000000000..2d20f4506 --- /dev/null +++ b/opencompass/configs/datasets/MathBench/mathbench_2024_gen_50a320.py @@ -0,0 +1,81 @@ +from mmengine.config import read_base +from copy import deepcopy +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer, PPLInferencer +from opencompass.openicl.icl_evaluator import CircularEvaluator, AccEvaluator +from opencompass.datasets import MathBenchDataset, math_postprocess_v2 +from opencompass.utils.text_postprocessors import first_option_postprocess + +with read_base(): + from .mathbench_prompt import zero_shot_prompts, few_shot_prompts, mathbench_sets + +# Max for this dataset is 4 +num_shot = 0 +# Generate reasoning path or not, only for single choice +with_reasoning = True +# Use circular evaluation or not +with_circular_eval = True +# Use PPL mode in single choice test or not +use_ppl_single_choice = False + +assert 0 <= num_shot <= 4 +if num_shot == 0: + prompts = zero_shot_prompts +else: + prompts = {name: p[- 2 * num_shot - 2:] for name, p in few_shot_prompts.items()} + +mathbench_datasets = [] +for _split in mathbench_sets: + for _name in mathbench_sets[_split]: + if 'single_choice' in _name: + if with_reasoning: + template_round = prompts[_name + '_with_reasoning'] + else: + template_round = prompts[_name] + else: + template_round = prompts[_name] + + if 'single_choice' in _name: + pred_postprocessor = dict(type=first_option_postprocess, options='ABCD') + else: + pred_postprocessor = dict(type=math_postprocess_v2) + + if 'single_choice' in _name and with_circular_eval: + evaluator = dict(type=CircularEvaluator) + else: + evaluator = dict(type=AccEvaluator) + + # assemble the final config + mathbench_reader_cfg = dict(input_columns=['question'], output_column='answer') + if use_ppl_single_choice and 'single_choice' in _name and not with_reasoning: + template = {} + for answer in ['A', 'B', 'C', 'D']: + one_template_round = deepcopy(template_round) + one_template_round['round'][-1]['prompt'] = one_template_round['round'][-1]['prompt'].format(answer=answer) + template[answer] = dict(round=one_template_round) + mathbench_infer_cfg = dict( + prompt_template=dict(type=PromptTemplate, template=template), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=PPLInferencer), + ) + else: + mathbench_infer_cfg = dict( + prompt_template=dict(type=PromptTemplate, template=dict(round=template_round)), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=2048), + ) + mathbench_eval_cfg = dict(evaluator=evaluator, pred_postprocessor=pred_postprocessor) + + mathbench_datasets.append( + dict( + abbr='mathbench-' + _split + '-' + _name, + type=MathBenchDataset, + path=f'data/mathbench_v1/{_split}', + name=_name, + with_circular=with_circular_eval, + reader_cfg=mathbench_reader_cfg, + infer_cfg=mathbench_infer_cfg, + eval_cfg=mathbench_eval_cfg, + ) + ) diff --git a/opencompass/configs/datasets/MathBench/mathbench_prompt.py b/opencompass/configs/datasets/MathBench/mathbench_prompt.py index 069528ee4..8052dab62 100644 --- a/opencompass/configs/datasets/MathBench/mathbench_prompt.py +++ b/opencompass/configs/datasets/MathBench/mathbench_prompt.py @@ -11,6 +11,12 @@ 'single_choice_en': [ dict(role='HUMAN', prompt='Question: Here is a multiple-choice question about mathematics. Please provide the correct answer option directly.\nHere is the question you need to answer:\n{question}\nThe answer is:'), ], + 'cloze_en': [ + dict(role='HUMAN', prompt='{question}\nPlease reason step by step, and put your final answer within \\boxed{}.'), + ], + 'cloze_cn': [ + dict(role='HUMAN', prompt='{question}\n请一步一步推理,并在最后用\\boxed{}给出你的答案。'), + ] } few_shot_prompts = { diff --git a/opencompass/configs/datasets/dingo/dingo_gen.py b/opencompass/configs/datasets/dingo/dingo_gen.py new file mode 100644 index 000000000..c36f6cdcc --- /dev/null +++ b/opencompass/configs/datasets/dingo/dingo_gen.py @@ -0,0 +1,34 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import DingoDataset, DingoEvaluator + + +dingo_paths = [ + './data/dingo/en_192.csv', + './data/dingo/zh_170.csv', +] + +dingo_datasets = [] +for path in dingo_paths: + dingo_reader_cfg = dict(input_columns='input', output_column=None) + dingo_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[dict(role='HUMAN', prompt='{input}')])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer), + ) + dingo_eval_cfg = dict(evaluator=dict(type=DingoEvaluator), pred_role='BOT') + + dingo_datasets.append( + dict( + abbr='dingo_' + path.split('/')[-1].split('.csv')[0], + type=DingoDataset, + path=path, + reader_cfg=dingo_reader_cfg, + infer_cfg=dingo_infer_cfg, + eval_cfg=dingo_eval_cfg, + )) + +datasets = dingo_datasets diff --git a/opencompass/configs/datasets/gaokao_math/README.md b/opencompass/configs/datasets/gaokao_math/README.md new file mode 100644 index 000000000..08253add1 --- /dev/null +++ b/opencompass/configs/datasets/gaokao_math/README.md @@ -0,0 +1,108 @@ +# GaoKao MATH Answer Evaluation Dataset +A dataset for testing the performance of the model in the GaoKao MATH Answer Extraction task. +Now support the following format of GAOKAO math questions: +1. '单选题':Single choice question +2. '多选题':Multiple choice question +3. '填空题':Fill in the blank question, can be multiple blanks +4. '解答题':Answer question, can be multiple answers + +Sample data: +```json +[ + { + "id": "3b270bc4-570a-4d77-b122-a2fc372f7d6a", + "question": "过椭圆${x^2\\over {16}} +{ y^2 \\over {4}}=1$ %内一点$M(2,1)$ %引一条弦,使该弦被点$M$ %平分,则这条弦所在直线的方程为( ).\nA. $x+2y-4=0$ %\nB. $x-2y-4=0$ %\nC. $x+2y+4=0$ %\nD. $x-2y+4=0$ %\n\n", + "response": "本题主要考查直线与圆锥曲线.设所求直线与椭圆的一个交点为$A(x,y)$ %,由于中点$M(2,1)$ %,所以另一个交点$B$ %为$(4-x,2-y)$ %.因为$A$ %,$B$ %两点都在椭圆上,所以$x^2+4y^2=16$ %,$(4-x)^2+4(2-y)^2=16$ %,两式相减,整理可得$x+2y-4=0$ %.由于过$A$ %,$B$ %两点的直线只有一条,所以这条弦所在直线的方程为$x+2y-4=0$ %.故本题正确答案为A.\n答案是:A", + "extract_answer": "A", + "question_type": "单选题" + }, + { + "id": "d60e42d7-30ee-44f9-a94d-aff6a8127750", + "question": "若函数$f(x)$ 具有下列性质:1.定义域为$(-1,1)$ ;2.对于任意的$x,y\\in(-1,1)$ ,都有$f(x)+f(y)=f\\left({\\dfrac{x+y}{1+xy}}\\right)$ ;3.当$-1< x< 0$ 时,$f(x)>0$ ,则称函数$f(x)$ 为$δ$ 的函数$.$ 若函数$f(x)$ 为$δ$ 的函数,则以下结论正确的是$(\\quad)$\nA. $\nB. x)$ 为奇函数\nC. $\nD. x)$ 为偶函数\nE. $\nF. x)$ 为单调递减函数\nG. $\nH. x)$ 为单调递增函数\n\n", + "response": "函数$f(x)$ 为$δ$ 的函数,令$x=y=0$ ,则$f(0)+f(0)=f(0)$ ,即$f(0)=0$ ,令$y=-x$ ,则$f(x)+f(-x)=f\\left(\\dfrac{x-x}{1-{x}^{2}}\\right)=f(0)=0$ ,则$f(-x)=-f(x)$ ,即函数$f(x)$ 是奇函数,设$-1< x< y< 1$ ,则$f(x)-f(y)=f(x)+f(-y)=f\\left(\\dfrac{x-y}{1-xy}\\right)$ ,$∵-1< x< y< 1$ ,$∴-1< \\dfrac{x-y}{1-xy}< 0$ ,则$f\\left(\\dfrac{x-y}{1-xy}\\right)>0$ ,即$f(x)-f(y)>0$ ,则$f(x)>f(y)$ ,即$f(x)$ 在$(-1,1)$ 上是减函数.故选$AC.$ 本题考查函数的奇偶性和单调性的判断,注意运用定义法,考查运算能力和推理能力,属于中档题.可令$x=y=0$ ,求得$f(0)=0$ ,再令$y=-x$ 可得$f(-x)=-f(x)$ ,可得$f(x)$ 的奇偶性;再令$-1< x< y< 1$ ,运用单调性的定义,结合其偶性的定义可得其单调性.\n答案是:A; C", + "extract_answer": "A, C", + "question_type": "多选题" + }, + { + "id": "31b3f702-e60c-4a20-9a40-73bd72b92d1e", + "question": "请完成以下题目(1)曲线$$y=-5\\text{e}^{x}+3$$在点$$(0,-2)$$处的切线方程为___.(2)若曲线$$f(x)=x \\sin x+1$$在$$x=\\dfrac{ \\pi }{2}$$处的切线与直线$$ax+2y+1=0$$相互垂直,则实数$$a=$$___.\n\n", + "response": "(1)由$$y=-5\\text{e}^{x}+3$$,得$$y'=-5\\text{e}^{x}$$,所以切线的斜率$$k=y'|_{x=0}=-5$$,所以切线方程为$$y+2=-5(x-0)$$,即$$5x+y+2=0$$.(2)因为$$f'(x)= \\sin x+x \\cos x$$,所以$$f'\\left(\\dfrac{ \\pi }{2}\\right)= \\sin \\dfrac{ \\pi }{2}+\\dfrac{ \\pi }{2}\\cdot \\cos \\dfrac{ \\pi }{2}=1$$.又直线$$ax+2y+1=0$$的斜率为$$-\\dfrac{a}{2}$$,所以根据题意得$$1\\times \\left(-\\dfrac{a}{2}\\right)=-1$$,解得$$a=2$$.\n答案是:(1)$$5x+y+2=0$$ (2)$$2$$", + "extract_answer": "['(1)$$5x+y+2=0$$ (2)$$2$$']", + "question_type": "填空题" + }, + { + "id": "16878941-1772-4290-bc61-00b193d5cf70", + "question": "已知函数$f\\left( x \\right)=\\left| 2x-1 \\right|$.(1)若不等式$f\\left( x+\\frac{1}{2} \\right)\\ge 2m+1\\left( m > 0 \\right)$的解集为$\\left( -\\infty ,-2 \\right]\\bigcup \\left[ 2,+\\infty \\right)$,求实数$m$的值;(2)若不等式$f\\left( x \\right)\\le {{2}^{y}}+\\frac{a}{{{2}^{y}}}+\\left| 2x+3 \\right|$对任意的实数$x,y\\in R$恒成立,求实数$a$的最小值.\n\n", + "response": "(1)直接写出不等式,解含有绝对值的函数不等式即可;(2)这是恒成立求参的问题,根据绝对值三角不等式得到左侧函数的最值,再结合均值不等式得最值.(1)由条件得$\\left| 2x \\right|\\le 2m+1$得$-m-\\frac{1}{2}\\le x\\le m+\\frac{1}{2}$,所以$m=\\frac{3}{2}$.(2)原不等式等价于$\\left| 2x-1 \\right|-\\left| 2x+3 \\right|\\le {{2}^{y}}+\\frac{a}{{{2}^{y}}}$,而$\\left| 2x-1 \\right|-\\left| 2x+3 \\right|\\le \\left| \\left( 2x-1 \\right)-\\left( 2x+3 \\right) \\right|=4$,所以${{2}^{y}}+\\frac{a}{{{2}^{y}}}\\ge 4$,则$a\\ge {{\\left[ {{2}^{y}}\\left( 4-{{2}^{y}} \\right) \\right]}_{\\text{max}}}=4$,当且仅当$y=1$时取得.\n答案是:(1) $m=\\frac{3}{2}$;(2) 最小值为$a=4$.", + "extract_answer": [ + "(1) $m=\\frac{3}{2}$;(2) 最小值为$a=4$." + ], + "question_type": "解答题" + } +] +``` +## How to use + +### 1. Prepare the dataset +```bash +cd opencompass +cp -rf /cpfs01/shared/public/liuhongwei/data/gaokao_math_dataset/gaokao_math ./data +``` +📢:If you want to evaluate your own gaokao math data, replace the `test_v2.jsonl` with your own data, but follow the format above. + +### 2. Set the evaluation model + +open `opencompass.datasets.gaokao_math.gaokao_math_gen_9b076f` and set the model name and api url for evaluation, multiple urls are supported for acceleration. + +```python +... + +gaokao_math_eval_cfg = dict( + evaluator=dict(type=GaoKaoMATHEvaluator, model_name='EVALUATE_MODEL_NAME', url=['http://0.0.0.0:23333/v1', 'http://...'])) + +... + +``` +We recommand `Qwen2.5-72B-Instruct` model for evaluation. + + +### 3. Set Extractor model and run the evaluation + +```python +from mmengine.config import read_base +from opencompass.models import HuggingFacewithChatTemplate + + +with read_base(): + from opencompass.datasets.gaokao_math.gaokao_math_gen_9b076f import gaokao_math_datasets + + +trained_qwen2_1_5b_model = [ # trained extractor model + dict( + type=HuggingFacewithChatTemplate, + abbr='gaokao_math_extractor_1_5b_v02', + path='/cpfs01/shared/public/liuhongwei/models/gaokao_math_trained/gaokao_math_extractor_1_5b_v02', + max_out_len=1024, + batch_size=8, + run_cfg=dict(num_gpus=1), + ) +] + +datasets = sum([v for k, v in locals().items() if k.endswith("_datasets")], []) +models = sum([v for k, v in locals().items() if k.endswith("_model")], []) + +... +``` + +### 4. Run the evaluation + +```bash +python run.py eval.py --dump-eval-details # eval and dump the evaluation details to `results` folder +``` + + +### 5. Evaluation results + +| Evaluator / Extractor | Qwen2.5-72B-Instruct | gaokao_math_extractor_1.5b_v0.2 | +|-----------------------|-----------------------|----------------------------------| +| Qwen2.5-72B-Instruct (ACC) | 95.85 | 95.2 | diff --git a/opencompass/configs/datasets/gaokao_math/gaokao_math_gen_f5fd28.py b/opencompass/configs/datasets/gaokao_math/gaokao_math_gen_f5fd28.py new file mode 100644 index 000000000..80ae4264f --- /dev/null +++ b/opencompass/configs/datasets/gaokao_math/gaokao_math_gen_f5fd28.py @@ -0,0 +1,48 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import GaoKaoMATHDataset, GaoKaoMATHEvaluator + + +MATH_CN_PROMPT=""" +你是一个数学阅卷专家,任务是从给定的回答句子中提取精确的关键答案。你必须只提供提取的关键答案,不包括任何额外的文字。 +— +我将为你提供一个问题、回答句子和问题类型。回答句子是对所提供问题的回应。利用提供的信息,你必须准确而精确地确定并从回答句子中提取预期的关键答案。请不要对问题发表主观看法。 + +对于单选题,答案应该是选项字母,例如 "A"; +对于多选题,答案应该是一个选项字母的列表,例如 ["A"] 或 ["A", "B", "C"]; +对于填空题,答案应该是一个填入空白处的答案列表,列表的数量应该与问题中的空白数量相同,例如 ["$$\\frac{{1}}{{2}}$$"] 或 ["$$\\frac{{1}}{{2}}$$", "2"]。 +对于问答题,类似填空题,为每个小问抽出相应答案,例如 ["$$\\frac{{1}}{{2}}$$"] 或 ["$$\\frac{{1}}{{2}}$$", "2"]。 + +如果回答句子提供了多个不同的答案,请仔细判断后面提供的答案是否是对前面答案的修正或修改。如果是这样,提取这个修正或修改后的答案作为最终答案。相反,如果回答句子在多个答案之间波动而没有明确的最终答案,你应该输出 [No valid answer]。 +— +问题类型: {question_type} +原始问题: {question} +回答: {response} +提取的关键答案: +""" + +gaokao_math_reader_cfg = dict(input_columns=['question', 'response', 'question_type'], output_column='extract_answer') + + +gaokao_math_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict(role='HUMAN', prompt=MATH_CN_PROMPT), + ])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512)) + +gaokao_math_eval_cfg = dict( + evaluator=dict(type=GaoKaoMATHEvaluator, model_name='Qwen/Qwen2.5-72B-Instruct', url=['http://22.8.73.119:23333/v1', 'http://22.8.4.97:23333/v1', 'http://22.8.22.254:23333/v1', 'http://22.8.17.14:23333/v1'])) + +gaokao_math_datasets = [ + dict( + type=GaoKaoMATHDataset, + abbr='GaoKaoMATH', + path='./data/gaokao_math/test_2k.json', + reader_cfg=gaokao_math_reader_cfg, + infer_cfg=gaokao_math_infer_cfg, + eval_cfg=gaokao_math_eval_cfg) +] diff --git a/opencompass/configs/datasets/gpqa/gpqa_few_shot_ppl_2c9cd6.py b/opencompass/configs/datasets/gpqa/gpqa_few_shot_ppl_2c9cd6.py new file mode 100644 index 000000000..3c06d7a40 --- /dev/null +++ b/opencompass/configs/datasets/gpqa/gpqa_few_shot_ppl_2c9cd6.py @@ -0,0 +1,49 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import FixKRetriever +from opencompass.openicl.icl_inferencer import PPLInferencer +from opencompass.openicl.icl_evaluator import AccwithDetailsEvaluator +from opencompass.datasets import GPQADataset, GPQAEvaluator +from opencompass.utils import first_option_postprocess + +gpqa_reader_cfg = dict( + input_columns=['question', 'A', 'B', 'C', 'D'], + output_column='answer') + +hint = f'对下面的单项选择题,请直接给出正确答案的选项。' +question_and_options = 'Question: {question}\n(A){A}\n(B){B}\n(C){C}\n(D){D}\n' +gpqa_infer_cfg = dict( + ice_template=dict( + type=PromptTemplate, + template={ + opt: f'{question_and_options}\nAnswer: {opt}' for opt in ['A', 'B', 'C', 'D']}, + ), + prompt_template=dict( + type=PromptTemplate, + template={ + opt: f'{hint}\n{question_and_options}\nAnswer: {opt}' for opt in ['A', 'B', 'C', 'D'] + }, + ice_token='' + ), + retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]), + inferencer=dict(type=PPLInferencer)) + +gpqa_eval_cfg = dict(evaluator=dict(type=AccwithDetailsEvaluator)) + +gpqa_datasets = [] +gpqa_subsets = { + # 'extended': 'gpqa_extended.csv', + # 'main': 'gpqa_main.csv', + 'diamond': 'gpqa_diamond.csv' +} + +for split in list(gpqa_subsets.keys()): + gpqa_datasets.append( + dict( + abbr='GPQA_' + split, + type=GPQADataset, + path='./data/gpqa/', + name=gpqa_subsets[split], + reader_cfg=gpqa_reader_cfg, + infer_cfg=gpqa_infer_cfg, + eval_cfg=gpqa_eval_cfg) + ) diff --git a/opencompass/configs/datasets/gsm8k/gsm8k_model_postprocess_gen_a58960.py b/opencompass/configs/datasets/gsm8k/gsm8k_model_postprocess_gen_a58960.py new file mode 100644 index 000000000..a95feb8d1 --- /dev/null +++ b/opencompass/configs/datasets/gsm8k/gsm8k_model_postprocess_gen_a58960.py @@ -0,0 +1,52 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import GSM8KDataset, gsm8k_dataset_postprocess +from opencompass.datasets import MATHEvaluator, math_postprocess_v2 +from opencompass.utils.model_postprocessors import navie_model_postprocess +from opencompass.utils.postprocessors.naive import MATH_NAVIE_PROMPT_TEMPLATE + +gsm8k_reader_cfg = dict(input_columns=['question'], output_column='answer') + +gsm8k_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='{question}\nPlease reason step by step, and put your final answer within \\boxed{}.'), + ], + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512), +) + +# # You can write your own postprocess prompt like: +# GSM8K_NAVIE_PROMPT_TEMPLATE = """ +# There is a detailed explanation of the final answer you should extract: +# 1. ... +# 2. ... +# ... +# """ + +gsm8k_eval_cfg = dict( + evaluator=dict(type=MATHEvaluator, version='v2'), + pred_postprocessor=dict(type=math_postprocess_v2), + dataset_postprocessor=dict(type=gsm8k_dataset_postprocess), + model_postprocessor=dict( + type=navie_model_postprocess, + custom_instruction=MATH_NAVIE_PROMPT_TEMPLATE, + model_name='', + api_url='http://0.0.0.0:23333/v1,http://0.0.0.0:23334/v1') + ) + +gsm8k_datasets = [ + dict( + abbr='gsm8k', + type=GSM8KDataset, + path='opencompass/gsm8k', + reader_cfg=gsm8k_reader_cfg, + infer_cfg=gsm8k_infer_cfg, + eval_cfg=gsm8k_eval_cfg, + ) +] diff --git a/opencompass/configs/datasets/math/math_0shot_llm_judge_gen_393424.py b/opencompass/configs/datasets/math/math_0shot_llm_judge_gen_393424.py new file mode 100644 index 000000000..eb302c854 --- /dev/null +++ b/opencompass/configs/datasets/math/math_0shot_llm_judge_gen_393424.py @@ -0,0 +1,78 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import MATHDataset, MATHEvaluator, math_postprocess_v2, GaoKaoMATHEvaluator +from opencompass.utils.model_postprocessors import naive_model_postprocess, xfinder_postprocess +from opencompass.utils.postprocessors.naive import MATH_NAVIE_PROMPT_TEMPLATE + +# ----------------------------- Eval Parameters ----------------------------- +## Postprocess function +post_func = 're' # 're', 'xfinder_model', 'naive_model' + +## Evalute function +eval_func = 'naive_model' # 're', 'naive_model' + +## Model api url +xfinder_url = 'http://0.0.0.0:23333/v1' # for 'xFinder-qwen1505' if post_func is 'xfinder_model' +naive_model_name = 'Qwen/Qwen2.5-72B-Instruct' # replace with your model name +naive_model_url = ['http://22.8.6.22:23333/v1', 'http://22.8.67.84:23333/v1', 'http://22.8.72.81:23333/v1', 'http://22.9.42.143:23333/v1'] # Multi-apis for accerlation + +# ----------------------------- Detailed Config ----------------------------- + +math_reader_cfg = dict(input_columns=['problem'], output_column='solution') + +math_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='{problem}\nPlease reason step by step, and put your final answer within \\boxed{}.'), + ] + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=1024), +) + +if post_func == 're': + pred_postprocessor = dict(type=math_postprocess_v2) +elif post_func == 'xfinder_model': + pred_postprocessor = dict( + type=xfinder_postprocess, + question_type='math', + model_name='xFinder-qwen1505', + num_processes=128, + api_url=xfinder_url, + ) +elif post_func == 'naive_model': + pred_postprocessor = dict( + type=naive_model_postprocess, + custom_instruction=MATH_NAVIE_PROMPT_TEMPLATE, + model_name=naive_model_name, + num_processes=64, + api_url=naive_model_url, + ) + +if eval_func == 're': + evaluator = dict(type=MATHEvaluator, version='v2') +elif eval_func == 'naive_model': + evaluator = dict( + type=GaoKaoMATHEvaluator, + model_name=naive_model_name, + url=naive_model_url, + ) + +math_eval_cfg = dict( + evaluator=evaluator, pred_postprocessor=pred_postprocessor, +) + +math_datasets = [ + dict( + type=MATHDataset, + abbr='math', + path='opencompass/math', + reader_cfg=math_reader_cfg, + infer_cfg=math_infer_cfg, + eval_cfg=math_eval_cfg, + ) +] diff --git a/opencompass/configs/datasets/math/math_4shot_base_gen_43d5b6.py b/opencompass/configs/datasets/math/math_4shot_base_gen_43d5b6.py new file mode 100644 index 000000000..1e8696798 --- /dev/null +++ b/opencompass/configs/datasets/math/math_4shot_base_gen_43d5b6.py @@ -0,0 +1,30 @@ +from mmengine.config import read_base +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import MATHDataset, MATHEvaluator, math_postprocess_v2 + +with read_base(): + from .math_4shot_example_from_google_research import prompt + +math_reader_cfg = dict(input_columns=['problem'], output_column='solution') + +math_infer_cfg = dict( + prompt_template=dict(type=PromptTemplate, template=prompt + '\n\nProblem:\n{problem}\nSolution:'), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=2048, stopping_criteria=['Problem', '问题:'])) + +# postprocess v2 +math_eval_cfg = dict( + evaluator=dict(type=MATHEvaluator, version='v2'), + pred_postprocessor=dict(type=math_postprocess_v2)) + +math_datasets = [ + dict( + type=MATHDataset, + abbr='math', + path='opencompass/math', + reader_cfg=math_reader_cfg, + infer_cfg=math_infer_cfg, + eval_cfg=math_eval_cfg) +] diff --git a/opencompass/configs/datasets/mmlu/mmlu_model_postprocess_gen_4d595a.py b/opencompass/configs/datasets/mmlu/mmlu_model_postprocess_gen_4d595a.py new file mode 100644 index 000000000..45cefb762 --- /dev/null +++ b/opencompass/configs/datasets/mmlu/mmlu_model_postprocess_gen_4d595a.py @@ -0,0 +1,141 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import FixKRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import AccwithDetailsEvaluator +from opencompass.datasets import MMLUDataset +from opencompass.utils.text_postprocessors import first_option_postprocess +from opencompass.utils.model_postprocessors import navie_model_postprocess +from opencompass.utils.postprocessors.naive import OPTION_NAVIE_PROMPT_TEMPLATE + + +# None of the mmlu dataset in huggingface is correctly parsed, so we use our own dataset reader +# Please download the dataset from https://people.eecs.berkeley.edu/~hendrycks/data.tar + +mmlu_reader_cfg = dict( + input_columns=['input', 'A', 'B', 'C', 'D'], + output_column='target', + train_split='dev') + +mmlu_all_sets = [ + 'college_biology', + 'college_chemistry', + 'college_computer_science', + 'college_mathematics', + 'college_physics', + 'electrical_engineering', + 'astronomy', + 'anatomy', + 'abstract_algebra', + 'machine_learning', + 'clinical_knowledge', + 'global_facts', + 'management', + 'nutrition', + 'marketing', + 'professional_accounting', + 'high_school_geography', + 'international_law', + 'moral_scenarios', + 'computer_security', + 'high_school_microeconomics', + 'professional_law', + 'medical_genetics', + 'professional_psychology', + 'jurisprudence', + 'world_religions', + 'philosophy', + 'virology', + 'high_school_chemistry', + 'public_relations', + 'high_school_macroeconomics', + 'human_sexuality', + 'elementary_mathematics', + 'high_school_physics', + 'high_school_computer_science', + 'high_school_european_history', + 'business_ethics', + 'moral_disputes', + 'high_school_statistics', + 'miscellaneous', + 'formal_logic', + 'high_school_government_and_politics', + 'prehistory', + 'security_studies', + 'high_school_biology', + 'logical_fallacies', + 'high_school_world_history', + 'professional_medicine', + 'high_school_mathematics', + 'college_medicine', + 'high_school_us_history', + 'sociology', + 'econometrics', + 'high_school_psychology', + 'human_aging', + 'us_foreign_policy', + 'conceptual_physics', +] + +mmlu_datasets = [] +for _name in mmlu_all_sets: + _hint = f'There is a single choice question about {_name.replace("_", " ")}. Answer the question by replying A, B, C or D.' + mmlu_infer_cfg = dict( + ice_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role='HUMAN', + prompt= + f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: ' + ), + dict(role='BOT', prompt='{target}\n') + ]), + ), + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin='', + round=[ + dict( + role='HUMAN', + prompt=f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: ' + ), + ], + ), + ice_token='', + ), + retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]), + inferencer=dict(type=GenInferencer), + ) + +# # You can write your own postprocess prompt like: +# MMLU_NAVIE_PROMPT_TEMPLATE = """ +# There is a detailed explanation of the final answer you should extract: +# 1. ... +# 2. ... +# ... +# """ + + mmlu_eval_cfg = dict( + evaluator=dict(type=AccwithDetailsEvaluator), + pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'), + model_postprocessor=dict( + type=navie_model_postprocess, + custom_instruction=OPTION_NAVIE_PROMPT_TEMPLATE, + model_name='', + api_url='http://0.0.0.0:23333/v1,http://0.0.0.0:23334/v1') + ) + + + mmlu_datasets.append( + dict( + abbr=f'lukaemon_mmlu_{_name}', + type=MMLUDataset, + path='opencompass/mmlu', + name=_name, + reader_cfg=mmlu_reader_cfg, + infer_cfg=mmlu_infer_cfg, + eval_cfg=mmlu_eval_cfg, + )) + +del _name, _hint diff --git a/opencompass/configs/datasets/mmlu_pro/mmlu_pro_few_shot_gen_bfaf90.py b/opencompass/configs/datasets/mmlu_pro/mmlu_pro_few_shot_gen_bfaf90.py new file mode 100644 index 000000000..dc12fd1d9 --- /dev/null +++ b/opencompass/configs/datasets/mmlu_pro/mmlu_pro_few_shot_gen_bfaf90.py @@ -0,0 +1,47 @@ +from mmengine.config import read_base +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import FixKRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import MMLUProDataset, MMLUProBaseEvaluator + +with read_base(): + from .mmlu_pro_categories import categories + +mmlu_pro_datasets = [] + +for category in categories: + hint = f'Answer the following multiple choice question about {category}, and give your answer option directly.' + question_and_options = 'Question:\n{question}\nOptions:\n{options_str}' + mmlu_pro_reader_cfg = dict( + input_columns=['question', 'cot_content', 'options_str'], + output_column='answer_string', + train_split='validation', + test_split='test', + ) + mmlu_pro_infer_cfg = dict( + ice_template=dict( + type=PromptTemplate, + template=f'{question_and_options}\nAnswer: {{answer}}'), + prompt_template=dict( + type=PromptTemplate, + template=f'{hint}\n{question_and_options}\nAnswer: ', + ice_token='' + ), + retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]), + inferencer=dict(type=GenInferencer, max_out_len=100) + ) + + mmlu_pro_eval_cfg = dict( + evaluator=dict(type=MMLUProBaseEvaluator) + ) + + mmlu_pro_datasets.append( + dict( + abbr=f'mmlu_pro_{category.replace(" ", "_")}', + type=MMLUProDataset, + path='opencompass/mmlu_pro', + category=category, + reader_cfg=mmlu_pro_reader_cfg, + infer_cfg=mmlu_pro_infer_cfg, + eval_cfg=mmlu_pro_eval_cfg, + )) diff --git a/opencompass/configs/datasets/flames/README.md b/opencompass/configs/datasets/subjective/flames/README.md similarity index 100% rename from opencompass/configs/datasets/flames/README.md rename to opencompass/configs/datasets/subjective/flames/README.md diff --git a/opencompass/configs/datasets/flames/flames_gen.py b/opencompass/configs/datasets/subjective/flames/flames_gen.py similarity index 100% rename from opencompass/configs/datasets/flames/flames_gen.py rename to opencompass/configs/datasets/subjective/flames/flames_gen.py diff --git a/opencompass/configs/datasets/flames/flames_gen_1a58bb.py b/opencompass/configs/datasets/subjective/flames/flames_gen_1a58bb.py similarity index 96% rename from opencompass/configs/datasets/flames/flames_gen_1a58bb.py rename to opencompass/configs/datasets/subjective/flames/flames_gen_1a58bb.py index 1082e2174..64a10519c 100644 --- a/opencompass/configs/datasets/flames/flames_gen_1a58bb.py +++ b/opencompass/configs/datasets/subjective/flames/flames_gen_1a58bb.py @@ -58,5 +58,6 @@ name=_name, reader_cfg=subjective_reader_cfg, infer_cfg=subjective_infer_cfg, - eval_cfg=subjective_eval_cfg + eval_cfg=subjective_eval_cfg, + mode='singlescore', )) diff --git a/opencompass/configs/datasets/subjective/followbench/followbench_llmeval.py b/opencompass/configs/datasets/subjective/followbench/followbench_llmeval.py index 0733340ed..e601bda34 100644 --- a/opencompass/configs/datasets/subjective/followbench/followbench_llmeval.py +++ b/opencompass/configs/datasets/subjective/followbench/followbench_llmeval.py @@ -15,7 +15,7 @@ ] data_path ='data/subjective/followbench/converted_data' -followbench_llmeval_dataset = [] +followbench_llmeval_datasets = [] for _name in subjective_all_sets: subjective_infer_cfg = dict( @@ -48,7 +48,7 @@ pred_role='BOT', ) - followbench_llmeval_dataset.append( + followbench_llmeval_datasets.append( dict( abbr=f'{_name}', type=FollowBenchDataset, diff --git a/opencompass/configs/datasets/wikibench/wikibench_few_shot_ppl_c23d79.py b/opencompass/configs/datasets/wikibench/wikibench_few_shot_ppl_c23d79.py new file mode 100644 index 000000000..0669bd7b9 --- /dev/null +++ b/opencompass/configs/datasets/wikibench/wikibench_few_shot_ppl_c23d79.py @@ -0,0 +1,73 @@ +import copy + +from opencompass.datasets import WikiBenchDataset +from opencompass.openicl.icl_evaluator import AccEvaluator, CircularEvaluator +from opencompass.openicl.icl_inferencer import PPLInferencer +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever + +single_choice_prompts = { + 'single_choice_cn': [ + dict(role='HUMAN', + prompt='问题: 白色念珠菌常被用作哪种生物的研究模式?\nA. 病毒\nB. 细菌\nC. 真菌\nD. 寄生虫'), + dict(role='BOT', prompt='回答: C'), + dict( + role='HUMAN', + prompt='问题: 星期五广场(荷兰语:Vrijdagmarkt;荷兰语发音: )是比利时根特老城的一个城市广场。 星期五广场下方有一个什么设施?\nA. 游乐场\nB. 地下停车场\nC. 公园\nD. 地下商场' # noqa: E501 + ), + dict(role='BOT', prompt='回答: B'), + dict( + role='HUMAN', + prompt='问题: 尔迪雷·巴斯杜克代表土耳其国家队出场的次数?\nA. 60次\nB. 35次\nC. 49次\nD. 20次' + ), + dict(role='BOT', prompt='回答: C'), + dict( + role='HUMAN', + prompt='问题: 陈酆被任命为漳州刺史是因为什么原因?\nA. 朝廷认为他有能力担任该职务\nB. 漳州人怀念陈元光、陈伯珙的政绩\nC. 他是陈伯珙的儿子\nD. 他是陈元光的孙子' # noqa: E501 + ), + dict(role='BOT', prompt='回答: B'), + dict(role='HUMAN', + prompt='问题: 丹徒县在1928年改名为什么?\nA. 苏州市\nB. 润州县\nC. 镇江县\nD. 丹阳县'), + dict(role='BOT', prompt='回答: C'), + dict(role='HUMAN', prompt='问题: {question}'), + dict(role='BOT', prompt='回答: {answer}'), + ] +} + +wikibench_sets = { + 'wiki': ['single_choice_cn'], +} + +do_circular = True + +wikibench_datasets = [] + +for _split in list(wikibench_sets.keys()): + for _name in wikibench_sets[_split]: + template = {} + for answer in ['A', 'B', 'C', 'D']: + one_template_round = copy.deepcopy(single_choice_prompts[_name]) + one_template_round[-1]['prompt'] = one_template_round[-1][ + 'prompt'].format(answer=answer) + template[answer] = dict(round=one_template_round) + wikibench_infer_cfg = dict( + prompt_template=dict(type=PromptTemplate, template=template), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=PPLInferencer), + ) + wikibench_eval_cfg = dict(evaluator=dict( + type=CircularEvaluator if do_circular else AccEvaluator), ) + wikibench_datasets.append( + dict( + type=WikiBenchDataset, + path=f'./data/WikiBench/{_name}.jsonl', + name='circular_' + _name if do_circular else _name, + abbr='wikibench-' + _split + '-' + _name + + 'circular' if do_circular else '', + reader_cfg=dict( + input_columns=['question'], + output_column='answer', + ), + infer_cfg=wikibench_infer_cfg, + eval_cfg=wikibench_eval_cfg, + )) diff --git a/opencompass/configs/datasets/wikibench/wikibench_gen_0978ad.py b/opencompass/configs/datasets/wikibench/wikibench_gen_0978ad.py new file mode 100644 index 000000000..871133f9e --- /dev/null +++ b/opencompass/configs/datasets/wikibench/wikibench_gen_0978ad.py @@ -0,0 +1,56 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import CircularEvaluator, AccEvaluator +from opencompass.datasets import WikiBenchDataset +from opencompass.utils.text_postprocessors import first_option_postprocess + + +single_choice_prompts = { + 'single_choice_cn': '以下是一道单项选择题,请你根据你了解的知识一步步推理,并在最后用“所以答案为选项X”给出答案,其中“X”为选项A,B,C,D中你认为正确的选项。。\n下面是你要回答的题目:\n{question}\n让我们一步步推理:', +} + +wikibench_sets = { + 'wiki': ['single_choice_cn'], +} + +do_circular = True + +wikibench_datasets = [] + +for _split in list(wikibench_sets.keys()): + for _name in wikibench_sets[_split]: + wikibench_infer_cfg = dict( + ice_template=dict( + type=PromptTemplate, + template=dict( + begin='', + round=[ + dict(role='HUMAN', prompt=single_choice_prompts[_name]), + dict(role='BOT', prompt='{answer}'), + ], + ), + ice_token='', + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer), + ) + wikibench_eval_cfg = dict( + evaluator=dict(type=CircularEvaluator if do_circular else AccEvaluator), + pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'), + ) + + wikibench_datasets.append( + dict( + type=WikiBenchDataset, + path=f'./data/WikiBench/{_name}.jsonl', + name='circular_' + _name if do_circular else _name, + abbr='wikibench-' + _split + '-' + _name + 'circular' if do_circular else '', + reader_cfg=dict( + input_columns=['question'], + output_column='answer', + ), + infer_cfg=wikibench_infer_cfg, + eval_cfg=wikibench_eval_cfg, + ) + ) diff --git a/opencompass/configs/models/bailing_api/bailing-lite-0830.py b/opencompass/configs/models/bailing_api/bailing-lite-0830.py new file mode 100644 index 000000000..88053ce98 --- /dev/null +++ b/opencompass/configs/models/bailing_api/bailing-lite-0830.py @@ -0,0 +1,30 @@ +from opencompass.models import BailingAPI + +api_meta_template = dict( + round=[ + dict(role='HUMAN', api_role='HUMAN'), + dict(role='BOT', api_role='BOT', generate=False), + ], + reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')], +) + +models = [ + dict( + path='Bailing-Lite-0830', + token='', # set your key here or in environment variable BAILING_API_KEY + url='https://bailingchat.alipay.com/chat/completions', + type=BailingAPI, + meta_template=api_meta_template, + query_per_second=1, + max_seq_len=4096, + batch_size=1, + generation_kwargs={ + 'temperature': 0.4, + 'top_p': 1.0, + 'top_k': -1, + 'n': 1, + 'logprobs': 1, + 'use_beam_search': False, + }, + ), +] diff --git a/opencompass/configs/models/bailing_api/bailing-pro-0920.py b/opencompass/configs/models/bailing_api/bailing-pro-0920.py new file mode 100644 index 000000000..db69b263e --- /dev/null +++ b/opencompass/configs/models/bailing_api/bailing-pro-0920.py @@ -0,0 +1,30 @@ +from opencompass.models import BailingAPI + +api_meta_template = dict( + round=[ + dict(role='HUMAN', api_role='HUMAN'), + dict(role='BOT', api_role='BOT', generate=False), + ], + reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')], +) + +models = [ + dict( + path='Bailing-Pro-0920', + token='', # set your key here or in environment variable BAILING_API_KEY + url='https://bailingchat.alipay.com/chat/completions', + type=BailingAPI, + meta_template=api_meta_template, + query_per_second=1, + max_seq_len=4096, + batch_size=1, + generation_kwargs={ + 'temperature': 0.4, + 'top_p': 1.0, + 'top_k': -1, + 'n': 1, + 'logprobs': 1, + 'use_beam_search': False, + }, + ), +] diff --git a/opencompass/configs/models/chatglm/lmdeploy_glm4_9b_chat.py b/opencompass/configs/models/chatglm/lmdeploy_glm4_9b_chat.py index 2f8218a62..c5cb8c4d5 100644 --- a/opencompass/configs/models/chatglm/lmdeploy_glm4_9b_chat.py +++ b/opencompass/configs/models/chatglm/lmdeploy_glm4_9b_chat.py @@ -6,9 +6,9 @@ abbr='glm-4-9b-chat-turbomind', path='THUDM/glm-4-9b-chat', engine_config=dict(max_batch_size=16, tp=1), - gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024), + gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=2048), max_seq_len=8192, - max_out_len=1024, + max_out_len=2048, batch_size=16, run_cfg=dict(num_gpus=1), ) diff --git a/opencompass/configs/models/deepseek/lmdeploy_deepseek_67b_chat.py b/opencompass/configs/models/deepseek/lmdeploy_deepseek_67b_chat.py index e369e6e12..67624eb89 100644 --- a/opencompass/configs/models/deepseek/lmdeploy_deepseek_67b_chat.py +++ b/opencompass/configs/models/deepseek/lmdeploy_deepseek_67b_chat.py @@ -7,8 +7,8 @@ path='deepseek-ai/deepseek-llm-67b-chat', engine_config=dict(max_batch_size=16, tp=4), gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9), - max_seq_len=7168, - max_out_len=1024, + max_seq_len=8192, + max_out_len=2048, batch_size=16, run_cfg=dict(num_gpus=4), ) diff --git a/opencompass/configs/models/deepseek/lmdeploy_deepseek_7b_chat.py b/opencompass/configs/models/deepseek/lmdeploy_deepseek_7b_chat.py index 26aa2afce..2c108cc13 100644 --- a/opencompass/configs/models/deepseek/lmdeploy_deepseek_7b_chat.py +++ b/opencompass/configs/models/deepseek/lmdeploy_deepseek_7b_chat.py @@ -7,8 +7,8 @@ path='deepseek-ai/deepseek-llm-7b-chat', engine_config=dict(max_batch_size=16, tp=1), gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9), - max_seq_len=7168, - max_out_len=1024, + max_seq_len=8192, + max_out_len=2048, batch_size=16, run_cfg=dict(num_gpus=1), ) diff --git a/opencompass/configs/models/hf_internlm/lmdeploy_internlm2_5_1_8b_chat.py b/opencompass/configs/models/hf_internlm/lmdeploy_internlm2_5_1_8b_chat.py index 5d5c257b1..cf4691f16 100644 --- a/opencompass/configs/models/hf_internlm/lmdeploy_internlm2_5_1_8b_chat.py +++ b/opencompass/configs/models/hf_internlm/lmdeploy_internlm2_5_1_8b_chat.py @@ -5,10 +5,10 @@ type=TurboMindModelwithChatTemplate, abbr='internlm2_5-1_8b-chat-turbomind', path='internlm/internlm2_5-1_8b-chat', - engine_config=dict(session_len=8192, max_batch_size=16, tp=1), - gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=2048), - max_seq_len=8192, - max_out_len=2048, + engine_config=dict(session_len=16384, max_batch_size=16, tp=1), + gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096), + max_seq_len=16384, + max_out_len=4096, batch_size=16, run_cfg=dict(num_gpus=1), ) diff --git a/opencompass/configs/models/hf_internlm/lmdeploy_internlm2_5_20b_chat.py b/opencompass/configs/models/hf_internlm/lmdeploy_internlm2_5_20b_chat.py index f1bb1b081..7fb521618 100644 --- a/opencompass/configs/models/hf_internlm/lmdeploy_internlm2_5_20b_chat.py +++ b/opencompass/configs/models/hf_internlm/lmdeploy_internlm2_5_20b_chat.py @@ -5,10 +5,10 @@ type=TurboMindModelwithChatTemplate, abbr='internlm2_5-20b-chat-turbomind', path='internlm/internlm2_5-20b-chat', - engine_config=dict(session_len=8192, max_batch_size=16, tp=2), - gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=2048), - max_seq_len=8192, - max_out_len=2048, + engine_config=dict(session_len=16384, max_batch_size=16, tp=2), + gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096), + max_seq_len=16384, + max_out_len=4096, batch_size=16, run_cfg=dict(num_gpus=2), ) diff --git a/opencompass/configs/models/hf_internlm/lmdeploy_internlm2_5_7b_chat.py b/opencompass/configs/models/hf_internlm/lmdeploy_internlm2_5_7b_chat.py index 75fb93713..8dce26843 100644 --- a/opencompass/configs/models/hf_internlm/lmdeploy_internlm2_5_7b_chat.py +++ b/opencompass/configs/models/hf_internlm/lmdeploy_internlm2_5_7b_chat.py @@ -5,10 +5,10 @@ type=TurboMindModelwithChatTemplate, abbr='internlm2_5-7b-chat-turbomind', path='internlm/internlm2_5-7b-chat', - engine_config=dict(session_len=7168, max_batch_size=16, tp=1), - gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024), - max_seq_len=7168, - max_out_len=1024, + engine_config=dict(session_len=16384, max_batch_size=16, tp=1), + gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096), + max_seq_len=16384, + max_out_len=4096, batch_size=16, run_cfg=dict(num_gpus=1), ) diff --git a/opencompass/configs/models/hf_internlm/lmdeploy_internlm2_chat_1_8b.py b/opencompass/configs/models/hf_internlm/lmdeploy_internlm2_chat_1_8b.py index 9c358d5a6..f5df7926d 100644 --- a/opencompass/configs/models/hf_internlm/lmdeploy_internlm2_chat_1_8b.py +++ b/opencompass/configs/models/hf_internlm/lmdeploy_internlm2_chat_1_8b.py @@ -5,10 +5,10 @@ type=TurboMindModelwithChatTemplate, abbr='internlm2-chat-1.8b-turbomind', path='internlm/internlm2-chat-1_8b', - engine_config=dict(session_len=7168, max_batch_size=16, tp=1), - gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024), - max_seq_len=7168, - max_out_len=1024, + engine_config=dict(session_len=8192, max_batch_size=16, tp=1), + gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096), + max_seq_len=8192, + max_out_len=4096, batch_size=16, run_cfg=dict(num_gpus=1), ) diff --git a/opencompass/configs/models/hf_internlm/lmdeploy_internlm2_chat_20b.py b/opencompass/configs/models/hf_internlm/lmdeploy_internlm2_chat_20b.py index 443715494..23f35636c 100644 --- a/opencompass/configs/models/hf_internlm/lmdeploy_internlm2_chat_20b.py +++ b/opencompass/configs/models/hf_internlm/lmdeploy_internlm2_chat_20b.py @@ -5,10 +5,10 @@ type=TurboMindModelwithChatTemplate, abbr='internlm2-chat-20b-turbomind', path='internlm/internlm2-chat-20b', - engine_config=dict(session_len=7168, max_batch_size=16, tp=2), - gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024), - max_seq_len=7168, - max_out_len=1024, + engine_config=dict(session_len=8192, max_batch_size=16, tp=2), + gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096), + max_seq_len=8192, + max_out_len=4096, batch_size=16, run_cfg=dict(num_gpus=2), ) diff --git a/opencompass/configs/models/hf_internlm/lmdeploy_internlm2_chat_7b.py b/opencompass/configs/models/hf_internlm/lmdeploy_internlm2_chat_7b.py index 82ad2e46a..38ea39d7d 100644 --- a/opencompass/configs/models/hf_internlm/lmdeploy_internlm2_chat_7b.py +++ b/opencompass/configs/models/hf_internlm/lmdeploy_internlm2_chat_7b.py @@ -1,15 +1,24 @@ from opencompass.models import TurboMindModelwithChatTemplate + models = [ dict( type=TurboMindModelwithChatTemplate, - abbr='internlm2-chat-7b-turbomind', + abbr=f'internlm2-chat-7b-lmdeploy', path='internlm/internlm2-chat-7b', - engine_config=dict(session_len=7168, max_batch_size=16, tp=1), - gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024), - max_seq_len=7168, - max_out_len=1024, - batch_size=16, + # inference backend of LMDeploy. It can be either 'turbomind' or 'pytorch'. + # If the model is not supported by 'turbomind', it will fallback to + # 'pytorch' + backend='turbomind', + # For the detailed engine config and generation config, please refer to + # https://github.com/InternLM/lmdeploy/blob/main/lmdeploy/messages.py + engine_config=dict(tp=1), + gen_config=dict(do_sample=False), + max_seq_len=8192, + max_out_len=4096, + # the max number of prompts that LMDeploy receives + # in `generate` function + batch_size=5000, run_cfg=dict(num_gpus=1), ) ] diff --git a/opencompass/configs/models/hf_internlm/lmdeploy_internlm_chat_20b.py b/opencompass/configs/models/hf_internlm/lmdeploy_internlm_chat_20b.py index 8718a6cfc..e9af5578b 100644 --- a/opencompass/configs/models/hf_internlm/lmdeploy_internlm_chat_20b.py +++ b/opencompass/configs/models/hf_internlm/lmdeploy_internlm_chat_20b.py @@ -6,9 +6,9 @@ abbr='internlm-chat-20b-turbomind', path='internlm/internlm-chat-20b', engine_config=dict(session_len=4096, max_batch_size=16, tp=2), - gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024), + gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=2048), max_seq_len=4096, - max_out_len=1024, + max_out_len=2048, batch_size=16, run_cfg=dict(num_gpus=2), ) diff --git a/opencompass/configs/models/hf_internlm/lmdeploy_internlm_chat_7b.py b/opencompass/configs/models/hf_internlm/lmdeploy_internlm_chat_7b.py index ea61313af..50656a5f8 100644 --- a/opencompass/configs/models/hf_internlm/lmdeploy_internlm_chat_7b.py +++ b/opencompass/configs/models/hf_internlm/lmdeploy_internlm_chat_7b.py @@ -6,9 +6,9 @@ abbr='internlm-chat-7b-turbomind', path='internlm/internlm-chat-7b', engine_config=dict(session_len=4096, max_batch_size=16, tp=1), - gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024), + gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=2048), max_seq_len=4096, - max_out_len=1024, + max_out_len=2048, batch_size=16, run_cfg=dict(num_gpus=1), ) diff --git a/opencompass/configs/models/hf_llama/hf_llama3_1_70b_instruct.py b/opencompass/configs/models/hf_llama/hf_llama3_1_70b_instruct.py index 4a17de935..c7527bb53 100644 --- a/opencompass/configs/models/hf_llama/hf_llama3_1_70b_instruct.py +++ b/opencompass/configs/models/hf_llama/hf_llama3_1_70b_instruct.py @@ -5,7 +5,7 @@ type=HuggingFacewithChatTemplate, abbr='llama-3_1-70b-instruct-hf', path='meta-llama/Meta-Llama-3.1-70B-Instruct', - max_out_len=1024, + max_out_len=4096, batch_size=8, run_cfg=dict(num_gpus=4), stop_words=['<|end_of_text|>', '<|eot_id|>'], diff --git a/opencompass/configs/models/hf_llama/hf_llama3_1_8b.py b/opencompass/configs/models/hf_llama/hf_llama3_1_8b.py new file mode 100644 index 000000000..a41e1ddfc --- /dev/null +++ b/opencompass/configs/models/hf_llama/hf_llama3_1_8b.py @@ -0,0 +1,12 @@ +from opencompass.models import HuggingFaceBaseModel + +models = [ + dict( + type=HuggingFaceBaseModel, + abbr='llama-3_1-8b-hf', + path='meta-llama/Meta-Llama-3.1-8B-Instruct', + max_out_len=1024, + batch_size=8, + run_cfg=dict(num_gpus=1), + ) +] diff --git a/opencompass/configs/models/hf_llama/lmdeploy_llama2_13b_chat.py b/opencompass/configs/models/hf_llama/lmdeploy_llama2_13b_chat.py index cb42cb294..cacdec9a5 100644 --- a/opencompass/configs/models/hf_llama/lmdeploy_llama2_13b_chat.py +++ b/opencompass/configs/models/hf_llama/lmdeploy_llama2_13b_chat.py @@ -6,9 +6,9 @@ abbr='llama-2-13b-chat-turbomind', path='meta-llama/Llama-2-13b-chat-hf', engine_config=dict(max_batch_size=16, tp=1), - gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024), + gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=2048), max_seq_len=4096, - max_out_len=1024, + max_out_len=2048, batch_size=16, run_cfg=dict(num_gpus=1), ) diff --git a/opencompass/configs/models/hf_llama/lmdeploy_llama2_70b_chat.py b/opencompass/configs/models/hf_llama/lmdeploy_llama2_70b_chat.py index d6c69c6f9..b850106b3 100644 --- a/opencompass/configs/models/hf_llama/lmdeploy_llama2_70b_chat.py +++ b/opencompass/configs/models/hf_llama/lmdeploy_llama2_70b_chat.py @@ -6,9 +6,9 @@ abbr='llama-2-70b-chat-turbomind', path='meta-llama/Llama-2-70b-chat-hf', engine_config=dict(max_batch_size=16, tp=4), - gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024), + gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=2048), max_seq_len=4096, - max_out_len=1024, + max_out_len=2048, batch_size=16, run_cfg=dict(num_gpus=4), ) diff --git a/opencompass/configs/models/hf_llama/lmdeploy_llama2_7b_chat.py b/opencompass/configs/models/hf_llama/lmdeploy_llama2_7b_chat.py index f520ce8b3..aa3452488 100644 --- a/opencompass/configs/models/hf_llama/lmdeploy_llama2_7b_chat.py +++ b/opencompass/configs/models/hf_llama/lmdeploy_llama2_7b_chat.py @@ -6,9 +6,9 @@ abbr='llama-2-7b-chat-turbomind', path='meta-llama/Llama-2-7b-chat-hf', engine_config=dict(max_batch_size=16, tp=1), - gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024), + gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=2048), max_seq_len=4096, - max_out_len=1024, + max_out_len=2048, batch_size=16, run_cfg=dict(num_gpus=1), ) diff --git a/opencompass/configs/models/hf_llama/lmdeploy_llama3_1_70b_instruct.py b/opencompass/configs/models/hf_llama/lmdeploy_llama3_1_70b_instruct.py index 23f9bc2a1..9674169f5 100644 --- a/opencompass/configs/models/hf_llama/lmdeploy_llama3_1_70b_instruct.py +++ b/opencompass/configs/models/hf_llama/lmdeploy_llama3_1_70b_instruct.py @@ -6,9 +6,9 @@ abbr='llama-3_1-70b-instruct-turbomind', path='meta-llama/Meta-Llama-3.1-70B-Instruct', engine_config=dict(max_batch_size=16, tp=4), - gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024), - max_seq_len=7168, - max_out_len=1024, + gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096), + max_seq_len=16384, + max_out_len=4096, batch_size=16, run_cfg=dict(num_gpus=4), stop_words=['<|end_of_text|>', '<|eot_id|>'], diff --git a/opencompass/configs/models/hf_llama/lmdeploy_llama3_1_8b_instruct.py b/opencompass/configs/models/hf_llama/lmdeploy_llama3_1_8b_instruct.py index 429dfec72..2754eb835 100644 --- a/opencompass/configs/models/hf_llama/lmdeploy_llama3_1_8b_instruct.py +++ b/opencompass/configs/models/hf_llama/lmdeploy_llama3_1_8b_instruct.py @@ -6,9 +6,9 @@ abbr='llama-3_1-8b-instruct-turbomind', path='meta-llama/Meta-Llama-3.1-8B-Instruct', engine_config=dict(max_batch_size=16, tp=1), - gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024), - max_seq_len=7168, - max_out_len=1024, + gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096), + max_seq_len=16384, + max_out_len=4096, batch_size=16, run_cfg=dict(num_gpus=1), stop_words=['<|end_of_text|>', '<|eot_id|>'], diff --git a/opencompass/configs/models/hf_llama/lmdeploy_llama3_70b_instruct.py b/opencompass/configs/models/hf_llama/lmdeploy_llama3_70b_instruct.py index 333dc0153..12fc944c7 100644 --- a/opencompass/configs/models/hf_llama/lmdeploy_llama3_70b_instruct.py +++ b/opencompass/configs/models/hf_llama/lmdeploy_llama3_70b_instruct.py @@ -6,9 +6,9 @@ abbr='llama-3-70b-instruct-turbomind', path='meta-llama/Meta-Llama-3-70B-Instruct', engine_config=dict(max_batch_size=16, tp=4), - gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024), - max_seq_len=7168, - max_out_len=1024, + gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096), + max_seq_len=16384, + max_out_len=4096, batch_size=16, run_cfg=dict(num_gpus=4), stop_words=['<|end_of_text|>', '<|eot_id|>'], diff --git a/opencompass/configs/models/hf_llama/lmdeploy_llama3_8b_instruct.py b/opencompass/configs/models/hf_llama/lmdeploy_llama3_8b_instruct.py index cc5b3bd45..5a6431b7a 100644 --- a/opencompass/configs/models/hf_llama/lmdeploy_llama3_8b_instruct.py +++ b/opencompass/configs/models/hf_llama/lmdeploy_llama3_8b_instruct.py @@ -6,9 +6,9 @@ abbr='llama-3-8b-instruct-turbomind', path='meta-llama/Meta-Llama-3-8B-Instruct', engine_config=dict(max_batch_size=16, tp=1), - gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024), - max_seq_len=7168, - max_out_len=1024, + gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096), + max_seq_len=16384, + max_out_len=4096, batch_size=16, run_cfg=dict(num_gpus=1), stop_words=['<|end_of_text|>', '<|eot_id|>'], diff --git a/opencompass/configs/models/mistral/lmdeploy_mistral_7b_instruct_v0_3.py b/opencompass/configs/models/mistral/lmdeploy_mistral_7b_instruct_v0_3.py new file mode 100644 index 000000000..4c867b602 --- /dev/null +++ b/opencompass/configs/models/mistral/lmdeploy_mistral_7b_instruct_v0_3.py @@ -0,0 +1,15 @@ +from opencompass.models import TurboMindModelwithChatTemplate + +models = [ + dict( + type=TurboMindModelwithChatTemplate, + abbr='mistral-7b-instruct-v0.3-turbomind', + path='mistralai/Mistral-7B-Instruct-v0.3', + engine_config=dict(session_len=32768, max_batch_size=16, tp=1), + gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096), + max_seq_len=32768, + max_out_len=4096, + batch_size=16, + run_cfg=dict(num_gpus=1), + ) +] diff --git a/opencompass/configs/models/mistral/lmdeploy_mixtral_large_instruct_2407.py b/opencompass/configs/models/mistral/lmdeploy_mixtral_large_instruct_2407.py new file mode 100644 index 000000000..e79a1f73a --- /dev/null +++ b/opencompass/configs/models/mistral/lmdeploy_mixtral_large_instruct_2407.py @@ -0,0 +1,15 @@ +from opencompass.models import TurboMindModelwithChatTemplate + +models = [ + dict( + type=TurboMindModelwithChatTemplate, + abbr='mixtral-large-instruct-2407-turbomind', + path='mistralai/Mistral-Large-Instruct-2407', + engine_config=dict(session_len=32768, max_batch_size=16, tp=4), + gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096), + max_seq_len=32768, + max_out_len=4096, + batch_size=16, + run_cfg=dict(num_gpus=4), + ) +] diff --git a/opencompass/configs/models/openai/o1_mini_2024_09_12.py b/opencompass/configs/models/openai/o1_mini_2024_09_12.py new file mode 100644 index 000000000..331ecf319 --- /dev/null +++ b/opencompass/configs/models/openai/o1_mini_2024_09_12.py @@ -0,0 +1,20 @@ +from opencompass.models import OpenAISDK + +api_meta_template = dict(round=[ + dict(role='HUMAN', api_role='HUMAN'), + dict(role='BOT', api_role='BOT', generate=True), +], ) + +models = [ + dict( + abbr='o1-mini-2024-09-12', + type=OpenAISDK, + path='o1-mini-2024-09-12', + key= + 'ENV', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well + meta_template=api_meta_template, + query_per_second=1, + batch_size=1, + temperature=1, + max_completion_tokens=8192), # you can change it for large reasoning inference cost, according to: https://platform.openai.com/docs/guides/reasoning +] diff --git a/opencompass/configs/models/openai/o1_preview_2024_09_12.py b/opencompass/configs/models/openai/o1_preview_2024_09_12.py new file mode 100644 index 000000000..9dff10371 --- /dev/null +++ b/opencompass/configs/models/openai/o1_preview_2024_09_12.py @@ -0,0 +1,20 @@ +from opencompass.models import OpenAISDK + +api_meta_template = dict(round=[ + dict(role='HUMAN', api_role='HUMAN'), + dict(role='BOT', api_role='BOT', generate=True), +], ) + +models = [ + dict( + abbr='o1-preview-2024-09-12', + type=OpenAISDK, + path='o1-preview-2024-09-12', + key= + 'ENV', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well + meta_template=api_meta_template, + query_per_second=1, + batch_size=1, + temperature=1, + max_completion_tokens=8192), # you can change it for large reasoning inference cost, according to: https://platform.openai.com/docs/guides/reasoning +] diff --git a/opencompass/configs/models/qwen/lmdeploy_qwen1_5_110b_chat.py b/opencompass/configs/models/qwen/lmdeploy_qwen1_5_110b_chat.py index 9b92b8140..bc123b405 100644 --- a/opencompass/configs/models/qwen/lmdeploy_qwen1_5_110b_chat.py +++ b/opencompass/configs/models/qwen/lmdeploy_qwen1_5_110b_chat.py @@ -5,10 +5,10 @@ type=TurboMindModelwithChatTemplate, abbr='qwen1.5-110b-chat-turbomind', path='Qwen/Qwen1.5-110B-Chat', - engine_config=dict(session_len=7168, max_batch_size=8, tp=4), - gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024), - max_seq_len=7168, - max_out_len=1024, + engine_config=dict(session_len=16834, max_batch_size=8, tp=4), + gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096), + max_seq_len=16834, + max_out_len=4096, batch_size=8, run_cfg=dict(num_gpus=4), stop_words=['<|im_end|>', '<|im_start|>'], diff --git a/opencompass/configs/models/qwen/lmdeploy_qwen1_5_14b_chat.py b/opencompass/configs/models/qwen/lmdeploy_qwen1_5_14b_chat.py index d2b85c2aa..5f0d54b96 100644 --- a/opencompass/configs/models/qwen/lmdeploy_qwen1_5_14b_chat.py +++ b/opencompass/configs/models/qwen/lmdeploy_qwen1_5_14b_chat.py @@ -5,10 +5,10 @@ type=TurboMindModelwithChatTemplate, abbr='qwen1.5-14b-chat-turbomind', path='Qwen/Qwen1.5-14B-Chat', - engine_config=dict(session_len=7168, max_batch_size=16, tp=1), - gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024), - max_seq_len=7168, - max_out_len=1024, + engine_config=dict(session_len=16384, max_batch_size=16, tp=1), + gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096), + max_seq_len=16384, + max_out_len=4096, batch_size=16, run_cfg=dict(num_gpus=1), stop_words=['<|im_end|>', '<|im_start|>'], diff --git a/opencompass/configs/models/qwen/lmdeploy_qwen1_5_1_8b_chat.py b/opencompass/configs/models/qwen/lmdeploy_qwen1_5_1_8b_chat.py index ff28ac0be..803ff3336 100644 --- a/opencompass/configs/models/qwen/lmdeploy_qwen1_5_1_8b_chat.py +++ b/opencompass/configs/models/qwen/lmdeploy_qwen1_5_1_8b_chat.py @@ -5,10 +5,10 @@ type=TurboMindModelwithChatTemplate, abbr='qwen1.5-1.8b-chat-turbomind', path='Qwen/Qwen1.5-1.8B-Chat', - engine_config=dict(session_len=7168, max_batch_size=16, tp=1), - gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024), - max_seq_len=7168, - max_out_len=1024, + engine_config=dict(session_len=16384, max_batch_size=16, tp=1), + gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096), + max_seq_len=16384, + max_out_len=4096, batch_size=16, run_cfg=dict(num_gpus=1), stop_words=['<|im_end|>', '<|im_start|>'], diff --git a/opencompass/configs/models/qwen/lmdeploy_qwen1_5_32b_chat.py b/opencompass/configs/models/qwen/lmdeploy_qwen1_5_32b_chat.py index 1196548a0..96fd1e43c 100644 --- a/opencompass/configs/models/qwen/lmdeploy_qwen1_5_32b_chat.py +++ b/opencompass/configs/models/qwen/lmdeploy_qwen1_5_32b_chat.py @@ -5,10 +5,10 @@ type=TurboMindModelwithChatTemplate, abbr='qwen1.5-32b-chat-turbomind', path='Qwen/Qwen1.5-32B-Chat', - engine_config=dict(session_len=7168, max_batch_size=16, tp=2), - gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024), - max_seq_len=7168, - max_out_len=1024, + engine_config=dict(session_len=16384, max_batch_size=16, tp=2), + gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096), + max_seq_len=16384, + max_out_len=4096, batch_size=16, run_cfg=dict(num_gpus=2), stop_words=['<|im_end|>', '<|im_start|>'], diff --git a/opencompass/configs/models/qwen/lmdeploy_qwen1_5_4b_chat.py b/opencompass/configs/models/qwen/lmdeploy_qwen1_5_4b_chat.py index bde14a295..f9fcc3fb9 100644 --- a/opencompass/configs/models/qwen/lmdeploy_qwen1_5_4b_chat.py +++ b/opencompass/configs/models/qwen/lmdeploy_qwen1_5_4b_chat.py @@ -5,10 +5,10 @@ type=TurboMindModelwithChatTemplate, abbr='qwen1.5-4b-chat-turbomind', path='Qwen/Qwen1.5-4B-Chat', - engine_config=dict(session_len=7168, max_batch_size=16, tp=1), - gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024), - max_seq_len=7168, - max_out_len=1024, + engine_config=dict(session_len=16384, max_batch_size=16, tp=1), + gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096), + max_seq_len=16384, + max_out_len=4096, batch_size=16, run_cfg=dict(num_gpus=1), stop_words=['<|im_end|>', '<|im_start|>'], diff --git a/opencompass/configs/models/qwen/lmdeploy_qwen1_5_72b_chat.py b/opencompass/configs/models/qwen/lmdeploy_qwen1_5_72b_chat.py index 38175eaf3..64a5f7cb6 100644 --- a/opencompass/configs/models/qwen/lmdeploy_qwen1_5_72b_chat.py +++ b/opencompass/configs/models/qwen/lmdeploy_qwen1_5_72b_chat.py @@ -5,10 +5,10 @@ type=TurboMindModelwithChatTemplate, abbr='qwen1.5-72b-chat-turbomind', path='Qwen/Qwen1.5-72B-Chat', - engine_config=dict(session_len=7168, max_batch_size=16, tp=4), - gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024), - max_seq_len=7168, - max_out_len=1024, + engine_config=dict(session_len=16384, max_batch_size=16, tp=4), + gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096), + max_seq_len=16384, + max_out_len=4096, batch_size=16, run_cfg=dict(num_gpus=4), stop_words=['<|im_end|>', '<|im_start|>'], diff --git a/opencompass/configs/models/qwen/lmdeploy_qwen1_5_7b_chat.py b/opencompass/configs/models/qwen/lmdeploy_qwen1_5_7b_chat.py index ca733c0b2..1ab393036 100644 --- a/opencompass/configs/models/qwen/lmdeploy_qwen1_5_7b_chat.py +++ b/opencompass/configs/models/qwen/lmdeploy_qwen1_5_7b_chat.py @@ -5,10 +5,10 @@ type=TurboMindModelwithChatTemplate, abbr='qwen1.5-7b-chat-turbomind', path='Qwen/Qwen1.5-7B-Chat', - engine_config=dict(session_len=7168, max_batch_size=16, tp=1), - gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024), - max_seq_len=7168, - max_out_len=1024, + engine_config=dict(session_len=16384, max_batch_size=16, tp=1), + gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096), + max_seq_len=16384, + max_out_len=4096, batch_size=16, run_cfg=dict(num_gpus=1), stop_words=['<|im_end|>', '<|im_start|>'], diff --git a/opencompass/configs/models/qwen/lmdeploy_qwen2_1_5b_instruct.py b/opencompass/configs/models/qwen/lmdeploy_qwen2_1_5b_instruct.py index 502de1876..f050ca382 100644 --- a/opencompass/configs/models/qwen/lmdeploy_qwen2_1_5b_instruct.py +++ b/opencompass/configs/models/qwen/lmdeploy_qwen2_1_5b_instruct.py @@ -5,10 +5,10 @@ type=TurboMindModelwithChatTemplate, abbr='qwen2-1.5b-instruct-turbomind', path='Qwen/Qwen2-1.5B-Instruct', - engine_config=dict(session_len=7168, max_batch_size=16, tp=1), - gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024), - max_seq_len=7168, - max_out_len=1024, + engine_config=dict(session_len=16384, max_batch_size=16, tp=1), + gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096), + max_seq_len=16384, + max_out_len=4096, batch_size=16, run_cfg=dict(num_gpus=1), ) diff --git a/opencompass/configs/models/qwen/lmdeploy_qwen2_72b_instruct.py b/opencompass/configs/models/qwen/lmdeploy_qwen2_72b_instruct.py index 69ecb7981..c29482b5b 100644 --- a/opencompass/configs/models/qwen/lmdeploy_qwen2_72b_instruct.py +++ b/opencompass/configs/models/qwen/lmdeploy_qwen2_72b_instruct.py @@ -5,10 +5,10 @@ type=TurboMindModelwithChatTemplate, abbr='qwen2-72b-instruct-turbomind', path='Qwen/Qwen2-72B-Instruct', - engine_config=dict(session_len=7168, max_batch_size=16, tp=4), - gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024), - max_seq_len=7168, - max_out_len=1024, + engine_config=dict(session_len=16384, max_batch_size=16, tp=4), + gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096), + max_seq_len=16384, + max_out_len=4096, batch_size=16, run_cfg=dict(num_gpus=4), ) diff --git a/opencompass/configs/models/qwen/lmdeploy_qwen2_7b_instruct.py b/opencompass/configs/models/qwen/lmdeploy_qwen2_7b_instruct.py index 4dff85e06..05fa25c5e 100644 --- a/opencompass/configs/models/qwen/lmdeploy_qwen2_7b_instruct.py +++ b/opencompass/configs/models/qwen/lmdeploy_qwen2_7b_instruct.py @@ -5,10 +5,10 @@ type=TurboMindModelwithChatTemplate, abbr='qwen2-7b-instruct-turbomind', path='Qwen/Qwen2-7B-Instruct', - engine_config=dict(session_len=7168, max_batch_size=16, tp=1), - gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024), - max_seq_len=7168, - max_out_len=1024, + engine_config=dict(session_len=16384, max_batch_size=16, tp=1), + gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096), + max_seq_len=16384, + max_out_len=4096, batch_size=16, run_cfg=dict(num_gpus=1), ) diff --git a/opencompass/configs/models/qwen2_5/hf_qwen2_5_0_5b_instruct.py b/opencompass/configs/models/qwen2_5/hf_qwen2_5_0_5b_instruct.py new file mode 100644 index 000000000..35289bb10 --- /dev/null +++ b/opencompass/configs/models/qwen2_5/hf_qwen2_5_0_5b_instruct.py @@ -0,0 +1,12 @@ +from opencompass.models import HuggingFacewithChatTemplate + +models = [ + dict( + type=HuggingFacewithChatTemplate, + abbr='qwen2.5-0.5b-instruct-hf', + path='Qwen/Qwen2.5-0.5B-Instruct', + max_out_len=4096, + batch_size=8, + run_cfg=dict(num_gpus=1), + ) +] diff --git a/opencompass/configs/models/qwen2_5/hf_qwen2_5_14b_instruct.py b/opencompass/configs/models/qwen2_5/hf_qwen2_5_14b_instruct.py new file mode 100644 index 000000000..af5a8816a --- /dev/null +++ b/opencompass/configs/models/qwen2_5/hf_qwen2_5_14b_instruct.py @@ -0,0 +1,12 @@ +from opencompass.models import HuggingFacewithChatTemplate + +models = [ + dict( + type=HuggingFacewithChatTemplate, + abbr='qwen2.5-14b-instruct-hf', + path='Qwen/Qwen2.5-14B-Instruc', + max_out_len=4096, + batch_size=8, + run_cfg=dict(num_gpus=2), + ) +] diff --git a/opencompass/configs/models/qwen2_5/hf_qwen2_5_1_5b_instruct.py b/opencompass/configs/models/qwen2_5/hf_qwen2_5_1_5b_instruct.py new file mode 100644 index 000000000..52da52895 --- /dev/null +++ b/opencompass/configs/models/qwen2_5/hf_qwen2_5_1_5b_instruct.py @@ -0,0 +1,12 @@ +from opencompass.models import HuggingFacewithChatTemplate + +models = [ + dict( + type=HuggingFacewithChatTemplate, + abbr='qwen2.5-1.5b-instruct-hf', + path='Qwen/Qwen2.5-1.5B-Instruct', + max_out_len=4096, + batch_size=8, + run_cfg=dict(num_gpus=1), + ) +] diff --git a/opencompass/configs/models/qwen2_5/hf_qwen2_5_32b_instruct.py b/opencompass/configs/models/qwen2_5/hf_qwen2_5_32b_instruct.py new file mode 100644 index 000000000..f2051f810 --- /dev/null +++ b/opencompass/configs/models/qwen2_5/hf_qwen2_5_32b_instruct.py @@ -0,0 +1,12 @@ +from opencompass.models import HuggingFacewithChatTemplate + +models = [ + dict( + type=HuggingFacewithChatTemplate, + abbr='qwen2.5-32b-instruct-hf', + path='Qwen/Qwen2.5-32B-Instruc', + max_out_len=4096, + batch_size=8, + run_cfg=dict(num_gpus=2), + ) +] diff --git a/opencompass/configs/models/qwen2_5/hf_qwen2_5_3b_instruct.py b/opencompass/configs/models/qwen2_5/hf_qwen2_5_3b_instruct.py new file mode 100644 index 000000000..88a101994 --- /dev/null +++ b/opencompass/configs/models/qwen2_5/hf_qwen2_5_3b_instruct.py @@ -0,0 +1,12 @@ +from opencompass.models import HuggingFacewithChatTemplate + +models = [ + dict( + type=HuggingFacewithChatTemplate, + abbr='qwen2.5-3b-instruct-hf', + path='Qwen/Qwen2.5-3B-Instruct', + max_out_len=4096, + batch_size=8, + run_cfg=dict(num_gpus=1), + ) +] diff --git a/opencompass/configs/models/qwen2_5/hf_qwen2_5_72b_instruct.py b/opencompass/configs/models/qwen2_5/hf_qwen2_5_72b_instruct.py new file mode 100644 index 000000000..c7fcf53ca --- /dev/null +++ b/opencompass/configs/models/qwen2_5/hf_qwen2_5_72b_instruct.py @@ -0,0 +1,12 @@ +from opencompass.models import HuggingFacewithChatTemplate + +models = [ + dict( + type=HuggingFacewithChatTemplate, + abbr='qwen2.5-72b-instruct-hf', + path='Qwen/Qwen2.5-72B-Instruc', + max_out_len=4096, + batch_size=8, + run_cfg=dict(num_gpus=4), + ) +] diff --git a/opencompass/configs/models/qwen2_5/hf_qwen2_5_7b_instruct.py b/opencompass/configs/models/qwen2_5/hf_qwen2_5_7b_instruct.py new file mode 100644 index 000000000..a9895be67 --- /dev/null +++ b/opencompass/configs/models/qwen2_5/hf_qwen2_5_7b_instruct.py @@ -0,0 +1,12 @@ +from opencompass.models import HuggingFacewithChatTemplate + +models = [ + dict( + type=HuggingFacewithChatTemplate, + abbr='qwen2.5-7b-instruct-hf', + path='Qwen/Qwen2.5-7B-Instruc', + max_out_len=4096, + batch_size=8, + run_cfg=dict(num_gpus=1), + ) +] diff --git a/opencompass/configs/models/qwen2_5/lmdeploy_qwen2_5_0_5b_instruct.py b/opencompass/configs/models/qwen2_5/lmdeploy_qwen2_5_0_5b_instruct.py new file mode 100644 index 000000000..145549630 --- /dev/null +++ b/opencompass/configs/models/qwen2_5/lmdeploy_qwen2_5_0_5b_instruct.py @@ -0,0 +1,15 @@ +from opencompass.models import TurboMindModelwithChatTemplate + +models = [ + dict( + type=TurboMindModelwithChatTemplate, + abbr='qwen2.5-0.5b-instruct-turbomind', + path='Qwen/Qwen2.5-0.5B-Instruct', + engine_config=dict(session_len=16384, max_batch_size=16, tp=1), + gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096), + max_seq_len=16384, + max_out_len=4096, + batch_size=16, + run_cfg=dict(num_gpus=1), + ) +] diff --git a/opencompass/configs/models/qwen2_5/lmdeploy_qwen2_5_14b_instruct.py b/opencompass/configs/models/qwen2_5/lmdeploy_qwen2_5_14b_instruct.py new file mode 100644 index 000000000..364690288 --- /dev/null +++ b/opencompass/configs/models/qwen2_5/lmdeploy_qwen2_5_14b_instruct.py @@ -0,0 +1,15 @@ +from opencompass.models import TurboMindModelwithChatTemplate + +models = [ + dict( + type=TurboMindModelwithChatTemplate, + abbr='qwen2.5-14b-instruct-turbomind', + path='Qwen/Qwen2.5-14B-Instruct', + engine_config=dict(session_len=16384, max_batch_size=16, tp=2), + gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096), + max_seq_len=16384, + max_out_len=4096, + batch_size=16, + run_cfg=dict(num_gpus=2), + ) +] diff --git a/opencompass/configs/models/qwen2_5/lmdeploy_qwen2_5_1_5b.py b/opencompass/configs/models/qwen2_5/lmdeploy_qwen2_5_1_5b.py new file mode 100644 index 000000000..a2661c9fd --- /dev/null +++ b/opencompass/configs/models/qwen2_5/lmdeploy_qwen2_5_1_5b.py @@ -0,0 +1,15 @@ +from opencompass.models import TurboMindModel + +models = [ + dict( + type=TurboMindModel, + abbr='qwen2.5-1.5b-turbomind', + path='Qwen/Qwen2.5-1.5B', + engine_config=dict(session_len=7168, max_batch_size=16, tp=1), + gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024), + max_seq_len=7168, + max_out_len=1024, + batch_size=16, + run_cfg=dict(num_gpus=1), + ) +] diff --git a/opencompass/configs/models/qwen2_5/lmdeploy_qwen2_5_1_5b_instruct.py b/opencompass/configs/models/qwen2_5/lmdeploy_qwen2_5_1_5b_instruct.py new file mode 100644 index 000000000..9b2f6c5fd --- /dev/null +++ b/opencompass/configs/models/qwen2_5/lmdeploy_qwen2_5_1_5b_instruct.py @@ -0,0 +1,15 @@ +from opencompass.models import TurboMindModelwithChatTemplate + +models = [ + dict( + type=TurboMindModelwithChatTemplate, + abbr='qwen2.5-1.5b-instruct-turbomind', + path='Qwen/Qwen2.5-1.5B-Instruct', + engine_config=dict(session_len=16384, max_batch_size=16, tp=1), + gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096), + max_seq_len=16384, + max_out_len=4096, + batch_size=16, + run_cfg=dict(num_gpus=1), + ) +] diff --git a/opencompass/configs/models/qwen2_5/lmdeploy_qwen2_5_32b_instruct.py b/opencompass/configs/models/qwen2_5/lmdeploy_qwen2_5_32b_instruct.py new file mode 100644 index 000000000..81e0cb0f1 --- /dev/null +++ b/opencompass/configs/models/qwen2_5/lmdeploy_qwen2_5_32b_instruct.py @@ -0,0 +1,15 @@ +from opencompass.models import TurboMindModelwithChatTemplate + +models = [ + dict( + type=TurboMindModelwithChatTemplate, + abbr='qwen2.5-32b-instruct-turbomind', + path='Qwen/Qwen2.5-32B-Instruct', + engine_config=dict(session_len=16384, max_batch_size=16, tp=2), + gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096), + max_seq_len=16384, + max_out_len=4096, + batch_size=16, + run_cfg=dict(num_gpus=2), + ) +] diff --git a/opencompass/configs/models/qwen2_5/lmdeploy_qwen2_5_3b_instruct.py b/opencompass/configs/models/qwen2_5/lmdeploy_qwen2_5_3b_instruct.py new file mode 100644 index 000000000..bab4b32ea --- /dev/null +++ b/opencompass/configs/models/qwen2_5/lmdeploy_qwen2_5_3b_instruct.py @@ -0,0 +1,15 @@ +from opencompass.models import TurboMindModelwithChatTemplate + +models = [ + dict( + type=TurboMindModelwithChatTemplate, + abbr='qwen2.5-3b-instruct-turbomind', + path='Qwen/Qwen2.5-3B-Instruct', + engine_config=dict(session_len=16384, max_batch_size=16, tp=1), + gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096), + max_seq_len=16384, + max_out_len=4096, + batch_size=16, + run_cfg=dict(num_gpus=1), + ) +] diff --git a/opencompass/configs/models/qwen2_5/lmdeploy_qwen2_5_72b_instruct.py b/opencompass/configs/models/qwen2_5/lmdeploy_qwen2_5_72b_instruct.py new file mode 100644 index 000000000..8b61daa8a --- /dev/null +++ b/opencompass/configs/models/qwen2_5/lmdeploy_qwen2_5_72b_instruct.py @@ -0,0 +1,15 @@ +from opencompass.models import TurboMindModelwithChatTemplate + +models = [ + dict( + type=TurboMindModelwithChatTemplate, + abbr='qwen2.5-72b-instruct-turbomind', + path='Qwen/Qwen2.5-72B-Instruct', + engine_config=dict(session_len=16384, max_batch_size=16, tp=4), + gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096), + max_seq_len=16384, + max_out_len=4096, + batch_size=16, + run_cfg=dict(num_gpus=4), + ) +] diff --git a/opencompass/configs/models/qwen2_5/lmdeploy_qwen2_5_7b.py b/opencompass/configs/models/qwen2_5/lmdeploy_qwen2_5_7b.py new file mode 100644 index 000000000..b2d7aa0c5 --- /dev/null +++ b/opencompass/configs/models/qwen2_5/lmdeploy_qwen2_5_7b.py @@ -0,0 +1,15 @@ +from opencompass.models import TurboMindModel + +models = [ + dict( + type=TurboMindModel, + abbr='qwen2.5-7b-turbomind', + path='Qwen/Qwen2.5-7B', + engine_config=dict(session_len=7168, max_batch_size=16, tp=1), + gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024), + max_seq_len=7168, + max_out_len=1024, + batch_size=16, + run_cfg=dict(num_gpus=1), + ) +] diff --git a/opencompass/configs/models/qwen2_5/lmdeploy_qwen2_5_7b_instruct.py b/opencompass/configs/models/qwen2_5/lmdeploy_qwen2_5_7b_instruct.py new file mode 100644 index 000000000..54732521f --- /dev/null +++ b/opencompass/configs/models/qwen2_5/lmdeploy_qwen2_5_7b_instruct.py @@ -0,0 +1,15 @@ +from opencompass.models import TurboMindModelwithChatTemplate + +models = [ + dict( + type=TurboMindModelwithChatTemplate, + abbr='qwen2.5-7b-instruct-turbomind', + path='Qwen/Qwen2.5-7B-Instruct', + engine_config=dict(session_len=16384, max_batch_size=16, tp=1), + gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096), + max_seq_len=16384, + max_out_len=4096, + batch_size=16, + run_cfg=dict(num_gpus=1), + ) +] diff --git a/opencompass/configs/models/qwen2_5/vllm_qwen2_5_0_5b_instruct.py b/opencompass/configs/models/qwen2_5/vllm_qwen2_5_0_5b_instruct.py new file mode 100644 index 000000000..effd12018 --- /dev/null +++ b/opencompass/configs/models/qwen2_5/vllm_qwen2_5_0_5b_instruct.py @@ -0,0 +1,14 @@ +from opencompass.models import VLLMwithChatTemplate + +models = [ + dict( + type=VLLMwithChatTemplate, + abbr='qwen2.5-0.5b-instruct-vllm', + path='Qwen/Qwen2.5-0.5B-Instruct', + model_kwargs=dict(tensor_parallel_size=1, gpu_memory_utilization=0.5), + max_out_len=4096, + batch_size=16, + generation_kwargs=dict(temperature=0), + run_cfg=dict(num_gpus=1), + ) +] diff --git a/opencompass/configs/models/qwen2_5/vllm_qwen2_5_14b_instruct.py b/opencompass/configs/models/qwen2_5/vllm_qwen2_5_14b_instruct.py new file mode 100644 index 000000000..438279f15 --- /dev/null +++ b/opencompass/configs/models/qwen2_5/vllm_qwen2_5_14b_instruct.py @@ -0,0 +1,14 @@ +from opencompass.models import VLLMwithChatTemplate + +models = [ + dict( + type=VLLMwithChatTemplate, + abbr='qwen2.5-14b-instruct-vllm', + path='Qwen/Qwen2.5-14B-Instruct', + model_kwargs=dict(tensor_parallel_size=2), + max_out_len=4096, + batch_size=16, + generation_kwargs=dict(temperature=0), + run_cfg=dict(num_gpus=2), + ) +] diff --git a/opencompass/configs/models/qwen2_5/vllm_qwen2_5_1_5b_instruct.py b/opencompass/configs/models/qwen2_5/vllm_qwen2_5_1_5b_instruct.py new file mode 100644 index 000000000..16b7f809b --- /dev/null +++ b/opencompass/configs/models/qwen2_5/vllm_qwen2_5_1_5b_instruct.py @@ -0,0 +1,14 @@ +from opencompass.models import VLLMwithChatTemplate + +models = [ + dict( + type=VLLMwithChatTemplate, + abbr='qwen2.5-1.5b-instruct-vllm', + path='Qwen/Qwen2.5-1.5B-Instruct', + model_kwargs=dict(tensor_parallel_size=1, gpu_memory_utilization=0.5), + max_out_len=4096, + batch_size=16, + generation_kwargs=dict(temperature=0), + run_cfg=dict(num_gpus=1), + ) +] diff --git a/opencompass/configs/models/qwen2_5/vllm_qwen2_5_32b_instruct.py b/opencompass/configs/models/qwen2_5/vllm_qwen2_5_32b_instruct.py new file mode 100644 index 000000000..58d518459 --- /dev/null +++ b/opencompass/configs/models/qwen2_5/vllm_qwen2_5_32b_instruct.py @@ -0,0 +1,14 @@ +from opencompass.models import VLLMwithChatTemplate + +models = [ + dict( + type=VLLMwithChatTemplate, + abbr='qwen2.5-32b-instruct-vllm', + path='Qwen/Qwen2.5-32B-Instruct', + model_kwargs=dict(tensor_parallel_size=2), + max_out_len=4096, + batch_size=16, + generation_kwargs=dict(temperature=0), + run_cfg=dict(num_gpus=2), + ) +] diff --git a/opencompass/configs/models/qwen2_5/vllm_qwen2_5_3b_instruct.py b/opencompass/configs/models/qwen2_5/vllm_qwen2_5_3b_instruct.py new file mode 100644 index 000000000..e24c7c1f8 --- /dev/null +++ b/opencompass/configs/models/qwen2_5/vllm_qwen2_5_3b_instruct.py @@ -0,0 +1,14 @@ +from opencompass.models import VLLMwithChatTemplate + +models = [ + dict( + type=VLLMwithChatTemplate, + abbr='qwen2.5-3b-instruct-vllm', + path='Qwen/Qwen2.5-3B-Instruct', + model_kwargs=dict(tensor_parallel_size=1, gpu_memory_utilization=0.5), + max_out_len=4096, + batch_size=16, + generation_kwargs=dict(temperature=0), + run_cfg=dict(num_gpus=1), + ) +] diff --git a/opencompass/configs/models/qwen2_5/vllm_qwen2_5_72b_instruct.py b/opencompass/configs/models/qwen2_5/vllm_qwen2_5_72b_instruct.py new file mode 100644 index 000000000..c1a557901 --- /dev/null +++ b/opencompass/configs/models/qwen2_5/vllm_qwen2_5_72b_instruct.py @@ -0,0 +1,14 @@ +from opencompass.models import VLLMwithChatTemplate + +models = [ + dict( + type=VLLMwithChatTemplate, + abbr='qwen2_5-72b-instruct-vllm', + path='Qwen/Qwen2.5-72B-Instruct', + model_kwargs=dict(tensor_parallel_size=4), + max_out_len=4096, + batch_size=16, + generation_kwargs=dict(temperature=0), + run_cfg=dict(num_gpus=4), + ) +] diff --git a/opencompass/configs/models/qwen2_5/vllm_qwen2_5_7b_instruct.py b/opencompass/configs/models/qwen2_5/vllm_qwen2_5_7b_instruct.py new file mode 100644 index 000000000..25c38d0f8 --- /dev/null +++ b/opencompass/configs/models/qwen2_5/vllm_qwen2_5_7b_instruct.py @@ -0,0 +1,14 @@ +from opencompass.models import VLLMwithChatTemplate + +models = [ + dict( + type=VLLMwithChatTemplate, + abbr='qwen2.5-7b-instruct-vllm', + path='Qwen/Qwen2.5-7B-Instruct', + model_kwargs=dict(tensor_parallel_size=1), + max_out_len=4096, + batch_size=16, + generation_kwargs=dict(temperature=0), + run_cfg=dict(num_gpus=1), + ) +] diff --git a/opencompass/configs/models/yi/lmdeploy_yi_1_5_34b_chat.py b/opencompass/configs/models/yi/lmdeploy_yi_1_5_34b_chat.py new file mode 100644 index 000000000..d296a1008 --- /dev/null +++ b/opencompass/configs/models/yi/lmdeploy_yi_1_5_34b_chat.py @@ -0,0 +1,15 @@ +from opencompass.models import TurboMindModelwithChatTemplate + +models = [ + dict( + type=TurboMindModelwithChatTemplate, + abbr='yi-1.5-34b-chat-turbomind', + path='01-ai/Yi-1.5-34B-Chat', + engine_config=dict(session_len=4096, max_batch_size=16, tp=2), + gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=2048), + max_seq_len=4096, + max_out_len=2048, + batch_size=16, + run_cfg=dict(num_gpus=2), + ) +] diff --git a/opencompass/configs/models/yi/lmdeploy_yi_1_5_6b_chat.py b/opencompass/configs/models/yi/lmdeploy_yi_1_5_6b_chat.py new file mode 100644 index 000000000..eeaf8ea25 --- /dev/null +++ b/opencompass/configs/models/yi/lmdeploy_yi_1_5_6b_chat.py @@ -0,0 +1,15 @@ +from opencompass.models import TurboMindModelwithChatTemplate + +models = [ + dict( + type=TurboMindModelwithChatTemplate, + abbr='yi-1.5-6b-chat-turbomind', + path='01-ai/Yi-1.5-6B-Chat', + engine_config=dict(session_len=4096, max_batch_size=16, tp=1), + gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=2048), + max_seq_len=4096, + max_out_len=2048, + batch_size=16, + run_cfg=dict(num_gpus=1), + ) +] diff --git a/opencompass/configs/models/yi/lmdeploy_yi_1_5_9b_chat.py b/opencompass/configs/models/yi/lmdeploy_yi_1_5_9b_chat.py new file mode 100644 index 000000000..4e33ba232 --- /dev/null +++ b/opencompass/configs/models/yi/lmdeploy_yi_1_5_9b_chat.py @@ -0,0 +1,15 @@ +from opencompass.models import TurboMindModelwithChatTemplate + +models = [ + dict( + type=TurboMindModelwithChatTemplate, + abbr='yi-1.5-9b-chat-turbomind', + path='01-ai/Yi-1.5-9B-Chat', + engine_config=dict(session_len=4096, max_batch_size=16, tp=1), + gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=2048), + max_seq_len=4096, + max_out_len=2048, + batch_size=16, + run_cfg=dict(num_gpus=1), + ) +] diff --git a/opencompass/configs/models/yi/lmdeploy_yi_34b_chat.py b/opencompass/configs/models/yi/lmdeploy_yi_34b_chat.py new file mode 100644 index 000000000..5ed603a6d --- /dev/null +++ b/opencompass/configs/models/yi/lmdeploy_yi_34b_chat.py @@ -0,0 +1,15 @@ +from opencompass.models import TurboMindModelwithChatTemplate + +models = [ + dict( + type=TurboMindModelwithChatTemplate, + abbr='yi-34b-chat-turbomind', + path='01-ai/Yi-34B-Chat', + engine_config=dict(session_len=4096, max_batch_size=16, tp=2), + gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=2048), + max_seq_len=4096, + max_out_len=2048, + batch_size=16, + run_cfg=dict(num_gpus=2), + ) +] diff --git a/opencompass/configs/models/yi/lmdeploy_yi_6b_chat.py b/opencompass/configs/models/yi/lmdeploy_yi_6b_chat.py new file mode 100644 index 000000000..5c75bfa50 --- /dev/null +++ b/opencompass/configs/models/yi/lmdeploy_yi_6b_chat.py @@ -0,0 +1,15 @@ +from opencompass.models import TurboMindModelwithChatTemplate + +models = [ + dict( + type=TurboMindModelwithChatTemplate, + abbr='yi-6b-chat-turbomind', + path='01-ai/Yi-6B-Chat', + engine_config=dict(session_len=4096, max_batch_size=16, tp=1), + gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=2048), + max_seq_len=4096, + max_out_len=2048, + batch_size=16, + run_cfg=dict(num_gpus=1), + ) +] diff --git a/opencompass/configs/summarizers/groups/cibench.py b/opencompass/configs/summarizers/groups/cibench.py index f08bcba51..8c6650c26 100644 --- a/opencompass/configs/summarizers/groups/cibench.py +++ b/opencompass/configs/summarizers/groups/cibench.py @@ -3,7 +3,7 @@ _cibench_generation = ['cibench_generation/' + i for i in _cibench_generation_modules] cibench_summary_groups = [] _cibench_generation_weight = { - 'matplotlib': [223, 50, 1, 156], + 'matplotlib': [175, 2, 1, 156], 'pandas': [200, 45, 45, 38], 'pytorch': [69, 0, 8, 11], 'seaborn': [130, 0, 2, 106], diff --git a/opencompass/datasets/GaokaoBench.py b/opencompass/datasets/GaokaoBench.py index 383845356..d3cd31a00 100644 --- a/opencompass/datasets/GaokaoBench.py +++ b/opencompass/datasets/GaokaoBench.py @@ -16,7 +16,7 @@ class GaokaoBenchDataset(BaseDataset): @staticmethod def load(path: str, name: str): - data = get_data_path(path, local_mode=True) + path = get_data_path(path, local_mode=True) if environ.get('DATASET_SOURCE') == 'ModelScope': from modelscope import MsDataset return MsDataset.load(path, subset_name=name, split='test') diff --git a/opencompass/datasets/__init__.py b/opencompass/datasets/__init__.py index a1f201efd..dbeaca4a5 100644 --- a/opencompass/datasets/__init__.py +++ b/opencompass/datasets/__init__.py @@ -33,6 +33,7 @@ from .csl import * # noqa: F401, F403 from .custom import * # noqa: F401, F403 from .cvalues import * # noqa: F401, F403 +from .dingo import * # noqa: F401, F403 from .drcd import * # noqa: F401, F403 from .drop import * # noqa: F401, F403 from .drop_simple_eval import * # noqa: F401, F403 @@ -40,9 +41,9 @@ from .ds1000_interpreter import * # noqa: F401, F403 from .eprstmt import * # noqa: F401, F403 from .FinanceIQ import * # noqa: F401, F403 -from .flames import * # noqa: F401, F403 from .flores import * # noqa: F401, F403 from .game24 import * # noqa: F401, F403 +from .gaokao_math import * # noqa: F401, F403 from .GaokaoBench import * # noqa: F401, F403 from .govrepcrs import * # noqa: F401, F403 from .gpqa import * # noqa: F401, F403 diff --git a/opencompass/datasets/dingo.py b/opencompass/datasets/dingo.py new file mode 100644 index 000000000..753d78ddb --- /dev/null +++ b/opencompass/datasets/dingo.py @@ -0,0 +1,84 @@ +# flake8: nodingo +# yapf: disable +import csv +import json +import os +import time +from typing import List + +from datasets import Dataset + +from opencompass.openicl.icl_evaluator import BaseEvaluator +from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET + +from .base import BaseDataset + + +@LOAD_DATASET.register_module() +class DingoDataset(BaseDataset): + + @staticmethod + def load(path: str): + raw_data = [] + with open(path, encoding='utf-8') as f: + reader = csv.reader(f, delimiter=';') + for row in reader: + if len(row) < 1: + row = [''] + raw_data.append({'input': row[0]}) + return Dataset.from_list(raw_data) + + +@LOAD_DATASET.register_module() +class DingoLongDataset(BaseDataset): + + @staticmethod + def load(path: str): + raw_data = [] + with open(path, 'r', encoding='utf-8') as f: + for line in f: + raw_data.append({'input': json.loads(line).get('input')}) + return Dataset.from_list(raw_data) + + +@ICL_EVALUATORS.register_module() +class DingoEvaluator(BaseEvaluator): + + def score(self, origin_prompt: List, predictions: List) -> dict: + try: + # from dingo.model.model import Model + from dingo.exec import Executor + from dingo.io import InputArgs + except Exception: + raise ModuleNotFoundError( + '=========== ' + 'dingo register fail. please try: pip install dingo-python.' + ' ===========') + + current_time = time.strftime('%Y%m%d_%H%M%S', time.localtime()) + file_data = [{'prompt': pmt, 'prediction': prd} + for pmt, prd in zip(origin_prompt, predictions)] + file_name = 'dingo_file_' + current_time + '.jsonl' + with open(file_name, 'a', encoding='utf-8') as f: + for d in file_data: + json.dump(d, f, ensure_ascii=False) + f.write('\n') + + input_data = { + 'eval_models': ['llm_base'], + 'input_path': file_name, + 'output_path': './outputs/dingo/', + 'dataset': 'local', + 'datasource': 'local', + 'data_format': 'jsonl', + 'column_prompt': ['prompt'], + 'column_content': ['prediction'], + } + # Model.apply_config(input_data["custom_config_path"]) + input_args = InputArgs(**input_data) + executor = Executor.exec_map['local'](input_args) + result = executor.execute() + summary = result[0].to_dict() + + os.remove(file_name) + return summary diff --git a/opencompass/datasets/gaokao_math.py b/opencompass/datasets/gaokao_math.py new file mode 100644 index 000000000..87840b71d --- /dev/null +++ b/opencompass/datasets/gaokao_math.py @@ -0,0 +1,144 @@ +import concurrent.futures +import json +import re + +from datasets import Dataset + +from opencompass.models import OpenAISDK +from opencompass.openicl.icl_evaluator import BaseEvaluator +from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET, MODELS + +from .base import BaseDataset + +# from opencompass.utils import get_data_path + + +EVAL_PROMPT = """ +请你作为一个数学高考阅卷专家,判断下面的答案是否与标准答案一致,即考生是否回答正确。下面是一些评判标准: +1. 有些答案可能包含多项内容,可能有单选题,多选题,填空题等,只要答案与标准答案一致即可, 对于多选题和多个空的填空题,需要考生对应的选项或空都回答正确才算正确。 +2. 有些答案可能通过不同的方式表达,比如有些答案可能是一个数学表达式,有些答案可能是一个文字描述,只要表达的意思一致即可。且有些公式通过不同的方式表达,但等价,也是正确的。 +3. 你不需要重新计算问题答案,因为标准答案已经给出,只需要根据问题形式来判断考生的答案是否与标准答案一致,是否正确即可。 + +请你根据上述标准,判断下面的答案是否与标准答案一致,如果一致,请在最后输出\\boxed{{yes}}, 否则输出\\boxed{{no}}, 如果难以判断,请输出\\boxed{{no}}. +原问题:{question} +标准答案:{gold_answer} +考生答案:{answer} + +分析: +""" # noqa E501 + + +def extract_boxed_answer(text): + match = re.findall(r'\\boxed{(.+?)}', text) + if match: + return match[-1] + return None + + +@LOAD_DATASET.register_module() +class GaoKaoMATHDataset(BaseDataset): + + @staticmethod + def load(path: str): + # path = get_data_path(path, local_mode=True) + data = json.load(open(path)) + for i in range(len(data)): + data[i]['extract_answer'] = str(data[i]['extract_answer']) + dataset = Dataset.from_list(data) + return dataset + + +api_meta_template = dict(round=[ + dict(role='HUMAN', api_role='HUMAN'), + dict(role='BOT', api_role='BOT', generate=True), +]) + + +@ICL_EVALUATORS.register_module() +class GaoKaoMATHEvaluator(BaseEvaluator): + + def __init__(self, model_name, url, **kwargs): + if isinstance(url, str): + url = [url] + + self.model = [ + MODELS.build( + dict( + type=OpenAISDK, + path=model_name, + openai_api_base=url, + key='EMPTY', + query_per_second=1, + meta_template=api_meta_template, + temperature=kwargs.get('temperature', 0.01), + max_seq_len=kwargs.get('max_tokens', 8192), + )) for url in url + ] + + def batch_response(self, inputs): + batch_num = len(self.model) + batch_size = (len(inputs) + batch_num - 1) // batch_num + result_responses = [] + + with concurrent.futures.ThreadPoolExecutor( + max_workers=batch_num) as executor: + futures = [ + executor.submit(self.model[i].generate, + inputs[i * batch_size:(i + 1) * batch_size]) + for i in range(batch_num) + ] + for response in executor.map(lambda f: f.result(), futures): + result_responses.extend(response) + + return result_responses + + def score(self, predictions, references, origin_prompt): + if len(predictions) != len(references): + return {'error': 'preds and refrs have different length'} + questions = [item[0]['prompt'] for item in origin_prompt] + count = 0 + correct = 0 + details = [] + results = [] + inputs = [] + for pred, ref, ques in zip(predictions, references, questions): + inputs.append( + EVAL_PROMPT.format(answer=pred, gold_answer=ref, + question=ques)) + + result_responses = self.batch_response(inputs) + results = [ + extract_boxed_answer(result) == 'yes' + for result in result_responses + ] + for pred, ref, result, result_response in zip(predictions, references, + results, + result_responses): + detail = { + 'pred': pred, + 'answer': ref, + 'correct': False, + 'eval_model_response': result_response + } + count += 1 + if result: + correct += 1 + detail['correct'] = True + details.append(detail) + + detailed_result = { + 'accuracy': 100 * correct / count, + 'details': details + } + + return detailed_result + + +if __name__ == '__main__': + evaluator = GaoKaoMATHEvaluator('http://0.0.0.0:23333/v1', + temperature=0.01, + max_tokens=2048, + procs=8) + predictions = ['1', '2', '3'] + references = ['1', '2', '3'] + evaluator.score(predictions, references) diff --git a/opencompass/datasets/mmlu_pro.py b/opencompass/datasets/mmlu_pro.py index d5e2a4e4a..0f2957b80 100644 --- a/opencompass/datasets/mmlu_pro.py +++ b/opencompass/datasets/mmlu_pro.py @@ -3,19 +3,26 @@ from datasets import load_dataset +from opencompass.openicl import BaseEvaluator from opencompass.registry import LOAD_DATASET from opencompass.utils import get_data_path from .base import BaseDataset +CHOICES=['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P'] def _parse(item): - choices = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P'] + s = '' + item['answer_string'] = '' for i, opt in enumerate(item['options']): if opt == 'N/A': continue - s += '{}. {}\n'.format(choices[i], opt) + option = '{}. {}\n'.format(CHOICES[i], opt) + s += option + if item['answer'] == CHOICES[i]: + item['answer_string'] = option + item['options_str'] = s.strip() item['cot_content'] = item['cot_content'].removeprefix("A: Let's think step by step.").strip() return item @@ -31,3 +38,38 @@ def load(path: str, category: str): mmlu_pro = mmlu_pro.filter(lambda x: x['category'] == category) mmlu_pro = mmlu_pro.map(_parse) return mmlu_pro + +class MMLUProBaseEvaluator(BaseEvaluator): + + def is_equal(self, pred, refer): + try: + refer_option, refer_string = refer.split('. ') + if pred in CHOICES and refer_option == pred: + return True + elif refer_string.strip() == pred: + return True + else : + return False + except Exception: + pass + return False + + def score(self, predictions, references): + if len(predictions) != len(references): + return { + 'error': 'predictions and references have different ' + 'length' + } + correct = 0 + count = 0 + details = [] + for i, j in zip(predictions, references): + i = i.split('\n')[0].strip() + detail = {'pred': i, 'answer': j, 'correct': False} + count += 1 + if self.is_equal(i, j): + correct += 1 + detail['correct'] = True + details.append(detail) + result = {'accuracy': 100 * correct / count, 'details': details} + return result diff --git a/opencompass/datasets/subjective/__init__.py b/opencompass/datasets/subjective/__init__.py index 20120e463..cfba94423 100644 --- a/opencompass/datasets/subjective/__init__.py +++ b/opencompass/datasets/subjective/__init__.py @@ -13,6 +13,7 @@ from .corev2 import Corev2Dataset # noqa: F401, F403 from .creationbench import CreationBenchDataset # noqa: F401, F403 from .fofo import FofoDataset, fofo_postprocess # noqa: F401, F403 +from .flames import FlamesDataset # noqa: F401, F403 from .followbench import FollowBenchDataset # noqa: F401, F403 from .information_retrival import IRDataset # noqa: F401, F403 from .judgerbench import JudgerBenchDataset # noqa: F401, F403 diff --git a/opencompass/datasets/flames.py b/opencompass/datasets/subjective/flames.py similarity index 92% rename from opencompass/datasets/flames.py rename to opencompass/datasets/subjective/flames.py index de2202e93..0c48c4ff2 100644 --- a/opencompass/datasets/flames.py +++ b/opencompass/datasets/subjective/flames.py @@ -9,7 +9,7 @@ from opencompass.registry import LOAD_DATASET from opencompass.utils import get_data_path -from .subjective.subjective_cmp import SubjectiveCmpDataset +from .subjective_cmp import SubjectiveCmpDataset class Config: @@ -36,11 +36,7 @@ def prompt_construct(sample, config: Config): @LOAD_DATASET.register_module() class FlamesDataset(SubjectiveCmpDataset): - def load( - self, - path: str, - name: str, - ): + def load(self, path: str, name: str, *args, **kwargs): path = get_data_path(path, local_mode=True) config = Config(path, f'{name}_config.txt') diff --git a/opencompass/models/__init__.py b/opencompass/models/__init__.py index 0d384fed2..580402d46 100644 --- a/opencompass/models/__init__.py +++ b/opencompass/models/__init__.py @@ -3,6 +3,7 @@ from .alaya import AlayaLM # noqa: F401 from .baichuan_api import BaiChuan # noqa: F401 from .baidu_api import ERNIEBot # noqa: F401 +from .bailing_api_oc import BailingAPI # noqa: F401 from .base import BaseModel, LMTemplateParser # noqa: F401 from .base_api import APITemplateParser, BaseAPIModel # noqa: F401 from .bytedance_api import ByteDance # noqa: F401 @@ -20,11 +21,10 @@ from .huggingface_above_v4_33 import HuggingFacewithChatTemplate # noqa: F401 from .hunyuan_api import Hunyuan # noqa: F401 from .intern_model import InternLM # noqa: F401 +from .interntrain import InternTrain # noqa: F401 from .krgpt_api import KrGPT # noqa: F401 from .lightllm_api import LightllmAPI, LightllmChatAPI # noqa: F401 from .llama2 import Llama2, Llama2Chat # noqa: F401 -from .lmdeploy_pytorch import LmdeployPytorchModel # noqa: F401 -from .lmdeploy_tis import LmdeployTisModel # noqa: F401 from .minimax_api import MiniMax, MiniMaxChatCompletionV2 # noqa: F401 from .mistral_api import Mistral # noqa: F401 from .mixtral import Mixtral # noqa: F401 @@ -39,7 +39,6 @@ from .sensetime_api import SenseTime # noqa: F401 from .stepfun_api import StepFun # noqa: F401 from .turbomind import TurboMindModel # noqa: F401 -from .turbomind_tis import TurboMindTisModel # noqa: F401 from .turbomind_with_tf_above_v4_33 import \ TurboMindModelwithChatTemplate # noqa: F401 from .unigpt_api import UniGPT # noqa: F401 diff --git a/opencompass/models/bailing_api_oc.py b/opencompass/models/bailing_api_oc.py new file mode 100644 index 000000000..8e107556c --- /dev/null +++ b/opencompass/models/bailing_api_oc.py @@ -0,0 +1,225 @@ +import concurrent +import concurrent.futures +import os +import socket +import traceback +from typing import Dict, List, Optional, Union + +import requests +from requests.adapters import HTTPAdapter +from urllib3.connection import HTTPConnection + +try: + from retrying import retry +except ImportError: + retry = None + +from opencompass.utils.prompt import PromptList + +from .base_api import BaseAPIModel + +PromptType = Union[PromptList, str] + + +class HTTPAdapterWithSocketOptions(HTTPAdapter): + + def __init__(self, *args, **kwargs): + self._socket_options = HTTPConnection.default_socket_options + [ + (socket.SOL_SOCKET, socket.SO_KEEPALIVE, 1), + (socket.SOL_TCP, socket.TCP_KEEPIDLE, 75), + (socket.SOL_TCP, socket.TCP_KEEPINTVL, 30), + (socket.SOL_TCP, socket.TCP_KEEPCNT, 120), + ] + super(HTTPAdapterWithSocketOptions, self).__init__(*args, **kwargs) + + def init_poolmanager(self, *args, **kwargs): + if self._socket_options is not None: + kwargs['socket_options'] = self._socket_options + super(HTTPAdapterWithSocketOptions, + self).init_poolmanager(*args, **kwargs) + + +class BailingAPI(BaseAPIModel): + """Model wrapper around Bailing Service. + + Args: + ouput_key (str): key for prediction + query_per_second (int): The maximum queries allowed per second + between two consecutive calls of the API. Defaults to 1. + generation_kwargs: other params + retry (int): Number of retires if the API call fails. Defaults to 2. + """ + + def __init__( + self, + path: str, + token: str, + url: str, + meta_template: Optional[Dict] = None, + query_per_second: int = 1, + retry: int = 3, + generation_kwargs: Dict = {}, + max_seq_len=4096, + ): + super().__init__( + path=path, + max_seq_len=max_seq_len, + query_per_second=query_per_second, + meta_template=meta_template, + retry=retry, + generation_kwargs=generation_kwargs, + ) + + self.logger.info(f'Bailing API Model Init path: {path} url={url}') + if not token: + token = os.environ.get('BAILING_API_KEY') + if token: + self._headers = {'Authorization': f'Bearer {token}'} + else: + raise RuntimeError('There is not valid token.') + else: + self._headers = {'Authorization': f'Bearer {token}'} + + self._headers['Content-Type'] = 'application/json' + self._url = url if url else \ + 'https://bailingchat.alipay.com/chat/completions' + self._model = path + self._sessions = [] + self._num = (int(os.environ.get('BAILING_API_PARALLEL_NUM')) + if os.environ.get('BAILING_API_PARALLEL_NUM') else 1) + try: + for _ in range(self._num): + adapter = HTTPAdapterWithSocketOptions() + sess = requests.Session() + sess.mount('http://', adapter) + sess.mount('https://', adapter) + self._sessions.append(sess) + except Exception as e: + self.logger.error(f'Fail to setup the session. {e}') + raise e + + def generate( + self, + inputs: Union[List[str], PromptList], + max_out_len: int = 4096, + ) -> List[str]: + """Generate results given a list of inputs. + + Args: + inputs (Union[List[str], PromptList]): + A list of strings or PromptDicts. + The PromptDict should be organized in OpenCompass' API format. + max_out_len (int): The maximum length of the output. + + Returns: + List[str]: A list of generated strings. + """ + with concurrent.futures.ThreadPoolExecutor( + max_workers=self._num, ) as executor: + future_to_m = { + executor.submit( + self._generate, + self._sessions[i % self._num], + input, + max_out_len, + ): i + for i, input in enumerate(inputs) + } + results = [] + for future in concurrent.futures.as_completed(future_to_m): + m = future_to_m[future] # noqa F841 + resp = future.result() + if resp and resp.status_code == 200: + try: + result = resp.json() + except Exception as e: # noqa F841 + results.append('') + else: + if (result.get('choices') + and result['choices'][0].get('message') + and result['choices'][0]['message'].get( + 'content')): + results.append( + result['choices'][0]['message']['content']) + else: + results.append('') + self.flush() + return results + + def _generate( + self, + sess, + input: Union[str, PromptList], + max_out_len: int, + ) -> str: + """Generate results given an input. + + Args: + inputs (str or PromptList): A string or PromptDict. + The PromptDict should be organized in OpenCompass' API format. + max_out_len (int): The maximum length of the output. + + Returns: + str: The generated string. + """ + if isinstance(input, str): + messages = [{'role': 'user', 'content': input}] + else: + messages = [] + for item in input: + content = item['prompt'] + if not content: + continue + message = {'content': content} + if item['role'] == 'HUMAN': + message['role'] = 'user' + elif item['role'] == 'BOT': + message['role'] = 'assistant' + elif item['role'] == 'SYSTEM': + message['role'] = 'system' + else: + message['role'] = item['role'] + messages.append(message) + request = { + 'model': + self._model, + 'messages': + messages, + 'max_seq_len': + max( + max_out_len if max_out_len else 4096, + self.max_seq_len if self.max_seq_len else 4096, + ), + } + request.update(self.generation_kwargs) + try: + retry_num = 0 + while retry_num < self.retry: + response = self._infer_result(request, sess) + if response.status_code == 200: + break # success + elif response.status_code == 426: + retry_num += 1 # retry + else: + raise ValueError(f'Status code = {response.status_code}') + else: + raise ValueError( + f'Exceed the maximal retry times. Last status code ' + f'= {response.status_code}') + except Exception as e: + self.logger.error(f'Fail to inference request={request}; ' + f'model_name={self.path}; error={e}, ' + f'stack:{traceback.format_exc()}') + raise e + return response + + # @retry(stop_max_attempt_number=3, wait_fixed=16000) # ms + def _infer_result(self, request, sess): + response = sess.request( + 'POST', + self._url, + json=request, + headers=self._headers, + timeout=500, + ) + return response diff --git a/opencompass/models/base_api.py b/opencompass/models/base_api.py index c88aa154c..13a8e956c 100644 --- a/opencompass/models/base_api.py +++ b/opencompass/models/base_api.py @@ -43,7 +43,8 @@ def __init__(self, retry: int = 2, max_seq_len: int = 2048, meta_template: Optional[Dict] = None, - generation_kwargs: Dict = dict()): + generation_kwargs: Dict = dict(), + verbose: bool = False): self.path = path self.max_seq_len = max_seq_len self.meta_template = meta_template @@ -53,6 +54,7 @@ def __init__(self, self.template_parser = APITemplateParser(meta_template) self.logger = get_logger() self.generation_kwargs = generation_kwargs + self.verbose = verbose @abstractmethod def generate(self, inputs: List[PromptType], @@ -281,6 +283,9 @@ def parse_template(self, prompt_template: PromptType, new_prompt.append(item) prompt = new_prompt + if self.meta_template.get('begin', None): + prompt.insert(0, self.meta_template['begin']) + else: # in case the model does not have any meta template prompt = '' diff --git a/opencompass/models/interntrain.py b/opencompass/models/interntrain.py new file mode 100644 index 000000000..e846aae2f --- /dev/null +++ b/opencompass/models/interntrain.py @@ -0,0 +1,478 @@ +import os +import random +import sys +import time +from typing import Dict, List, Optional, Union + +import numpy as np +import torch +import torch.distributed as dist + +from opencompass.models.base import BaseModel +from opencompass.registry import MODELS +from opencompass.utils.logging import get_logger + + +class InternTrainManager: + + def __init__(self, module_path): + self.module_path = module_path + + @staticmethod + def build(module_path): + sys.path.insert(0, module_path) + try: + from internlm.core.context.registry import \ + register_model_initializer # noqa: F401 + return CurrentInternTrainManager(module_path) + except ImportError: + return LegacyInternTrainManager(module_path) + + +class CurrentInternTrainManager(InternTrainManager): + + def load_config(self, path, model_config=None): + from internlm.config import Config + if model_config is None: + model_config = torch.load(os.path.join(path, 'model_config.pt')) + elif isinstance(model_config, dict): + model_config = Config(model_config) + elif isinstance(model_config, str): + model_config = Config.fromfile(model_config).model + else: + raise NotImplementedError( + 'model_config should be None, dict or filename.') + + return model_config + + def initialize_model(self): + from internlm.train.pipeline import (initialize_model, + initialize_parallel_communicator) + model = initialize_model().model + initialize_parallel_communicator(model) + + return model + + +class LegacyInternTrainManager(InternTrainManager): + + def load_config(self, path, model_config=None): + from internlm.core.context import Config + if model_config is None: + model_config = torch.load(os.path.join(path, 'model_config.pt')) + elif isinstance(model_config, dict): + model_config = Config(model_config) + elif isinstance(model_config, str): + model_config = Config.from_file(model_config).model + else: + raise NotImplementedError( + 'model_config should be None, dict or filename.') + + return model_config + + def initialize_model(self): + from internlm.train.pipeline import initialize_model + model = initialize_model().model + + return model + + +@MODELS.register_module() +class InternTrain(BaseModel): + """Model wrapper for InternTrain. + + Args: + path (str): The name or path to HuggingFace's model. + module_path (str): Path of InternTrain repository. + max_seq_len (int): The maximum length of the input sequence. Defaults + to 2048. + tokenizer_only (bool): If True, only the tokenizer will be initialized. + Defaults to False. + tokenizer_path (str): The path to the tokenizer. Defaults to None. + tokenizer_type: InternTrain's tokenizer type. Defaults to 'InternLM'. + model_config (str, dict, optional): Config of model. There are several + options for this parameter: + + - filename (str): The config items are defined in a python file + so the model will load configs from this file. + - config (dict): The configuration items are defined in a dict + and the model will be initialized from ```model_config```. + - None: The config is loaded from ```path```. In this case, + please make sure that ```path``` contains a config file named + ``model_config.pt``. + + Defaults to None. + model_type: Type of model. Defaults to 'InternTrain' + ckpt_type: The type of load function in InternTrain when checkpoints + are loaded. Defaults to None, which means load the checkpoint + directlywith pipeline merged. + meta_template (Dict, optional): The model's meta prompt + template if needed, in case the requirement of injecting or + wrapping of any meta instructions. + model_dtype: The model's dtype. If None, will use dtype defined in + ```model_config```. Defaults to None. + generation_kwargs (Dict, optional): The generation kwargs for the + model. Defaults to dict(). + sync_rank (bool): Whether to sync inputs between ranks. Do not use this + if you are not familiar with this behavior. Check `sync_inputs` + function for more details. Defaults to False. + mode (str, optional): The method of input truncation when input length + exceeds max_seq_len. 'mid' represents the part of input to + truncate. Defaults to 'none'. + end_str (str, optional): Whether to trim generated strings with end_str + if the model has special ending strings that are not handled well. + Defaults to None. + """ + + def __init__(self, + path: str, + module_path: str, + max_seq_len: int = 2048, + tokenizer_only: bool = False, + tokenizer_path: Optional[str] = None, + tokenizer_type: str = 'INTERNLM', + model_config: Optional[Union[str, Dict]] = None, + model_type: str = 'INTERNLM2', + ckpt_type: Optional[str] = None, + meta_template: Optional[Dict] = None, + model_dtype: Optional[str] = None, + generation_kwargs={}, + sync_rank: bool = False, + mode='none', + end_str: Optional[str] = None): + super().__init__(path=path, + max_seq_len=max_seq_len, + tokenizer_only=tokenizer_only, + meta_template=meta_template, + sync_rank=sync_rank) + self.logger = get_logger() + # insert interntrain module + self.manager = InternTrainManager.build(module_path) + + # TODO: mode is not a good name, change it both here and huggingface.py + # mode = 'mid' is used only in longtext eval, which cut off tokens in + # the middle + # https://github.com/THUDM/LongBench + assert mode in ['none', 'mid'] + self.mode = mode + + self._load_tokenizer(tokenizer_path=tokenizer_path, + tokenizer_type=tokenizer_type) + + if not tokenizer_only: + self._load_model(path=path, + model_config=model_config, + model_type=model_type, + model_dtype=model_dtype, + ckpt_type=ckpt_type) + + # default generation_kwargs + assert generation_kwargs.pop('num_return_sequences', 1) == 1 # TODO + self.generation_kwargs = { + 'temperature': 1.0, + 'top_p': 1.0, + 'top_k': 50, + 'do_sample': False, + 'repetition_penalty': 1.0, + } + self.generation_kwargs.update(generation_kwargs) + self.logger.info(f'generation_kwargs: {self.generation_kwargs}') + + # generator + from internlm.apis.inference import SequenceGenerator + eos_token_ids = self.generation_kwargs.get('eos_token_id', []) + if isinstance(eos_token_ids, int): + eos_token_ids = [eos_token_ids] + eos_token_ids.append(self.tokenizer.eos_id) + if self.eos_token_id is not None: + eos_token_ids.append(self.eos_token_id) + eos_token_ids = list(set(eos_token_ids)) + self.generator = SequenceGenerator(self.model, + bos_token_id=self.tokenizer.bos_id, + pad_token_id=self.tokenizer.bos_id, + eos_token_id=eos_token_ids) + self.end_str = end_str + + def _load_model(self, + path: str, + model_config: Optional[str] = None, + model_type: str = 'INTERNLM2', + model_dtype: Optional[str] = None, + ckpt_type: Optional[str] = None): + # funcs + from internlm.checkpoint.load_funcs import (LOAD_FUNC_DICT, + merge_pp_within_tp) + from internlm.core.context import global_context as gpc + from internlm.initialize.launch import launch + from internlm.utils.storage_manager import (get_storage_manager, + init_storage_manager) + + # config + model_config = self.manager.load_config(path, model_config) + model_config['parallel_output'] = False + model_config['dtype'] = self._convert_dtype(model_config['dtype'], + model_dtype=model_dtype) + + world_size = int(os.getenv('WORLD_SIZE', '1')) + tp_size = world_size # TODO + self.logger.info(f'world size: {world_size} tp: {tp_size}') + parallel_config = dict(zero1=dict(size=1, fsdp=False), + pipeline=dict(size=1), + tensor=dict(size=tp_size, mode='mtp'), + sequence_parallel=False) + config = dict(model=model_config, + parallel=parallel_config, + data=dict(use_packed_dataset=False), + model_type=model_type, + use_cuda_flash_attn=model_config.get( + 'use_flash_attn', True)) + launch( + config=config, + seed=42, + local_rank=int(os.getenv('RANK', '0')), + rank=int(os.getenv('LOCAL_RANK', '0')), + world_size=int(os.getenv('WORLD_SIZE', '1')), + host=os.getenv('MASTER_ADDR', '127.0.0.1'), + port=int(os.getenv('MASTER_PORT', random.randint(12000, 32000))), + ) + self.logger.info(f'Config: {gpc.config}') + + self.model = self.manager.initialize_model() + + # load state dict + try: + get_storage_manager() + except AssertionError: + init_storage_manager(False, None, None) + get_storage_manager() + if ckpt_type is None or ckpt_type == 'internevo': + state_dict = merge_pp_within_tp(path, del_model_prefix=True) + load_info = self.model.load_state_dict(state_dict, strict=False) + self.logger.info(load_info) + else: + load_func = LOAD_FUNC_DICT[ckpt_type] + load_func(path, self.model) + + self.model.to(model_config['dtype']).eval().cuda() + + def _load_tokenizer(self, tokenizer_path: str, tokenizer_type: str): + from internlm.core.context.registry import TOKENIZER_INITIALIZER + tokenizer_cls = TOKENIZER_INITIALIZER.get_module(tokenizer_type) + self.tokenizer = tokenizer_cls( + model_path=tokenizer_path, + use_bos=True, + use_eos=False, + ) + + # TODO use bos as pad temporarily + if self.tokenizer.pad_id == -1: + self.pad_id = self.tokenizer.bos_id + else: + self.pad_id = self.tokenizer.pad_id + + def _convert_dtype(self, default_dtype, model_dtype=None): + if model_dtype is None: + return default_dtype + elif isinstance(model_dtype, torch.dtype): + return model_dtype + elif model_dtype == 'torch.bfloat16': + return torch.bfloat16 + elif model_dtype in ('torch.float16', 'torch.half'): + return torch.float16 + elif model_dtype in ('torch.float32', 'torch.float'): + return torch.float32 + elif model_dtype in ('torch.tf32'): + torch.backends.cudnn.allow_tf32 = True + torch.backends.cuda.matmul.allow_tf32 = True + return torch.float32 + else: + raise NotImplementedError(f'Unknown model dtype {model_dtype}') + + def get_token_len(self, prompt: str, use_bos=None, use_eos=None) -> int: + """Get lengths of the tokenized strings. + + Args: + prompt (str): Input string. + + Returns: + int: Length of the input tokens + """ + tokens = self.tokenizer(prompt, use_bos=use_bos, use_eos=use_eos) + return len(tokens) + + def generate(self, + inputs: List[str], + max_out_len: int, + min_out_len: Optional[int] = None, + stopping_criteria: List[str] = []) -> List[str]: + """Generate results given a list of inputs. + + Args: + inputs (List[str]): A list of strings. + max_out_len (int): The maximum length of the output. + + Returns: + List[str]: A list of generated strings. + """ + if min_out_len is None: + # keep same with InternTrain's default value + min_out_len = 1 + + tokens = self.batch_encode(inputs, + self.max_seq_len - max_out_len, + left_padding=True) + + # random seed for pass@k + seed = torch.tensor(time.time(), dtype=torch.int64).cuda() + + dist.broadcast(seed, src=0) + torch.cuda.manual_seed(seed.item()) + dist.barrier() + outputs = self.generator.generate( + tokens, + max_length=tokens.shape[1] + max_out_len, + **self.generation_kwargs) # bsz, num_return_sequences, max_length + outputs = outputs[:, 0, tokens.shape[1]:] + output_text = self.batch_decode( + outputs, + eos_token_ids=self.generator.eos_token_id, + stopping_criteria=stopping_criteria) + + return output_text + + def get_ppl(self, + input_texts: List[str], + mask_length: Optional[List[int]] = None) -> List[float]: + """Get perplexity scores given a list of inputs. + + Args: + input_texts (List[str]): A list of strings. + mask_length (Optional[List[int]]): A list of mask lengths. If + provided, the perplexity scores will be calculated with the + first mask_length[i] tokens masked out. + + Returns: + List[float]: A list of perplexity scores. + """ + outputs, inputs = self.get_logits(input_texts) + + shift_logits = outputs[..., :-1, :].contiguous() + shift_labels = inputs[..., 1:].contiguous() + loss_fct = torch.nn.CrossEntropyLoss(reduction='none', + ignore_index=self.pad_id) + loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), + shift_labels.view(-1)).view(shift_labels.size()) + + if mask_length is not None: + mask = torch.zeros_like(shift_labels) # [batch,seqlen] + for i in range(len(mask)): + for j in range(mask_length[i] - 1, len(mask[i])): + mask[i][j] = 1 + loss = loss * mask + + lens = (inputs != self.pad_id).sum(-1).cpu().numpy() + if mask_length is not None: + lens -= np.array(mask_length) + ce_loss = loss.float().sum(-1).cpu().detach().numpy() / lens + return ce_loss + + def get_loglikelihood(self, input_texts: List[str], + conts: List[str]) -> List[float]: + outputs, inputs = self.get_logits(input_texts) + shift_logits = outputs[..., :-1, :].contiguous() + shift_labels = inputs[..., 1:].contiguous() + loss_fct = torch.nn.CrossEntropyLoss(reduction='none', + ignore_index=self.pad_id) + loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), + shift_labels.view(-1)).view(shift_labels.size()) + lens = (inputs != self.pad_id).sum(-1).cpu().numpy() + replaced_texts = [ + input_text.replace(cont, '') + for input_text, cont in zip(input_texts, conts) + ] + replaced_lens = [ + self.get_token_len(input_text) for input_text in replaced_texts + ] + loglikelihoods = [] + for nloss, nlen, rlen in zip(loss, lens, replaced_lens): + nlen, rlen = int(nlen), int(rlen) + nloss = nloss[:nlen] + nloss = nloss[rlen:].float().sum().cpu().detach().numpy() + loglikelihoods.append(-nloss) + return np.array(loglikelihoods) + + def get_mink_percent(self, + input_texts: List[str], + k: int = 20) -> List[float]: + """https://swj0419.github.io/detect-pretrain.github.io/""" + outputs, inputs = self.get_logits(input_texts) + shift_logits = outputs[..., :-1, :].contiguous() + shift_labels = inputs[..., 1:].contiguous() + loss_fct = torch.nn.CrossEntropyLoss(reduction='none', + ignore_index=self.pad_id) + loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), + shift_labels.view(-1)).view(shift_labels.size()) + lens = (inputs != self.pad_id).sum(-1).cpu().numpy() + mink_percent = [] + for nloss, nlen in zip(loss, lens): + nlen = int(nlen) + minklen = max(nlen * k // 100, 1) + nloss = torch.topk(loss[-nlen:], minklen, dim=-1)[0] + nloss = -nloss.float().mean().cpu().detach().numpy() + mink_percent.append(nloss) + return np.array(mink_percent) + + def get_logits(self, input_texts: Union[str, List[str]]): + tokens = self.batch_encode(input_texts, max_seq_len=self.max_seq_len) + outputs = self.model(input_ids=tokens) + if isinstance(outputs, tuple): + # moe returns (hidden_states, moe_losses) + outputs = outputs[0] + return outputs, tokens + + def batch_encode(self, + input_texts: Union[str, List[str]], + max_seq_len: int, + left_padding=False): + if isinstance(input_texts, str): + input_texts = [input_texts] + tokens = [self.tokenizer(text) for text in input_texts] + max_len = min(max_seq_len, max([len(t) for t in tokens])) + for i in range(len(tokens)): + cur_input = tokens[i] + padding_len = max_len - len(cur_input) + if self.mode == 'none': + cur_input = cur_input[:max_len] + elif self.mode == 'mid' and len(cur_input) > max_len: + mid_cut_len = max_len // 2 + cur_input = cur_input[:mid_cut_len] + cur_input[-mid_cut_len:] + + if left_padding: + # left padding with bos + tokens[i] = [self.tokenizer.bos_id] * padding_len + cur_input + else: + tokens[i] = cur_input + [self.pad_id] * padding_len + + return torch.LongTensor(tokens).cuda() + + def batch_decode(self, + outputs, + eos_token_ids: List[int], + stopping_criteria: List[str] = []): + # outputs: bsz, seq_len + output_text = [] + outputs = outputs.tolist() + for output in outputs: + # cut off by eos_token_ids + eos_idx = len(output) + for eos_id in eos_token_ids: + if eos_id in output: + eos_idx = min(output.index(eos_id), eos_idx) + text = self.tokenizer.decode(output[:eos_idx]) + if self.end_str is not None: + text = text.split(self.end_str)[0] + for stop_word in stopping_criteria: + text = text.split(stop_word)[0] + output_text.append(text) + + return output_text diff --git a/opencompass/models/lmdeploy_pytorch.py b/opencompass/models/lmdeploy_pytorch.py deleted file mode 100644 index 80924c276..000000000 --- a/opencompass/models/lmdeploy_pytorch.py +++ /dev/null @@ -1,188 +0,0 @@ -from concurrent.futures import ThreadPoolExecutor -from typing import Dict, List, Optional, Union - -from opencompass.models.base import BaseModel -from opencompass.utils.logging import get_logger -from opencompass.utils.prompt import PromptList - -PromptType = Union[PromptList, str] - - -def valid_str(string, coding='utf-8'): - """decode text according to its encoding type.""" - invalid_chars = [b'\xef\xbf\xbd'] - bstr = bytes(string, coding) - for invalid_char in invalid_chars: - bstr = bstr.replace(invalid_char, b'') - ret = bstr.decode(encoding=coding, errors='ignore') - return ret - - -class LmdeployPytorchModel(BaseModel): - """Model wrapper for lmdeploy pytorch engine through python API. - - Args: - path (str): path of the supported pytorch model. - max_seq_len (int): The maximum allowed sequence length of a model. - Note that the length of prompt + generated tokens shall not exceed - this value. Defaults to 2048. - meta_template (Dict, optional): The model's meta prompt - template if needed, in case the requirement of injecting or - wrapping of any meta instructions. - engine_config (Dict, optional): The engine config to set - arguments like session_len, max_batch_size for TurboMind. - gen_config (Dict, optional): Generation config to set - arguments like top_k, top_p, temperature. - end_str (str, optional): Whether to trim generated strings with end_str - if the model has special ending strings that are not handled well. - Defaults to None. - """ - - def __init__(self, - path: str, - concurrency: int = 8, - max_seq_len: int = 2048, - meta_template: Optional[Dict] = None, - engine_config: Optional[Dict] = None, - gen_config: Optional[Dict] = None, - end_str: Optional[str] = None): - super().__init__(path=path, - max_seq_len=max_seq_len, - meta_template=meta_template) - from lmdeploy.pytorch import engine as tm - from lmdeploy.version import version_info - - if engine_config is not None: - from lmdeploy.messages import PytorchEngineConfig - engine_config = PytorchEngineConfig(**engine_config) - # set thread_safe - if hasattr(engine_config, 'thread_safe'): - engine_config.thread_safe = True - - if gen_config is not None: - from lmdeploy.messages import GenerationConfig - gen_config = GenerationConfig(**gen_config) - - self.logger = get_logger() - tm_model = tm.Engine(path, engine_config) - self.tokenizer = tm_model.tokenizer - self.generators = [ - tm_model.create_instance() for i in range(concurrency) - ] - self.generator_ids = [i + 1 for i in range(concurrency)] - - from transformers import GenerationConfig - try: - generation_config = GenerationConfig.from_pretrained(path) - except Exception: - generation_config = None - if generation_config and hasattr(generation_config, 'eos_token_id'): - if gen_config.stop_words is None: - stop_words = [] - if isinstance(generation_config.eos_token_id, int): - stop_words.append(generation_config.eos_token_id) - else: - assert isinstance(generation_config.eos_token_id, list) - for token_id in generation_config.eos_token_id: - stop_words.append(token_id) - gen_config.stop_words = stop_words - if version_info >= (0, 6, 0): - gen_config.stop_token_ids = stop_words - self.gen_config = gen_config - self.end_str = end_str - self.major_version, self.minor_version = version_info[:2] - - def generate( - self, - inputs: List[str], - max_out_len: int = 512, - ) -> List[str]: - """Generate results given a list of inputs. - - Args: - inputs (List[str]): A list of prompts - max_out_len (int): The maximum length of the output. - - Returns: - List[str]: A list of generated strings. - """ - assert isinstance( - inputs, List), f'List(str) is expected, but got {type(inputs)}' - - # split inputs into batches - batch_size = len(self.generators) - batch_inputs = [ - inputs[i:i + batch_size] for i in range(0, len(inputs), batch_size) - ] - - results = [] - for batch_input in batch_inputs: - with ThreadPoolExecutor() as executor: - _results = list( - executor.map( - self._generate, - self.generators[:len(batch_input)], - self.generator_ids[:len(batch_input)], - batch_input, - [self.gen_config] * len(batch_input), - [self.end_str] * len(batch_input), - )) - results += _results - return results - - def get_token_len(self, prompt: str) -> int: - input_ids = self.tokenizer.encode(prompt) - return len(input_ids) - - def wait(self): - """Wait till the next query can be sent. - - Applicable in both single-thread and multi-thread environments. - """ - return self.token_bucket.get_token() - - def _generate(self, - generator, - session_id, - prompt: PromptType, - gen_config=None, - end_str: Optional[str] = None) -> str: - """Generate results given a list of inputs. - - Args: - prompt (PromptType): A string or PromptDict. - The PromptDict should be organized in OpenCompass' - API format. - gen_config (GenerationConfig, optional): Generation - config to set arguments like top_k, top_p, temperature. - end_str (str, optional): Whether to trim generated strings - with end_str if the model has special ending strings - that are not handled well. - Defaults to None. - Returns: - str: The generated string. - """ - assert type( - prompt) is str, 'We only support string for TurboMind Python API' - input_ids = self.tokenizer.encode(prompt) - if self.major_version >= 0 and self.minor_version >= 4: - outputs = generator.infer(session_id, - input_ids, - gen_config=gen_config) - output_ids = outputs.token_ids - else: - _, output_ids, _ = generator.infer(session_id, - input_ids, - gen_config=gen_config) - - # stop engine - if hasattr(generator, 'end'): - generator.end(session_id) - # decode output - response_all = self.tokenizer.decode(output_ids) - # trim output - if end_str: - response_all = response_all.split(end_str)[0] - # remove invalid characters - response_all = valid_str(response_all) - return response_all diff --git a/opencompass/models/lmdeploy_tis.py b/opencompass/models/lmdeploy_tis.py deleted file mode 100644 index 9c92ef18a..000000000 --- a/opencompass/models/lmdeploy_tis.py +++ /dev/null @@ -1,200 +0,0 @@ -import threading -from concurrent.futures import ThreadPoolExecutor -from functools import partial -from queue import Queue -from typing import Dict, List, Optional, Union - -import numpy as np - -from opencompass.models.base import BaseModel, LMTemplateParser -from opencompass.utils.logging import get_logger -from opencompass.utils.prompt import PromptList - -PromptType = Union[PromptList, str] - - -def valid_str(string, coding='utf-8'): - """decode text according to its encoding type.""" - invalid_chars = [b'\xef\xbf\xbd'] - bstr = bytes(string, coding) - for invalid_char in invalid_chars: - bstr = bstr.replace(invalid_char, b'') - ret = bstr.decode(encoding=coding, errors='ignore') - return ret - - -def prepare_tensor(name, input_tensor): - """Create grpcclient's InferInput instance according to a given tensor.""" - import tritonclient.grpc as grpcclient - from tritonclient.utils import np_to_triton_dtype - t = grpcclient.InferInput(name, list(input_tensor.shape), - np_to_triton_dtype(input_tensor.dtype)) - t.set_data_from_numpy(input_tensor) - return t - - -def stream_callback(que, result, error): - """callback function invoked by triton client.""" - que.put((result, error)) - - -class LmdeployTisModel(BaseModel): - """Model wrapper for LMDeploy Python Backend Triton Inference Server gRPC - API. - - Args: - path (str): The name of OpenAI's model. - tis_addr (str): The address (ip:port format) of turbomind's - triton inference server - max_seq_len (int): The maximum allowed sequence length of a model. - Note that the length of prompt + generated tokens shall not exceed - this value. Defaults to 2048. - meta_template (Dict, optional): The model's meta prompt - template if needed, in case the requirement of injecting or - wrapping of any meta instructions. - """ - - is_api: bool = True - - def __init__(self, - path: str, - tis_addr: str = '0.0.0.0:33337', - max_seq_len: int = 2048, - meta_template: Optional[Dict] = None, - end_str: Optional[str] = None): - super().__init__(path=path, - max_seq_len=max_seq_len, - meta_template=meta_template) - from lmdeploy.tokenizer import Tokenizer - - self.logger = get_logger() - self.template_parser = LMTemplateParser(meta_template) - self.eos_token_id = None - if meta_template and 'eos_token_id' in meta_template: - self.eos_token_id = meta_template['eos_token_id'] - self.tis_addr = tis_addr - self.tokenizer = Tokenizer(path) - self.end_str = end_str - - def generate( - self, - inputs: List[str or PromptList], - max_out_len: int = 512, - temperature: float = 1.0, - ) -> List[str]: - """Generate results given a list of inputs. - - Args: - inputs (List[str or PromptList]): A list of strings or PromptDicts. - The PromptDict should be organized in OpenCompass' - API format. - max_out_len (int): The maximum length of the output. - temperature (float): What sampling temperature to use, - between 0 and 2. Higher values like 0.8 will make the output - more random, while lower values like 0.2 will make it more - focused and deterministic. Defaults to 0.7. - - Returns: - List[str]: A list of generated strings. - """ - - with ThreadPoolExecutor() as executor: - results = list( - executor.map(self._generate, inputs, - [max_out_len] * len(inputs), - [temperature] * len(inputs), - [self.end_str] * len(inputs))) - return results - - def wait(self): - """Wait till the next query can be sent. - - Applicable in both single-thread and multi-thread environments. - """ - return self.token_bucket.get_token() - - def get_token_len(self, prompt: str) -> int: - input_ids = self.tokenizer.encode(prompt) - return len(input_ids) - - def _call_triton_server(self, prompt, tis_addr, session_id, - request_output_len, temperature, res_que): - import tritonclient.grpc as grpcclient - - with grpcclient.InferenceServerClient(tis_addr) as client: - inputs = [ - prepare_tensor('prompt', - np.array([prompt.encode()], dtype=np.object_)), - prepare_tensor('max_tokens', - np.array([request_output_len], dtype=np.int32)), - prepare_tensor('temperature', - np.array([temperature], dtype=np.float_)), - prepare_tensor('top_p', np.array([1.0], dtype=np.float_)), - prepare_tensor('top_k', np.array([1], dtype=np.int32)), - prepare_tensor('ignore_eos', np.array([False], - dtype=np.bool_)), - prepare_tensor('stream', np.array([True], dtype=np.bool_)), - ] - - # async_stream - client.start_stream(partial(stream_callback, res_que)) - client.async_stream_infer('lmdeploy_model', - inputs, - sequence_id=session_id, - sequence_start=True, - sequence_end=True) - - res_que.put(None) - return - - def _process_result(self, que): - text = '' - while True: - res = que.get() - if res is not None: - result, err = res - if err is not None: - print(err) - else: - res = result.as_numpy('response').item().decode() - text += res - else: - return text - - def _generate(self, - prompt: str or PromptList, - max_out_len: int, - temperature: float, - end_str: Optional[str] = None) -> str: - """Generate results given a list of inputs. - - Args: - prompt (str or PromptList): A string or PromptDict. - The PromptDict should be organized in OpenCompass' - API format. - max_out_len (int): The maximum length of the output. - temperature (float): What sampling temperature to use, - between 0 and 2. Higher values like 0.8 will make the output - more random, while lower values like 0.2 will make it more - focused and deterministic. - - Returns: - str: The generated string. - """ - assert type( - prompt - ) is str, 'We only support string for LMDeploy Python Backend TIS API' - - res_que = Queue() - - self._call_triton_server(prompt=prompt, - tis_addr=self.tis_addr, - session_id=threading.currentThread().ident, - request_output_len=max_out_len, - temperature=temperature, - res_que=res_que) - text = self._process_result(res_que) - response = valid_str(text) - if end_str: - response = response.split(end_str)[0] - return response diff --git a/opencompass/models/openai_api.py b/opencompass/models/openai_api.py index f572a846f..aff2579a6 100644 --- a/opencompass/models/openai_api.py +++ b/opencompass/models/openai_api.py @@ -20,6 +20,13 @@ os.environ.get('OPENAI_BASE_URL', 'https://api.openai.com/v1/'), 'chat/completions') +O1_MODEL_LIST = [ + 'o1-preview-2024-09-12', + 'o1-mini-2024-09-12', + 'o1-preview', + 'o1-mini', +] + @MODELS.register_module() class OpenAI(BaseAPIModel): @@ -82,14 +89,17 @@ def __init__(self, top_logprobs: Optional[int] = None, temperature: Optional[float] = None, tokenizer_path: Optional[str] = None, - extra_body: Optional[Dict] = None): + extra_body: Optional[Dict] = None, + max_completion_tokens: int = 16384, + verbose: bool = False): super().__init__(path=path, max_seq_len=max_seq_len, meta_template=meta_template, query_per_second=query_per_second, rpm_verbose=rpm_verbose, - retry=retry) + retry=retry, + verbose=verbose) import tiktoken self.tiktoken = tiktoken self.temperature = temperature @@ -131,6 +141,9 @@ def __init__(self, self.proxy_url = openai_proxy_url self.path = path + self.max_completion_tokens = max_completion_tokens + self.logger.warning( + f'Max Completion tokens for {path} is :{max_completion_tokens}') def generate(self, inputs: List[PromptType], @@ -255,16 +268,33 @@ def _generate(self, input: PromptType, max_out_len: int, header['OpenAI-Organization'] = self.orgs[self.org_ctr] try: - data = dict( - model=self.path, - messages=messages, - max_tokens=max_out_len, - n=1, - logprobs=self.logprobs, - top_logprobs=self.top_logprobs, - stop=None, - temperature=temperature, - ) + if self.path in O1_MODEL_LIST: + self.logger.warning( + f"'max_token' is unsupported for model {self.path}") + self.logger.warning( + f'We use max_completion_tokens:' + f'{self.max_completion_tokens}for this query') + data = dict( + model=self.path, + messages=messages, + max_completion_tokens=self.max_completion_tokens, + n=1, + logprobs=self.logprobs, + top_logprobs=self.top_logprobs, + stop=None, + temperature=temperature, + ) + else: + data = dict( + model=self.path, + messages=messages, + max_tokens=max_out_len, + n=1, + logprobs=self.logprobs, + top_logprobs=self.top_logprobs, + stop=None, + temperature=temperature, + ) if self.extra_body: data.update(self.extra_body) if isinstance(self.url, list): @@ -282,7 +312,9 @@ def _generate(self, input: PromptType, max_out_len: int, 'http': self.proxy_url, 'https': self.proxy_url, } - + if self.verbose: + self.logger.debug( + f'Start send query to {self.proxy_url}') raw_response = requests.post( url, headers=header, @@ -290,6 +322,10 @@ def _generate(self, input: PromptType, max_out_len: int, proxies=proxies, ) + if self.verbose: + self.logger.debug( + f'Get response from {self.proxy_url}') + except requests.ConnectionError: self.logger.error('Got connection error, retrying...') continue @@ -343,27 +379,44 @@ def get_token_len(self, prompt: str) -> int: """ assert self.tokenizer_path or self.path try: + if self.verbose: + self.logger.info(f'Used tokenizer_path: {self.tokenizer_path}') tokenizer_path = self.tokenizer_path if self.tokenizer_path \ else self.path try: + if self.verbose: + self.logger.info( + f'Start load tiktoken encoding: {tokenizer_path}') enc = self.tiktoken.encoding_for_model(tokenizer_path) + if self.verbose: + self.logger.info( + f'Successfully tiktoken encoding: {tokenizer_path}') return len(enc.encode(prompt)) except Exception as e: self.logger.warn(f'{e}, tiktoken encoding cannot load ' f'{tokenizer_path}') from transformers import AutoTokenizer if self.hf_tokenizer is None: + if self.verbose: + self.logger.info( + f'Start load hf tokenizer: {tokenizer_path}') self.hf_tokenizer = AutoTokenizer.from_pretrained( tokenizer_path, trust_remote_code=True) self.logger.info( - f'Tokenizer is loaded from {tokenizer_path}') + f'Successfully load HF Tokenizer from {tokenizer_path}' + ) return len(self.hf_tokenizer(prompt).input_ids) except Exception: self.logger.warn( 'Can not get tokenizer automatically, ' 'will use default tokenizer gpt-4 for length calculation.') default_tokenizer = 'gpt-4' + enc = self.tiktoken.encoding_for_model(default_tokenizer) + if self.verbose: + self.logger.info( + f'Successfully load default tiktoken tokenizer: ' + f' {default_tokenizer}') return len(enc.encode(prompt)) def bin_trim(self, prompt: str, num_token: int) -> str: @@ -429,11 +482,27 @@ def __init__(self, top_logprobs: int | None = None, temperature: float | None = None, tokenizer_path: str | None = None, - extra_body: Dict | None = None): - super().__init__(path, max_seq_len, query_per_second, rpm_verbose, - retry, key, org, meta_template, openai_api_base, - openai_proxy_url, mode, logprobs, top_logprobs, - temperature, tokenizer_path, extra_body) + extra_body: Dict | None = None, + max_completion_tokens: int = 16384, + verbose: bool = False): + super().__init__(path, + max_seq_len, + query_per_second, + rpm_verbose, + retry, + key, + org, + meta_template, + openai_api_base, + openai_proxy_url, + mode, + logprobs, + top_logprobs, + temperature, + tokenizer_path, + extra_body, + verbose=verbose, + max_completion_tokens=max_completion_tokens) from openai import OpenAI if self.proxy_url is None: @@ -448,6 +517,8 @@ def __init__(self, base_url=openai_api_base, api_key=key, http_client=httpx.Client(proxies=proxies)) + if self.verbose: + self.logger.info(f'Used openai_client: {self.openai_client}') def _generate(self, input: PromptList | str, max_out_len: int, temperature: float) -> str: @@ -497,8 +568,23 @@ def _generate(self, input: PromptList | str, max_out_len: int, num_retries = 0 while num_retries < self.retry: self.wait() - try: - responses = self.openai_client.chat.completions.create( + + if self.path in O1_MODEL_LIST: + self.logger.warning( + f"'max_token' is unsupported for model {self.path}") + self.logger.warning( + f'We use max_completion_tokens:' + f'{self.max_completion_tokens}for this query') + query_data = dict( + model=self.path, + max_completion_tokens=self.max_completion_tokens, + n=1, + temperature=self.temperature, + messages=messages, + extra_body=self.extra_body, + ) + else: + query_data = dict( model=self.path, max_tokens=max_out_len, n=1, @@ -506,6 +592,19 @@ def _generate(self, input: PromptList | str, max_out_len: int, messages=messages, extra_body=self.extra_body, ) + + try: + if self.verbose: + self.logger.info('Start calling OpenAI API') + responses = self.openai_client.chat.completions.create( + **query_data) + if self.verbose: + self.logger.info( + 'Successfully get response from OpenAI API') + try: + self.logger.info(responses) + except Exception as e: # noqa F841 + pass return responses.choices[0].message.content except Exception as e: self.logger.error(e) diff --git a/opencompass/models/turbomind.py b/opencompass/models/turbomind.py index e6cfebd29..687fef0dc 100644 --- a/opencompass/models/turbomind.py +++ b/opencompass/models/turbomind.py @@ -1,6 +1,4 @@ import copy -import os -from concurrent.futures import ThreadPoolExecutor from typing import Dict, List, Optional, Union import numpy as np @@ -9,6 +7,8 @@ from opencompass.utils.logging import get_logger from opencompass.utils.prompt import PromptList +from .huggingface_above_v4_33 import _get_possible_max_seq_len + PromptType = Union[PromptList, str] @@ -27,7 +27,9 @@ class TurboMindModel(BaseModel): Args: path (str): path of the turbomind model - concurrency (int): the maximum allowed concurrency of turbomind. + backend (str): The infernce backend, which can be either 'turbomind' or + 'pytorch'. It will fallback to 'pytorch' once the model is not + supported by 'turbomind' max_seq_len (int): The maximum allowed sequence length of a model. Note that the length of prompt + generated tokens shall not exceed this value. Defaults to 2048. @@ -45,32 +47,30 @@ class TurboMindModel(BaseModel): def __init__(self, path: str, - concurrency: int = 8, + backend: str = 'turbomind', max_seq_len: int = 2048, meta_template: Optional[Dict] = None, engine_config: Dict = {}, gen_config: Dict = {}, + batch_padding: bool = False, end_str: Optional[str] = None): super().__init__(path=path, max_seq_len=max_seq_len, meta_template=meta_template) - from lmdeploy.turbomind import TurboMind - from lmdeploy.version import version_info - - if engine_config is not None: - from lmdeploy.messages import TurbomindEngineConfig - engine_config = TurbomindEngineConfig(**engine_config) self.logger = get_logger() - if path.startswith('/') or path.startswith('.'): - assert os.path.exists(path), '{} is not existist'.format(path) - tm_model = TurboMind.from_pretrained(path, engine_config=engine_config) - self.tokenizer = tm_model.tokenizer - self.generators = [ - tm_model.create_instance() for i in range(concurrency) - ] - self.generator_ids = [i + 1 for i in range(concurrency)] + self.max_seq_len = _get_possible_max_seq_len(max_seq_len, path) + from lmdeploy import version_info + from transformers import AutoTokenizer + self.version_info = version_info + self.tokenizer = AutoTokenizer.from_pretrained(path, + trust_remote_code=True) + + DEFAULT_ENGING_CONFIG = {'session_len': self.max_seq_len} + _engine_config = DEFAULT_ENGING_CONFIG.copy() + _engine_config.update(engine_config) + self.pipe = self._build_pipe(path, backend, _engine_config) self.gen_config = gen_config - self.major_version, self.minor_version, _ = version_info + self.batch_padding = batch_padding self.end_str = end_str def generate(self, @@ -92,47 +92,39 @@ def generate(self, assert isinstance( inputs, List), f'List(str) is expected, but got {type(inputs)}' - # split inputs into batches - batch_size = len(self.generators) - batch_inputs = [ - inputs[i:i + batch_size] for i in range(0, len(inputs), batch_size) - ] - - gen_config = copy.deepcopy(self.gen_config) - if do_sample is not None: - if do_sample: - gen_config['top_k'] = 1000 - gen_config['temperature'] = temperature + stop_words = list(set(stopping_criteria)) + + DEFAULT_GEN_CONFIG = { + 'max_new_tokens': max_out_len, + 'min_new_tokens': 1, + 'stop_words': stop_words, + } + + gen_config = copy.deepcopy(DEFAULT_GEN_CONFIG) + gen_config.update(self.gen_config) + if do_sample: + gen_config['top_k'] = 40 + gen_config['temperature'] = temperature + else: + if self.version_info >= (0, 6, 0): + gen_config['do_sample'] = False else: gen_config['top_k'] = 1 - if stopping_criteria: - stop_words = gen_config.get('stop_words', []) - for t in stopping_criteria: - t = self.tokenizer.encode(t, add_bos=False) - stop_words.append(t[0]) - gen_config['stop_words'] = list(set(stop_words)) - gen_config.setdefault('min_new_tokens', 1) - - from lmdeploy.messages import GenerationConfig + + from lmdeploy import GenerationConfig + gen_config = { + k: v + for k, v in gen_config.items() if hasattr(GenerationConfig, k) + } gen_config = GenerationConfig(**gen_config) results = [] - for batch_input in batch_inputs: - with ThreadPoolExecutor() as executor: - _results = list( - executor.map( - self._generate, - self.generators[:len(batch_input)], - self.generator_ids[:len(batch_input)], - batch_input, - [max_out_len] * len(batch_input), - [gen_config] * len(batch_input), - [self.end_str] * len(batch_input), - )) - results += _results - if stopping_criteria: - for s in stopping_criteria: - results = [r.split(s)[0] for r in results] + outputs = self.pipe(inputs, gen_config=gen_config, do_preprocess=False) + for output in outputs: + text = self.tokenizer.decode(output.token_ids) + results.append(text) + for s in stop_words: + results = [r.split(s)[0] for r in results] return results def get_token_len(self, prompt: str) -> int: @@ -146,56 +138,9 @@ def wait(self): """ return self.token_bucket.get_token() - def _generate(self, - generator, - session_id, - prompt: PromptType, - max_out_len: int, - gen_config=None, - end_str: Optional[str] = None) -> str: - """Generate results given a list of inputs. - - Args: - prompt (PromptType): A string or PromptDict. - The PromptDict should be organized in OpenCompass' - API format. - max_out_len (int): The maximum length of the output. - gen_config (GenerationConfig, optional): Generation - config to set arguments like top_k, top_p, temperature. - end_str (str, optional): Whether to trim generated strings - with end_str if the model has special ending strings - that are not handled well. - Defaults to None. - Returns: - str: The generated string. - """ - assert type( - prompt) is str, 'We only support string for TurboMind Python API' - - input_ids = self.tokenizer.encode(prompt) - - for outputs in generator.stream_infer(session_id=session_id, - input_ids=[input_ids], - gen_config=gen_config, - request_output_len=max_out_len, - sequence_start=True, - sequence_end=True, - step=0, - stream_output=False): - if self.major_version >= 0 and self.minor_version >= 4: - output_ids = outputs.token_ids - else: - _, output_ids, _ = outputs - response = self.tokenizer.decode(output_ids) - response = valid_str(response) - # used to trim - if end_str: - response = response.split(end_str)[0] - return response - def get_ppl(self, inputs: List[str], - mask_length: Optional[List[int]] = None) -> List[float]: + mask_length: Optional[List[int]] = None) -> np.ndarray: """Get perplexity scores given a list of inputs. Args: @@ -212,11 +157,28 @@ def get_ppl(self, assert isinstance( inputs, List), f'List(str) is expected, but got {type(inputs)}' results = [] - for text in inputs: - input_ids = self.tokenizer.encode(text) - res = self.generators[0].get_ppl(input_ids) - results.append(res) - results = np.concatenate(results) + if self.version_info <= (0, 6, 0): + for text in inputs: + input_ids = self.tokenizer.encode(text) + res = self.pipe.get_ppl(input_ids) + results.append(res) + results = np.concatenate(results) + else: + if self.batch_padding and len(inputs) > 1: + assert self.tokenizer.pad_token + input_ids = self.tokenizer( + inputs, + padding=True, + truncation=True, + max_length=self.max_seq_len)['input_ids'] + else: + input_ids = [ + self.tokenizer(text)['input_ids'] for text in inputs + ] + for i in range(0, len(input_ids), 128): + results.append(self.pipe.get_ppl(input_ids[i:i + 128])) + results = np.concatenate(results) + return results def get_loglikelihood( @@ -229,11 +191,36 @@ def get_loglikelihood( results = [] for text, cont in zip(inputs, conts): input_ids = self.tokenizer.encode(text) - res = self.generators[0].get_ppl(input_ids) + res = self.pipe.get_ppl(input_ids) logit_sum = res * len(input_ids) input_ids = self.tokenizer.encode(text.replace(cont, '')) - res = self.generators[0].get_ppl(input_ids) + res = self.pipe.get_ppl(input_ids) logit_part = res * len(input_ids) results.append(-(logit_sum - logit_part)) results = np.concatenate(results) return results + + def _build_pipe(self, model_path, backend, engine_config): + assert backend in ['pytorch', 'turbomind'], \ + f'unsupported backend type: {backend}' + + from lmdeploy import (PytorchEngineConfig, TurbomindEngineConfig, + pipeline) + if backend == 'turbomind': + filtered = { + k: v + for k, v in engine_config.items() + if hasattr(TurbomindEngineConfig, k) + } + backend_config = TurbomindEngineConfig(**filtered) + else: + filtered = { + k: v + for k, v in engine_config.items() + if hasattr(PytorchEngineConfig, k) + } + backend_config = PytorchEngineConfig(**filtered) + return pipeline(model_path, + backend_config=backend_config, + log_level='INFO', + max_log_len=10) diff --git a/opencompass/models/turbomind_tis.py b/opencompass/models/turbomind_tis.py deleted file mode 100644 index 8541b9de5..000000000 --- a/opencompass/models/turbomind_tis.py +++ /dev/null @@ -1,135 +0,0 @@ -import logging -import threading -from concurrent.futures import ThreadPoolExecutor -from typing import Dict, List, Optional, Union - -from opencompass.models.base import BaseModel, LMTemplateParser -from opencompass.utils.logging import get_logger -from opencompass.utils.prompt import PromptList - -PromptType = Union[PromptList, str] - - -def valid_str(string, coding='utf-8'): - """decode text according to its encoding type.""" - invalid_chars = [b'\xef\xbf\xbd'] - bstr = bytes(string, coding) - for invalid_char in invalid_chars: - bstr = bstr.replace(invalid_char, b'') - ret = bstr.decode(encoding=coding, errors='ignore') - return ret - - -class TurboMindTisModel(BaseModel): - """Model wrapper for TurboMind Triton Inference Server gRPC API. - - Args: - path (str): The name of OpenAI's model. - tis_addr (str): The address (ip:port format) of turbomind's - triton inference server - max_seq_len (int): The maximum allowed sequence length of a model. - Note that the length of prompt + generated tokens shall not exceed - this value. Defaults to 2048. - meta_template (Dict, optional): The model's meta prompt - template if needed, in case the requirement of injecting or - wrapping of any meta instructions. - """ - - is_api: bool = True - - def __init__( - self, - path: str, - tis_addr: str = '0.0.0.0:33337', - max_seq_len: int = 2048, - meta_template: Optional[Dict] = None, - ): - super().__init__(path=path, - max_seq_len=max_seq_len, - meta_template=meta_template) - from lmdeploy.serve.turbomind.utils import Preprocessor - self.preprocess = Preprocessor(tis_addr) - self.logger = get_logger() - self.template_parser = LMTemplateParser(meta_template) - self.eos_token_id = None - if meta_template and 'eos_token_id' in meta_template: - self.eos_token_id = meta_template['eos_token_id'] - self.tis_addr = tis_addr - - def generate( - self, - inputs: List[PromptType], - max_out_len: int = 512, - temperature: float = 1.0, - ) -> List[str]: - """Generate results given a list of inputs. - - Args: - inputs (List[PromptType]): A list of strings or PromptDicts. - The PromptDict should be organized in OpenCompass' - API format. - max_out_len (int): The maximum length of the output. - temperature (float): What sampling temperature to use, - between 0 and 2. Higher values like 0.8 will make the output - more random, while lower values like 0.2 will make it more - focused and deterministic. Defaults to 0.7. - - Returns: - List[str]: A list of generated strings. - """ - - with ThreadPoolExecutor() as executor: - results = list( - executor.map(self._generate, inputs, - [max_out_len] * len(inputs), - [temperature] * len(inputs))) - return results - - def get_token_len(self, prompt: str) -> int: - input_ids, _ = self.preprocess(prompt) - return input_ids.shape[-1] - - def wait(self): - """Wait till the next query can be sent. - - Applicable in both single-thread and multi-thread environments. - """ - return self.token_bucket.get_token() - - def _generate(self, prompt: PromptType, max_out_len: int, - temperature: float) -> str: - """Generate results given a list of inputs. - - Args: - prompt (PromptType): A string or PromptDict. - The PromptDict should be organized in OpenCompass' - API format. - max_out_len (int): The maximum length of the output. - temperature (float): What sampling temperature to use, - between 0 and 2. Higher values like 0.8 will make the output - more random, while lower values like 0.2 will make it more - focused and deterministic. - - Returns: - str: The generated string. - """ - assert type( - prompt) is str, 'We only support string for TurboMind RPC API' - - from lmdeploy.serve.turbomind.chatbot import Chatbot - chatbot = Chatbot(self.tis_addr, - temperature=temperature, - capability='completion', - top_k=1, - log_level=logging.ERROR) - - for status, text, n_token in chatbot.stream_infer( - session_id=threading.currentThread().ident, - prompt=prompt, - request_output_len=max_out_len, - sequence_start=True, - sequence_end=True): - continue - response = valid_str(text) - response = response.replace('', '') - return response diff --git a/opencompass/models/turbomind_with_tf_above_v4_33.py b/opencompass/models/turbomind_with_tf_above_v4_33.py index 48706671f..ab6801c9c 100644 --- a/opencompass/models/turbomind_with_tf_above_v4_33.py +++ b/opencompass/models/turbomind_with_tf_above_v4_33.py @@ -1,7 +1,6 @@ # flake8: noqa # yapf: disable import copy -from concurrent.futures import ThreadPoolExecutor from typing import Dict, List, Optional, Union from opencompass.models.base import BaseModel @@ -31,38 +30,32 @@ def __init__( self, path: str, tokenizer_only: bool = False, + backend: str = 'turbomind', engine_config: Dict = {}, gen_config: Dict = {}, - concurrency: int = 8, max_seq_len: int = None, meta_template: Optional[Dict] = None, fastchat_template: Optional[str] = None, stop_words: List[str] = [], ): - from lmdeploy.messages import TurbomindEngineConfig - from lmdeploy.turbomind import TurboMind - from lmdeploy.version import version_info - from transformers import AutoTokenizer - self.logger = get_logger() self.path = path self.tokenizer_only = tokenizer_only self.template_parser = _get_meta_template(meta_template) self.max_seq_len = _get_possible_max_seq_len(max_seq_len, path) - self.origin_tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True) + from lmdeploy import version_info + from transformers import AutoTokenizer + self.version_info = version_info + self.tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True) if not tokenizer_only: DEFAULT_ENGING_CONFIG = {'session_len': self.max_seq_len} _engine_config = DEFAULT_ENGING_CONFIG.copy() _engine_config.update(engine_config) - engine_config = TurbomindEngineConfig(**_engine_config) - tm_model = TurboMind.from_pretrained(path, engine_config=engine_config) - self.tokenizer = tm_model.tokenizer - self.generators = [tm_model.create_instance() for i in range(concurrency)] - self.generator_ids = [i + 1 for i in range(concurrency)] - self.concurrency = concurrency + self.pipe = self._build_pipe(path, backend, _engine_config) + else: + self.pipe = None self.gen_config = gen_config - self.version_info = version_info self.fastchat_template = fastchat_template self.stop_words = list(set(stop_words + self._get_potential_stop_words(path))) self.logger.info(f'using stop words: {self.stop_words}') @@ -76,23 +69,23 @@ def _get_potential_stop_words(self, path: Optional[str]): generation_config = None if generation_config and hasattr(generation_config, 'eos_token_id'): if isinstance(generation_config.eos_token_id, int): - potential_stop_words.append(self.origin_tokenizer.decode(generation_config.eos_token_id)) + potential_stop_words.append(self.tokenizer.decode(generation_config.eos_token_id)) else: assert isinstance(generation_config.eos_token_id, list) for token_id in generation_config.eos_token_id: - potential_stop_words.append(self.origin_tokenizer.decode(token_id)) - if self.origin_tokenizer.eos_token is not None: - potential_stop_words.append(self.origin_tokenizer.eos_token) + potential_stop_words.append(self.tokenizer.decode(token_id)) + if self.tokenizer.eos_token is not None: + potential_stop_words.append(self.tokenizer.eos_token) potential_stop_words = list(set(potential_stop_words)) potential_stop_words = [s for s in potential_stop_words if s] return potential_stop_words def generate(self, inputs: List[str], - max_out_len: int = 512, + max_out_len: int, stopping_criteria: List[str] = [], do_sample: Optional[bool] = None, - temperature: int = 1, + temperature: float = 1.0, **kwargs) -> List[str]: """Generate results given a list of inputs. @@ -104,93 +97,45 @@ def generate(self, List[str]: A list of generated strings. """ assert isinstance(inputs, List), f'List(str) is expected, but got {type(inputs)}' - messages = _convert_chat_messages(inputs) if self.fastchat_template: messages = _format_with_fast_chat_template(messages, self.fastchat_template) else: - messages = [self.origin_tokenizer.apply_chat_template(m, add_generation_prompt=True, tokenize=False) for m in messages] - - # split messages into batches - batch_messages = [messages[i:i + self.concurrency] for i in range(0, len(messages), self.concurrency)] + messages = [self.tokenizer.apply_chat_template(m, add_generation_prompt=True, tokenize=False) for m in messages] stop_words = list(set(self.stop_words + stopping_criteria)) - encode_stop_words = [] - if stop_words is not None and len(stop_words) > 0: - for words in stop_words: - encode_stop_words += self.tokenizer.encode(words, add_bos=False) DEFAULT_GEN_CONFIG = { 'max_new_tokens': max_out_len, 'min_new_tokens': 1, - 'top_k': 1, - 'stop_words': encode_stop_words, + 'stop_words': stop_words, } gen_config = copy.deepcopy(DEFAULT_GEN_CONFIG) gen_config.update(self.gen_config) if do_sample: - gen_config['top_k'] = 1000 + gen_config['top_k'] = 40 gen_config['temperature'] = temperature + else: + if self.version_info >= (0, 6, 0): + gen_config['do_sample'] = False + else: + gen_config['top_k'] = 1 - from lmdeploy.messages import GenerationConfig + from lmdeploy import GenerationConfig + gen_config = {k: v for k, v in gen_config.items() if hasattr(GenerationConfig, k)} gen_config = GenerationConfig(**gen_config) - if self.version_info >= (0, 6, 0): - gen_config.stop_words = stop_words - gen_config.convert_stop_bad_words_to_ids(self.tokenizer) results = [] - for batch_message in batch_messages: - n = len(batch_message) - with ThreadPoolExecutor() as executor: - _results = list( - executor.map( - self._generate, - self.generators[:n], - self.generator_ids[:n], - batch_message, - [gen_config] * n, - )) - results += _results + outputs = self.pipe(messages, gen_config=gen_config, do_preprocess=False) + for output in outputs: + text = self.tokenizer.decode(output.token_ids) + results.append(text) for s in stop_words: results = [r.split(s)[0] for r in results] return results - def _generate(self, - generator, - session_id, - prompt: PromptType, - gen_config=None) -> str: - """Generate results given a list of inputs. - - Args: - prompt (PromptType): A string or PromptDict. - The PromptDict should be organized in OpenCompass' - API format. - gen_config (GenerationConfig, optional): Generation - config to set arguments like top_k, top_p, temperature. - Returns: - str: The generated string. - """ - assert type(prompt) is str, 'We only support string for TurboMind Python API' - - input_ids = self.tokenizer.encode(prompt, add_bos=False) - for outputs in generator.stream_infer(session_id=session_id, - input_ids=[input_ids], - gen_config=gen_config, - sequence_start=True, - sequence_end=True, - step=0, - stream_output=False): - if self.version_info >= (0, 4, 0): - output_ids = outputs.token_ids - else: - _, output_ids, _ = outputs - response = self.tokenizer.decode(output_ids) - response = valid_str(response) - return response - def get_token_len(self, prompt: str) -> int: """Get lengths of the tokenized strings. @@ -201,5 +146,20 @@ def get_token_len(self, prompt: str) -> int: int: Length of the input tokens """ m = _convert_chat_messages([prompt])[0] - t = self.origin_tokenizer.apply_chat_template(m, add_generation_prompt=True, return_dict=True) + t = self.tokenizer.apply_chat_template(m, add_generation_prompt=True, return_dict=True) return len(t['input_ids']) + + def _build_pipe(self, model_path, backend, engine_config): + from lmdeploy import (PytorchEngineConfig, TurbomindEngineConfig, + pipeline) + + assert backend in ['pytorch', 'turbomind'], \ + f'unsupported backend type: {backend}' + + if backend == 'turbomind': + filtered = {k: v for k, v in engine_config.items() if hasattr(TurbomindEngineConfig, k)} + backend_config = TurbomindEngineConfig(**filtered) + else: + filtered = {k: v for k, v in engine_config.items() if hasattr(PytorchEngineConfig, k)} + backend_config = PytorchEngineConfig(**filtered) + return pipeline(model_path, backend_config=backend_config, log_level='INFO', max_log_len=10) diff --git a/opencompass/openicl/icl_inferencer/icl_gen_inferencer.py b/opencompass/openicl/icl_inferencer/icl_gen_inferencer.py index 17bdf468c..6a33b711a 100644 --- a/opencompass/openicl/icl_inferencer/icl_gen_inferencer.py +++ b/opencompass/openicl/icl_inferencer/icl_gen_inferencer.py @@ -127,6 +127,7 @@ def inference(self, index = len(tmp_result_dict) # 4. Wrap prompts with Dataloader + logger.info('Starting build dataloader') dataloader = self.get_dataloader(prompt_list[index:], self.batch_size) # 5. Inference for prompts in each batch diff --git a/opencompass/runners/dlc.py b/opencompass/runners/dlc.py index 40453ed08..87fee8c4b 100644 --- a/opencompass/runners/dlc.py +++ b/opencompass/runners/dlc.py @@ -182,7 +182,11 @@ def _launch(self, cfg: ConfigDict, random_sleep: Optional[bool] = None): cfg_path=param_file, template=tmpl) cmd = get_cmd() - + # Use specified python env instead of sys.executable + if self.aliyun_cfg['python_env_path']: + cmd = cmd.replace( + sys.executable, + f'{self.aliyun_cfg["python_env_path"]}/bin/python') logger = get_logger() logger.debug(f'Running command: {cmd}') @@ -232,6 +236,8 @@ def _run_within_retry(): while True: # 1. Avoid to request dlc too frequently. # 2. DLC job may not be ready immediately after creation. + dlc_sleep_time = self.aliyun_cfg.get('dlc_sleep_time', 10) + time.sleep(dlc_sleep_time) num_retry = 60 for retry_index in range(num_retry): time.sleep(2) diff --git a/opencompass/summarizers/subjective/__init__.py b/opencompass/summarizers/subjective/__init__.py index 6565d5c89..ea2367c0b 100644 --- a/opencompass/summarizers/subjective/__init__.py +++ b/opencompass/summarizers/subjective/__init__.py @@ -4,6 +4,7 @@ from .alpacaeval import AlpacaSummarizer from .arenahard import ArenaHardSummarizer from .charm import CharmMemSummarizer +from .common_summarizer import CommonSummarizer from .compass_arena import CompassArenaSummarizer from .compassbench import CompassBenchSummarizer from .corev2 import Corev2Summarizer diff --git a/opencompass/summarizers/subjective/common_summarizer.py b/opencompass/summarizers/subjective/common_summarizer.py new file mode 100644 index 000000000..4793a91f1 --- /dev/null +++ b/opencompass/summarizers/subjective/common_summarizer.py @@ -0,0 +1,146 @@ +# flake8: noqa +# yapf: disable +import csv +import os +import os.path as osp +import re +from collections import defaultdict +from datetime import datetime + +import numpy as np +from mmengine import ConfigDict +from tabulate import tabulate + +from opencompass.utils import dataset_abbr_from_cfg, model_abbr_from_cfg + +from .compass_arena import CompassArenaSummarizer +from .utils import get_judgeanswer_and_reference, get_outdir + + +def model_abbr_from_cfg_used_in_summarizer(model): + if model.get('summarizer_abbr', None): + return model['summarizer_abbr'] + else: + return model_abbr_from_cfg(model) + +def post_process_single_rate(judgement: str): + """Input a string like below: + + xxx[[5]]xxx, and extract the score + """ + pattern = r'Rating:\s*\[\[([\d.]+)\]\]' + matched_result = re.findall(pattern, judgement) + if matched_result: + score = float(matched_result[0]) + else: + return None + return {'score': score} + + +def get_capability_results( + judged_answers, + references, + fout, + fout_flag, + model_abbr, + judge_model_abbr, + dataset_abbr, +): + capability_ratings = defaultdict(int) + capability_counts = defaultdict(int) + for ans, ref in zip(judged_answers, references): + capability_ratings['total'] += ans['score'] + capability_counts['total'] += 1 + capability_ratings[ref['capability']] += ans['score'] + capability_counts[ref['capability']] += 1 + + capability_avg_ratings = defaultdict(float) + + for capability, total_score in capability_ratings.items(): + s = total_score / capability_counts[capability] + s = round(s, 2) + capability_avg_ratings[capability] = s + columns = list(capability_avg_ratings.keys()) + columns.insert(0, columns.pop(columns.index('total'))) + + if fout_flag == 0: + with open(fout, 'w', newline='') as csvfile: + writer = csv.writer(csvfile) + if fout_flag == 0: + writer.writerow(['model', 'judge_model', 'dataset'] + columns) + writer.writerow([model_abbr] + [judge_model_abbr] + [dataset_abbr] + [capability_avg_ratings[column] for column in columns]) + else: + with open(fout, 'a+', newline='') as csvfile: + writer = csv.writer(csvfile) + writer.writerow([model_abbr] + [judge_model_abbr] + [dataset_abbr] + [capability_avg_ratings[column] for column in columns]) + + +class CommonSummarizer(CompassArenaSummarizer): + """Do the subjectivity analyze based on evaluation results. + + Args: + config (ConfigDict): The configuration object of the evaluation task. + It's expected to be filled out at runtime. + """ + + def __init__(self, config: ConfigDict, judge_type='single_rate') -> None: + self.judge_type = judge_type + self.tasks = [] + self.cfg = config + self.judge_type = 'single_rate' + self.eval_model_cfgs = self.cfg['eval']['partitioner']['models'] + self.judge_model_cfgs = self.cfg['judge_models'] + self.judge_map = { + 'single_rate': post_process_single_rate + } + self.judge_function = self.judge_map[self.judge_type] + + def summarize(self, time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S')): + """Summarize the subjectivity analysis based on evaluation results. + + Args: + time_str (str): Timestamp for file naming. + + Returns: + pd.DataFrame: The summary results. + """ + if self.judge_type == 'pair': + return super().summarize() + + # self.judge_type == 'single' + dataset_cfgs = self.cfg['datasets'] + output_dir, results_folder = get_outdir(self.cfg, time_str) + fout_flag = 0 + output_tmp_file = osp.join(output_dir, 'result.csv') + output_file = osp.join(output_dir, 'total_result.csv') + for eval_model_cfg in self.eval_model_cfgs: + for judge_model_cfg in self.judge_model_cfgs: + eval_model_abbr = model_abbr_from_cfg(eval_model_cfg) + show_model_abbr = model_abbr_from_cfg_used_in_summarizer(eval_model_cfg) + show_judge_model_abbr = model_abbr_from_cfg_used_in_summarizer(judge_model_cfg) + judge_abbr = model_abbr_from_cfg(judge_model_cfg) + subdir_path = os.path.join(results_folder, eval_model_abbr + '_judged-by--' + judge_abbr) + if os.path.isdir(subdir_path): + for dataset in dataset_cfgs: + judged_answers, references = get_judgeanswer_and_reference(dataset, subdir_path, self.judge_function) + show_dataset_abbr = dataset_abbr_from_cfg(dataset) + + get_capability_results(judged_answers, references, output_tmp_file, fout_flag, show_model_abbr, show_judge_model_abbr, show_dataset_abbr) + fout_flag += 1 + else: + print(subdir_path + ' is not exist! please check!') + with open(output_tmp_file, 'r') as f: + csv_reader = csv.reader(f) + header = next(csv_reader) + table = [line for line in csv_reader] + + new_header = [''] + [line[0] for line in table] + new_table = [[h] + line[1:] for h, line in zip(header[1:], table)] + new_table = [[h] + [line[i] for line in table] for i, h in enumerate(header[1:], start=1)] + t = tabulate(new_table, headers=new_header) + with open(output_file, 'a') as f: + f.write(','.join(new_header) + '\n') + for line in new_table: + f.write(','.join(map(str, line)) + '\n') + print(t) + print(output_file) diff --git a/opencompass/summarizers/subjective/flames.py b/opencompass/summarizers/subjective/flames.py index c0150b749..53cde79c4 100644 --- a/opencompass/summarizers/subjective/flames.py +++ b/opencompass/summarizers/subjective/flames.py @@ -21,7 +21,7 @@ def post_process_flames(judgement: str): 分数=3 and extract the score """ - matches = re.findall(r'分数=(\d+)', text) + matches = re.findall(r'分数=(\d+)', judgement) if matches: matches = matches[0] return int(matches) diff --git a/opencompass/utils/model_postprocessors.py b/opencompass/utils/model_postprocessors.py index 887ffa2c8..13690ad06 100644 --- a/opencompass/utils/model_postprocessors.py +++ b/opencompass/utils/model_postprocessors.py @@ -6,12 +6,71 @@ from opencompass.registry import TEXT_POSTPROCESSORS +from .postprocessors.naive import NaiveExtractor, format_input_naive from .postprocessors.xfinder.extractor import Extractor from .postprocessors.xfinder.xfinder_utils import (DataProcessor, convert_to_xfinder_format) -def gen_output(ori_data, extractor): +def gen_output_naive(ori_data, extractor): + extracted_answers = [] + for item in tqdm(ori_data): + user_input = extractor.prepare_input(item) + extracted_answer = extractor.gen_output(user_input) + item['extracted_answer'] = extracted_answer + extracted_answers.append(extracted_answer) + + return extracted_answers + + +@TEXT_POSTPROCESSORS.register_module('naive') +def navie_model_postprocess(preds: list, + model_name: str, + custom_instruction: str, + api_url: Union[str, list], + num_processes: int = 8, + **kwargs) -> list: + """Postprocess the text extracted by custom model. + Args: + preds (list): The question, reference answer and model prediction. + model_name (str): The name of the model. + custom_instruction (str): Custom instruction for the dataset. + url (Union[str, list]): The api url of the model. + + Returns: + list: The postprocessed answers. + """ + + def _eval_pred(texts, extractor, num_processes): + ori_data = texts + extracted_answers = [] + batched_ori_data = [] + # Split data into batches + num_processes = min(num_processes, len(ori_data)) + batch_size = len(ori_data) // num_processes + for i in range(0, len(ori_data), batch_size): + batched_ori_data.append(ori_data[i:i + batch_size]) + with Pool(num_processes) as p: + results = p.map(partial(gen_output_naive, extractor=extractor), + batched_ori_data) + for result in results: + extracted_answers.extend(result) + return extracted_answers + + format_data = format_input_naive(preds) + assert api_url is not None, 'Please provide the api url.' + extractor = NaiveExtractor( + model_name=model_name, + custom_instruction=custom_instruction, + url=api_url.split(',') if ',' in api_url else api_url) + calc_acc_func = partial(_eval_pred, + extractor=extractor, + num_processes=num_processes) + extracted_answers = calc_acc_func(format_data) + return extracted_answers + + +def gen_output_xfinder(ori_data, extractor): ext_cor_pairs = [] extracted_data = [] extracted_answers = [] @@ -30,9 +89,8 @@ def gen_output(ori_data, extractor): @TEXT_POSTPROCESSORS.register_module('xfinder') -def xfinder_postprocess(preds: list, question_type: str, - xfinder_model_name: str, - xfiner_api_url: Union[str, list], **kwargs) -> list: +def xfinder_postprocess(preds: list, question_type: str, model_name: str, + api_url: Union[str, list], **kwargs) -> list: """Postprocess the text extracted by xFinder model. Args: preds (list): The question, reference answer and model prediction. @@ -56,7 +114,7 @@ def _eval_pred(texts, data_processor, extractor, num_processes=8): for i in range(0, len(ori_data), batch_size): batched_ori_data.append(ori_data[i:i + batch_size]) with Pool(num_processes) as p: - results = p.map(partial(gen_output, extractor=extractor), + results = p.map(partial(gen_output_xfinder, extractor=extractor), batched_ori_data) for result in results: extracted_answers += result[0] @@ -65,11 +123,11 @@ def _eval_pred(texts, data_processor, extractor, num_processes=8): return extracted_answers format_data = convert_to_xfinder_format(question_type, preds) - assert xfiner_api_url is not None, 'Please provide the api url.' + assert api_url is not None, 'Please provide the api url.' data_processor = DataProcessor() - extractor = Extractor(model_name=xfinder_model_name, - url=xfiner_api_url.split(',') - if ',' in xfiner_api_url else xfiner_api_url) + extractor = Extractor( + model_name=model_name, + url=api_url.split(',') if ',' in api_url else api_url) calc_acc_func = partial(_eval_pred, data_processor=data_processor, extractor=extractor) diff --git a/opencompass/utils/postprocessors/naive/PROMPT_TEMPLATE.py b/opencompass/utils/postprocessors/naive/PROMPT_TEMPLATE.py new file mode 100644 index 000000000..b8b9abbb4 --- /dev/null +++ b/opencompass/utils/postprocessors/naive/PROMPT_TEMPLATE.py @@ -0,0 +1,11 @@ +OPTION_NAVIE_PROMPT_TEMPLATE = """ +There is a detailed explanation of the final answer you should extract: +1. You should extract the final answer option like 'A', 'B', 'C', 'D' ... from the given output sentences. +2. The question is a single choice question, so the final answer option should be one of the options, not a combination of options. +""" # noqa + +MATH_NAVIE_PROMPT_TEMPLATE = """ +This is a detailed explanation of the final answer you should extract: +1. The question type is a math question, so the final answer should be a number, set, vector, matrix, interval, expression, function, equation, or inequality and any combination of them. +2. If the final answer includes additional symbols, such as units, you should exclude them and only extract the pure final answer. +""" # noqa diff --git a/opencompass/utils/postprocessors/naive/README.md b/opencompass/utils/postprocessors/naive/README.md new file mode 100644 index 000000000..dcc14a4bf --- /dev/null +++ b/opencompass/utils/postprocessors/naive/README.md @@ -0,0 +1,71 @@ +## Short Usage Introduction for Naive Model Postprocessor with Custom Model + + + +### Step 1: Deploy an API server using vLLM or LMDeploy + +```bash +lmdeploy serve api_server meta-llama/Meta-Llama-3-8B-Instruct --model-name llama3-8b-instruct --server-port 23333 --backend turbomind --tp 1 +``` + +### Step 2: Add Naive Model Postprocessor to the configuration file + +Take GSM8K as an example, you can add the following lines to the configuration file and replace the `api_url` with the correct address of the API server. + +```python +... +from opencompass.utils.model_postprocessors import navie_model_postprocess +from opencompass.utils.postprocessors.naive import MATH_NAVIE_PROMPT_TEMPLATE + +... + +gsm8k_eval_cfg = dict( + evaluator=dict(type=MATHEvaluator, version='v2'), + pred_postprocessor=dict(type=math_postprocess_v2), + dataset_postprocessor=dict(type=gsm8k_dataset_postprocess), + # Add the following line to use the naive model postprocessor + model_postprocessor=dict( + type=navie_model_postprocess, + custom_instruction=MATH_NAVIE_PROMPT_TEMPLATE, + model_name='llama3-8b-instruct', + api_url='http://0.0.0.0:23333/v1,http://0.0.0.0:23334/v1') + ) +... + +``` + +The prompt for extraction can also be customized by changing the `custom_instruction` parameter. Now support two default templates: `MATH_NAVIE_PROMPT_TEMPLATE` for math problems extraction like GSM8K and MATH, and `OPTION_NAVIE_PROMPT_TEMPLATE` for option problems extraction like MMLU. You can also write your own prompt template, like: + +```python +OPTION_NAVIE_PROMPT_TEMPLATE = """ +There is a detailed explanation of the final answer you should extract: +1. You should extract the final answer option like 'A', 'B', 'C', 'D' ... from the given output sentences. +2. The question is a single choice question, so the final answer option should be one of the options, not a combination of options. +""" +``` + +Your prompt should start with `There is a detailed explanation of the final answer you should extract:` and following with your customized instructions. + +### Step 3: Run the Evaluation as Usual + +Now you can run the evaluation as usual with the configuration file you modified. The evaluation will use the custom model as the post-process model to get the final result. The final result will be the `model_postprocess_accuracy` in the evaluation result, like: + +```Markdown +dataset version metric mode llama-3-8b-instruct-turbomind +------------------------------------------------- --------- -------------------------- ------ ------------------------------- +gsm8k a58960 accuracy gen 73.46 +gsm8k a58960 model_postprocess_accuracy gen 78.77 +``` + +## Experiment Results + +We have tested the model postprocess method with different models (Qwen2-72B-Chat, Llama3-8b-Chat) as post-process model on the GSM8K, MMLU datasets for `Meta-Llama-3-8B-Instruct` with above settings, and the results are as follows: + +```Markdown +| Dataset | Type | Config ID | Regex Postprocess Score | Model Postprocess Score (Llama3-8b-Instruct) | Model Postprocess Score (Qwen2-72B-Chat) | +| ------- | --------------- | ------------------------ | ----------------------- | ----------------------- |----------------------- | +| gsm8k | math | a58960 | 73.46 | 79.08 | 78.77 | +| mmlu | option | 4d595a | 67.89 | 65.26 | 67.94 | +``` + +The `metric` column with `model_postprocess_accuracy` is the final result after the `Naive Model Postprocessor` is applied. diff --git a/opencompass/utils/postprocessors/naive/__init__.py b/opencompass/utils/postprocessors/naive/__init__.py new file mode 100644 index 000000000..70a914d58 --- /dev/null +++ b/opencompass/utils/postprocessors/naive/__init__.py @@ -0,0 +1,2 @@ +from .extractor import * # noqa +from .PROMPT_TEMPLATE import * # noqa diff --git a/opencompass/utils/postprocessors/naive/extractor.py b/opencompass/utils/postprocessors/naive/extractor.py new file mode 100644 index 000000000..c759094ce --- /dev/null +++ b/opencompass/utils/postprocessors/naive/extractor.py @@ -0,0 +1,121 @@ +# Naive model extractor for OpenCompass, modified from xFinder: https://github.com/IAAR-Shanghai/xFinder # noqa +import json +import time +from logging import getLogger + +from openai import OpenAI + +Meta_Instruction = """I will provide you with a question, output sentences along with an answer range. The output sentences are the response of the question provided. The answer range could either describe the type of answer expected or list all possible valid answers. Using the information provided, you must accurately and precisely determine and extract the intended key answer from the output sentences. Please don't have your subjective thoughts about the question. +First, you need to determine whether the content of the output sentences is relevant to the given question. If the entire output sentences are unrelated to the question (meaning the output sentences are not addressing the question), then output [No valid answer]. +Otherwise, ignore the parts of the output sentences that have no relevance to the question and then extract the key answer that matches the answer range. +Below are some special cases you need to be aware of: + (1) If the output sentences present multiple different answers, carefully determine if the later provided answer is a correction or modification of a previous one. If so, extract this corrected or modified answer as the final response. Conversely, if the output sentences fluctuate between multiple answers without a clear final answer, you should output [No valid answer]. + (2) If the answer range is a list and the key answer in the output sentences is not explicitly listed among the candidate options in the answer range, also output [No valid answer]. + (3) You should only return the precise answer you extract, without processing the answer. Please return only the answer and do not add any additional content. + +""" # noqa + + +def format_input_naive(data): + format_data = [] + for item in data: + template = {} + question = item['origin_prompt'][-1]['prompt'] + llm_output = item['prediction'] + correct_answer = item['reference'] if item['reference'] else item[ + 'gold'] + template['correct_answer'] = correct_answer + template['question'] = question + template['llm_output'] = llm_output + + format_data.append(template) + return format_data + + +class NaiveExtractor: + + def __init__( + self, + model_name, + model_path=None, + url=None, + temperature=0, + max_tokens=3000, + api_key='EMPTY', + SYSTEM='You are a help assistant tasked with extracting the precise key answer from given output sentences. You must only provide the extracted key answer without including any additional text.', # noqa + custom_instruction=''): + self.model_name = model_name + self.SYSTEM = SYSTEM + self.model_path = model_path + self.url = url + self.api_key = api_key + self.temperature = temperature + self.max_tokens = max_tokens + self.custom_instruction = custom_instruction + self.logger = getLogger(__name__) + + def prepare_input(self, item): + user_input = Meta_Instruction + self.custom_instruction + \ + "Question: \"\"\"" + item['question'] + "\"\"\"\n\n" + \ + "Output sentences: \"\"\"" + item['llm_output'] + "\"\"\"\n\n" + \ + 'Key extracted answer: ' + + return user_input + + def gen_output(self, query): + return self.openai_infer(query) + + def openai_infer(self, query: str, retry=9) -> str: + """Perform inference on the OpenAI model. + + Args: + query (str): The input query. + + Returns: + str: The extracted answer (xFinder's output). + """ + if isinstance(self.url, list): + # Randomly api for better load balancing + import random + self.url = random.choice(self.url) + self.client = OpenAI( + api_key=self.api_key, + base_url=self.url, + ) + self.retry = retry + + t = time.time() + retry = self.retry + response = '' + while retry > 0: + try: + chat_response = self.client.chat.completions.create( + model=self.client.models.list().data[0].id + if self.model_name == '' else self.model_name, + messages=[ + { + 'role': 'system', + 'content': self.SYSTEM + }, + { + 'role': 'user', + 'content': query + }, + ], + temperature=self.temperature, + max_tokens=self.max_tokens, + ) + js_response = json.loads(chat_response.model_dump_json()) + response = js_response['choices'][0]['message']['content'] + break + except Exception as e: + self.logger.info(f'Error: {e}') + self.logger.info(f'{self.url} is down. Retrying...') + self.logger.info(f'Time elapsed: {time.time() - t} seconds') + time.sleep(6) + retry -= 1 + if retry == 0: + response = 'Error: Failed to get response.' + self.logger.info(f'{response} after {self.retry} tries.') + raise ValueError('The api is down') + return response.strip() diff --git a/opencompass/utils/run.py b/opencompass/utils/run.py index 67c465941..025efc4b3 100644 --- a/opencompass/utils/run.py +++ b/opencompass/utils/run.py @@ -9,7 +9,7 @@ from opencompass.datasets.custom import make_custom_dataset_config from opencompass.models import (VLLM, HuggingFace, HuggingFaceBaseModel, HuggingFaceCausalLM, HuggingFaceChatGLM3, - HuggingFacewithChatTemplate, TurboMindModel, + HuggingFacewithChatTemplate, TurboMindModelwithChatTemplate, VLLMwithChatTemplate) from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner @@ -233,7 +233,7 @@ def change_accelerator(models, accelerator): model_accels = [] for model in models: logger.info(f'Transforming {model["abbr"]} to {accelerator}') - # change HuggingFace model to VLLM or TurboMindModel + # change HuggingFace model to VLLM or LMDeploy if model['type'] in [HuggingFace, HuggingFaceCausalLM, HuggingFaceChatGLM3, f'{HuggingFaceBaseModel.__module__}.{HuggingFaceBaseModel.__name__}']: gen_args = dict() if model.get('generation_kwargs') is not None: @@ -254,10 +254,10 @@ def change_accelerator(models, accelerator): if accelerator == 'lmdeploy': logger.info(f'Transforming {model["abbr"]} to {accelerator}') - mod = TurboMindModel + mod = TurboMindModelwithChatTemplate acc_model = dict( type=f'{mod.__module__}.{mod.__name__}', - abbr=model['abbr'].replace('hf', 'turbomind') if '-hf' in model['abbr'] else model['abbr'] + '-turbomind', + abbr=model['abbr'].replace('hf', 'lmdeploy') if '-hf' in model['abbr'] else model['abbr'] + '-lmdeploy', path=model['path'], engine_config=dict(session_len=model['max_seq_len'], max_batch_size=model['batch_size'], @@ -270,7 +270,6 @@ def change_accelerator(models, accelerator): max_out_len=model['max_out_len'], max_seq_len=model['max_seq_len'], batch_size=model['batch_size'], - concurrency=model['batch_size'], run_cfg=model['run_cfg'], ) for item in ['meta_template']: @@ -312,7 +311,7 @@ def change_accelerator(models, accelerator): mod = TurboMindModelwithChatTemplate acc_model = dict( type=f'{mod.__module__}.{mod.__name__}', - abbr=model['abbr'].replace('hf', 'turbomind') if '-hf' in model['abbr'] else model['abbr'] + '-turbomind', + abbr=model['abbr'].replace('hf', 'lmdeploy') if '-hf' in model['abbr'] else model['abbr'] + '-lmdeploy', path=model['path'], engine_config=dict(max_batch_size=model.get('batch_size', 16), tp=model['run_cfg']['num_gpus']), gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9), diff --git a/requirements/agent.txt b/requirements/agent.txt index 334574100..bf4f88129 100644 --- a/requirements/agent.txt +++ b/requirements/agent.txt @@ -5,7 +5,7 @@ json5 jupyter jupyter_client jupytext -lagent==0.1.2 +-e git+https://github.com/open-compass/lagent-cibench.git#egg=lagent lightgbm==4.1.0 networkx scikit-image diff --git a/requirements/extra.txt b/requirements/extra.txt index 218348344..efeef772e 100644 --- a/requirements/extra.txt +++ b/requirements/extra.txt @@ -1,6 +1,7 @@ # Alpaca-eval alpaca-eval==0.6 cn2an +dingo-python # Icl topk retriever faiss_gpu==1.7.2 # Humaneval, Humaneval X diff --git a/requirements/runtime.txt b/requirements/runtime.txt index dc6389114..e7229e889 100644 --- a/requirements/runtime.txt +++ b/requirements/runtime.txt @@ -23,6 +23,7 @@ python-Levenshtein rank_bm25==0.2.2 rapidfuzz requests>=2.31.0 +retrying rich rouge -e git+https://github.com/Isaac-JL-Chen/rouge_chinese.git@master#egg=rouge_chinese