From 141588ebda40ecf7690a97e453478949c3b11505 Mon Sep 17 00:00:00 2001 From: Yifan Mai Date: Fri, 22 Nov 2024 21:14:15 -0800 Subject: [PATCH 01/14] Add Upstage Solar Pro Preview model (#3181) --- src/helm/config/model_deployments.yaml | 13 ++++++++++++- src/helm/config/model_metadata.yaml | 9 +++++++++ src/helm/config/tokenizer_configs.yaml | 9 +++++++++ 3 files changed, 30 insertions(+), 1 deletion(-) diff --git a/src/helm/config/model_deployments.yaml b/src/helm/config/model_deployments.yaml index ff737870cb2..9819638c159 100644 --- a/src/helm/config/model_deployments.yaml +++ b/src/helm/config/model_deployments.yaml @@ -1271,6 +1271,17 @@ model_deployments: client_spec: class_name: "helm.clients.huggingface_client.HuggingFaceClient" + # Upstage + - name: huggingface/solar-pro-preview-instruct + model_name: upstage/solar-pro-preview-instruct + tokenizer_name: upstage/solar-pro-preview-instruct + max_sequence_length: 4096 + client_spec: + class_name: "helm.clients.huggingface_client.HuggingFaceClient" + args: + torch_dtype: auto + trust_remote_code: true + ## Text-to-Image Diffusion Models - name: huggingface/dreamlike-diffusion-v1-0 @@ -2818,4 +2829,4 @@ model_deployments: client_spec: class_name: "helm.clients.huggingface_client.HuggingFaceClient" args: - pretrained_model_name_or_path: ibm-granite/granite-3.0-1b-a400m-base \ No newline at end of file + pretrained_model_name_or_path: ibm-granite/granite-3.0-1b-a400m-base diff --git a/src/helm/config/model_metadata.yaml b/src/helm/config/model_metadata.yaml index 65afaa1b895..db75d81266d 100644 --- a/src/helm/config/model_metadata.yaml +++ b/src/helm/config/model_metadata.yaml @@ -3126,6 +3126,15 @@ models: release_date: 2022-09-19 tags: [] # TODO: add tags + # Upstage + - name: upstage/solar-pro-preview-instruct + display_name: Solar Pro Preview (22B) + description: Solar Pro Preview (22B) is open-weights model for single GPU inference that is a preview of the upcoming Solar Pro model ([blog](https://www.upstage.ai/products/solar-pro-preview)). + creator_organization_name: Upstage + access: open + num_parameters: 22000000000 + release_date: 2024-09-11 + tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG] # Writer diff --git a/src/helm/config/tokenizer_configs.yaml b/src/helm/config/tokenizer_configs.yaml index 9f92970bc73..2741c25aaab 100644 --- a/src/helm/config/tokenizer_configs.yaml +++ b/src/helm/config/tokenizer_configs.yaml @@ -612,6 +612,15 @@ tokenizer_configs: end_of_text_token: "" prefix_token: "" + # Upstage + - name: upstage/solar-pro-preview-instruct + tokenizer_spec: + class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer" + args: + trust_remote_code: true + end_of_text_token: "<|im_end|>" + prefix_token: "<|startoftext|>" + # Writer - name: writer/gpt2 tokenizer_spec: From ee10b8fd5a0f46949c98cd7a0a79cb7e1b163073 Mon Sep 17 00:00:00 2001 From: Yifan Mai Date: Fri, 22 Nov 2024 21:14:20 -0800 Subject: [PATCH 02/14] Add Llama 3.1 Nemotron Instruct (70B) model on Together AI (#3172) --- src/helm/config/model_deployments.yaml | 10 ++++++++++ src/helm/config/model_metadata.yaml | 10 ++++++++++ src/helm/config/tokenizer_configs.yaml | 8 ++++++++ 3 files changed, 28 insertions(+) diff --git a/src/helm/config/model_deployments.yaml b/src/helm/config/model_deployments.yaml index 9819638c159..24ea01bafcd 100644 --- a/src/helm/config/model_deployments.yaml +++ b/src/helm/config/model_deployments.yaml @@ -2164,6 +2164,16 @@ model_deployments: args: together_model: meta-llama/Meta-Llama-Guard-3-8B + # NVIDIA + - name: together/llama-3.1-nemotron-70b-instruct + model_name: nvidia/llama-3.1-nemotron-70b-instruct + tokenizer_name: nvidia/llama-3.1-nemotron-70b-instruct + max_sequence_length: 32768 + client_spec: + class_name: "helm.clients.together_client.TogetherClient" + args: + together_model: nvidia/Llama-3.1-Nemotron-70B-Instruct-HF + # 01.AI - name: together/yi-6b model_name: 01-ai/yi-6b diff --git a/src/helm/config/model_metadata.yaml b/src/helm/config/model_metadata.yaml index db75d81266d..eb611e446f9 100644 --- a/src/helm/config/model_metadata.yaml +++ b/src/helm/config/model_metadata.yaml @@ -2173,6 +2173,16 @@ models: release_date: 2024-06-17 tags: [TEXT_MODEL_TAG, PARTIAL_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG] + - name: nvidia/llama-3.1-nemotron-70b-instruct + display_name: Llama 3.1 Nemotron Instruct (70B) + description: Llama-3.1-Nemotron-70B-Instruct is a large language model customized by NVIDIA to improve the helpfulness of LLM generated responses to user queries. It was trained using RLHF (specifically, REINFORCE), Llama-3.1-Nemotron-70B-Reward and HelpSteer2-Preference prompts on a Llama-3.1-70B-Instruct model. ([paper](https://arxiv.org/abs/2410.01257)) + creator_organization_name: NVIDIA + access: open + num_parameters: 70000000000 + release_date: 2024-10-02 + tags: [TEXT_MODEL_TAG, PARTIAL_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG] + + # OpenAI ## GPT 2 Models diff --git a/src/helm/config/tokenizer_configs.yaml b/src/helm/config/tokenizer_configs.yaml index 2741c25aaab..e2293562610 100644 --- a/src/helm/config/tokenizer_configs.yaml +++ b/src/helm/config/tokenizer_configs.yaml @@ -473,6 +473,14 @@ tokenizer_configs: end_of_text_token: "<|endoftext|>" prefix_token: "<|endoftext|>" + - name: nvidia/llama-3.1-nemotron-70b-instruct + tokenizer_spec: + class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer" + args: + pretrained_model_name_or_path: nvidia/Llama-3.1-Nemotron-70B-Instruct-HF + end_of_text_token: "<|eot_id|>" + prefix_token: "<|begin_of_text|>" + # OpenAI - name: openai/cl100k_base tokenizer_spec: From c0b29010bf0f6a1f5598eafd3a9aa13fcefca0af Mon Sep 17 00:00:00 2001 From: Haoqin Tu Date: Tue, 3 Dec 2024 23:50:20 -0800 Subject: [PATCH 03/14] Add Air-Bench chat audio scenario (#3189) Co-authored-by: Yifan Mai --- .../presentation/run_entries_speech.conf | 4 + .../benchmark/run_specs/audio_run_specs.py | 20 +++ .../audio_language/air_bench_chat_scenario.py | 117 ++++++++++++++++++ src/helm/benchmark/static/schema_speech.yaml | 34 +++-- 4 files changed, 165 insertions(+), 10 deletions(-) create mode 100644 src/helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py diff --git a/src/helm/benchmark/presentation/run_entries_speech.conf b/src/helm/benchmark/presentation/run_entries_speech.conf index e89a61208d8..cb13962c654 100644 --- a/src/helm/benchmark/presentation/run_entries_speech.conf +++ b/src/helm/benchmark/presentation/run_entries_speech.conf @@ -6,6 +6,10 @@ entries: [ {description: "vocal_sound:model=audiolm", priority: 1} {description: "audiocaps:model=audiolm", priority: 1} {description: "voxceleb2:model=audiolm", priority: 1} + {description: "air_bench_chat:subject=speech,model=audiolm", priority: 1} + {description: "air_bench_chat:subject=sound,model=audiolm", priority: 1} + {description: "air_bench_chat:subject=music,model=audiolm", priority: 1} + {description: "air_bench_chat:subject=mix,model=audiolm", priority: 1} #################################################################################################################### # Fairness diff --git a/src/helm/benchmark/run_specs/audio_run_specs.py b/src/helm/benchmark/run_specs/audio_run_specs.py index 97fcddd55fc..ea8e32c225d 100644 --- a/src/helm/benchmark/run_specs/audio_run_specs.py +++ b/src/helm/benchmark/run_specs/audio_run_specs.py @@ -373,3 +373,23 @@ def get_casual_conversations2_run_spec(subject: str) -> RunSpec: metric_specs=metric_specs, groups=[run_spec_name], ) + + +@run_spec_function("air_bench_chat") +def get_air_bench_chat_run_spec(subject: str) -> RunSpec: + scenario_spec = ScenarioSpec( + class_name="helm.benchmark.scenarios.audio_language.air_bench_chat_scenario." "AirBenchChatScenario", + args={"subject": subject}, + ) + adapter_spec = _get_generation_adapter_spec( + max_tokens=50, + ) + metric_specs: List[MetricSpec] = _get_open_ended_generation_metric_specs() + run_spec_name: str = "air_bench_chat" + return RunSpec( + name=f"{run_spec_name}:subject={subject}", + scenario_spec=scenario_spec, + adapter_spec=adapter_spec, + metric_specs=metric_specs, + groups=[run_spec_name], + ) diff --git a/src/helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py b/src/helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py new file mode 100644 index 00000000000..89de1a93f88 --- /dev/null +++ b/src/helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py @@ -0,0 +1,117 @@ +from typing import List +import os + +from helm.benchmark.scenarios.scenario import ( + Scenario, + Instance, + Reference, + TEST_SPLIT, + CORRECT_TAG, + Input, + Output, +) +from tqdm import tqdm +from helm.common.media_object import MediaObject, MultimediaObject +from helm.common.general import ensure_file_downloaded +import json + + +class AirBenchChatScenario(Scenario): + """Air-Bench Chat + + Air-Bench AIR-Bench (Audio InstRuction Benchmark) is a benchmark designed to evaluate the ability of audio language + models to understand various types of audio signals (including human speech, natural sounds and music), and + furthermore, to interact with humans in textual format. AIR-Bench encompasses two dimensions: foundation + and chat benchmarks. The former consists of 19 tasks with approximately 19k single-choice questions. The + latter one contains 2k instances of open-ended question-and-answer data. We consider the chat benchmark + in this scenario. + + Paper: https://aclanthology.org/2024.acl-long.109.pdf + Code: https://github.com/OFA-Sys/AIR-Bench + + Citation: + @inproceedings{yang-etal-2024-air, + title = "{AIR}-Bench: Benchmarking Large Audio-Language Models via Generative Comprehension", + author = "Yang, Qian and + Xu, Jin and + Liu, Wenrui and + Chu, Yunfei and + Jiang, Ziyue and + Zhou, Xiaohuan and + Leng, Yichong and + Lv, Yuanjun and + Zhao, Zhou and + Zhou, Chang and + Zhou, Jingren", + booktitle = "Proceedings of the 62nd Annual Meeting of the Association for Computational + Linguistics (Volume 1: Long Papers)", + year = "2024",} + """ + + HF_DATA_PATH_PREFIX = "https://huggingface.co/datasets/qyang1021/AIR-Bench-Dataset/resolve/main/Chat" + META_DATA_FILE_PATH = "https://huggingface.co/datasets/qyang1021/AIR-Bench-Dataset/resolve/main/Chat/Chat_meta.json" + SUJECTS = ["music", "sound", "speech", "mix"] + + name = "air_bench_chat" + description = "A large-scale dataset of about 46K audio clips to human-written text pairs \ + ([Yang et al, 2024](https://aclanthology.org/2024.acl-long.109.pdf))." + tags: List[str] = ["audio", "reasoning"] + + def __init__(self, subject: str) -> None: + super().__init__() + + if subject not in AirBenchChatScenario.SUJECTS: + raise ValueError(f"Invalid subject. Valid subjects are: {AirBenchChatScenario.SUJECTS}") + + self._subject: str = subject + + def _get_subject_indices(self, meta_data) -> List[int]: + subject_indices = [] + for idx, line in enumerate(meta_data): + if self._subject == "mix": + if "_".join(line["task_name"].split("_")[:2]) == "speech_and": + subject_indices.append(idx) + else: + if line["task_name"].split("_")[0] == self._subject and line["task_name"].split("_")[1] != "and": + subject_indices.append(idx) + return subject_indices + + def _get_content_type(self, audio_file_name) -> str: + if audio_file_name.endswith(".wav"): + return "audio/wav" + elif audio_file_name.endswith(".mp3"): + return "audio/mp3" + else: + raise ValueError(f"Unsupported audio file format: {audio_file_name}") + + def get_instances(self, output_path: str) -> List[Instance]: + instances: List[Instance] = [] + data_dir: str = os.path.join(output_path, "wav_files") + meta_data_path: str = os.path.join(output_path, "Chat_meta.json") + ensure_file_downloaded(source_url=AirBenchChatScenario.META_DATA_FILE_PATH, target_path=meta_data_path) + meta_data = json.load(open(meta_data_path)) + subject_indices = self._get_subject_indices(meta_data) + for _, row in enumerate(tqdm(subject_indices)): + audio_meda_data = meta_data[row] + hf_audio_file_path = os.path.join( + self.HF_DATA_PATH_PREFIX, + f'{audio_meda_data["task_name"]}_{audio_meda_data["dataset_name"]}/{audio_meda_data["path"]}', + ) + local_audio_file_path = os.path.join( + data_dir, f'{audio_meda_data["task_name"]}_{audio_meda_data["dataset_name"]}_{audio_meda_data["path"]}' + ) + ensure_file_downloaded(source_url=hf_audio_file_path, target_path=local_audio_file_path) + input = Input( + multimedia_content=MultimediaObject( + [ + MediaObject( + content_type=self._get_content_type(audio_meda_data["path"]), + location=local_audio_file_path, + ), + MediaObject(content_type="text/plain", text=audio_meda_data["question"]), + ] + ) + ) + references = [Reference(Output(text=audio_meda_data["answer_gt"]), tags=[CORRECT_TAG])] + instances.append(Instance(input=input, references=references, split=TEST_SPLIT)) + return instances diff --git a/src/helm/benchmark/static/schema_speech.yaml b/src/helm/benchmark/static/schema_speech.yaml index 79c6f99163c..13ce45228c7 100644 --- a/src/helm/benchmark/static/schema_speech.yaml +++ b/src/helm/benchmark/static/schema_speech.yaml @@ -195,7 +195,6 @@ run_groups: audio sample ([Becker et al, 2023](https://arxiv.org/abs/1807.03418)). metric_groups: - accuracy - - efficiency - general_information environment: main_name: exact_match @@ -219,7 +218,6 @@ run_groups: ([Wang et al, 2020](https://arxiv.org/abs/2007.10310)). metric_groups: - accuracy - - efficiency - general_information environment: main_name: bleu @@ -241,7 +239,6 @@ run_groups: age, gender, native language, country, and health condition ([Gong et al, 2022](https://arxiv.org/abs/2205.03433)). metric_groups: - accuracy - - efficiency - general_information environment: main_name: exact_match @@ -263,7 +260,6 @@ run_groups: Dutch, German, French, Spanish, Italian, Portuguese", Polish ([Pratap et al, 2022](https://arxiv.org/abs/2012.03411)). metric_groups: - accuracy - - efficiency - general_information environment: main_name: f1_score @@ -288,7 +284,6 @@ run_groups: South Asian, South East Asian, Chinese Japanase Korean ([Conneau et al, 2022](https://arxiv.org/abs/2205.12446)). metric_groups: - accuracy - - efficiency - general_information environment: main_name: exact_match @@ -353,7 +348,6 @@ run_groups: ([Ardila et al, 2020](https://arxiv.org/abs/1912.06670)). metric_groups: - accuracy - - efficiency - general_information environment: main_name: word_accuracy @@ -378,7 +372,6 @@ run_groups: ([Shah et al, 2024](https://arxiv.org/abs/2403.07937)). metric_groups: - accuracy - - efficiency - general_information environment: main_name: word_accuracy @@ -401,7 +394,6 @@ run_groups: The dataset contains the audio and question for three subsets: occupation, status, and potential_crime. metric_groups: - accuracy - - efficiency - general_information environment: main_name: exact_match @@ -427,7 +419,6 @@ run_groups: questions answering task. metric_groups: - accuracy - - efficiency - general_information environment: main_name: exact_match @@ -437,4 +428,27 @@ run_groups: what: audio, spoken language, speaker's gender, age information of audio samples who: real speakers when: "2023" - language: 10 languages \ No newline at end of file + language: 10 languages + + - name: air_bench_chat + display_name: Air-Bench Chat + description: > + Air-Bench (Yang et al, 2024) encompasses two dimensions: foundation and chat benchmarks. The former consists of 19 tasks with + approximately 19k single-choice questions. The latter one contains 2k instances of open-ended question-and-answer data. + We consider the chat benchmark in this scenario. + + The dataset contains the audio question answering task in four subjects: sound, speech, music, and mixed. + ([Yang et al, 2024](https://aclanthology.org/2024.acl-long.109.pdf)). + metric_groups: + - accuracy + - general_information + - reasoning + environment: + main_name: f1_score + main_split: test + taxonomy: + task: audio question answering + what: adio, question, and answer of audio samples + who: real speakers + when: "2024" + language: English \ No newline at end of file From 2e16cf2aaed1cc2e50674e7c4582178b18f4e835 Mon Sep 17 00:00:00 2001 From: Yifan Mai Date: Wed, 4 Dec 2024 16:36:38 -0800 Subject: [PATCH 04/14] Add Solar Pro model (#3198) --- src/helm/clients/upstage_client.py | 23 +++++++++++++++++++++++ src/helm/config/model_deployments.yaml | 8 ++++++++ src/helm/config/model_metadata.yaml | 9 +++++++++ 3 files changed, 40 insertions(+) create mode 100644 src/helm/clients/upstage_client.py diff --git a/src/helm/clients/upstage_client.py b/src/helm/clients/upstage_client.py new file mode 100644 index 00000000000..734acc4c3b2 --- /dev/null +++ b/src/helm/clients/upstage_client.py @@ -0,0 +1,23 @@ +from helm.clients.openai_client import OpenAIClient +from helm.common.cache import CacheConfig +from helm.tokenizers.tokenizer import Tokenizer + + +class UpstageChatClient(OpenAIClient): + """Sends request to a Upstage model using a OpenAI-compatible Chat API.""" + + def __init__( + self, + tokenizer: Tokenizer, + tokenizer_name: str, + cache_config: CacheConfig, + api_key: str, + ): + super().__init__( + tokenizer=tokenizer, + tokenizer_name=tokenizer_name, + cache_config=cache_config, + api_key=api_key, + org_id=None, + base_url="https://api.upstage.ai/v1/solar", + ) diff --git a/src/helm/config/model_deployments.yaml b/src/helm/config/model_deployments.yaml index 24ea01bafcd..20cd77afcb7 100644 --- a/src/helm/config/model_deployments.yaml +++ b/src/helm/config/model_deployments.yaml @@ -2751,6 +2751,14 @@ model_deployments: client_spec: class_name: "helm.clients.reka_client.RekaClient" + # Upstage + - name: upstage/solar-pro-241126 + model_name: upstage/solar-pro-241126 + tokenizer_name: upstage/solar-pro-preview-instruct + max_sequence_length: 32768 + client_spec: + class_name: "helm.clients.upstage_client.UpstageChatClient" + # Diva Llama - name: huggingface/diva-llama model_name: stanford/diva-llama diff --git a/src/helm/config/model_metadata.yaml b/src/helm/config/model_metadata.yaml index eb611e446f9..da037391813 100644 --- a/src/helm/config/model_metadata.yaml +++ b/src/helm/config/model_metadata.yaml @@ -3146,6 +3146,15 @@ models: release_date: 2024-09-11 tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG] + - name: upstage/solar-pro-241126 + display_name: Solar Pro + display_name: Solar Pro + description: Solar Pro is a LLM designed for instruction-following and processing structured formats like HTML and Markdown. It supports English, Korean, and Japanese and has domain expertise in Finance, Healthcare, and Legal. ([blog](https://www.upstage.ai/blog/press/solar-pro-aws)). + creator_organization_name: Upstage + access: limited + num_parameters: 22000000000 + release_date: 2024-11-26 + tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG] # Writer - name: writer/palmyra-base From 6c358c6d9aaa5580aed8e0674c01bf211b797307 Mon Sep 17 00:00:00 2001 From: JESSADA PRANEE <89401708+JackJessada@users.noreply.github.com> Date: Fri, 6 Dec 2024 01:06:03 +0700 Subject: [PATCH 05/14] Add NECTEC (#3197) --- src/helm/config/model_deployments.yaml | 15 +++++++++++++++ src/helm/config/model_metadata.yaml | 23 ++++++++++++++++++++++- src/helm/config/tokenizer_configs.yaml | 19 ++++++++++++++++--- 3 files changed, 53 insertions(+), 4 deletions(-) diff --git a/src/helm/config/model_deployments.yaml b/src/helm/config/model_deployments.yaml index 20cd77afcb7..95214d068bc 100644 --- a/src/helm/config/model_deployments.yaml +++ b/src/helm/config/model_deployments.yaml @@ -980,7 +980,22 @@ model_deployments: max_sequence_length: 2048 client_spec: class_name: "helm.clients.vision_language.huggingface_vlm_client.HuggingFaceVLMClient" + + ## NECTEC + - name: huggingface/Pathumma-llm-text-1.0.0 + model_name: nectec/Pathumma-llm-text-1.0.0 + tokenizer_name: nectec/Pathumma-llm-text-1.0.0 + max_sequence_length: 8192 + client_spec: + class_name: "helm.clients.huggingface_client.HuggingFaceClient" + - name: huggingface/OpenThaiLLM-Prebuilt-7B + model_name: nectec/OpenThaiLLM-Prebuilt-7B + tokenizer_name: nectec/OpenThaiLLM-Prebuilt-7B + max_sequence_length: 4096 + client_spec: + class_name: "helm.clients.huggingface_client.HuggingFaceClient" + ## KAIST AI - name: huggingface/prometheus-vision-13b-v1.0-hf model_name: kaistai/prometheus-vision-13b-v1.0-hf diff --git a/src/helm/config/model_metadata.yaml b/src/helm/config/model_metadata.yaml index da037391813..c3bb0f54b8d 100644 --- a/src/helm/config/model_metadata.yaml +++ b/src/helm/config/model_metadata.yaml @@ -2144,6 +2144,27 @@ models: tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG] + + # NECTEC + - name: nectec/Pathumma-llm-text-1.0.0 + display_name: Pathumma-llm-text-1.0.0 (7B) + description: Pathumma-llm-text-1.0.0 (7B) is a instruction model from OpenThaiLLM-Prebuilt-7B ([blog](https://medium.com/nectec/pathummallm-v-1-0-0-release-6a098ddfe276)) + creator_organization_name: nectec + access: open + num_parameters: 7620000000 + release_date: 2024-10-28 + tags: [TEXT_MODEL_TAG, PARTIAL_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG] + + - name: nectec/OpenThaiLLM-Prebuilt-7B + display_name: OpenThaiLLM-Prebuilt-7B (7B) + description: OpenThaiLLM-Prebuilt-7B (7B) is a pretrained Thai large language model with 7 billion parameters based on Qwen2.5-7B. + creator_organization_name: nectec + access: open + num_parameters: 7620000000 + release_date: 2024-10-28 + tags: [TEXT_MODEL_TAG, PARTIAL_FUNCTIONALITY_TEXT_MODEL_TAG] + + # Neurips - name: neurips/local @@ -3468,4 +3489,4 @@ models: access: open num_parameters: 1380000000 release: 2024-10-21 - tags: [TEXT_MODEL_TAG] \ No newline at end of file + tags: [TEXT_MODEL_TAG] diff --git a/src/helm/config/tokenizer_configs.yaml b/src/helm/config/tokenizer_configs.yaml index e2293562610..ef7dda6d765 100644 --- a/src/helm/config/tokenizer_configs.yaml +++ b/src/helm/config/tokenizer_configs.yaml @@ -456,6 +456,19 @@ tokenizer_configs: end_of_text_token: "" prefix_token: "" + # Nectec + - name: nectec/OpenThaiLLM-Prebuilt-7B + tokenizer_spec: + class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer" + end_of_text_token: "<|im_end|>" + prefix_token: "" + + - name: nectec/Pathumma-llm-text-1.0.0 + tokenizer_spec: + class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer" + end_of_text_token: "<|im_end|>" + prefix_token: "<|im_start|>" + # Neurips - name: neurips/local tokenizer_spec: @@ -530,7 +543,7 @@ tokenizer_configs: class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer" args: pretrained_model_name_or_path: Qwen/Qwen2-72B-Instruct - end_of_text_token: <|im_end|>" + end_of_text_token: "<|im_end|>" prefix_token: "<|im_start|>'" - name: qwen/qwen2.5-7b-instruct @@ -538,7 +551,7 @@ tokenizer_configs: class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer" args: pretrained_model_name_or_path: Qwen/Qwen2.5-7B-Instruct - end_of_text_token: <|im_end|>" + end_of_text_token: "<|im_end|>" prefix_token: "<|im_start|>'" - name: qwen/qwen-vl @@ -728,4 +741,4 @@ tokenizer_configs: args: pretrained_model_name_or_path: ibm-granite/granite-3.0-1b-a400m-base end_of_text_token: "" - prefix_token: "" \ No newline at end of file + prefix_token: "" From 416601c6e640e122bc1a1efe3007a646d7c6c536 Mon Sep 17 00:00:00 2001 From: Yifan Mai Date: Fri, 6 Dec 2024 14:11:05 -0800 Subject: [PATCH 06/14] Add Llama 3.3 model (#3202) --- src/helm/config/model_deployments.yaml | 9 +++++++++ src/helm/config/model_metadata.yaml | 9 +++++++++ src/helm/config/tokenizer_configs.yaml | 8 ++++++++ 3 files changed, 26 insertions(+) diff --git a/src/helm/config/model_deployments.yaml b/src/helm/config/model_deployments.yaml index 95214d068bc..10f688461ae 100644 --- a/src/helm/config/model_deployments.yaml +++ b/src/helm/config/model_deployments.yaml @@ -2152,6 +2152,15 @@ model_deployments: args: together_model: meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo + - name: together/llama-3.3-70b-instruct-turbo + model_name: meta/llama-3.3-70b-instruct-turbo + tokenizer_name: meta/llama-3.3-70b-instruct + max_sequence_length: 128000 + client_spec: + class_name: "helm.clients.together_client.TogetherChatClient" + args: + together_model: meta-llama/Llama-3.3-70B-Instruct-Turbo + - name: together/llama-guard-7b model_name: meta/llama-guard-7b tokenizer_name: meta-llama/Llama-2-7b-hf diff --git a/src/helm/config/model_metadata.yaml b/src/helm/config/model_metadata.yaml index c3bb0f54b8d..7fc8457beba 100644 --- a/src/helm/config/model_metadata.yaml +++ b/src/helm/config/model_metadata.yaml @@ -1656,6 +1656,15 @@ models: release_date: 2024-09-25 tags: [TEXT_MODEL_TAG, VISION_LANGUAGE_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG] + - name: meta/llama-3.3-70b-instruct-turbo + display_name: Llama 3.3 Instruct Turbo (70B) + description: Llama 3.3 (70B) is part of the Llama 3 family of dense Transformer models that that natively support multilinguality, coding, reasoning, and tool usage. ([paper](https://ai.meta.com/research/publications/the-llama-3-herd-of-models/)) Turbo is Together's implementation, providing a near negligible difference in quality from the reference implementation with faster performance and lower cost, currently using FP8 quantization. ([blog](https://www.together.ai/blog/llama-31-quality)) + creator_organization_name: Meta + access: open + num_parameters: 70000000000 + release_date: 2024-12-06 + tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG] + - name: meta/llama-3-8b-chat display_name: Llama 3 Instruct (8B) description: Llama 3 is a family of language models that have been trained on more than 15 trillion tokens, and use Grouped-Query Attention (GQA) for improved inference scalability. It used SFT, rejection sampling, PPO and DPO for post-training. ([paper](https://ai.meta.com/research/publications/the-llama-3-herd-of-models/) diff --git a/src/helm/config/tokenizer_configs.yaml b/src/helm/config/tokenizer_configs.yaml index ef7dda6d765..cbf96457f18 100644 --- a/src/helm/config/tokenizer_configs.yaml +++ b/src/helm/config/tokenizer_configs.yaml @@ -349,6 +349,14 @@ tokenizer_configs: prefix_token: "<|begin_of_text|>" end_of_text_token: "<|eot_id|>" + - name: meta/llama-3.3-70b-instruct + tokenizer_spec: + class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer" + args: + pretrained_model_name_or_path: meta-llama/Llama-3.3-70B-Instruct + prefix_token: "<|begin_of_text|>" + end_of_text_token: "<|eot_id|>" + # 01-ai - name: 01-ai/Yi-6B tokenizer_spec: From ff9c7c9a7dd5af8a62d6de2b1a36c70c8f60b690 Mon Sep 17 00:00:00 2001 From: Siya Goel <139517142+siyagoel@users.noreply.github.com> Date: Fri, 6 Dec 2024 15:25:13 -0800 Subject: [PATCH 07/14] Changes for MMLU PRO with COT (#3200) --- .../metrics/chain_of_thought_metric.py | 93 +++++++++ .../benchmark/run_specs/lite_run_specs.py | 188 ++++++++++-------- .../{mmlu_pro.py => mmlu_pro_scenario.py} | 32 ++- .../scenarios/test_mmlu_pro_scenario.py | 2 +- src/helm/benchmark/static/schema_lite_v2.yaml | 27 +-- 5 files changed, 235 insertions(+), 107 deletions(-) create mode 100644 src/helm/benchmark/metrics/chain_of_thought_metric.py rename src/helm/benchmark/scenarios/{mmlu_pro.py => mmlu_pro_scenario.py} (76%) diff --git a/src/helm/benchmark/metrics/chain_of_thought_metric.py b/src/helm/benchmark/metrics/chain_of_thought_metric.py new file mode 100644 index 00000000000..0925fb38527 --- /dev/null +++ b/src/helm/benchmark/metrics/chain_of_thought_metric.py @@ -0,0 +1,93 @@ +import re +from typing import List, Optional + +from helm.benchmark.adaptation.adapter_spec import AdapterSpec +from helm.benchmark.adaptation.request_state import RequestState +from helm.benchmark.metrics.metric import Metric +from helm.benchmark.metrics.metric_name import MetricName +from helm.benchmark.metrics.metric_service import MetricService +from helm.benchmark.metrics.statistic import Stat + + +def extract_answer(output_text: str) -> Optional[str]: + """ + Extracts the answer from the output text using two exact regex patterns. + Returns None if no valid answer is found. + + Args: + output_text (str): The text from which to extract the answer. + + Returns: + Optional[str]: The extracted answer (A-J) if found, otherwise None. + """ + # First regex: Matches "answer is (A-J)" with optional parentheses + match = re.search(r"answer is \(?([A-J])\)?", output_text) + if match: + return match.group(1) + + # Second regex: Matches "[answer: (A-J)]" with optional leading characters like "." + match = re.search(r"\.*\[aA\]nswer:\s*\(?([A-J])\)?", output_text) + if match: + return match.group(1) + + # If neither regex matches, return None + return None + + +class ChainOfThoughtMetric(Metric): + """ + This metric focuses on structured reasoning and the accuracy of extracted answers. + It compares model outputs against correct answers provided in a multiple-choice + format and returns a score indicating the correctness of the generated response. + """ + + def evaluate_generation( + self, + adapter_spec: AdapterSpec, + request_state: RequestState, + metric_service: MetricService, + eval_cache_path: str, + ) -> List[Stat]: + """ + Evaluate the generated output for chain-of-thought reasoning accuracy. + + The method extracts the model's output, determines the correct answer + from the provided references, and compares the two to compute a binary score. + + Args: + adapter_spec (AdapterSpec): Specification of the adapter used for the evaluation. + request_state (RequestState): The state of the current request, including + the input instance, output results, and references. + metric_service (MetricService): A service used to compute metrics if needed. + eval_cache_path (str): Path to the evaluation cache for storing or retrieving data. + + Returns: + List[Stat]: A list containing a single `Stat` object with the correctness + score (1 for correct, 0 for incorrect) under the metric + name "chain_of_thought_correct". + """ + # Assert that completions exist if the result is not None + assert ( + request_state.result is not None and request_state.result.completions + ), "Request state result must have completions." + + # Set output_text if the assertion passes + output_text = request_state.result.completions[0].text + + # Extract the answer using the updated logic + extracted_answer = extract_answer(output_text) + + # Find the correct answer from references by translating index to letter + correct_answer = None + for index, option in enumerate(request_state.instance.references): + if option.is_correct: + correct_answer = chr(65 + index) # Translate index (0 -> A, 1 -> B, etc.) + break + + # Raise an exception if no correct answer is found + if correct_answer is None: + raise ValueError(f"No correct answer found for instance ID {request_state.instance.id}") + + # Compare extracted answer with the correct answer and compute the score + score = 1 if extracted_answer == correct_answer else 0 + return [Stat(MetricName("chain_of_thought_correctness")).add(score)] diff --git a/src/helm/benchmark/run_specs/lite_run_specs.py b/src/helm/benchmark/run_specs/lite_run_specs.py index 38fbd4ecaa1..c995aa3e36c 100644 --- a/src/helm/benchmark/run_specs/lite_run_specs.py +++ b/src/helm/benchmark/run_specs/lite_run_specs.py @@ -21,11 +21,11 @@ get_generative_harms_metric_specs, get_generic_metric_specs, get_open_ended_generation_metric_specs, - MetricSpec, ) from helm.benchmark.run_spec import RunSpec, run_spec_function from helm.benchmark.runner import get_benchmark_output_path from helm.benchmark.scenarios.scenario import ScenarioSpec, get_scenario_cache_path +from helm.benchmark.metrics.metric import MetricSpec @run_spec_function("narrative_qa") @@ -137,25 +137,59 @@ def get_mmlu_spec(subject: str, method: str = ADAPT_MULTIPLE_CHOICE_JOINT) -> Ru @run_spec_function("mmlu_pro") -def get_mmlu_pro_spec(subject: str) -> RunSpec: - scenario_spec = ScenarioSpec( - class_name="helm.benchmark.scenarios.mmlu_pro.MMLUProScenario", args={"subject": subject} - ) +def get_mmlu_pro_spec(subject: str, use_chain_of_thought: str = "False", use_few_shot: str = "False") -> RunSpec: + # Convert to bools and remove the str versions + use_chain_of_thought_bool: bool = use_chain_of_thought == "True" + use_few_shot_bool: bool = use_few_shot == "True" + del use_chain_of_thought + del use_few_shot - adapter_spec = get_multiple_choice_adapter_spec( - method=ADAPT_MULTIPLE_CHOICE_JOINT, - instructions=f"The following are multiple choice questions (with answers) about {subject.replace('_', ' ')}.", - input_noun="Question", - output_noun="Answer", + scenario_spec = ScenarioSpec( + class_name="helm.benchmark.scenarios.mmlu_pro_scenario.MMLUProScenario", args={"subject": subject} ) + max_train_instance_num = 5 if use_few_shot_bool else 0 - return RunSpec( - name=f"mmlu_pro:subject={subject}", - scenario_spec=scenario_spec, - adapter_spec=adapter_spec, - metric_specs=get_exact_match_metric_specs(), - groups=["mmlu_pro"], - ) + if use_chain_of_thought_bool: + adapter_spec = AdapterSpec( + method=ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT, + max_train_instances=max_train_instance_num, + max_tokens=1000, + input_prefix="What is the correct answer to this question: ", + input_suffix="\nChoices:\n", + output_prefix="", + global_suffix=( + "Let’s think step by step. Based on your reasoning, what is the single, " + "most likely answer choice? Format your response as follows: " + '"The correct answer is (insert answer here)".' + ), + ) + return RunSpec( + name=f"mmlu_pro:subset={subject},use_chain_of_thought={use_chain_of_thought_bool}", + scenario_spec=scenario_spec, + adapter_spec=adapter_spec, + metric_specs=get_exact_match_metric_specs() + + [ + MetricSpec(class_name="helm.benchmark.metrics.chain_of_thought_metric.ChainOfThoughtMetric", args={}), + ], + groups=["mmlu_pro"], + ) + else: + adapter_spec = AdapterSpec( + method=ADAPT_MULTIPLE_CHOICE_JOINT, + max_train_instances=max_train_instance_num, + max_tokens=1000, + input_prefix="What is the correct answer to this question: ", + input_suffix="\nChoices:\n", + output_prefix="", + global_suffix=("Format your response as follows: " '"The correct answer is (insert answer here)".'), + ) + return RunSpec( + name=f"mmlu_pro:subset={subject},use_chain_of_thought={use_chain_of_thought_bool}", + scenario_spec=scenario_spec, + adapter_spec=adapter_spec, + metric_specs=get_exact_match_metric_specs(), + groups=["mmlu_pro"], + ) @run_spec_function("gsm") @@ -344,79 +378,57 @@ def get_gpqa_spec(subset: str, use_chain_of_thought: str = "False", use_few_shot ) max_train_instance_num = 5 if use_few_shot_bool else 0 - if use_few_shot_bool: - if use_chain_of_thought_bool: - adapter_spec = get_multiple_choice_adapter_spec( - method=ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT, - max_tokens=1000, # following original repo - max_train_instances=max_train_instance_num, - instructions=( - "Here are some example questions from experts. " - "An explanation is given before the final answer. " - "Answer the final question yourself, giving your reasoning beforehand." - ), - input_noun="Question", - input_suffix="\nChoices: \n", - reference_prefix="(A) ", - chain_of_thought_prefix="Let's think step by step: ", - chain_of_thought_suffix="The correct answer is ", - output_noun="", # will be overwritten with output_prefix - output_prefix="", - global_suffix=( - "Give step by step reasoning before you answer, and when you’re ready to answer, " - 'please use the format "The correct answer is (insert answer here)":' - ), - ) - else: - adapter_spec = get_multiple_choice_adapter_spec( - method=ADAPT_MULTIPLE_CHOICE_JOINT, - max_train_instances=max_train_instance_num, - instructions=( - "Here are some example questions from experts. " - "An explanation is given before the final answer. " - "Answer the final question yourself, giving your reasoning beforehand." - ), - input_noun="Question", - input_suffix="\nChoices: \n", - reference_prefix="(A) ", - output_noun="", # will be overwritten with output_prefix - output_prefix="The correct answer is ", - ) + if use_chain_of_thought_bool: + adapter_spec = get_multiple_choice_adapter_spec( + method=ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT, + max_tokens=1000, # following original repo + max_train_instances=max_train_instance_num, + instructions=( + "Here are some example questions from experts. " + "An explanation is given before the final answer. " + "Answer the final question yourself, giving your reasoning beforehand." + ), + input_noun="Question", + input_suffix="\nChoices: \n", + reference_prefix="(A) ", + chain_of_thought_prefix="Let's think step by step: ", + chain_of_thought_suffix="The correct answer is ", + output_noun="", # will be overwritten with output_prefix + output_prefix="", + ) + return RunSpec( + name=f"gpqa:subset={subset},use_chain_of_thought={use_chain_of_thought_bool}", + scenario_spec=scenario_spec, + adapter_spec=adapter_spec, + metric_specs=get_exact_match_metric_specs() + + [ + MetricSpec(class_name="helm.benchmark.metrics.chain_of_thought_metric.ChainOfThoughtMetric", args={}), + ], + groups=["gpqa"], + ) else: - if use_chain_of_thought_bool: - adapter_spec = AdapterSpec( - method=ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT, - max_train_instances=max_train_instance_num, - max_tokens=1000, - input_prefix="What is the correct answer to this question: ", - input_suffix="\nChoices:\n", - output_prefix="", - reference_prefix="(A) ", - global_suffix=( - "Let’s think step by step. Based on your reasoning, what is the single, " - "most likely answer choice? Format your response as follows: " - '"The correct answer is (insert answer here)".' - ), - ) - else: - adapter_spec = AdapterSpec( - method=ADAPT_MULTIPLE_CHOICE_JOINT, - max_train_instances=max_train_instance_num, - max_tokens=1000, - input_prefix="What is the correct answer to this question: ", - input_suffix="\nChoices:\n", - output_prefix="", - reference_prefix="(A) ", - global_suffix=("Format your response as follows: " '"The correct answer is (insert answer here)".'), - ) + adapter_spec = get_multiple_choice_adapter_spec( + method=ADAPT_MULTIPLE_CHOICE_JOINT, + max_train_instances=max_train_instance_num, + instructions=( + "Here are some example questions from experts. " + "An explanation is given before the final answer. " + "Answer the final question yourself, giving your reasoning beforehand." + ), + input_noun="Question", + input_suffix="\nChoices: \n", + reference_prefix="(A) ", + output_noun="", # will be overwritten with output_prefix + output_prefix="The correct answer is ", + ) - return RunSpec( - name=f"gpqa:subset={subset},use_chain_of_thought={use_chain_of_thought_bool}", - scenario_spec=scenario_spec, - adapter_spec=adapter_spec, - metric_specs=get_exact_match_metric_specs(), # TODO: update this after cot metric is ready - groups=["gpqa"], - ) + return RunSpec( + name=f"gpqa:subset={subset},use_chain_of_thought={use_chain_of_thought_bool}", + scenario_spec=scenario_spec, + adapter_spec=adapter_spec, + metric_specs=get_exact_match_metric_specs(), + groups=["gpqa"], + ) @run_spec_function("ifeval") diff --git a/src/helm/benchmark/scenarios/mmlu_pro.py b/src/helm/benchmark/scenarios/mmlu_pro_scenario.py similarity index 76% rename from src/helm/benchmark/scenarios/mmlu_pro.py rename to src/helm/benchmark/scenarios/mmlu_pro_scenario.py index a091387dc22..5d08d4f9d16 100644 --- a/src/helm/benchmark/scenarios/mmlu_pro.py +++ b/src/helm/benchmark/scenarios/mmlu_pro_scenario.py @@ -1,8 +1,17 @@ from typing import Dict, List -from datasets import load_dataset +from datasets import Dataset, load_dataset from helm.common.hierarchical_logger import hlog -from .scenario import Scenario, Instance, Reference, TRAIN_SPLIT, TEST_SPLIT, CORRECT_TAG, Input, Output +from helm.benchmark.scenarios.scenario import ( + Scenario, + Instance, + Reference, + TRAIN_SPLIT, + TEST_SPLIT, + CORRECT_TAG, + Input, + Output, +) class MMLUProScenario(Scenario): @@ -33,7 +42,14 @@ def __init__(self, subject: str): super().__init__() self.subject: str = subject - def process_csv(self, data, split: str) -> List[Instance]: + def process_dataset(self, data: Dataset, split: str) -> List[Instance]: + """ + Process the dataset to create instances. + + :param data: Hugging Face `Dataset` containing the data for a specific split. + :param split: The data split (e.g., "train", "test"). + :return: A list of processed `Instance` objects. + """ instances: List[Instance] = [] hlog(f"Processing data for {split} split") for row in data: @@ -55,8 +71,14 @@ def answer_to_reference(answer: str) -> Reference: return instances def get_instances(self, output_path: str) -> List[Instance]: + """ + Load and process the MMLU-Pro dataset to create instances. + + :param output_path: Path to save or output the processed instances. + :return: A list of all processed `Instance` objects. + """ # Load the MMLU-Pro dataset from Hugging Face - dataset = load_dataset("TIGER-Lab/MMLU-Pro") + dataset = load_dataset("TIGER-Lab/MMLU-Pro", revision="3373e0b") # Process all the instances instances: List[Instance] = [] @@ -66,6 +88,6 @@ def get_instances(self, output_path: str) -> List[Instance]: } for hf_split, split in splits.items(): data = dataset[hf_split].filter(lambda x: x["category"] == self.subject) - instances.extend(self.process_csv(data, split)) + instances.extend(self.process_dataset(data, split)) return instances diff --git a/src/helm/benchmark/scenarios/test_mmlu_pro_scenario.py b/src/helm/benchmark/scenarios/test_mmlu_pro_scenario.py index 8c1dc87a8c0..12ac71f0e01 100644 --- a/src/helm/benchmark/scenarios/test_mmlu_pro_scenario.py +++ b/src/helm/benchmark/scenarios/test_mmlu_pro_scenario.py @@ -1,7 +1,7 @@ import pytest from tempfile import TemporaryDirectory -from helm.benchmark.scenarios.mmlu_pro import MMLUProScenario +from helm.benchmark.scenarios.mmlu_pro_scenario import MMLUProScenario from helm.benchmark.scenarios.scenario import CORRECT_TAG, Input, Output, Reference diff --git a/src/helm/benchmark/static/schema_lite_v2.yaml b/src/helm/benchmark/static/schema_lite_v2.yaml index 17d6fd58346..0d0026e3a30 100644 --- a/src/helm/benchmark/static/schema_lite_v2.yaml +++ b/src/helm/benchmark/static/schema_lite_v2.yaml @@ -93,6 +93,11 @@ metrics: short_display_name: IFEval Strict Acc description: Fraction of instructions in the instance that are correctly followed. lower_is_better: false + - name: chain_of_thought_correctness + display_name: COT correct + short_display_name: COT correct + description: Allows to do evaluation using chain of thought for mmlu pro and gpqa. + lower_is_better: false ############################################################ perturbations: [] @@ -154,32 +159,28 @@ run_groups: when: "?" language: English - - name: gpqa - display_name: GPQA - description: GPQA + - name: ifeval + display_name: IFEval + description: IFEval metric_groups: - accuracy - efficiency - general_information environment: - main_name: exact_match # non-CoT + main_name: ifeval_strict_accuracy main_split: test taxonomy: task: "?" - what: "?" - who: "?" - when: "?" - language: English - - - name: ifeval - display_name: IFEval - description: IFEval + + - name: gpqa + display_name: GPQA + description: GPQA metric_groups: - accuracy - efficiency - general_information environment: - main_name: ifeval_strict_accuracy + main_name: chain_of_thought_correct # non-CoT main_split: test taxonomy: task: "?" From b8a140f865c01510e4091c404c5024512aaa17ab Mon Sep 17 00:00:00 2001 From: Thallyson Alves Date: Fri, 6 Dec 2024 21:43:46 -0300 Subject: [PATCH 08/14] =?UTF-8?q?Adding=20ENEM=20Challenge=20Scenario=20&?= =?UTF-8?q?=20Maritaca=20AI=20model=20(Sabi=C3=A1=207B)=20(#3185)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Yifan Mai --- .../run_specs/enem_challenge_specs.py | 31 ++++ .../scenarios/enem_challenge_scenario.py | 58 +++++++ .../scenarios/test_enem_challenge_scenario.py | 53 +++++++ .../static/schema_enem_challenge.yaml | 146 ++++++++++++++++++ src/helm/config/model_deployments.yaml | 9 ++ src/helm/config/model_metadata.yaml | 10 ++ src/helm/config/tokenizer_configs.yaml | 8 + 7 files changed, 315 insertions(+) create mode 100644 src/helm/benchmark/run_specs/enem_challenge_specs.py create mode 100644 src/helm/benchmark/scenarios/enem_challenge_scenario.py create mode 100644 src/helm/benchmark/scenarios/test_enem_challenge_scenario.py create mode 100644 src/helm/benchmark/static/schema_enem_challenge.yaml diff --git a/src/helm/benchmark/run_specs/enem_challenge_specs.py b/src/helm/benchmark/run_specs/enem_challenge_specs.py new file mode 100644 index 00000000000..a06cf2ecee4 --- /dev/null +++ b/src/helm/benchmark/run_specs/enem_challenge_specs.py @@ -0,0 +1,31 @@ +from helm.benchmark.adaptation.adapters.adapter_factory import ADAPT_MULTIPLE_CHOICE_JOINT +from helm.benchmark.adaptation.common_adapter_specs import get_multiple_choice_adapter_spec +from helm.benchmark.metrics.common_metric_specs import get_exact_match_metric_specs +from helm.benchmark.run_spec import RunSpec, run_spec_function +from helm.benchmark.scenarios.scenario import ScenarioSpec + + +@run_spec_function("enem_challenge") +def get_enem_spec() -> RunSpec: + scenario_spec = ScenarioSpec( + class_name="helm.benchmark.scenarios.enem_challenge_scenario.ENEMChallengeScenario", args={} + ) + + adapter_spec = get_multiple_choice_adapter_spec( + method=ADAPT_MULTIPLE_CHOICE_JOINT, + instructions="Dê uma resposta selecionando uma letra entre as opções fornecidas. " + "Se as opções forem A, B, C, D e E, " + "sua resposta deve consistir em uma única letra que corresponde a resposta correta.\n" + "Exemplo: Qual é a capital da França?\nA. Londres\nB. Paris\nC. Roma\nD. Berlim\nE. Sydney\n" + "Resposta: B", + input_noun="Pergunta", + output_noun="Resposta", + ) + + return RunSpec( + name="enem_challenge", + scenario_spec=scenario_spec, + adapter_spec=adapter_spec, + metric_specs=get_exact_match_metric_specs(), + groups=["enem_challenge"], + ) diff --git a/src/helm/benchmark/scenarios/enem_challenge_scenario.py b/src/helm/benchmark/scenarios/enem_challenge_scenario.py new file mode 100644 index 00000000000..d05b2951868 --- /dev/null +++ b/src/helm/benchmark/scenarios/enem_challenge_scenario.py @@ -0,0 +1,58 @@ +from typing import List, Any +from pathlib import Path +from datasets import load_dataset + +from helm.benchmark.scenarios.scenario import ( + Scenario, + Instance, + Reference, + CORRECT_TAG, + TEST_SPLIT, + Input, + Output, +) + + +class ENEMChallengeScenario(Scenario): + """ + The Exame Nacional do Ensino Médio (ENEM) is an advanced High-School level exam widely applied + every year by the Brazilian government to students that wish to undertake a University degree. + + The questions are about all types of intelectual fields and they are divided into four groups + that are named as: Humanities, Languages, Sciences and Mathematics. + + This scenario is based on the exams that were applied throughout the years of 2009 and 2023. + + The dataset can be found in this link: https://huggingface.co/datasets/eduagarcia/enem_challenge + """ + + name = "enem_challenge" + description = "ENEM Challenge dataset" + tags = ["knowledge", "multiple_choice", "pt-br"] + + def get_instances(self, output_path: str) -> List[Instance]: + # Download the raw data and read all the dialogues + dataset: Any + # Read all the instances + instances: List[Instance] = [] + cache_dir = str(Path(output_path) / "data") + + dataset = load_dataset("eduagarcia/enem_challenge", cache_dir=cache_dir) + for example in dataset["train"]: + question = example["question"] + choices = example["choices"] + answer = example["answerKey"] + # Skipping every canceled question! + if answer == "ANULADO": + continue + answers_dict = dict(zip(choices["label"], choices["text"])) + correct_answer = answers_dict[answer] + + def answer_to_reference(answer: str) -> Reference: + return Reference(Output(text=answer), tags=[CORRECT_TAG] if answer == correct_answer else []) + + instance = Instance( + input=Input(text=question), split=TEST_SPLIT, references=list(map(answer_to_reference, choices["text"])) + ) + instances.append(instance) + return instances diff --git a/src/helm/benchmark/scenarios/test_enem_challenge_scenario.py b/src/helm/benchmark/scenarios/test_enem_challenge_scenario.py new file mode 100644 index 00000000000..db2fc0cff8f --- /dev/null +++ b/src/helm/benchmark/scenarios/test_enem_challenge_scenario.py @@ -0,0 +1,53 @@ +import pytest +from tempfile import TemporaryDirectory + +from helm.benchmark.scenarios.enem_challenge_scenario import ENEMChallengeScenario +from helm.benchmark.scenarios.scenario import TEST_SPLIT, CORRECT_TAG, Output, Reference + + +@pytest.mark.scenarios +def test_enem_challenge_scenario(): + enem_scenario = ENEMChallengeScenario() + with TemporaryDirectory() as tmpdir: + instances = enem_scenario.get_instances(tmpdir) + assert len(instances) == 1431 + assert instances[0].split == TEST_SPLIT + + assert instances[0].input.text.startswith( + "A atmosfera terrestre é composta pelos gases nitrogênio (N2) e oxigênio (O2)" + ) + assert len(instances[0].input.text) == 1163 + + assert instances[0].references == [ + Reference( + output=Output( + text="reduzir o calor irradiado pela Terra mediante a substituição da produção primária pela industrialização refrigerada. " # noqa: E501 + ), + tags=[], + ), + Reference( + output=Output( + text="promover a queima da biomassa vegetal, responsável pelo aumento do efeito estufa devido à produção de CH4. " # noqa: E501 + ), + tags=[], + ), + Reference( + output=Output( + text="reduzir o desmatamento, mantendo-se, assim, o potencial da vegetação em absorver o CO2 da atmosfera. " # noqa: E501 + ), + tags=[CORRECT_TAG], + ), + Reference( + output=Output( + text="aumentar a concentração atmosférica de H2O, molécula capaz de absorver grande quantidade de calor. " # noqa: E501 + ), + tags=[], + ), + Reference( + output=Output( + text="remover moléculas orgânicas polares da atmosfera, diminuindo a capacidade delas de reter calor. " # noqa: E501 + ), + tags=[], + ), + ] + assert instances[0].references[2].is_correct diff --git a/src/helm/benchmark/static/schema_enem_challenge.yaml b/src/helm/benchmark/static/schema_enem_challenge.yaml new file mode 100644 index 00000000000..f329a2d104d --- /dev/null +++ b/src/helm/benchmark/static/schema_enem_challenge.yaml @@ -0,0 +1,146 @@ +############################################################ +metrics: + # Infrastructure metrics: + - name: num_perplexity_tokens + display_name: '# tokens' + description: Average number of tokens in the predicted output (for language modeling, the input too). + - name: num_bytes + display_name: '# bytes' + description: Average number of bytes in the predicted output (for language modeling, the input too). + + - name: num_references + display_name: '# ref' + description: Number of references. + - name: num_train_trials + display_name: '# trials' + description: Number of trials, where in each trial we choose an independent, random set of training instances. + - name: estimated_num_tokens_cost + display_name: 'cost' + description: An estimate of the number of tokens (including prompt and output completions) needed to perform the request. + - name: num_prompt_tokens + display_name: '# prompt tokens' + description: Number of tokens in the prompt. + - name: num_prompt_characters + display_name: '# prompt chars' + description: Number of characters in the prompt. + - name: num_completion_tokens + display_name: '# completion tokens' + description: Actual number of completion tokens (over all completions). + - name: num_output_tokens + display_name: '# output tokens' + description: Actual number of output tokens. + - name: max_num_output_tokens + display_name: 'Max output tokens' + description: Maximum number of output tokens (overestimate since we might stop earlier due to stop sequences). + - name: num_requests + display_name: '# requests' + description: Number of distinct API requests. + - name: num_instances + display_name: '# eval' + description: Number of evaluation instances. + - name: num_train_instances + display_name: '# train' + description: Number of training instances (e.g., in-context examples). + - name: prompt_truncated + display_name: truncated + description: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples). + - name: finish_reason_length + display_name: finish b/c length + description: Fraction of instances where the the output was terminated because of the max tokens limit. + - name: finish_reason_stop + display_name: finish b/c stop + description: Fraction of instances where the the output was terminated because of the stop sequences. + - name: finish_reason_endoftext + display_name: finish b/c endoftext + description: Fraction of instances where the the output was terminated because the end of text token was generated. + - name: finish_reason_unknown + display_name: finish b/c unknown + description: Fraction of instances where the the output was terminated for unknown reasons. + - name: num_completions + display_name: '# completions' + description: Number of completions. + - name: predicted_index + display_name: Predicted index + description: Integer index of the reference (0, 1, ...) that was predicted by the model (for multiple-choice). + + # Accuracy metrics: + - name: exact_match + display_name: Exact match + short_display_name: EM + description: Fraction of instances that the predicted output matches a correct reference exactly. + lower_is_better: false + - name: quasi_exact_match + display_name: Quasi-exact match + short_display_name: EM + description: Fraction of instances that the predicted output matches a correct reference up to light processing. + lower_is_better: false + - name: prefix_exact_match + display_name: Prefix exact match + short_display_name: PEM + description: Fraction of instances that the predicted output matches the prefix of a correct reference exactly. + lower_is_better: false + - name: quasi_prefix_exact_match + # TODO: should call this prefix_quasi_exact_match + display_name: Prefix quasi-exact match + short_display_name: PEM + description: Fraction of instances that the predicted output matches the prefix of a correct reference up to light processing. + lower_is_better: false + + +############################################################ +perturbations: [] + +############################################################ +metric_groups: + - name: accuracy + display_name: Accuracy + metrics: + - name: ${main_name} + split: ${main_split} + + # - name: efficiency + # display_name: Efficiency + # metrics: + # - name: inference_runtime + # split: ${main_split} + + - name: general_information + display_name: General information + hide_win_rates: true + metrics: + - name: num_instances + split: ${main_split} + - name: num_train_instances + split: ${main_split} + - name: prompt_truncated + split: ${main_split} + - name: num_prompt_tokens + split: ${main_split} + - name: num_output_tokens + split: ${main_split} + +############################################################ +run_groups: + - name: core_scenarios + display_name: Core Scenarios + description: Core Scenarios + category: All scenarios + subgroups: + - enem_challenge + + - name: enem_challenge + display_name: ENEM Challenge + description: ENEM Challenge + metric_groups: + - accuracy + # - efficiency + - general_information + environment: + main_name: exact_match + main_split: test + taxonomy: + task: "multiple-choice question answering" + what: "general academic subjects" + who: "brazilian ministry of education" + when: "between 2009 and 2023" + language: Portuguese diff --git a/src/helm/config/model_deployments.yaml b/src/helm/config/model_deployments.yaml index 10f688461ae..bb9aed655f5 100644 --- a/src/helm/config/model_deployments.yaml +++ b/src/helm/config/model_deployments.yaml @@ -2872,3 +2872,12 @@ model_deployments: class_name: "helm.clients.huggingface_client.HuggingFaceClient" args: pretrained_model_name_or_path: ibm-granite/granite-3.0-1b-a400m-base + + - name: huggingface/sabia-7b + model_name: maritaca-ai/sabia-7b + tokenizer_name: maritaca-ai/sabia-7b + max_sequence_length: 2048 + client_spec: + class_name: "helm.clients.huggingface_client.HuggingFaceClient" + args: + pretrained_model_name_or_path: maritaca-ai/sabia-7b diff --git a/src/helm/config/model_metadata.yaml b/src/helm/config/model_metadata.yaml index 7fc8457beba..84227602ca2 100644 --- a/src/helm/config/model_metadata.yaml +++ b/src/helm/config/model_metadata.yaml @@ -3499,3 +3499,13 @@ models: num_parameters: 1380000000 release: 2024-10-21 tags: [TEXT_MODEL_TAG] + + - name: maritaca-ai/sabia-7b + display_name: Sabia 7B + description: Sabia 7B + creator_organization_name: MARITACA-AI + access: open + num_parameters: 6740000000 + release_date: 2023-11-08 + tags: [TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG] + diff --git a/src/helm/config/tokenizer_configs.yaml b/src/helm/config/tokenizer_configs.yaml index cbf96457f18..c851d58c5c8 100644 --- a/src/helm/config/tokenizer_configs.yaml +++ b/src/helm/config/tokenizer_configs.yaml @@ -750,3 +750,11 @@ tokenizer_configs: pretrained_model_name_or_path: ibm-granite/granite-3.0-1b-a400m-base end_of_text_token: "" prefix_token: "" + + - name: maritaca-ai/sabia-7b + tokenizer_spec: + class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer" + args: + pretrained_model_name_or_path: maritaca-ai/sabia-7b + end_of_text_token: "" + prefix_token: "" \ No newline at end of file From 709336e0291afd5f0d38cf97361ca05805ab6e77 Mon Sep 17 00:00:00 2001 From: Yifan Mai Date: Fri, 6 Dec 2024 17:45:15 -0800 Subject: [PATCH 09/14] Release Lite and MMLU v1.11.0 leaderboards (#3204) --- helm-frontend/project_metadata.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/helm-frontend/project_metadata.json b/helm-frontend/project_metadata.json index dfaf297d16f..2bcd9a82f50 100644 --- a/helm-frontend/project_metadata.json +++ b/helm-frontend/project_metadata.json @@ -3,7 +3,7 @@ "title": "Lite", "description": "Lightweight, broad evaluation of the capabilities of language models using in-context learning", "id": "lite", - "releases": ["v1.10.0", "v1.9.0", "v1.8.0", "v1.7.0", "v1.6.0", "v1.5.0", "v1.4.0", "v1.3.0", "v1.2.0", "v1.1.0", "v1.0.0"] + "releases": ["v1.11.0", "v1.10.0", "v1.9.0", "v1.8.0", "v1.7.0", "v1.6.0", "v1.5.0", "v1.4.0", "v1.3.0", "v1.2.0", "v1.1.0", "v1.0.0"] }, { "title": "Classic", @@ -27,7 +27,7 @@ "title": "MMLU", "description": "Massive Multitask Language Understanding (MMLU) evaluations using standardized prompts", "id": "mmlu", - "releases": ["v1.10.0", "v1.9.0", "v1.8.0", "v1.7.0", "v1.6.0", "v1.5.0", "v1.4.0", "v1.3.0", "v1.2.0", "v1.1.0", "v1.0.0"] + "releases": ["v1.11.0", "v1.10.0", "v1.9.0", "v1.8.0", "v1.7.0", "v1.6.0", "v1.5.0", "v1.4.0", "v1.3.0", "v1.2.0", "v1.1.0", "v1.0.0"] }, { "title": "VHELM", From c9065e190c356c1f21ac4111de6df66d04f2e5a4 Mon Sep 17 00:00:00 2001 From: Yifan Mai Date: Fri, 6 Dec 2024 21:33:54 -0800 Subject: [PATCH 10/14] Rename Multimodality section to Papers in the documentation (#3203) --- mkdocs.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mkdocs.yml b/mkdocs.yml index 167f7536a02..c4074561594 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -51,7 +51,7 @@ nav: - get_helm_rank.md - benchmark.md - huggingface_models.md - - Multimodality: + - Papers: - heim.md - vhelm.md - Reference: From d7a61c603b05da1627e467325a7c10dd4ba810ed Mon Sep 17 00:00:00 2001 From: Yifan Mai Date: Mon, 9 Dec 2024 17:21:17 -0800 Subject: [PATCH 11/14] Shorten run spec names for Unitxt runs (#3205) --- src/helm/benchmark/run_specs/unitxt_run_specs.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/helm/benchmark/run_specs/unitxt_run_specs.py b/src/helm/benchmark/run_specs/unitxt_run_specs.py index a7aebf7b811..d6620c42190 100644 --- a/src/helm/benchmark/run_specs/unitxt_run_specs.py +++ b/src/helm/benchmark/run_specs/unitxt_run_specs.py @@ -1,3 +1,5 @@ +import os + from helm.benchmark.adaptation.adapter_spec import ADAPT_GENERATION, AdapterSpec from helm.benchmark.metrics.metric import MetricSpec from helm.benchmark.metrics.common_metric_specs import get_basic_metric_specs @@ -10,7 +12,12 @@ def get_unitxt_spec(**kwargs) -> RunSpec: card = kwargs.get("card") if not card: raise Exception("Unitxt card must be specified") - name_suffix = ",".join([f"{key}={value}" for key, value in kwargs.items()]) + if os.environ.get("HELM_UNITXT_SHORTEN_RUN_SPEC_NAMES", "").lower() == "true": + name_suffix = ",".join( + [f"{key}={value}" for key, value in kwargs.items() if key not in ["template_card_index", "loader_limit"]] + ) + else: + name_suffix = ",".join([f"{key}={value}" for key, value in kwargs.items()]) name = f"unitxt:{name_suffix}" scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.unitxt_scenario.UnitxtScenario", args=kwargs) adapter_spec = AdapterSpec( From 98d7d0b6c932f42353baddbbd05f2e687ed45344 Mon Sep 17 00:00:00 2001 From: Leonardo Schettini Date: Wed, 11 Dec 2024 17:02:57 +0100 Subject: [PATCH 12/14] feat: implement medcalc bench scenario, metrics and specs --- .../metrics/med_calc_bench_metrics.py | 172 ++++++++++++++++++ .../run_specs/med_calc_bench_specs.py | 54 ++++++ .../scenarios/med_calc_bench_scenario.py | 72 ++++++++ 3 files changed, 298 insertions(+) create mode 100644 src/helm/benchmark/metrics/med_calc_bench_metrics.py create mode 100644 src/helm/benchmark/run_specs/med_calc_bench_specs.py create mode 100644 src/helm/benchmark/scenarios/med_calc_bench_scenario.py diff --git a/src/helm/benchmark/metrics/med_calc_bench_metrics.py b/src/helm/benchmark/metrics/med_calc_bench_metrics.py new file mode 100644 index 00000000000..a44306d2ec7 --- /dev/null +++ b/src/helm/benchmark/metrics/med_calc_bench_metrics.py @@ -0,0 +1,172 @@ +import re +from datetime import datetime +from typing import List + +from helm.benchmark.adaptation.adapter_spec import AdapterSpec +from helm.benchmark.adaptation.request_state import RequestState +from helm.benchmark.metrics.metric import Metric +from helm.benchmark.metrics.metric_name import MetricName +from helm.benchmark.metrics.metric_service import MetricService +from helm.benchmark.metrics.statistic import Stat + + +class MedCalcBenchMetric(Metric): + def evaluate_generation( + self, + adapter_spec: AdapterSpec, + request_state: RequestState, + metric_service: MetricService, + eval_cache_path: str, + ) -> List[Stat]: + """Metric for MedCalc-Bench dataset. + + Original implementation: + https://github.com/ncbi-nlp/MedCalc-Bench/blob/048ba77dbe332e9190935e4a30965bff444b940e/evaluation/evaluate.py#L11 + """ + assert request_state.instance.extra_data, ( + "Could not find `extra_data` in the request state. " + "Both `lower_limit` and `upper_limit` are required for this metric." + ) + + assert len(request_state.result.completions) == 1, ( + f"Found a total of {len(request_state.result.completions)} completions. " + "Only one was expected" + ) + + final_answer = request_state.result.completions[0].text.strip().lower().split("final answer:")[-1].strip() + + try: + correctness = self.medcalc_bench_range_metric_calculation( + answer=final_answer, + ground_truth=request_state.instance.extra_data["ground_truth"], + calid=int(request_state.instance.extra_data["calculator_id"]), + upper_limit=request_state.instance.extra_data["upper_limit"], + lower_limit=request_state.instance.extra_data["lower_limit"], + ) + except ValueError: + raise ValueError( + "Failed to calculate the correctess of the output for a MedCalc-Bench instance." + ) + + stat = Stat(MetricName("medcalc_bench_metric")) + stat.add(int(correctness)) + + return [stat] + + def medcalc_bench_metric_calculation( + self, + answer: str, + ground_truth: str, + calid: int, + upper_limit: str, + lower_limit: str, + ) -> int: + """Calculate the metric for MedCalc-Bench dataset. + + This method is basically a copy of the original implementation of this metric: + https://github.com/ncbi-nlp/MedCalc-Bench/blob/048ba77dbe332e9190935e4a30965bff444b940e/evaluation/evaluate.py#L11 + + Credits to the original authors: https://github.com/ncbi-nlp/MedCalc-Bench. + """ + if calid in [13, 68]: + # Output Type: date + + if datetime.strptime(answer, "%m/%d/%Y").strftime( + "%-m/%-d/%Y" + ) == datetime.strptime(ground_truth, "%m/%d/%Y").strftime("%-m/%-d/%Y"): + correctness = 1 + else: + correctness = 0 + elif calid in [69]: + # Output Type: integer (A, B) + match = re.search( + r"\(?[\"\']?(\d+)\s*(weeks?)?[\"\']?,?\s*[\"\']?(\d+)\s*(days?)?[\"\']?\s*\)?", + ground_truth, + ) + ground_truth = f"({match.group(1)}, {match.group(3)})" + match = re.search( + r"\(?[\"\']?(\d+)\s*(weeks?)?[\"\']?,?\s*[\"\']?(\d+)\s*(days?)?[\"\']?\s*\)?", + answer, + ) + if match: + weeks = match.group(1) + days = match.group(3) + answer = f"({weeks}, {days})" + if eval(answer) == eval(ground_truth): + correctness = 1 + else: + correctness = 0 + else: + correctness = 0 + elif calid in [ + 4, + 15, + 16, + 17, + 18, + 20, + 21, + 25, + 27, + 28, + 29, + 32, + 33, + 36, + 43, + 45, + 48, + 51, + 69, + ]: + # Output Type: integer A + answer = round(eval(answer)) + if answer == eval(ground_truth): + correctness = 1 + else: + correctness = 0 + elif calid in [ + 2, + 3, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 19, + 22, + 23, + 24, + 26, + 30, + 31, + 38, + 39, + 40, + 44, + 46, + 49, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63, + 64, + 65, + 66, + 67, + ]: + # Output Type: decimal + answer = eval(answer) + if answer >= eval(lower_limit) and answer <= eval(upper_limit): + correctness = 1 + else: + correctness = 0 + else: + raise ValueError(f"Unknown calculator ID: {calid}") + return correctness diff --git a/src/helm/benchmark/run_specs/med_calc_bench_specs.py b/src/helm/benchmark/run_specs/med_calc_bench_specs.py new file mode 100644 index 00000000000..fb3f5d58717 --- /dev/null +++ b/src/helm/benchmark/run_specs/med_calc_bench_specs.py @@ -0,0 +1,54 @@ +from typing import List + +from helm.benchmark.adaptation.common_adapter_specs import get_generation_adapter_spec +from helm.benchmark.metrics.common_metric_specs import get_basic_metric_specs +from helm.benchmark.metrics.metric import MetricSpec +from helm.benchmark.run_spec import RunSpec, run_spec_function +from helm.benchmark.scenarios.scenario import ScenarioSpec + + +@run_spec_function("med_calc_bench_zero_shot_cot") +def get_med_calc_bench_spec() -> RunSpec: + scenario_spec = ScenarioSpec( + class_name="helm.benchmark.scenarios.med_calc_bench_scenario.MedCalcBenchScenario", + args={}, + ) + + adapter_spec = get_generation_adapter_spec( + instructions=_get_zero_shot_cot_instructions(), + input_noun="Patient Note", + output_noun="Calculated Value", + max_tokens=50, + ) + + metric_specs: List[MetricSpec] = [ + MetricSpec( + class_name="helm.benchmark.metrics.med_calc_bench_metrics.MedCalcBenchMetric", + args={}, + ) + ] + get_basic_metric_specs([]) + + return RunSpec( + name="med_calc_bench", + scenario_spec=scenario_spec, + adapter_spec=adapter_spec, + metric_specs=metric_specs, + groups=["med_calc_bench"], + ) + + +def _get_zero_shot_cot_instructions() -> str: + """Generate instructions for the MedCalcBench scenario. + + This function is inspired on the system prompt definition in the original code: + https://github.com/ncbi-nlp/MedCalc-Bench/blob/048ba77dbe332e9190935e4a30965bff444b940e/evaluation/run.py#L16 + + Credits to the original authors: https://github.com/ncbi-nlp/MedCalc-Bench. + """ + + return ( + "You are a helpful assistant for calculating a score for a given patient note. " + "Please think step-by-step to solve the question and then generate the required score. " + "Your output should contain the step by step thinking and the final answer, which is a short and direct answer to the question. " + 'Before giving the final answer, write "Final Answer: " followed by the answer.' + ) diff --git a/src/helm/benchmark/scenarios/med_calc_bench_scenario.py b/src/helm/benchmark/scenarios/med_calc_bench_scenario.py new file mode 100644 index 00000000000..7221319f20d --- /dev/null +++ b/src/helm/benchmark/scenarios/med_calc_bench_scenario.py @@ -0,0 +1,72 @@ +import os +from typing import List, Dict +import json +from helm.common.general import ensure_file_downloaded +from helm.common.constants import TRAIN_SPLIT, TEST_SPLIT, CORRECT_TAG +from helm.benchmark.scenarios.scenario import Scenario, Instance, Reference, Input, Output + +class MedCalcBenchScenario(Scenario): + """ + MedCalcBench scenario: Processes a medical calculation dataset with explanations. + + Each record in the dataset has: + - Row Number + - Calculator ID + - Calculator Name + - Category + - Output Type + - Note ID + - Note Type + - Question + - Ground Truth Explanation + - Patient Note + - Relevant Entities + - Lower Limit + - Upper Limit + - Ground Truth Answer + + The output is formatted as: + "The answer is . Steps: " + """ + + # TODO: Add a base url + DATASET_DOWNLOAD_BASE_URL: str = "" + + name = "medcalcbench" + description = "Medical calculation questions with step-by-step explanations." + tags = ["reasoning", "medicine", "calculation"] + + def get_instances(self, output_path: str) -> List[Instance]: + splits = {"train": TRAIN_SPLIT, "test": TEST_SPLIT} + instances: List[Instance] = [] + + for split, split_tag in splits.items(): # Iterate over the splits + source_url: str = f"{self.DATASET_DOWNLOAD_BASE_URL}/{split}.jsonl" + data_path: str = os.path.join(output_path, f"med_calc_bench_{split}") + ensure_file_downloaded(source_url=source_url, target_path=data_path) + + with open(data_path, "r", encoding="utf-8") as f: + for line in f: + example: Dict = json.loads(line.strip()) + question = example["Question"] + patient_note = example["Patient_Note"] + + input_text = f"Patient Note:\n\n{patient_note}\n\nQuestion:\n\n{question}" + + # Format the final answer with explanation + instances.append( + Instance( + input=Input(text=input_text), + references=[Reference(Output(text=example["Ground_Truth_Answer"]), tags=[CORRECT_TAG])], + split=split_tag, + extra_data={ + "relevant_entities": example["Relevant_Entities"], + "lower_limit": example["Lower_Limit"], + "upper_limit": example["Upper_Limit"], + "calculator_id": example["Calculator ID"], + "ground_truth": example["Ground_Truth_Answer"], + } + ) + ) + + return instances \ No newline at end of file From bb15f352cfa86f7e9294fde424aab91648066a82 Mon Sep 17 00:00:00 2001 From: Leonardo Schettini Date: Thu, 12 Dec 2024 16:05:10 +0100 Subject: [PATCH 13/14] feat: med calc bench one shot spec --- .../run_specs/med_calc_bench_specs.py | 99 ++++++++++++++++++- 1 file changed, 94 insertions(+), 5 deletions(-) diff --git a/src/helm/benchmark/run_specs/med_calc_bench_specs.py b/src/helm/benchmark/run_specs/med_calc_bench_specs.py index fb3f5d58717..216e1b05382 100644 --- a/src/helm/benchmark/run_specs/med_calc_bench_specs.py +++ b/src/helm/benchmark/run_specs/med_calc_bench_specs.py @@ -1,4 +1,5 @@ -from typing import List +import json +from typing import Dict, List from helm.benchmark.adaptation.common_adapter_specs import get_generation_adapter_spec from helm.benchmark.metrics.common_metric_specs import get_basic_metric_specs @@ -6,9 +7,11 @@ from helm.benchmark.run_spec import RunSpec, run_spec_function from helm.benchmark.scenarios.scenario import ScenarioSpec +ONE_SHOT_EXAMPLES_URL = "https://raw.githubusercontent.com/ncbi-nlp/MedCalc-Bench/048ba77dbe332e9190935e4a30965bff444b940e/evaluation/one_shot_finalized_explanation.json" + @run_spec_function("med_calc_bench_zero_shot_cot") -def get_med_calc_bench_spec() -> RunSpec: +def get_med_calc_bench_zero_shot_spec() -> RunSpec: scenario_spec = ScenarioSpec( class_name="helm.benchmark.scenarios.med_calc_bench_scenario.MedCalcBenchScenario", args={}, @@ -16,9 +19,52 @@ def get_med_calc_bench_spec() -> RunSpec: adapter_spec = get_generation_adapter_spec( instructions=_get_zero_shot_cot_instructions(), - input_noun="Patient Note", + input_noun=None, # Set directly in the scenario. + output_noun="Calculated Value", + max_tokens=500, + ) + + metric_specs: List[MetricSpec] = [ + MetricSpec( + class_name="helm.benchmark.metrics.med_calc_bench_metrics.MedCalcBenchMetric", + args={}, + ) + ] + get_basic_metric_specs([]) + + return RunSpec( + name="med_calc_bench", + scenario_spec=scenario_spec, + adapter_spec=adapter_spec, + metric_specs=metric_specs, + groups=["med_calc_bench"], + ) + + +@run_spec_function("med_calc_bench_one_shot_cot") +def get_med_calc_bench_one_shot_spec() -> RunSpec: + scenario_spec = ScenarioSpec( + class_name="helm.benchmark.scenarios.med_calc_bench_scenario.MedCalcBenchScenario", + args={}, + ) + + adapter_spec = get_generation_adapter_spec( + instructions=_get_one_shot_cot_instructions( + # TODO: Modify this to retrieve the question and calculator ID from the respective dataset sample. + # For more information see the docstring for the `_get_one_shot_cot_instructions` function. + # One way of doing so is having receiving the calculator ID in this function and passing it to + # the scenario, which can then filter the dataset samples by the calculator ID. + question=( + "What is the patient's Creatinine Clearance using the Cockroft-Gault Equation in terms of mL/min? " + "You should use the patient's adjusted body weight in kg instead of the patient's actual body " + "weight if the patient is overweight or obese based on their BMI. If the patient's BMI's normal, " + "set their adjusted body weight to the minimum of the ideal body and actual weight. If the " + "patient is underweight, please set their adjusted body weight to their actual body weight." + ), + calculator_id="2", + ), + input_noun=None, # Set directly in the scenario. output_noun="Calculated Value", - max_tokens=50, + max_tokens=500, ) metric_specs: List[MetricSpec] = [ @@ -50,5 +96,48 @@ def _get_zero_shot_cot_instructions() -> str: "You are a helpful assistant for calculating a score for a given patient note. " "Please think step-by-step to solve the question and then generate the required score. " "Your output should contain the step by step thinking and the final answer, which is a short and direct answer to the question. " - 'Before giving the final answer, write "Final Answer: " followed by the answer.' + 'Before giving the final answer, write "Calculated Value: " followed by the answer.' + ) + + +def _get_one_shot_cot_instructions(question: str, calculator_id: str) -> str: + """Generate instructions for the MedCalcBench scenario. + + This function is inspired on the system prompt definition in the original code: + https://github.com/ncbi-nlp/MedCalc-Bench/blob/048ba77dbe332e9190935e4a30965bff444b940e/evaluation/run.py#L26 + + Credits to the original authors: https://github.com/ncbi-nlp/MedCalc-Bench. + + In the original code, there's exactly one example response for each calculator ID. + These examples are stored in a JSON file: https://github.com/ncbi-nlp/MedCalc-Bench/blob/048ba77dbe332e9190935e4a30965bff444b940e/evaluation/one_shot_finalized_explanation.json + None of the examples include the actual questions. They only contain the step-by-step thinking and the final answer. + Looking at the dataset samples we can see that all samples with the same calculator ID use the same question. + The original expect that for each sample, we collect the calculator ID and the question for building the one-shot instructions. + """ + examples: Dict = {} + with open(ONE_SHOT_EXAMPLES_URL, "r") as f: + examples = json.load(f) + + if not examples: + raise ValueError( + "Failed to load one-shot examples for the MedCalcBench scenario." + ) + + example = examples.get(calculator_id, {}) + + if not example: + raise ValueError( + f"Failed to find one-shot example for calculator ID {calculator_id}." + ) + + return ( + "You are a helpful assistant for calculating a score for a given patient note. " + "Please think step-by-step to solve the question and then generate the required score. " + "Your output should contain the step by step thinking and the final answer, which is a short and direct answer to the question. " + "\nBelow is an example:" + # This example follows the formatting of the respective scenario. + f"Patient Note:\n\n{example['Patient Note']}" + f"\n\nQuestion:\n\n{question}" + f"\n\nExplanation:\n\n{example['Response']['step_by_step_thinking']}" + f"\n\nCalculated Value: {example['Response']['answer']}" ) From 792fb4f9e3de928a9165395fbf12664c72f1a96a Mon Sep 17 00:00:00 2001 From: Leonardo Schettini Date: Tue, 17 Dec 2024 16:19:30 +0100 Subject: [PATCH 14/14] fix: dataset loading and standardize naming --- ...ch_metrics.py => medcalc_bench_metrics.py} | 51 ++++++---- ..._bench_specs.py => medcalc_bench_specs.py} | 65 ++++++------- .../scenarios/med_calc_bench_scenario.py | 72 -------------- .../scenarios/medcalc_bench_scenario.py | 96 +++++++++++++++++++ 4 files changed, 153 insertions(+), 131 deletions(-) rename src/helm/benchmark/metrics/{med_calc_bench_metrics.py => medcalc_bench_metrics.py} (75%) rename src/helm/benchmark/run_specs/{med_calc_bench_specs.py => medcalc_bench_specs.py} (77%) delete mode 100644 src/helm/benchmark/scenarios/med_calc_bench_scenario.py create mode 100644 src/helm/benchmark/scenarios/medcalc_bench_scenario.py diff --git a/src/helm/benchmark/metrics/med_calc_bench_metrics.py b/src/helm/benchmark/metrics/medcalc_bench_metrics.py similarity index 75% rename from src/helm/benchmark/metrics/med_calc_bench_metrics.py rename to src/helm/benchmark/metrics/medcalc_bench_metrics.py index a44306d2ec7..346528cf1b2 100644 --- a/src/helm/benchmark/metrics/med_calc_bench_metrics.py +++ b/src/helm/benchmark/metrics/medcalc_bench_metrics.py @@ -8,6 +8,7 @@ from helm.benchmark.metrics.metric_name import MetricName from helm.benchmark.metrics.metric_service import MetricService from helm.benchmark.metrics.statistic import Stat +from helm.common.hierarchical_logger import hlog class MedCalcBenchMetric(Metric): @@ -33,25 +34,33 @@ def evaluate_generation( "Only one was expected" ) - final_answer = request_state.result.completions[0].text.strip().lower().split("final answer:")[-1].strip() - - try: - correctness = self.medcalc_bench_range_metric_calculation( - answer=final_answer, - ground_truth=request_state.instance.extra_data["ground_truth"], - calid=int(request_state.instance.extra_data["calculator_id"]), - upper_limit=request_state.instance.extra_data["upper_limit"], - lower_limit=request_state.instance.extra_data["lower_limit"], - ) - except ValueError: - raise ValueError( - "Failed to calculate the correctess of the output for a MedCalc-Bench instance." - ) + final_answer = ( + request_state.result.completions[0] + .text.strip() + .lower() + .split("calculated value:")[-1] + .strip() + ) - stat = Stat(MetricName("medcalc_bench_metric")) - stat.add(int(correctness)) + correctness = 0 + if final_answer: + try: + correctness = self.medcalc_bench_metric_calculation( + answer=final_answer, + ground_truth=request_state.instance.extra_data["ground_truth"], + calid=int(request_state.instance.extra_data["calculator_id"]), + upper_limit=request_state.instance.extra_data["upper_limit"], + lower_limit=request_state.instance.extra_data["lower_limit"], + ) + except ValueError as e: + hlog( + ( + "Failed to calculate the correctess of the output for MedCalc-Bench instance " + f'with id {request_state.instance.extra_data["id"]}: {e}' + ) + ) - return [stat] + return [Stat(MetricName("medcalc_bench_metric")).add(correctness)] def medcalc_bench_metric_calculation( self, @@ -120,8 +129,8 @@ def medcalc_bench_metric_calculation( 69, ]: # Output Type: integer A - answer = round(eval(answer)) - if answer == eval(ground_truth): + answer = round(int(answer)) + if answer == int(ground_truth): correctness = 1 else: correctness = 0 @@ -162,8 +171,8 @@ def medcalc_bench_metric_calculation( 67, ]: # Output Type: decimal - answer = eval(answer) - if answer >= eval(lower_limit) and answer <= eval(upper_limit): + answer = float(answer) + if answer >= float(lower_limit) and answer <= float(upper_limit): correctness = 1 else: correctness = 0 diff --git a/src/helm/benchmark/run_specs/med_calc_bench_specs.py b/src/helm/benchmark/run_specs/medcalc_bench_specs.py similarity index 77% rename from src/helm/benchmark/run_specs/med_calc_bench_specs.py rename to src/helm/benchmark/run_specs/medcalc_bench_specs.py index 216e1b05382..8bdb5374751 100644 --- a/src/helm/benchmark/run_specs/med_calc_bench_specs.py +++ b/src/helm/benchmark/run_specs/medcalc_bench_specs.py @@ -1,6 +1,7 @@ import json from typing import Dict, List +from helm.benchmark.adaptation.adapter_spec import AdapterSpec from helm.benchmark.adaptation.common_adapter_specs import get_generation_adapter_spec from helm.benchmark.metrics.common_metric_specs import get_basic_metric_specs from helm.benchmark.metrics.metric import MetricSpec @@ -10,44 +11,47 @@ ONE_SHOT_EXAMPLES_URL = "https://raw.githubusercontent.com/ncbi-nlp/MedCalc-Bench/048ba77dbe332e9190935e4a30965bff444b940e/evaluation/one_shot_finalized_explanation.json" -@run_spec_function("med_calc_bench_zero_shot_cot") -def get_med_calc_bench_zero_shot_spec() -> RunSpec: +@run_spec_function("medcalc_bench") +def get_medcalc_bench_spec(method: str) -> RunSpec: scenario_spec = ScenarioSpec( - class_name="helm.benchmark.scenarios.med_calc_bench_scenario.MedCalcBenchScenario", + class_name="helm.benchmark.scenarios.medcalc_bench_scenario.MedCalcBenchScenario", args={}, ) - adapter_spec = get_generation_adapter_spec( - instructions=_get_zero_shot_cot_instructions(), - input_noun=None, # Set directly in the scenario. - output_noun="Calculated Value", - max_tokens=500, - ) + if method == "zero_shot": + adapter_spec = get_medcalc_bench_zero_shot_adapter() + elif method == "one_shot": + adapter_spec = get_medcalc_bench_one_shot_adapter() + else: + raise ValueError(f"Invalid method for MedCalc-Bench: {method}") metric_specs: List[MetricSpec] = [ MetricSpec( - class_name="helm.benchmark.metrics.med_calc_bench_metrics.MedCalcBenchMetric", + class_name="helm.benchmark.metrics.medcalc_bench_metrics.MedCalcBenchMetric", args={}, ) - ] + get_basic_metric_specs([]) + ] # + get_basic_metric_specs([]) return RunSpec( - name="med_calc_bench", + name=f"medcalc_bench:method{method}", scenario_spec=scenario_spec, adapter_spec=adapter_spec, metric_specs=metric_specs, - groups=["med_calc_bench"], + groups=["medcalc_bench"], ) -@run_spec_function("med_calc_bench_one_shot_cot") -def get_med_calc_bench_one_shot_spec() -> RunSpec: - scenario_spec = ScenarioSpec( - class_name="helm.benchmark.scenarios.med_calc_bench_scenario.MedCalcBenchScenario", - args={}, +def get_medcalc_bench_zero_shot_adapter() -> AdapterSpec: + return get_generation_adapter_spec( + instructions=_get_zero_shot_cot_instructions(), + input_noun=None, # Set directly in the scenario. + output_noun="\n\nCalculated Value", + max_tokens=500, ) - adapter_spec = get_generation_adapter_spec( + +def get_medcalc_bench_one_shot_adapter() -> AdapterSpec: + return get_generation_adapter_spec( instructions=_get_one_shot_cot_instructions( # TODO: Modify this to retrieve the question and calculator ID from the respective dataset sample. # For more information see the docstring for the `_get_one_shot_cot_instructions` function. @@ -63,28 +67,13 @@ def get_med_calc_bench_one_shot_spec() -> RunSpec: calculator_id="2", ), input_noun=None, # Set directly in the scenario. - output_noun="Calculated Value", + output_noun="\n\nCalculated Value", max_tokens=500, ) - metric_specs: List[MetricSpec] = [ - MetricSpec( - class_name="helm.benchmark.metrics.med_calc_bench_metrics.MedCalcBenchMetric", - args={}, - ) - ] + get_basic_metric_specs([]) - - return RunSpec( - name="med_calc_bench", - scenario_spec=scenario_spec, - adapter_spec=adapter_spec, - metric_specs=metric_specs, - groups=["med_calc_bench"], - ) - def _get_zero_shot_cot_instructions() -> str: - """Generate instructions for the MedCalcBench scenario. + """Generate instructions for the MedCalc-Bench scenario. This function is inspired on the system prompt definition in the original code: https://github.com/ncbi-nlp/MedCalc-Bench/blob/048ba77dbe332e9190935e4a30965bff444b940e/evaluation/run.py#L16 @@ -101,7 +90,7 @@ def _get_zero_shot_cot_instructions() -> str: def _get_one_shot_cot_instructions(question: str, calculator_id: str) -> str: - """Generate instructions for the MedCalcBench scenario. + """Generate instructions for the MedCalc-Bench scenario. This function is inspired on the system prompt definition in the original code: https://github.com/ncbi-nlp/MedCalc-Bench/blob/048ba77dbe332e9190935e4a30965bff444b940e/evaluation/run.py#L26 @@ -120,7 +109,7 @@ def _get_one_shot_cot_instructions(question: str, calculator_id: str) -> str: if not examples: raise ValueError( - "Failed to load one-shot examples for the MedCalcBench scenario." + "Failed to load one-shot examples for the MedCalc-Bench scenario." ) example = examples.get(calculator_id, {}) diff --git a/src/helm/benchmark/scenarios/med_calc_bench_scenario.py b/src/helm/benchmark/scenarios/med_calc_bench_scenario.py deleted file mode 100644 index 7221319f20d..00000000000 --- a/src/helm/benchmark/scenarios/med_calc_bench_scenario.py +++ /dev/null @@ -1,72 +0,0 @@ -import os -from typing import List, Dict -import json -from helm.common.general import ensure_file_downloaded -from helm.common.constants import TRAIN_SPLIT, TEST_SPLIT, CORRECT_TAG -from helm.benchmark.scenarios.scenario import Scenario, Instance, Reference, Input, Output - -class MedCalcBenchScenario(Scenario): - """ - MedCalcBench scenario: Processes a medical calculation dataset with explanations. - - Each record in the dataset has: - - Row Number - - Calculator ID - - Calculator Name - - Category - - Output Type - - Note ID - - Note Type - - Question - - Ground Truth Explanation - - Patient Note - - Relevant Entities - - Lower Limit - - Upper Limit - - Ground Truth Answer - - The output is formatted as: - "The answer is . Steps: " - """ - - # TODO: Add a base url - DATASET_DOWNLOAD_BASE_URL: str = "" - - name = "medcalcbench" - description = "Medical calculation questions with step-by-step explanations." - tags = ["reasoning", "medicine", "calculation"] - - def get_instances(self, output_path: str) -> List[Instance]: - splits = {"train": TRAIN_SPLIT, "test": TEST_SPLIT} - instances: List[Instance] = [] - - for split, split_tag in splits.items(): # Iterate over the splits - source_url: str = f"{self.DATASET_DOWNLOAD_BASE_URL}/{split}.jsonl" - data_path: str = os.path.join(output_path, f"med_calc_bench_{split}") - ensure_file_downloaded(source_url=source_url, target_path=data_path) - - with open(data_path, "r", encoding="utf-8") as f: - for line in f: - example: Dict = json.loads(line.strip()) - question = example["Question"] - patient_note = example["Patient_Note"] - - input_text = f"Patient Note:\n\n{patient_note}\n\nQuestion:\n\n{question}" - - # Format the final answer with explanation - instances.append( - Instance( - input=Input(text=input_text), - references=[Reference(Output(text=example["Ground_Truth_Answer"]), tags=[CORRECT_TAG])], - split=split_tag, - extra_data={ - "relevant_entities": example["Relevant_Entities"], - "lower_limit": example["Lower_Limit"], - "upper_limit": example["Upper_Limit"], - "calculator_id": example["Calculator ID"], - "ground_truth": example["Ground_Truth_Answer"], - } - ) - ) - - return instances \ No newline at end of file diff --git a/src/helm/benchmark/scenarios/medcalc_bench_scenario.py b/src/helm/benchmark/scenarios/medcalc_bench_scenario.py new file mode 100644 index 00000000000..0d28cdc0c86 --- /dev/null +++ b/src/helm/benchmark/scenarios/medcalc_bench_scenario.py @@ -0,0 +1,96 @@ +import json +import os +from typing import Dict, List + +from datasets import DatasetDict, load_dataset + +from helm.benchmark.scenarios.scenario import ( + CORRECT_TAG, + TEST_SPLIT, + TRAIN_SPLIT, + Input, + Instance, + Output, + Reference, + Scenario, +) +from helm.common.general import ensure_directory_exists + + +class MedCalcBenchScenario(Scenario): + """ + MedCalcBench scenario: Processes a medical calculation dataset with explanations. + + Each record in the dataset has: + - Row Number + - Calculator ID + - Calculator Name + - Category + - Output Type + - Note ID + - Note Type + - Question + - Ground Truth Explanation + - Patient Note + - Relevant Entities + - Lower Limit + - Upper Limit + - Ground Truth Answer + + The output is formatted as: + "The answer is . Steps: " + """ + + HUGGING_FACE_DATASET_PATH: str = "ncbi/MedCalc-Bench-v1.0" + + # TODO: Add a base url + DATASET_DOWNLOAD_BASE_URL: str = "" + + name = "medcalcbench" + description = "Medical calculation questions with step-by-step explanations." + tags = ["reasoning", "medicine", "calculation"] + + def get_instances(self, output_path: str) -> List[Instance]: + data_path: str = os.path.join(output_path, "data") + ensure_directory_exists(data_path) + dataset: DatasetDict = load_dataset(self.HUGGING_FACE_DATASET_PATH) + + splits = {TRAIN_SPLIT: "train", TEST_SPLIT: "test"} + instances: List[Instance] = [] + for ( + helm_split_name, + dataset_split_name, + ) in splits.items(): # Iterate over the splits + split_data = dataset[dataset_split_name] + + for example in split_data: + question = example["Question"] + patient_note = example["Patient Note"] + + input_text = ( + f"Patient Note:\n\n{patient_note}\n\nQuestion:\n\n{question}" + ) + + # Format the final answer with explanation + instances.append( + Instance( + input=Input(text=input_text), + references=[ + Reference( + Output(text=example["Ground Truth Answer"]), + tags=[CORRECT_TAG], + ) + ], + split=helm_split_name, + extra_data={ + "id": example["Row Number"], + "relevant_entities": example["Relevant Entities"], + "lower_limit": example["Lower Limit"], + "upper_limit": example["Upper Limit"], + "calculator_id": example["Calculator ID"], + "ground_truth": example["Ground Truth Answer"], + }, + ) + ) + + return instances