From 141588ebda40ecf7690a97e453478949c3b11505 Mon Sep 17 00:00:00 2001
From: Yifan Mai <yifan@cs.stanford.edu>
Date: Fri, 22 Nov 2024 21:14:15 -0800
Subject: [PATCH 01/14] Add Upstage Solar Pro Preview model (#3181)

---
 src/helm/config/model_deployments.yaml | 13 ++++++++++++-
 src/helm/config/model_metadata.yaml    |  9 +++++++++
 src/helm/config/tokenizer_configs.yaml |  9 +++++++++
 3 files changed, 30 insertions(+), 1 deletion(-)
diff --git a/src/helm/config/model_deployments.yaml b/src/helm/config/model_deployments.yaml
index ff737870cb2..9819638c159 100644
--- a/src/helm/config/model_deployments.yaml
+++ b/src/helm/config/model_deployments.yaml
@@ -1271,6 +1271,17 @@ model_deployments:
     client_spec:
       class_name: "helm.clients.huggingface_client.HuggingFaceClient"
 
+  # Upstage
+  - name: huggingface/solar-pro-preview-instruct
+    model_name: upstage/solar-pro-preview-instruct
+    tokenizer_name: upstage/solar-pro-preview-instruct
+    max_sequence_length: 4096
+    client_spec:
+      class_name: "helm.clients.huggingface_client.HuggingFaceClient"
+      args:
+        torch_dtype: auto
+        trust_remote_code: true
+
   ## Text-to-Image Diffusion Models
 
   - name: huggingface/dreamlike-diffusion-v1-0
@@ -2818,4 +2829,4 @@ model_deployments:
     client_spec:
       class_name: "helm.clients.huggingface_client.HuggingFaceClient"
       args:
-        pretrained_model_name_or_path: ibm-granite/granite-3.0-1b-a400m-base
\ No newline at end of file
+        pretrained_model_name_or_path: ibm-granite/granite-3.0-1b-a400m-base
diff --git a/src/helm/config/model_metadata.yaml b/src/helm/config/model_metadata.yaml
index 65afaa1b895..db75d81266d 100644
--- a/src/helm/config/model_metadata.yaml
+++ b/src/helm/config/model_metadata.yaml
@@ -3126,6 +3126,15 @@ models:
     release_date: 2022-09-19
     tags: [] # TODO: add tags
 
+  # Upstage
+  - name: upstage/solar-pro-preview-instruct
+    display_name: Solar Pro Preview (22B)
+    description: Solar Pro Preview (22B) is open-weights model for single GPU inference that is a preview of the upcoming Solar Pro model ([blog](https://www.upstage.ai/products/solar-pro-preview)).
+    creator_organization_name: Upstage
+    access: open
+    num_parameters: 22000000000
+    release_date: 2024-09-11
+    tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG]
 
 
   # Writer
diff --git a/src/helm/config/tokenizer_configs.yaml b/src/helm/config/tokenizer_configs.yaml
index 9f92970bc73..2741c25aaab 100644
--- a/src/helm/config/tokenizer_configs.yaml
+++ b/src/helm/config/tokenizer_configs.yaml
@@ -612,6 +612,15 @@ tokenizer_configs:
     end_of_text_token: "</s>"
     prefix_token: "<s>"
 
+  # Upstage
+  - name: upstage/solar-pro-preview-instruct
+    tokenizer_spec:
+      class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+      args:
+        trust_remote_code: true
+    end_of_text_token: "<|im_end|>"
+    prefix_token: "<|startoftext|>"
+
   # Writer
   - name: writer/gpt2
     tokenizer_spec:

From ee10b8fd5a0f46949c98cd7a0a79cb7e1b163073 Mon Sep 17 00:00:00 2001
From: Yifan Mai <yifan@cs.stanford.edu>
Date: Fri, 22 Nov 2024 21:14:20 -0800
Subject: [PATCH 02/14] Add Llama 3.1 Nemotron Instruct (70B) model on Together
 AI (#3172)

---
 src/helm/config/model_deployments.yaml | 10 ++++++++++
 src/helm/config/model_metadata.yaml    | 10 ++++++++++
 src/helm/config/tokenizer_configs.yaml |  8 ++++++++
 3 files changed, 28 insertions(+)

diff --git a/src/helm/config/model_deployments.yaml b/src/helm/config/model_deployments.yaml
index 9819638c159..24ea01bafcd 100644
--- a/src/helm/config/model_deployments.yaml
+++ b/src/helm/config/model_deployments.yaml
@@ -2164,6 +2164,16 @@ model_deployments:
       args:
         together_model: meta-llama/Meta-Llama-Guard-3-8B
 
+  # NVIDIA  
+  - name: together/llama-3.1-nemotron-70b-instruct
+    model_name: nvidia/llama-3.1-nemotron-70b-instruct
+    tokenizer_name: nvidia/llama-3.1-nemotron-70b-instruct
+    max_sequence_length: 32768
+    client_spec:
+      class_name: "helm.clients.together_client.TogetherClient"
+      args:
+        together_model: nvidia/Llama-3.1-Nemotron-70B-Instruct-HF
+
   # 01.AI
   - name: together/yi-6b
     model_name: 01-ai/yi-6b
diff --git a/src/helm/config/model_metadata.yaml b/src/helm/config/model_metadata.yaml
index db75d81266d..eb611e446f9 100644
--- a/src/helm/config/model_metadata.yaml
+++ b/src/helm/config/model_metadata.yaml
@@ -2173,6 +2173,16 @@ models:
     release_date: 2024-06-17
     tags: [TEXT_MODEL_TAG, PARTIAL_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
 
+  - name: nvidia/llama-3.1-nemotron-70b-instruct
+    display_name: Llama 3.1 Nemotron Instruct (70B)
+    description: Llama-3.1-Nemotron-70B-Instruct is a large language model customized by NVIDIA to improve the helpfulness of LLM generated responses to user queries. It was trained using RLHF (specifically, REINFORCE), Llama-3.1-Nemotron-70B-Reward and HelpSteer2-Preference prompts on a Llama-3.1-70B-Instruct model. ([paper](https://arxiv.org/abs/2410.01257))
+    creator_organization_name: NVIDIA
+    access: open
+    num_parameters: 70000000000
+    release_date: 2024-10-02
+    tags: [TEXT_MODEL_TAG, PARTIAL_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
+
+
   # OpenAI
 
   ## GPT 2 Models
diff --git a/src/helm/config/tokenizer_configs.yaml b/src/helm/config/tokenizer_configs.yaml
index 2741c25aaab..e2293562610 100644
--- a/src/helm/config/tokenizer_configs.yaml
+++ b/src/helm/config/tokenizer_configs.yaml
@@ -473,6 +473,14 @@ tokenizer_configs:
     end_of_text_token: "<|endoftext|>"
     prefix_token: "<|endoftext|>"
 
+  - name: nvidia/llama-3.1-nemotron-70b-instruct
+    tokenizer_spec:
+      class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+      args:
+        pretrained_model_name_or_path: nvidia/Llama-3.1-Nemotron-70B-Instruct-HF
+    end_of_text_token: "<|eot_id|>"
+    prefix_token: "<|begin_of_text|>"
+
   # OpenAI
   - name: openai/cl100k_base
     tokenizer_spec:

From c0b29010bf0f6a1f5598eafd3a9aa13fcefca0af Mon Sep 17 00:00:00 2001
From: Haoqin Tu <tuisaac163@gmail.com>
Date: Tue, 3 Dec 2024 23:50:20 -0800
Subject: [PATCH 03/14] Add Air-Bench chat audio scenario (#3189)

Co-authored-by: Yifan Mai <yifan@cs.stanford.edu>
---
 .../presentation/run_entries_speech.conf      |   4 +
 .../benchmark/run_specs/audio_run_specs.py    |  20 +++
 .../audio_language/air_bench_chat_scenario.py | 117 ++++++++++++++++++
 src/helm/benchmark/static/schema_speech.yaml  |  34 +++--
 4 files changed, 165 insertions(+), 10 deletions(-)
 create mode 100644 src/helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py

diff --git a/src/helm/benchmark/presentation/run_entries_speech.conf b/src/helm/benchmark/presentation/run_entries_speech.conf
index e89a61208d8..cb13962c654 100644
--- a/src/helm/benchmark/presentation/run_entries_speech.conf
+++ b/src/helm/benchmark/presentation/run_entries_speech.conf
@@ -6,6 +6,10 @@ entries: [
     {description: "vocal_sound:model=audiolm", priority: 1}
     {description: "audiocaps:model=audiolm", priority: 1}
     {description: "voxceleb2:model=audiolm", priority: 1}
+    {description: "air_bench_chat:subject=speech,model=audiolm", priority: 1}
+    {description: "air_bench_chat:subject=sound,model=audiolm", priority: 1}
+    {description: "air_bench_chat:subject=music,model=audiolm", priority: 1}
+    {description: "air_bench_chat:subject=mix,model=audiolm", priority: 1}
 
     ####################################################################################################################
     # Fairness
diff --git a/src/helm/benchmark/run_specs/audio_run_specs.py b/src/helm/benchmark/run_specs/audio_run_specs.py
index 97fcddd55fc..ea8e32c225d 100644
--- a/src/helm/benchmark/run_specs/audio_run_specs.py
+++ b/src/helm/benchmark/run_specs/audio_run_specs.py
@@ -373,3 +373,23 @@ def get_casual_conversations2_run_spec(subject: str) -> RunSpec:
         metric_specs=metric_specs,
         groups=[run_spec_name],
     )
+
+
+@run_spec_function("air_bench_chat")
+def get_air_bench_chat_run_spec(subject: str) -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.audio_language.air_bench_chat_scenario." "AirBenchChatScenario",
+        args={"subject": subject},
+    )
+    adapter_spec = _get_generation_adapter_spec(
+        max_tokens=50,
+    )
+    metric_specs: List[MetricSpec] = _get_open_ended_generation_metric_specs()
+    run_spec_name: str = "air_bench_chat"
+    return RunSpec(
+        name=f"{run_spec_name}:subject={subject}",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=metric_specs,
+        groups=[run_spec_name],
+    )
diff --git a/src/helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py b/src/helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py
new file mode 100644
index 00000000000..89de1a93f88
--- /dev/null
+++ b/src/helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py
@@ -0,0 +1,117 @@
+from typing import List
+import os
+
+from helm.benchmark.scenarios.scenario import (
+    Scenario,
+    Instance,
+    Reference,
+    TEST_SPLIT,
+    CORRECT_TAG,
+    Input,
+    Output,
+)
+from tqdm import tqdm
+from helm.common.media_object import MediaObject, MultimediaObject
+from helm.common.general import ensure_file_downloaded
+import json
+
+
+class AirBenchChatScenario(Scenario):
+    """Air-Bench Chat
+
+    Air-Bench AIR-Bench (Audio InstRuction Benchmark) is a benchmark designed to evaluate the ability of audio language
+    models to understand various types of audio signals (including human speech, natural sounds and music), and
+    furthermore, to interact with humans in textual format. AIR-Bench encompasses two dimensions: foundation
+    and chat benchmarks. The former consists of 19 tasks with approximately 19k single-choice questions. The
+    latter one contains 2k instances of open-ended question-and-answer data. We consider the chat benchmark
+    in this scenario.
+
+    Paper: https://aclanthology.org/2024.acl-long.109.pdf
+    Code: https://github.com/OFA-Sys/AIR-Bench
+
+    Citation:
+    @inproceedings{yang-etal-2024-air,
+    title = "{AIR}-Bench: Benchmarking Large Audio-Language Models via Generative Comprehension",
+    author = "Yang, Qian  and
+      Xu, Jin  and
+      Liu, Wenrui  and
+      Chu, Yunfei  and
+      Jiang, Ziyue  and
+      Zhou, Xiaohuan  and
+      Leng, Yichong  and
+      Lv, Yuanjun  and
+      Zhao, Zhou  and
+      Zhou, Chang  and
+      Zhou, Jingren",
+    booktitle = "Proceedings of the 62nd Annual Meeting of the Association for Computational
+        Linguistics (Volume 1: Long Papers)",
+    year = "2024",}
+    """
+
+    HF_DATA_PATH_PREFIX = "https://huggingface.co/datasets/qyang1021/AIR-Bench-Dataset/resolve/main/Chat"
+    META_DATA_FILE_PATH = "https://huggingface.co/datasets/qyang1021/AIR-Bench-Dataset/resolve/main/Chat/Chat_meta.json"
+    SUJECTS = ["music", "sound", "speech", "mix"]
+
+    name = "air_bench_chat"
+    description = "A large-scale dataset of about 46K audio clips to human-written text pairs \
+        ([Yang et al, 2024](https://aclanthology.org/2024.acl-long.109.pdf))."
+    tags: List[str] = ["audio", "reasoning"]
+
+    def __init__(self, subject: str) -> None:
+        super().__init__()
+
+        if subject not in AirBenchChatScenario.SUJECTS:
+            raise ValueError(f"Invalid subject. Valid subjects are: {AirBenchChatScenario.SUJECTS}")
+
+        self._subject: str = subject
+
+    def _get_subject_indices(self, meta_data) -> List[int]:
+        subject_indices = []
+        for idx, line in enumerate(meta_data):
+            if self._subject == "mix":
+                if "_".join(line["task_name"].split("_")[:2]) == "speech_and":
+                    subject_indices.append(idx)
+            else:
+                if line["task_name"].split("_")[0] == self._subject and line["task_name"].split("_")[1] != "and":
+                    subject_indices.append(idx)
+        return subject_indices
+
+    def _get_content_type(self, audio_file_name) -> str:
+        if audio_file_name.endswith(".wav"):
+            return "audio/wav"
+        elif audio_file_name.endswith(".mp3"):
+            return "audio/mp3"
+        else:
+            raise ValueError(f"Unsupported audio file format: {audio_file_name}")
+
+    def get_instances(self, output_path: str) -> List[Instance]:
+        instances: List[Instance] = []
+        data_dir: str = os.path.join(output_path, "wav_files")
+        meta_data_path: str = os.path.join(output_path, "Chat_meta.json")
+        ensure_file_downloaded(source_url=AirBenchChatScenario.META_DATA_FILE_PATH, target_path=meta_data_path)
+        meta_data = json.load(open(meta_data_path))
+        subject_indices = self._get_subject_indices(meta_data)
+        for _, row in enumerate(tqdm(subject_indices)):
+            audio_meda_data = meta_data[row]
+            hf_audio_file_path = os.path.join(
+                self.HF_DATA_PATH_PREFIX,
+                f'{audio_meda_data["task_name"]}_{audio_meda_data["dataset_name"]}/{audio_meda_data["path"]}',
+            )
+            local_audio_file_path = os.path.join(
+                data_dir, f'{audio_meda_data["task_name"]}_{audio_meda_data["dataset_name"]}_{audio_meda_data["path"]}'
+            )
+            ensure_file_downloaded(source_url=hf_audio_file_path, target_path=local_audio_file_path)
+            input = Input(
+                multimedia_content=MultimediaObject(
+                    [
+                        MediaObject(
+                            content_type=self._get_content_type(audio_meda_data["path"]),
+                            location=local_audio_file_path,
+                        ),
+                        MediaObject(content_type="text/plain", text=audio_meda_data["question"]),
+                    ]
+                )
+            )
+            references = [Reference(Output(text=audio_meda_data["answer_gt"]), tags=[CORRECT_TAG])]
+            instances.append(Instance(input=input, references=references, split=TEST_SPLIT))
+        return instances
diff --git a/src/helm/benchmark/static/schema_speech.yaml b/src/helm/benchmark/static/schema_speech.yaml
index 79c6f99163c..13ce45228c7 100644
--- a/src/helm/benchmark/static/schema_speech.yaml
+++ b/src/helm/benchmark/static/schema_speech.yaml
@@ -195,7 +195,6 @@ run_groups:
       audio sample ([Becker et al, 2023](https://arxiv.org/abs/1807.03418)).
     metric_groups:
       - accuracy
-      - efficiency
       - general_information
     environment:
       main_name: exact_match
@@ -219,7 +218,6 @@ run_groups:
       ([Wang et al, 2020](https://arxiv.org/abs/2007.10310)).
     metric_groups:
       - accuracy
-      - efficiency
       - general_information
     environment:
       main_name: bleu
@@ -241,7 +239,6 @@ run_groups:
       age, gender, native language, country, and health condition ([Gong et al, 2022](https://arxiv.org/abs/2205.03433)).
     metric_groups:
       - accuracy
-      - efficiency
       - general_information
     environment:
       main_name: exact_match
@@ -263,7 +260,6 @@ run_groups:
       Dutch, German, French, Spanish, Italian, Portuguese", Polish ([Pratap et al, 2022](https://arxiv.org/abs/2012.03411)).
     metric_groups:
       - accuracy
-      - efficiency
       - general_information
     environment:
       main_name: f1_score
@@ -288,7 +284,6 @@ run_groups:
       South Asian, South East Asian, Chinese Japanase Korean ([Conneau et al, 2022](https://arxiv.org/abs/2205.12446)).
     metric_groups:
       - accuracy
-      - efficiency
       - general_information
     environment:
       main_name: exact_match
@@ -353,7 +348,6 @@ run_groups:
       ([Ardila et al, 2020](https://arxiv.org/abs/1912.06670)).
     metric_groups:
       - accuracy
-      - efficiency
       - general_information
     environment:
       main_name: word_accuracy
@@ -378,7 +372,6 @@ run_groups:
       ([Shah et al, 2024](https://arxiv.org/abs/2403.07937)).
     metric_groups:
       - accuracy
-      - efficiency
       - general_information
     environment:
       main_name: word_accuracy
@@ -401,7 +394,6 @@ run_groups:
       The dataset contains the audio and question for three subsets: occupation, status, and potential_crime.
     metric_groups:
       - accuracy
-      - efficiency
       - general_information
     environment:
       main_name: exact_match
@@ -427,7 +419,6 @@ run_groups:
       questions answering task.
     metric_groups:
       - accuracy
-      - efficiency
       - general_information
     environment:
       main_name: exact_match
@@ -437,4 +428,27 @@ run_groups:
       what: audio, spoken language, speaker's gender, age information of audio samples
       who: real speakers
       when: "2023"
-      language: 10 languages
\ No newline at end of file
+      language: 10 languages
+
+  - name: air_bench_chat
+    display_name: Air-Bench Chat
+    description: >
+      Air-Bench (Yang et al, 2024) encompasses two dimensions: foundation and chat benchmarks. The former consists of 19 tasks with 
+      approximately 19k single-choice questions. The latter one contains 2k instances of open-ended question-and-answer data. 
+      We consider the chat benchmark in this scenario.
+
+      The dataset contains the audio question answering task in four subjects: sound, speech, music, and mixed.
+      ([Yang et al, 2024](https://aclanthology.org/2024.acl-long.109.pdf)).
+    metric_groups:
+      - accuracy
+      - general_information
+      - reasoning
+    environment:
+      main_name: f1_score
+      main_split: test
+    taxonomy:
+      task: audio question answering
+      what: adio, question, and answer of audio samples
+      who: real speakers
+      when: "2024"
+      language: English
\ No newline at end of file

From 2e16cf2aaed1cc2e50674e7c4582178b18f4e835 Mon Sep 17 00:00:00 2001
From: Yifan Mai <yifan@cs.stanford.edu>
Date: Wed, 4 Dec 2024 16:36:38 -0800
Subject: [PATCH 04/14] Add Solar Pro model (#3198)

---
 src/helm/clients/upstage_client.py     | 23 +++++++++++++++++++++++
 src/helm/config/model_deployments.yaml |  8 ++++++++
 src/helm/config/model_metadata.yaml    |  9 +++++++++
 3 files changed, 40 insertions(+)
 create mode 100644 src/helm/clients/upstage_client.py

diff --git a/src/helm/clients/upstage_client.py b/src/helm/clients/upstage_client.py
new file mode 100644
index 00000000000..734acc4c3b2
--- /dev/null
+++ b/src/helm/clients/upstage_client.py
@@ -0,0 +1,23 @@
+from helm.clients.openai_client import OpenAIClient
+from helm.common.cache import CacheConfig
+from helm.tokenizers.tokenizer import Tokenizer
+
+
+class UpstageChatClient(OpenAIClient):
+    """Sends request to a Upstage model using a OpenAI-compatible Chat API."""
+
+    def __init__(
+        self,
+        tokenizer: Tokenizer,
+        tokenizer_name: str,
+        cache_config: CacheConfig,
+        api_key: str,
+    ):
+        super().__init__(
+            tokenizer=tokenizer,
+            tokenizer_name=tokenizer_name,
+            cache_config=cache_config,
+            api_key=api_key,
+            org_id=None,
+            base_url="https://api.upstage.ai/v1/solar",
+        )
diff --git a/src/helm/config/model_deployments.yaml b/src/helm/config/model_deployments.yaml
index 24ea01bafcd..20cd77afcb7 100644
--- a/src/helm/config/model_deployments.yaml
+++ b/src/helm/config/model_deployments.yaml
@@ -2751,6 +2751,14 @@ model_deployments:
     client_spec:
       class_name: "helm.clients.reka_client.RekaClient"
 
+  # Upstage
+  - name: upstage/solar-pro-241126
+    model_name: upstage/solar-pro-241126
+    tokenizer_name: upstage/solar-pro-preview-instruct
+    max_sequence_length: 32768
+    client_spec:
+      class_name: "helm.clients.upstage_client.UpstageChatClient"
+
 # Diva Llama
   - name: huggingface/diva-llama
     model_name: stanford/diva-llama
diff --git a/src/helm/config/model_metadata.yaml b/src/helm/config/model_metadata.yaml
index eb611e446f9..da037391813 100644
--- a/src/helm/config/model_metadata.yaml
+++ b/src/helm/config/model_metadata.yaml
@@ -3146,6 +3146,15 @@ models:
     release_date: 2024-09-11
     tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG]
 
+  - name: upstage/solar-pro-241126
+    display_name: Solar Pro
+    display_name: Solar Pro
+    description: Solar Pro is a LLM designed for instruction-following and processing structured formats like HTML and Markdown. It supports English, Korean, and Japanese and has domain expertise in Finance, Healthcare, and Legal. ([blog](https://www.upstage.ai/blog/press/solar-pro-aws)).
+    creator_organization_name: Upstage
+    access: limited
+    num_parameters: 22000000000
+    release_date: 2024-11-26
+    tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG]
 
   # Writer
   - name: writer/palmyra-base

From 6c358c6d9aaa5580aed8e0674c01bf211b797307 Mon Sep 17 00:00:00 2001
From: JESSADA PRANEE <89401708+JackJessada@users.noreply.github.com>
Date: Fri, 6 Dec 2024 01:06:03 +0700
Subject: [PATCH 05/14] Add NECTEC (#3197)

---
 src/helm/config/model_deployments.yaml | 15 +++++++++++++++
 src/helm/config/model_metadata.yaml    | 23 ++++++++++++++++++++++-
 src/helm/config/tokenizer_configs.yaml | 19 ++++++++++++++++---
 3 files changed, 53 insertions(+), 4 deletions(-)

diff --git a/src/helm/config/model_deployments.yaml b/src/helm/config/model_deployments.yaml
index 20cd77afcb7..95214d068bc 100644
--- a/src/helm/config/model_deployments.yaml
+++ b/src/helm/config/model_deployments.yaml
@@ -980,7 +980,22 @@ model_deployments:
     max_sequence_length: 2048
     client_spec:
       class_name: "helm.clients.vision_language.huggingface_vlm_client.HuggingFaceVLMClient"
+      
+  ## NECTEC
+  - name: huggingface/Pathumma-llm-text-1.0.0
+    model_name: nectec/Pathumma-llm-text-1.0.0
+    tokenizer_name: nectec/Pathumma-llm-text-1.0.0
+    max_sequence_length: 8192
+    client_spec:
+      class_name: "helm.clients.huggingface_client.HuggingFaceClient"
 
+  - name: huggingface/OpenThaiLLM-Prebuilt-7B
+    model_name: nectec/OpenThaiLLM-Prebuilt-7B
+    tokenizer_name: nectec/OpenThaiLLM-Prebuilt-7B
+    max_sequence_length: 4096
+    client_spec:
+      class_name: "helm.clients.huggingface_client.HuggingFaceClient"
+    
   ## KAIST AI
   - name: huggingface/prometheus-vision-13b-v1.0-hf
     model_name: kaistai/prometheus-vision-13b-v1.0-hf
diff --git a/src/helm/config/model_metadata.yaml b/src/helm/config/model_metadata.yaml
index da037391813..c3bb0f54b8d 100644
--- a/src/helm/config/model_metadata.yaml
+++ b/src/helm/config/model_metadata.yaml
@@ -2144,6 +2144,27 @@ models:
     tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
 
 
+    
+  # NECTEC
+  - name: nectec/Pathumma-llm-text-1.0.0
+    display_name: Pathumma-llm-text-1.0.0 (7B)
+    description: Pathumma-llm-text-1.0.0 (7B) is a instruction model from  OpenThaiLLM-Prebuilt-7B ([blog](https://medium.com/nectec/pathummallm-v-1-0-0-release-6a098ddfe276))
+    creator_organization_name: nectec
+    access: open
+    num_parameters: 7620000000
+    release_date: 2024-10-28
+    tags: [TEXT_MODEL_TAG, PARTIAL_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
+ 
+  - name: nectec/OpenThaiLLM-Prebuilt-7B
+    display_name: OpenThaiLLM-Prebuilt-7B (7B)
+    description: OpenThaiLLM-Prebuilt-7B (7B) is a pretrained Thai large language model with 7 billion parameters based on Qwen2.5-7B.
+    creator_organization_name: nectec
+    access: open
+    num_parameters: 7620000000
+    release_date: 2024-10-28
+    tags: [TEXT_MODEL_TAG, PARTIAL_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+
 
   # Neurips
   - name: neurips/local
@@ -3468,4 +3489,4 @@ models:
     access: open
     num_parameters: 1380000000
     release: 2024-10-21
-    tags: [TEXT_MODEL_TAG]
\ No newline at end of file
+    tags: [TEXT_MODEL_TAG]
diff --git a/src/helm/config/tokenizer_configs.yaml b/src/helm/config/tokenizer_configs.yaml
index e2293562610..ef7dda6d765 100644
--- a/src/helm/config/tokenizer_configs.yaml
+++ b/src/helm/config/tokenizer_configs.yaml
@@ -456,6 +456,19 @@ tokenizer_configs:
     end_of_text_token: "</s>"
     prefix_token: "<s>"
 
+  # Nectec
+  - name: nectec/OpenThaiLLM-Prebuilt-7B
+    tokenizer_spec:
+      class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+    end_of_text_token: "<|im_end|>"
+    prefix_token: ""
+  
+  - name: nectec/Pathumma-llm-text-1.0.0
+    tokenizer_spec:
+      class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+    end_of_text_token: "<|im_end|>"
+    prefix_token: "<|im_start|>"
+  
   # Neurips
   - name: neurips/local
     tokenizer_spec:
@@ -530,7 +543,7 @@ tokenizer_configs:
       class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
       args:
         pretrained_model_name_or_path: Qwen/Qwen2-72B-Instruct
-    end_of_text_token: <|im_end|>"
+    end_of_text_token: "<|im_end|>"
     prefix_token: "<|im_start|>'"
 
   - name: qwen/qwen2.5-7b-instruct
@@ -538,7 +551,7 @@ tokenizer_configs:
       class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
       args:
         pretrained_model_name_or_path: Qwen/Qwen2.5-7B-Instruct
-    end_of_text_token: <|im_end|>"
+    end_of_text_token: "<|im_end|>"
     prefix_token: "<|im_start|>'"
 
   - name: qwen/qwen-vl
@@ -728,4 +741,4 @@ tokenizer_configs:
       args:
         pretrained_model_name_or_path: ibm-granite/granite-3.0-1b-a400m-base
     end_of_text_token: ""
-    prefix_token: ""
\ No newline at end of file
+    prefix_token: ""

From 416601c6e640e122bc1a1efe3007a646d7c6c536 Mon Sep 17 00:00:00 2001
From: Yifan Mai <yifan@cs.stanford.edu>
Date: Fri, 6 Dec 2024 14:11:05 -0800
Subject: [PATCH 06/14] Add Llama 3.3 model (#3202)

---
 src/helm/config/model_deployments.yaml | 9 +++++++++
 src/helm/config/model_metadata.yaml    | 9 +++++++++
 src/helm/config/tokenizer_configs.yaml | 8 ++++++++
 3 files changed, 26 insertions(+)

diff --git a/src/helm/config/model_deployments.yaml b/src/helm/config/model_deployments.yaml
index 95214d068bc..10f688461ae 100644
--- a/src/helm/config/model_deployments.yaml
+++ b/src/helm/config/model_deployments.yaml
@@ -2152,6 +2152,15 @@ model_deployments:
       args:
         together_model: meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo
 
+  - name: together/llama-3.3-70b-instruct-turbo
+    model_name: meta/llama-3.3-70b-instruct-turbo
+    tokenizer_name: meta/llama-3.3-70b-instruct
+    max_sequence_length: 128000
+    client_spec:
+      class_name: "helm.clients.together_client.TogetherChatClient"
+      args:
+        together_model: meta-llama/Llama-3.3-70B-Instruct-Turbo
+
   - name: together/llama-guard-7b
     model_name: meta/llama-guard-7b
     tokenizer_name: meta-llama/Llama-2-7b-hf
diff --git a/src/helm/config/model_metadata.yaml b/src/helm/config/model_metadata.yaml
index c3bb0f54b8d..7fc8457beba 100644
--- a/src/helm/config/model_metadata.yaml
+++ b/src/helm/config/model_metadata.yaml
@@ -1656,6 +1656,15 @@ models:
     release_date: 2024-09-25
     tags: [TEXT_MODEL_TAG, VISION_LANGUAGE_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
 
+  - name: meta/llama-3.3-70b-instruct-turbo
+    display_name: Llama 3.3 Instruct Turbo (70B)
+    description: Llama 3.3 (70B) is part of the Llama 3 family of dense Transformer models that that natively support multilinguality, coding, reasoning, and tool usage. ([paper](https://ai.meta.com/research/publications/the-llama-3-herd-of-models/)) Turbo is Together's implementation, providing a near negligible difference in quality from the reference implementation with faster performance and lower cost, currently using FP8 quantization. ([blog](https://www.together.ai/blog/llama-31-quality))
+    creator_organization_name: Meta
+    access: open
+    num_parameters: 70000000000
+    release_date: 2024-12-06
+    tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
+
   - name: meta/llama-3-8b-chat
     display_name: Llama 3 Instruct (8B)
     description: Llama 3 is a family of language models that have been trained on more than 15 trillion tokens, and use Grouped-Query Attention (GQA) for improved inference scalability. It used SFT, rejection sampling, PPO and DPO for post-training. ([paper](https://ai.meta.com/research/publications/the-llama-3-herd-of-models/)
diff --git a/src/helm/config/tokenizer_configs.yaml b/src/helm/config/tokenizer_configs.yaml
index ef7dda6d765..cbf96457f18 100644
--- a/src/helm/config/tokenizer_configs.yaml
+++ b/src/helm/config/tokenizer_configs.yaml
@@ -349,6 +349,14 @@ tokenizer_configs:
     prefix_token: "<|begin_of_text|>"
     end_of_text_token: "<|eot_id|>"
 
+  - name: meta/llama-3.3-70b-instruct
+    tokenizer_spec:
+      class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+      args:
+        pretrained_model_name_or_path: meta-llama/Llama-3.3-70B-Instruct
+    prefix_token: "<|begin_of_text|>"
+    end_of_text_token: "<|eot_id|>"
+
   # 01-ai
   - name: 01-ai/Yi-6B
     tokenizer_spec:

From ff9c7c9a7dd5af8a62d6de2b1a36c70c8f60b690 Mon Sep 17 00:00:00 2001
From: Siya Goel <139517142+siyagoel@users.noreply.github.com>
Date: Fri, 6 Dec 2024 15:25:13 -0800
Subject: [PATCH 07/14] Changes for MMLU PRO with COT (#3200)

---
 .../metrics/chain_of_thought_metric.py        |  93 +++++++++
 .../benchmark/run_specs/lite_run_specs.py     | 188 ++++++++++--------
 .../{mmlu_pro.py => mmlu_pro_scenario.py}     |  32 ++-
 .../scenarios/test_mmlu_pro_scenario.py       |   2 +-
 src/helm/benchmark/static/schema_lite_v2.yaml |  27 +--
 5 files changed, 235 insertions(+), 107 deletions(-)
 create mode 100644 src/helm/benchmark/metrics/chain_of_thought_metric.py
 rename src/helm/benchmark/scenarios/{mmlu_pro.py => mmlu_pro_scenario.py} (76%)

diff --git a/src/helm/benchmark/metrics/chain_of_thought_metric.py b/src/helm/benchmark/metrics/chain_of_thought_metric.py
new file mode 100644
index 00000000000..0925fb38527
--- /dev/null
+++ b/src/helm/benchmark/metrics/chain_of_thought_metric.py
@@ -0,0 +1,93 @@
+import re
+from typing import List, Optional
+
+from helm.benchmark.adaptation.adapter_spec import AdapterSpec
+from helm.benchmark.adaptation.request_state import RequestState
+from helm.benchmark.metrics.metric import Metric
+from helm.benchmark.metrics.metric_name import MetricName
+from helm.benchmark.metrics.metric_service import MetricService
+from helm.benchmark.metrics.statistic import Stat
+
+
+def extract_answer(output_text: str) -> Optional[str]:
+    """
+    Extracts the answer from the output text using two exact regex patterns.
+    Returns None if no valid answer is found.
+
+    Args:
+        output_text (str): The text from which to extract the answer.
+
+    Returns:
+        Optional[str]: The extracted answer (A-J) if found, otherwise None.
+    """
+    # First regex: Matches "answer is (A-J)" with optional parentheses
+    match = re.search(r"answer is \(?([A-J])\)?", output_text)
+    if match:
+        return match.group(1)
+
+    # Second regex: Matches "[answer: (A-J)]" with optional leading characters like "."
+    match = re.search(r"\.*\[aA\]nswer:\s*\(?([A-J])\)?", output_text)
+    if match:
+        return match.group(1)
+
+    # If neither regex matches, return None
+    return None
+
+
+class ChainOfThoughtMetric(Metric):
+    """
+    This metric focuses on structured reasoning and the accuracy of extracted answers.
+    It compares model outputs against correct answers provided in a multiple-choice
+    format and returns a score indicating the correctness of the generated response.
+    """
+
+    def evaluate_generation(
+        self,
+        adapter_spec: AdapterSpec,
+        request_state: RequestState,
+        metric_service: MetricService,
+        eval_cache_path: str,
+    ) -> List[Stat]:
+        """
+        Evaluate the generated output for chain-of-thought reasoning accuracy.
+
+        The method extracts the model's output, determines the correct answer
+        from the provided references, and compares the two to compute a binary score.
+
+        Args:
+            adapter_spec (AdapterSpec): Specification of the adapter used for the evaluation.
+            request_state (RequestState): The state of the current request, including
+                                          the input instance, output results, and references.
+            metric_service (MetricService): A service used to compute metrics if needed.
+            eval_cache_path (str): Path to the evaluation cache for storing or retrieving data.
+
+        Returns:
+            List[Stat]: A list containing a single `Stat` object with the correctness
+                        score (1 for correct, 0 for incorrect) under the metric
+                        name "chain_of_thought_correct".
+        """
+        # Assert that completions exist if the result is not None
+        assert (
+            request_state.result is not None and request_state.result.completions
+        ), "Request state result must have completions."
+
+        # Set output_text if the assertion passes
+        output_text = request_state.result.completions[0].text
+
+        # Extract the answer using the updated logic
+        extracted_answer = extract_answer(output_text)
+
+        # Find the correct answer from references by translating index to letter
+        correct_answer = None
+        for index, option in enumerate(request_state.instance.references):
+            if option.is_correct:
+                correct_answer = chr(65 + index)  # Translate index (0 -> A, 1 -> B, etc.)
+                break
+
+        # Raise an exception if no correct answer is found
+        if correct_answer is None:
+            raise ValueError(f"No correct answer found for instance ID {request_state.instance.id}")
+
+        # Compare extracted answer with the correct answer and compute the score
+        score = 1 if extracted_answer == correct_answer else 0
+        return [Stat(MetricName("chain_of_thought_correctness")).add(score)]
diff --git a/src/helm/benchmark/run_specs/lite_run_specs.py b/src/helm/benchmark/run_specs/lite_run_specs.py
index 38fbd4ecaa1..c995aa3e36c 100644
--- a/src/helm/benchmark/run_specs/lite_run_specs.py
+++ b/src/helm/benchmark/run_specs/lite_run_specs.py
@@ -21,11 +21,11 @@
     get_generative_harms_metric_specs,
     get_generic_metric_specs,
     get_open_ended_generation_metric_specs,
-    MetricSpec,
 )
 from helm.benchmark.run_spec import RunSpec, run_spec_function
 from helm.benchmark.runner import get_benchmark_output_path
 from helm.benchmark.scenarios.scenario import ScenarioSpec, get_scenario_cache_path
+from helm.benchmark.metrics.metric import MetricSpec
 
 
 @run_spec_function("narrative_qa")
@@ -137,25 +137,59 @@ def get_mmlu_spec(subject: str, method: str = ADAPT_MULTIPLE_CHOICE_JOINT) -> Ru
 
 
 @run_spec_function("mmlu_pro")
-def get_mmlu_pro_spec(subject: str) -> RunSpec:
-    scenario_spec = ScenarioSpec(
-        class_name="helm.benchmark.scenarios.mmlu_pro.MMLUProScenario", args={"subject": subject}
-    )
+def get_mmlu_pro_spec(subject: str, use_chain_of_thought: str = "False", use_few_shot: str = "False") -> RunSpec:
+    # Convert to bools and remove the str versions
+    use_chain_of_thought_bool: bool = use_chain_of_thought == "True"
+    use_few_shot_bool: bool = use_few_shot == "True"
+    del use_chain_of_thought
+    del use_few_shot
 
-    adapter_spec = get_multiple_choice_adapter_spec(
-        method=ADAPT_MULTIPLE_CHOICE_JOINT,
-        instructions=f"The following are multiple choice questions (with answers) about {subject.replace('_', ' ')}.",
-        input_noun="Question",
-        output_noun="Answer",
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.mmlu_pro_scenario.MMLUProScenario", args={"subject": subject}
     )
+    max_train_instance_num = 5 if use_few_shot_bool else 0
 
-    return RunSpec(
-        name=f"mmlu_pro:subject={subject}",
-        scenario_spec=scenario_spec,
-        adapter_spec=adapter_spec,
-        metric_specs=get_exact_match_metric_specs(),
-        groups=["mmlu_pro"],
-    )
+    if use_chain_of_thought_bool:
+        adapter_spec = AdapterSpec(
+            method=ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT,
+            max_train_instances=max_train_instance_num,
+            max_tokens=1000,
+            input_prefix="What is the correct answer to this question: ",
+            input_suffix="\nChoices:\n",
+            output_prefix="",
+            global_suffix=(
+                "Let’s think step by step. Based on your reasoning, what is the single, "
+                "most likely answer choice? Format your response as follows: "
+                '"The correct answer is (insert answer here)".'
+            ),
+        )
+        return RunSpec(
+            name=f"mmlu_pro:subset={subject},use_chain_of_thought={use_chain_of_thought_bool}",
+            scenario_spec=scenario_spec,
+            adapter_spec=adapter_spec,
+            metric_specs=get_exact_match_metric_specs()
+            + [
+                MetricSpec(class_name="helm.benchmark.metrics.chain_of_thought_metric.ChainOfThoughtMetric", args={}),
+            ],
+            groups=["mmlu_pro"],
+        )
+    else:
+        adapter_spec = AdapterSpec(
+            method=ADAPT_MULTIPLE_CHOICE_JOINT,
+            max_train_instances=max_train_instance_num,
+            max_tokens=1000,
+            input_prefix="What is the correct answer to this question: ",
+            input_suffix="\nChoices:\n",
+            output_prefix="",
+            global_suffix=("Format your response as follows: " '"The correct answer is (insert answer here)".'),
+        )
+        return RunSpec(
+            name=f"mmlu_pro:subset={subject},use_chain_of_thought={use_chain_of_thought_bool}",
+            scenario_spec=scenario_spec,
+            adapter_spec=adapter_spec,
+            metric_specs=get_exact_match_metric_specs(),
+            groups=["mmlu_pro"],
+        )
 
 
 @run_spec_function("gsm")
@@ -344,79 +378,57 @@ def get_gpqa_spec(subset: str, use_chain_of_thought: str = "False", use_few_shot
     )
     max_train_instance_num = 5 if use_few_shot_bool else 0
 
-    if use_few_shot_bool:
-        if use_chain_of_thought_bool:
-            adapter_spec = get_multiple_choice_adapter_spec(
-                method=ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT,
-                max_tokens=1000,  # following original repo
-                max_train_instances=max_train_instance_num,
-                instructions=(
-                    "Here are some example questions from experts. "
-                    "An explanation is given before the final answer. "
-                    "Answer the final question yourself, giving your reasoning beforehand."
-                ),
-                input_noun="Question",
-                input_suffix="\nChoices: \n",
-                reference_prefix="(A) ",
-                chain_of_thought_prefix="Let's think step by step: ",
-                chain_of_thought_suffix="The correct answer is ",
-                output_noun="",  # will be overwritten with output_prefix
-                output_prefix="",
-                global_suffix=(
-                    "Give step by step reasoning before you answer, and when you’re ready to answer, "
-                    'please use the format "The correct answer is (insert answer here)":'
-                ),
-            )
-        else:
-            adapter_spec = get_multiple_choice_adapter_spec(
-                method=ADAPT_MULTIPLE_CHOICE_JOINT,
-                max_train_instances=max_train_instance_num,
-                instructions=(
-                    "Here are some example questions from experts. "
-                    "An explanation is given before the final answer. "
-                    "Answer the final question yourself, giving your reasoning beforehand."
-                ),
-                input_noun="Question",
-                input_suffix="\nChoices: \n",
-                reference_prefix="(A) ",
-                output_noun="",  # will be overwritten with output_prefix
-                output_prefix="The correct answer is ",
-            )
+    if use_chain_of_thought_bool:
+        adapter_spec = get_multiple_choice_adapter_spec(
+            method=ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT,
+            max_tokens=1000,  # following original repo
+            max_train_instances=max_train_instance_num,
+            instructions=(
+                "Here are some example questions from experts. "
+                "An explanation is given before the final answer. "
+                "Answer the final question yourself, giving your reasoning beforehand."
+            ),
+            input_noun="Question",
+            input_suffix="\nChoices: \n",
+            reference_prefix="(A) ",
+            chain_of_thought_prefix="Let's think step by step: ",
+            chain_of_thought_suffix="The correct answer is ",
+            output_noun="",  # will be overwritten with output_prefix
+            output_prefix="",
+        )
+        return RunSpec(
+            name=f"gpqa:subset={subset},use_chain_of_thought={use_chain_of_thought_bool}",
+            scenario_spec=scenario_spec,
+            adapter_spec=adapter_spec,
+            metric_specs=get_exact_match_metric_specs()
+            + [
+                MetricSpec(class_name="helm.benchmark.metrics.chain_of_thought_metric.ChainOfThoughtMetric", args={}),
+            ],
+            groups=["gpqa"],
+        )
     else:
-        if use_chain_of_thought_bool:
-            adapter_spec = AdapterSpec(
-                method=ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT,
-                max_train_instances=max_train_instance_num,
-                max_tokens=1000,
-                input_prefix="What is the correct answer to this question: ",
-                input_suffix="\nChoices:\n",
-                output_prefix="",
-                reference_prefix="(A) ",
-                global_suffix=(
-                    "Let’s think step by step. Based on your reasoning, what is the single, "
-                    "most likely answer choice? Format your response as follows: "
-                    '"The correct answer is (insert answer here)".'
-                ),
-            )
-        else:
-            adapter_spec = AdapterSpec(
-                method=ADAPT_MULTIPLE_CHOICE_JOINT,
-                max_train_instances=max_train_instance_num,
-                max_tokens=1000,
-                input_prefix="What is the correct answer to this question: ",
-                input_suffix="\nChoices:\n",
-                output_prefix="",
-                reference_prefix="(A) ",
-                global_suffix=("Format your response as follows: " '"The correct answer is (insert answer here)".'),
-            )
+        adapter_spec = get_multiple_choice_adapter_spec(
+            method=ADAPT_MULTIPLE_CHOICE_JOINT,
+            max_train_instances=max_train_instance_num,
+            instructions=(
+                "Here are some example questions from experts. "
+                "An explanation is given before the final answer. "
+                "Answer the final question yourself, giving your reasoning beforehand."
+            ),
+            input_noun="Question",
+            input_suffix="\nChoices: \n",
+            reference_prefix="(A) ",
+            output_noun="",  # will be overwritten with output_prefix
+            output_prefix="The correct answer is ",
+        )
 
-    return RunSpec(
-        name=f"gpqa:subset={subset},use_chain_of_thought={use_chain_of_thought_bool}",
-        scenario_spec=scenario_spec,
-        adapter_spec=adapter_spec,
-        metric_specs=get_exact_match_metric_specs(),  # TODO: update this after cot metric is ready
-        groups=["gpqa"],
-    )
+        return RunSpec(
+            name=f"gpqa:subset={subset},use_chain_of_thought={use_chain_of_thought_bool}",
+            scenario_spec=scenario_spec,
+            adapter_spec=adapter_spec,
+            metric_specs=get_exact_match_metric_specs(),
+            groups=["gpqa"],
+        )
 
 
 @run_spec_function("ifeval")
diff --git a/src/helm/benchmark/scenarios/mmlu_pro.py b/src/helm/benchmark/scenarios/mmlu_pro_scenario.py
similarity index 76%
rename from src/helm/benchmark/scenarios/mmlu_pro.py
rename to src/helm/benchmark/scenarios/mmlu_pro_scenario.py
index a091387dc22..5d08d4f9d16 100644
--- a/src/helm/benchmark/scenarios/mmlu_pro.py
+++ b/src/helm/benchmark/scenarios/mmlu_pro_scenario.py
@@ -1,8 +1,17 @@
 from typing import Dict, List
-from datasets import load_dataset
+from datasets import Dataset, load_dataset
 
 from helm.common.hierarchical_logger import hlog
-from .scenario import Scenario, Instance, Reference, TRAIN_SPLIT, TEST_SPLIT, CORRECT_TAG, Input, Output
+from helm.benchmark.scenarios.scenario import (
+    Scenario,
+    Instance,
+    Reference,
+    TRAIN_SPLIT,
+    TEST_SPLIT,
+    CORRECT_TAG,
+    Input,
+    Output,
+)
 
 
 class MMLUProScenario(Scenario):
@@ -33,7 +42,14 @@ def __init__(self, subject: str):
         super().__init__()
         self.subject: str = subject
 
-    def process_csv(self, data, split: str) -> List[Instance]:
+    def process_dataset(self, data: Dataset, split: str) -> List[Instance]:
+        """
+        Process the dataset to create instances.
+
+        :param data: Hugging Face `Dataset` containing the data for a specific split.
+        :param split: The data split (e.g., "train", "test").
+        :return: A list of processed `Instance` objects.
+        """
         instances: List[Instance] = []
         hlog(f"Processing data for {split} split")
         for row in data:
@@ -55,8 +71,14 @@ def answer_to_reference(answer: str) -> Reference:
         return instances
 
     def get_instances(self, output_path: str) -> List[Instance]:
+        """
+        Load and process the MMLU-Pro dataset to create instances.
+
+        :param output_path: Path to save or output the processed instances.
+        :return: A list of all processed `Instance` objects.
+        """
         # Load the MMLU-Pro dataset from Hugging Face
-        dataset = load_dataset("TIGER-Lab/MMLU-Pro")
+        dataset = load_dataset("TIGER-Lab/MMLU-Pro", revision="3373e0b")
 
         # Process all the instances
         instances: List[Instance] = []
@@ -66,6 +88,6 @@ def get_instances(self, output_path: str) -> List[Instance]:
         }
         for hf_split, split in splits.items():
             data = dataset[hf_split].filter(lambda x: x["category"] == self.subject)
-            instances.extend(self.process_csv(data, split))
+            instances.extend(self.process_dataset(data, split))
 
         return instances
diff --git a/src/helm/benchmark/scenarios/test_mmlu_pro_scenario.py b/src/helm/benchmark/scenarios/test_mmlu_pro_scenario.py
index 8c1dc87a8c0..12ac71f0e01 100644
--- a/src/helm/benchmark/scenarios/test_mmlu_pro_scenario.py
+++ b/src/helm/benchmark/scenarios/test_mmlu_pro_scenario.py
@@ -1,7 +1,7 @@
 import pytest
 from tempfile import TemporaryDirectory
 
-from helm.benchmark.scenarios.mmlu_pro import MMLUProScenario
+from helm.benchmark.scenarios.mmlu_pro_scenario import MMLUProScenario
 from helm.benchmark.scenarios.scenario import CORRECT_TAG, Input, Output, Reference
 
 
diff --git a/src/helm/benchmark/static/schema_lite_v2.yaml b/src/helm/benchmark/static/schema_lite_v2.yaml
index 17d6fd58346..0d0026e3a30 100644
--- a/src/helm/benchmark/static/schema_lite_v2.yaml
+++ b/src/helm/benchmark/static/schema_lite_v2.yaml
@@ -93,6 +93,11 @@ metrics:
     short_display_name: IFEval Strict Acc
     description: Fraction of instructions in the instance that are correctly followed.
     lower_is_better: false
+  - name: chain_of_thought_correctness
+    display_name: COT correct
+    short_display_name: COT correct
+    description: Allows to do evaluation using chain of thought for mmlu pro and gpqa.
+    lower_is_better: false
 
 ############################################################
 perturbations: []
@@ -154,32 +159,28 @@ run_groups:
       when: "?"
       language: English
 
-  - name: gpqa
-    display_name: GPQA
-    description: GPQA
+  - name: ifeval
+    display_name: IFEval
+    description: IFEval
     metric_groups:
       - accuracy
       - efficiency
       - general_information
     environment:
-      main_name: exact_match  # non-CoT
+      main_name: ifeval_strict_accuracy
       main_split: test
     taxonomy:
       task: "?"
-      what: "?"
-      who: "?"
-      when: "?"
-      language: English
-  
-  - name: ifeval
-    display_name: IFEval
-    description: IFEval
+
+  - name: gpqa
+    display_name: GPQA
+    description: GPQA
     metric_groups:
       - accuracy
       - efficiency
       - general_information
     environment:
-      main_name: ifeval_strict_accuracy
+      main_name: chain_of_thought_correct  # non-CoT
       main_split: test
     taxonomy:
       task: "?"

From b8a140f865c01510e4091c404c5024512aaa17ab Mon Sep 17 00:00:00 2001
From: Thallyson Alves <thallyson.alves@ccc.ufcg.edu.br>
Date: Fri, 6 Dec 2024 21:43:46 -0300
Subject: [PATCH 08/14] =?UTF-8?q?Adding=20ENEM=20Challenge=20Scenario=20&?=
 =?UTF-8?q?=20Maritaca=20AI=20model=20(Sabi=C3=A1=207B)=20(#3185)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Yifan Mai <yifan@cs.stanford.edu>
---
 .../run_specs/enem_challenge_specs.py         |  31 ++++
 .../scenarios/enem_challenge_scenario.py      |  58 +++++++
 .../scenarios/test_enem_challenge_scenario.py |  53 +++++++
 .../static/schema_enem_challenge.yaml         | 146 ++++++++++++++++++
 src/helm/config/model_deployments.yaml        |   9 ++
 src/helm/config/model_metadata.yaml           |  10 ++
 src/helm/config/tokenizer_configs.yaml        |   8 +
 7 files changed, 315 insertions(+)
 create mode 100644 src/helm/benchmark/run_specs/enem_challenge_specs.py
 create mode 100644 src/helm/benchmark/scenarios/enem_challenge_scenario.py
 create mode 100644 src/helm/benchmark/scenarios/test_enem_challenge_scenario.py
 create mode 100644 src/helm/benchmark/static/schema_enem_challenge.yaml

diff --git a/src/helm/benchmark/run_specs/enem_challenge_specs.py b/src/helm/benchmark/run_specs/enem_challenge_specs.py
new file mode 100644
index 00000000000..a06cf2ecee4
--- /dev/null
+++ b/src/helm/benchmark/run_specs/enem_challenge_specs.py
@@ -0,0 +1,31 @@
+from helm.benchmark.adaptation.adapters.adapter_factory import ADAPT_MULTIPLE_CHOICE_JOINT
+from helm.benchmark.adaptation.common_adapter_specs import get_multiple_choice_adapter_spec
+from helm.benchmark.metrics.common_metric_specs import get_exact_match_metric_specs
+from helm.benchmark.run_spec import RunSpec, run_spec_function
+from helm.benchmark.scenarios.scenario import ScenarioSpec
+
+
+@run_spec_function("enem_challenge")
+def get_enem_spec() -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.enem_challenge_scenario.ENEMChallengeScenario", args={}
+    )
+
+    adapter_spec = get_multiple_choice_adapter_spec(
+        method=ADAPT_MULTIPLE_CHOICE_JOINT,
+        instructions="Dê uma resposta selecionando uma letra entre as opções fornecidas. "
+        "Se as opções forem A, B, C, D e E, "
+        "sua resposta deve consistir em uma única letra que corresponde a resposta correta.\n"
+        "Exemplo: Qual é a capital da França?\nA. Londres\nB. Paris\nC. Roma\nD. Berlim\nE. Sydney\n"
+        "Resposta: B",
+        input_noun="Pergunta",
+        output_noun="Resposta",
+    )
+
+    return RunSpec(
+        name="enem_challenge",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=get_exact_match_metric_specs(),
+        groups=["enem_challenge"],
+    )
diff --git a/src/helm/benchmark/scenarios/enem_challenge_scenario.py b/src/helm/benchmark/scenarios/enem_challenge_scenario.py
new file mode 100644
index 00000000000..d05b2951868
--- /dev/null
+++ b/src/helm/benchmark/scenarios/enem_challenge_scenario.py
@@ -0,0 +1,58 @@
+from typing import List, Any
+from pathlib import Path
+from datasets import load_dataset
+
+from helm.benchmark.scenarios.scenario import (
+    Scenario,
+    Instance,
+    Reference,
+    CORRECT_TAG,
+    TEST_SPLIT,
+    Input,
+    Output,
+)
+
+
+class ENEMChallengeScenario(Scenario):
+    """
+    The Exame Nacional do Ensino Médio (ENEM) is an advanced High-School level exam widely applied
+    every year by the Brazilian government to students that wish to undertake a University degree.
+
+    The questions are about all types of intelectual fields and they are divided into four groups
+    that are named as: Humanities, Languages, Sciences and Mathematics.
+
+    This scenario is based on the exams that were applied throughout the years of 2009 and 2023.
+
+    The dataset can be found in this link: https://huggingface.co/datasets/eduagarcia/enem_challenge
+    """
+
+    name = "enem_challenge"
+    description = "ENEM Challenge dataset"
+    tags = ["knowledge", "multiple_choice", "pt-br"]
+
+    def get_instances(self, output_path: str) -> List[Instance]:
+        # Download the raw data and read all the dialogues
+        dataset: Any
+        # Read all the instances
+        instances: List[Instance] = []
+        cache_dir = str(Path(output_path) / "data")
+
+        dataset = load_dataset("eduagarcia/enem_challenge", cache_dir=cache_dir)
+        for example in dataset["train"]:
+            question = example["question"]
+            choices = example["choices"]
+            answer = example["answerKey"]
+            # Skipping every canceled question!
+            if answer == "ANULADO":
+                continue
+            answers_dict = dict(zip(choices["label"], choices["text"]))
+            correct_answer = answers_dict[answer]
+
+            def answer_to_reference(answer: str) -> Reference:
+                return Reference(Output(text=answer), tags=[CORRECT_TAG] if answer == correct_answer else [])
+
+            instance = Instance(
+                input=Input(text=question), split=TEST_SPLIT, references=list(map(answer_to_reference, choices["text"]))
+            )
+            instances.append(instance)
+        return instances
diff --git a/src/helm/benchmark/scenarios/test_enem_challenge_scenario.py b/src/helm/benchmark/scenarios/test_enem_challenge_scenario.py
new file mode 100644
index 00000000000..db2fc0cff8f
--- /dev/null
+++ b/src/helm/benchmark/scenarios/test_enem_challenge_scenario.py
@@ -0,0 +1,53 @@
+import pytest
+from tempfile import TemporaryDirectory
+
+from helm.benchmark.scenarios.enem_challenge_scenario import ENEMChallengeScenario
+from helm.benchmark.scenarios.scenario import TEST_SPLIT, CORRECT_TAG, Output, Reference
+
+
+@pytest.mark.scenarios
+def test_enem_challenge_scenario():
+    enem_scenario = ENEMChallengeScenario()
+    with TemporaryDirectory() as tmpdir:
+        instances = enem_scenario.get_instances(tmpdir)
+    assert len(instances) == 1431
+    assert instances[0].split == TEST_SPLIT
+
+    assert instances[0].input.text.startswith(
+        "A atmosfera terrestre é composta pelos gases nitrogênio (N2) e oxigênio (O2)"
+    )
+    assert len(instances[0].input.text) == 1163
+
+    assert instances[0].references == [
+        Reference(
+            output=Output(
+                text="reduzir o calor irradiado pela Terra mediante a substituição da produção primária pela industrialização refrigerada. "  # noqa: E501
+            ),
+            tags=[],
+        ),
+        Reference(
+            output=Output(
+                text="promover a queima da biomassa vegetal, responsável pelo aumento do efeito estufa devido à produção de CH4. "  # noqa: E501
+            ),
+            tags=[],
+        ),
+        Reference(
+            output=Output(
+                text="reduzir o desmatamento, mantendo-se, assim, o potencial da vegetação em absorver o CO2 da atmosfera. "  # noqa: E501
+            ),
+            tags=[CORRECT_TAG],
+        ),
+        Reference(
+            output=Output(
+                text="aumentar a concentração atmosférica de H2O, molécula capaz de absorver grande quantidade de calor. "  # noqa: E501
+            ),
+            tags=[],
+        ),
+        Reference(
+            output=Output(
+                text="remover moléculas orgânicas polares da atmosfera, diminuindo a capacidade delas de reter calor. "  # noqa: E501
+            ),
+            tags=[],
+        ),
+    ]
+    assert instances[0].references[2].is_correct
diff --git a/src/helm/benchmark/static/schema_enem_challenge.yaml b/src/helm/benchmark/static/schema_enem_challenge.yaml
new file mode 100644
index 00000000000..f329a2d104d
--- /dev/null
+++ b/src/helm/benchmark/static/schema_enem_challenge.yaml
@@ -0,0 +1,146 @@
+############################################################
+metrics:
+  # Infrastructure metrics:
+  - name: num_perplexity_tokens
+    display_name: '# tokens'
+    description: Average number of tokens in the predicted output (for language modeling, the input too).
+  - name: num_bytes
+    display_name: '# bytes'
+    description: Average number of bytes in the predicted output (for language modeling, the input too).
+
+  - name: num_references
+    display_name: '# ref'
+    description: Number of references.
+  - name: num_train_trials
+    display_name: '# trials'
+    description: Number of trials, where in each trial we choose an independent, random set of training instances.
+  - name: estimated_num_tokens_cost
+    display_name: 'cost'
+    description: An estimate of the number of tokens (including prompt and output completions) needed to perform the request.
+  - name: num_prompt_tokens
+    display_name: '# prompt tokens'
+    description: Number of tokens in the prompt.
+  - name: num_prompt_characters
+    display_name: '# prompt chars'
+    description: Number of characters in the prompt.
+  - name: num_completion_tokens
+    display_name: '# completion tokens'
+    description: Actual number of completion tokens (over all completions).
+  - name: num_output_tokens
+    display_name: '# output tokens'
+    description: Actual number of output tokens.
+  - name: max_num_output_tokens
+    display_name: 'Max output tokens'
+    description: Maximum number of output tokens (overestimate since we might stop earlier due to stop sequences).
+  - name: num_requests
+    display_name: '# requests'
+    description: Number of distinct API requests.
+  - name: num_instances
+    display_name: '# eval'
+    description: Number of evaluation instances.
+  - name: num_train_instances
+    display_name: '# train'
+    description: Number of training instances (e.g., in-context examples).
+  - name: prompt_truncated
+    display_name: truncated
+    description: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).
+  - name: finish_reason_length
+    display_name: finish b/c length
+    description: Fraction of instances where the the output was terminated because of the max tokens limit.
+  - name: finish_reason_stop
+    display_name: finish b/c stop
+    description: Fraction of instances where the the output was terminated because of the stop sequences.
+  - name: finish_reason_endoftext
+    display_name: finish b/c endoftext
+    description: Fraction of instances where the the output was terminated because the end of text token was generated.
+  - name: finish_reason_unknown
+    display_name: finish b/c unknown
+    description: Fraction of instances where the the output was terminated for unknown reasons.
+  - name: num_completions
+    display_name: '# completions'
+    description: Number of completions.
+  - name: predicted_index
+    display_name: Predicted index
+    description: Integer index of the reference (0, 1, ...) that was predicted by the model (for multiple-choice).
+
+  # Accuracy metrics:
+  - name: exact_match
+    display_name: Exact match
+    short_display_name: EM
+    description: Fraction of instances that the predicted output matches a correct reference exactly.
+    lower_is_better: false
+  - name: quasi_exact_match
+    display_name: Quasi-exact match
+    short_display_name: EM
+    description: Fraction of instances that the predicted output matches a correct reference up to light processing.
+    lower_is_better: false
+  - name: prefix_exact_match
+    display_name: Prefix exact match
+    short_display_name: PEM
+    description: Fraction of instances that the predicted output matches the prefix of a correct reference exactly.
+    lower_is_better: false
+  - name: quasi_prefix_exact_match
+    # TODO: should call this prefix_quasi_exact_match
+    display_name: Prefix quasi-exact match
+    short_display_name: PEM
+    description: Fraction of instances that the predicted output matches the prefix of a correct reference up to light processing.
+    lower_is_better: false
+
+
+############################################################
+perturbations: []
+
+############################################################
+metric_groups:
+  - name: accuracy
+    display_name: Accuracy
+    metrics:
+      - name: ${main_name}
+        split: ${main_split}
+
+  # - name: efficiency
+  #   display_name: Efficiency
+  #   metrics:
+  #   - name: inference_runtime
+  #     split: ${main_split}
+
+  - name: general_information
+    display_name: General information
+    hide_win_rates: true
+    metrics:
+    - name: num_instances
+      split: ${main_split}
+    - name: num_train_instances
+      split: ${main_split}
+    - name: prompt_truncated
+      split: ${main_split}
+    - name: num_prompt_tokens
+      split: ${main_split}
+    - name: num_output_tokens
+      split: ${main_split}
+
+############################################################
+run_groups:
+  - name: core_scenarios
+    display_name: Core Scenarios
+    description: Core Scenarios
+    category: All scenarios
+    subgroups:
+      - enem_challenge
+
+  - name: enem_challenge
+    display_name: ENEM Challenge
+    description: ENEM Challenge
+    metric_groups:
+      - accuracy
+    # - efficiency
+      - general_information
+    environment:
+      main_name: exact_match
+      main_split: test
+    taxonomy:
+      task: "multiple-choice question answering"
+      what: "general academic subjects"
+      who: "brazilian ministry of education"
+      when: "between 2009 and 2023"
+      language: Portuguese
diff --git a/src/helm/config/model_deployments.yaml b/src/helm/config/model_deployments.yaml
index 10f688461ae..bb9aed655f5 100644
--- a/src/helm/config/model_deployments.yaml
+++ b/src/helm/config/model_deployments.yaml
@@ -2872,3 +2872,12 @@ model_deployments:
       class_name: "helm.clients.huggingface_client.HuggingFaceClient"
       args:
         pretrained_model_name_or_path: ibm-granite/granite-3.0-1b-a400m-base
+
+  - name: huggingface/sabia-7b
+    model_name: maritaca-ai/sabia-7b
+    tokenizer_name: maritaca-ai/sabia-7b
+    max_sequence_length: 2048
+    client_spec:
+      class_name: "helm.clients.huggingface_client.HuggingFaceClient"
+    args:
+      pretrained_model_name_or_path: maritaca-ai/sabia-7b
diff --git a/src/helm/config/model_metadata.yaml b/src/helm/config/model_metadata.yaml
index 7fc8457beba..84227602ca2 100644
--- a/src/helm/config/model_metadata.yaml
+++ b/src/helm/config/model_metadata.yaml
@@ -3499,3 +3499,13 @@ models:
     num_parameters: 1380000000
     release: 2024-10-21
     tags: [TEXT_MODEL_TAG]
+
+  - name: maritaca-ai/sabia-7b
+    display_name: Sabia 7B
+    description: Sabia 7B
+    creator_organization_name: MARITACA-AI
+    access: open
+    num_parameters: 6740000000
+    release_date: 2023-11-08
+    tags: [TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
+   
diff --git a/src/helm/config/tokenizer_configs.yaml b/src/helm/config/tokenizer_configs.yaml
index cbf96457f18..c851d58c5c8 100644
--- a/src/helm/config/tokenizer_configs.yaml
+++ b/src/helm/config/tokenizer_configs.yaml
@@ -750,3 +750,11 @@ tokenizer_configs:
         pretrained_model_name_or_path: ibm-granite/granite-3.0-1b-a400m-base
     end_of_text_token: ""
     prefix_token: ""
+
+  - name: maritaca-ai/sabia-7b
+    tokenizer_spec:
+      class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+      args:
+        pretrained_model_name_or_path: maritaca-ai/sabia-7b
+    end_of_text_token: "</s>"
+    prefix_token: "<s>"
\ No newline at end of file

From 709336e0291afd5f0d38cf97361ca05805ab6e77 Mon Sep 17 00:00:00 2001
From: Yifan Mai <yifan@cs.stanford.edu>
Date: Fri, 6 Dec 2024 17:45:15 -0800
Subject: [PATCH 09/14] Release Lite and MMLU v1.11.0 leaderboards (#3204)

---
 helm-frontend/project_metadata.json | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/helm-frontend/project_metadata.json b/helm-frontend/project_metadata.json
index dfaf297d16f..2bcd9a82f50 100644
--- a/helm-frontend/project_metadata.json
+++ b/helm-frontend/project_metadata.json
@@ -3,7 +3,7 @@
 		"title": "Lite",
 		"description": "Lightweight, broad evaluation of the capabilities of language models using in-context learning",
 		"id": "lite",
-		"releases": ["v1.10.0", "v1.9.0", "v1.8.0", "v1.7.0", "v1.6.0", "v1.5.0", "v1.4.0", "v1.3.0", "v1.2.0", "v1.1.0", "v1.0.0"]
+		"releases": ["v1.11.0", "v1.10.0", "v1.9.0", "v1.8.0", "v1.7.0", "v1.6.0", "v1.5.0", "v1.4.0", "v1.3.0", "v1.2.0", "v1.1.0", "v1.0.0"]
 	},
 	{
 		"title": "Classic",
@@ -27,7 +27,7 @@
 		"title": "MMLU",
 		"description": "Massive Multitask Language Understanding (MMLU) evaluations using standardized prompts",
 		"id": "mmlu",
-		"releases": ["v1.10.0", "v1.9.0", "v1.8.0", "v1.7.0", "v1.6.0", "v1.5.0", "v1.4.0", "v1.3.0", "v1.2.0", "v1.1.0", "v1.0.0"]
+		"releases": ["v1.11.0", "v1.10.0", "v1.9.0", "v1.8.0", "v1.7.0", "v1.6.0", "v1.5.0", "v1.4.0", "v1.3.0", "v1.2.0", "v1.1.0", "v1.0.0"]
 	},
 	{
 		"title": "VHELM",

From c9065e190c356c1f21ac4111de6df66d04f2e5a4 Mon Sep 17 00:00:00 2001
From: Yifan Mai <yifan@cs.stanford.edu>
Date: Fri, 6 Dec 2024 21:33:54 -0800
Subject: [PATCH 10/14] Rename Multimodality section to Papers in the
 documentation (#3203)

---
 mkdocs.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mkdocs.yml b/mkdocs.yml
index 167f7536a02..c4074561594 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -51,7 +51,7 @@ nav:
     - get_helm_rank.md
     - benchmark.md
     - huggingface_models.md
-  - Multimodality:
+  - Papers:
     - heim.md
     - vhelm.md
   - Reference:

From d7a61c603b05da1627e467325a7c10dd4ba810ed Mon Sep 17 00:00:00 2001
From: Yifan Mai <yifan@cs.stanford.edu>
Date: Mon, 9 Dec 2024 17:21:17 -0800
Subject: [PATCH 11/14] Shorten run spec names for Unitxt runs (#3205)

---
 src/helm/benchmark/run_specs/unitxt_run_specs.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/helm/benchmark/run_specs/unitxt_run_specs.py b/src/helm/benchmark/run_specs/unitxt_run_specs.py
index a7aebf7b811..d6620c42190 100644
--- a/src/helm/benchmark/run_specs/unitxt_run_specs.py
+++ b/src/helm/benchmark/run_specs/unitxt_run_specs.py
@@ -1,3 +1,5 @@
+import os
+
 from helm.benchmark.adaptation.adapter_spec import ADAPT_GENERATION, AdapterSpec
 from helm.benchmark.metrics.metric import MetricSpec
 from helm.benchmark.metrics.common_metric_specs import get_basic_metric_specs
@@ -10,7 +12,12 @@ def get_unitxt_spec(**kwargs) -> RunSpec:
     card = kwargs.get("card")
     if not card:
         raise Exception("Unitxt card must be specified")
-    name_suffix = ",".join([f"{key}={value}" for key, value in kwargs.items()])
+    if os.environ.get("HELM_UNITXT_SHORTEN_RUN_SPEC_NAMES", "").lower() == "true":
+        name_suffix = ",".join(
+            [f"{key}={value}" for key, value in kwargs.items() if key not in ["template_card_index", "loader_limit"]]
+        )
+    else:
+        name_suffix = ",".join([f"{key}={value}" for key, value in kwargs.items()])
     name = f"unitxt:{name_suffix}"
     scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.unitxt_scenario.UnitxtScenario", args=kwargs)
     adapter_spec = AdapterSpec(

From 98d7d0b6c932f42353baddbbd05f2e687ed45344 Mon Sep 17 00:00:00 2001
From: Leonardo Schettini <lschettini@microsoft.com>
Date: Wed, 11 Dec 2024 17:02:57 +0100
Subject: [PATCH 12/14] feat: implement medcalc bench scenario, metrics and
 specs

---
 .../metrics/med_calc_bench_metrics.py         | 172 ++++++++++++++++++
 .../run_specs/med_calc_bench_specs.py         |  54 ++++++
 .../scenarios/med_calc_bench_scenario.py      |  72 ++++++++
 3 files changed, 298 insertions(+)
 create mode 100644 src/helm/benchmark/metrics/med_calc_bench_metrics.py
 create mode 100644 src/helm/benchmark/run_specs/med_calc_bench_specs.py
 create mode 100644 src/helm/benchmark/scenarios/med_calc_bench_scenario.py

diff --git a/src/helm/benchmark/metrics/med_calc_bench_metrics.py b/src/helm/benchmark/metrics/med_calc_bench_metrics.py
new file mode 100644
index 00000000000..a44306d2ec7
--- /dev/null
+++ b/src/helm/benchmark/metrics/med_calc_bench_metrics.py
@@ -0,0 +1,172 @@
+import re
+from datetime import datetime
+from typing import List
+
+from helm.benchmark.adaptation.adapter_spec import AdapterSpec
+from helm.benchmark.adaptation.request_state import RequestState
+from helm.benchmark.metrics.metric import Metric
+from helm.benchmark.metrics.metric_name import MetricName
+from helm.benchmark.metrics.metric_service import MetricService
+from helm.benchmark.metrics.statistic import Stat
+
+
+class MedCalcBenchMetric(Metric):
+    def evaluate_generation(
+        self,
+        adapter_spec: AdapterSpec,
+        request_state: RequestState,
+        metric_service: MetricService,
+        eval_cache_path: str,
+    ) -> List[Stat]:
+        """Metric for MedCalc-Bench dataset.
+
+        Original implementation:
+        https://github.com/ncbi-nlp/MedCalc-Bench/blob/048ba77dbe332e9190935e4a30965bff444b940e/evaluation/evaluate.py#L11
+        """
+        assert request_state.instance.extra_data, (
+            "Could not find `extra_data` in the request state. "
+            "Both `lower_limit` and `upper_limit` are required for this metric."
+        )
+
+        assert len(request_state.result.completions) == 1, (
+            f"Found a total of {len(request_state.result.completions)} completions. "
+            "Only one was expected"
+        )
+
+        final_answer = request_state.result.completions[0].text.strip().lower().split("final answer:")[-1].strip()
+
+        try:
+            correctness = self.medcalc_bench_range_metric_calculation(
+                answer=final_answer,
+                ground_truth=request_state.instance.extra_data["ground_truth"],
+                calid=int(request_state.instance.extra_data["calculator_id"]),
+                upper_limit=request_state.instance.extra_data["upper_limit"],
+                lower_limit=request_state.instance.extra_data["lower_limit"],
+            )
+        except ValueError:
+            raise ValueError(
+                "Failed to calculate the correctess of the output for a MedCalc-Bench instance."
+            )
+
+        stat = Stat(MetricName("medcalc_bench_metric"))
+        stat.add(int(correctness))
+
+        return [stat]
+
+    def medcalc_bench_metric_calculation(
+        self,
+        answer: str,
+        ground_truth: str,
+        calid: int,
+        upper_limit: str,
+        lower_limit: str,
+    ) -> int:
+        """Calculate the metric for MedCalc-Bench dataset.
+
+        This method is basically a copy of the original implementation of this metric:
+        https://github.com/ncbi-nlp/MedCalc-Bench/blob/048ba77dbe332e9190935e4a30965bff444b940e/evaluation/evaluate.py#L11
+
+        Credits to the original authors: https://github.com/ncbi-nlp/MedCalc-Bench.
+        """
+        if calid in [13, 68]:
+            # Output Type: date
+
+            if datetime.strptime(answer, "%m/%d/%Y").strftime(
+                "%-m/%-d/%Y"
+            ) == datetime.strptime(ground_truth, "%m/%d/%Y").strftime("%-m/%-d/%Y"):
+                correctness = 1
+            else:
+                correctness = 0
+        elif calid in [69]:
+            # Output Type: integer (A, B)
+            match = re.search(
+                r"\(?[\"\']?(\d+)\s*(weeks?)?[\"\']?,?\s*[\"\']?(\d+)\s*(days?)?[\"\']?\s*\)?",
+                ground_truth,
+            )
+            ground_truth = f"({match.group(1)}, {match.group(3)})"
+            match = re.search(
+                r"\(?[\"\']?(\d+)\s*(weeks?)?[\"\']?,?\s*[\"\']?(\d+)\s*(days?)?[\"\']?\s*\)?",
+                answer,
+            )
+            if match:
+                weeks = match.group(1)
+                days = match.group(3)
+                answer = f"({weeks}, {days})"
+                if eval(answer) == eval(ground_truth):
+                    correctness = 1
+                else:
+                    correctness = 0
+            else:
+                correctness = 0
+        elif calid in [
+            4,
+            15,
+            16,
+            17,
+            18,
+            20,
+            21,
+            25,
+            27,
+            28,
+            29,
+            32,
+            33,
+            36,
+            43,
+            45,
+            48,
+            51,
+            69,
+        ]:
+            # Output Type: integer A
+            answer = round(eval(answer))
+            if answer == eval(ground_truth):
+                correctness = 1
+            else:
+                correctness = 0
+        elif calid in [
+            2,
+            3,
+            5,
+            6,
+            7,
+            8,
+            9,
+            10,
+            11,
+            19,
+            22,
+            23,
+            24,
+            26,
+            30,
+            31,
+            38,
+            39,
+            40,
+            44,
+            46,
+            49,
+            56,
+            57,
+            58,
+            59,
+            60,
+            61,
+            62,
+            63,
+            64,
+            65,
+            66,
+            67,
+        ]:
+            # Output Type: decimal
+            answer = eval(answer)
+            if answer >= eval(lower_limit) and answer <= eval(upper_limit):
+                correctness = 1
+            else:
+                correctness = 0
+        else:
+            raise ValueError(f"Unknown calculator ID: {calid}")
+        return correctness
diff --git a/src/helm/benchmark/run_specs/med_calc_bench_specs.py b/src/helm/benchmark/run_specs/med_calc_bench_specs.py
new file mode 100644
index 00000000000..fb3f5d58717
--- /dev/null
+++ b/src/helm/benchmark/run_specs/med_calc_bench_specs.py
@@ -0,0 +1,54 @@
+from typing import List
+
+from helm.benchmark.adaptation.common_adapter_specs import get_generation_adapter_spec
+from helm.benchmark.metrics.common_metric_specs import get_basic_metric_specs
+from helm.benchmark.metrics.metric import MetricSpec
+from helm.benchmark.run_spec import RunSpec, run_spec_function
+from helm.benchmark.scenarios.scenario import ScenarioSpec
+
+
+@run_spec_function("med_calc_bench_zero_shot_cot")
+def get_med_calc_bench_spec() -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.med_calc_bench_scenario.MedCalcBenchScenario",
+        args={},
+    )
+
+    adapter_spec = get_generation_adapter_spec(
+        instructions=_get_zero_shot_cot_instructions(),
+        input_noun="Patient Note",
+        output_noun="Calculated Value",
+        max_tokens=50,
+    )
+
+    metric_specs: List[MetricSpec] = [
+        MetricSpec(
+            class_name="helm.benchmark.metrics.med_calc_bench_metrics.MedCalcBenchMetric",
+            args={},
+        )
+    ] + get_basic_metric_specs([])
+
+    return RunSpec(
+        name="med_calc_bench",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=metric_specs,
+        groups=["med_calc_bench"],
+    )
+
+
+def _get_zero_shot_cot_instructions() -> str:
+    """Generate instructions for the MedCalcBench scenario.
+
+    This function is inspired on the system prompt definition in the original code:
+    https://github.com/ncbi-nlp/MedCalc-Bench/blob/048ba77dbe332e9190935e4a30965bff444b940e/evaluation/run.py#L16
+
+    Credits to the original authors: https://github.com/ncbi-nlp/MedCalc-Bench.
+    """
+
+    return (
+        "You are a helpful assistant for calculating a score for a given patient note. "
+        "Please think step-by-step to solve the question and then generate the required score. "
+        "Your output should contain the step by step thinking and the final answer, which is a short and direct answer to the question. "
+        'Before giving the final answer, write "Final Answer: " followed by the answer.'
+    )
diff --git a/src/helm/benchmark/scenarios/med_calc_bench_scenario.py b/src/helm/benchmark/scenarios/med_calc_bench_scenario.py
new file mode 100644
index 00000000000..7221319f20d
--- /dev/null
+++ b/src/helm/benchmark/scenarios/med_calc_bench_scenario.py
@@ -0,0 +1,72 @@
+import os
+from typing import List, Dict
+import json
+from helm.common.general import ensure_file_downloaded
+from helm.common.constants import TRAIN_SPLIT, TEST_SPLIT, CORRECT_TAG
+from helm.benchmark.scenarios.scenario import Scenario, Instance, Reference, Input, Output
+
+class MedCalcBenchScenario(Scenario):
+    """
+    MedCalcBench scenario: Processes a medical calculation dataset with explanations.
+    
+    Each record in the dataset has:
+    - Row Number
+    - Calculator ID
+    - Calculator Name
+    - Category
+    - Output Type
+    - Note ID
+    - Note Type
+    - Question
+    - Ground Truth Explanation
+    - Patient Note
+    - Relevant Entities
+    - Lower Limit
+    - Upper Limit
+    - Ground Truth Answer
+    
+    The output is formatted as:
+    "The answer is <calculated value>. Steps: <explanation>"
+    """
+
+    # TODO: Add a base url
+    DATASET_DOWNLOAD_BASE_URL: str = ""
+
+    name = "medcalcbench"
+    description = "Medical calculation questions with step-by-step explanations."
+    tags = ["reasoning", "medicine", "calculation"]
+
+    def get_instances(self, output_path: str) -> List[Instance]:
+        splits = {"train": TRAIN_SPLIT, "test": TEST_SPLIT}
+        instances: List[Instance] = []
+
+        for split, split_tag in splits.items():  # Iterate over the splits
+            source_url: str = f"{self.DATASET_DOWNLOAD_BASE_URL}/{split}.jsonl"
+            data_path: str = os.path.join(output_path, f"med_calc_bench_{split}")
+            ensure_file_downloaded(source_url=source_url, target_path=data_path)
+
+            with open(data_path, "r", encoding="utf-8") as f:
+                for line in f:
+                    example: Dict = json.loads(line.strip())
+                    question = example["Question"]
+                    patient_note = example["Patient_Note"]
+
+                    input_text = f"Patient Note:\n\n{patient_note}\n\nQuestion:\n\n{question}"
+
+                    # Format the final answer with explanation
+                    instances.append(
+                        Instance(
+                            input=Input(text=input_text),
+                            references=[Reference(Output(text=example["Ground_Truth_Answer"]), tags=[CORRECT_TAG])],
+                            split=split_tag,
+                            extra_data={
+                                "relevant_entities": example["Relevant_Entities"],
+                                "lower_limit": example["Lower_Limit"],
+                                "upper_limit": example["Upper_Limit"],
+                                "calculator_id": example["Calculator ID"],
+                                "ground_truth": example["Ground_Truth_Answer"],
+                            }
+                        )
+                    )
+
+        return instances
\ No newline at end of file

From bb15f352cfa86f7e9294fde424aab91648066a82 Mon Sep 17 00:00:00 2001
From: Leonardo Schettini <lschettini@microsoft.com>
Date: Thu, 12 Dec 2024 16:05:10 +0100
Subject: [PATCH 13/14] feat: med calc bench one shot spec

---
 .../run_specs/med_calc_bench_specs.py         | 99 ++++++++++++++++++-
 1 file changed, 94 insertions(+), 5 deletions(-)

diff --git a/src/helm/benchmark/run_specs/med_calc_bench_specs.py b/src/helm/benchmark/run_specs/med_calc_bench_specs.py
index fb3f5d58717..216e1b05382 100644
--- a/src/helm/benchmark/run_specs/med_calc_bench_specs.py
+++ b/src/helm/benchmark/run_specs/med_calc_bench_specs.py
@@ -1,4 +1,5 @@
-from typing import List
+import json
+from typing import Dict, List
 
 from helm.benchmark.adaptation.common_adapter_specs import get_generation_adapter_spec
 from helm.benchmark.metrics.common_metric_specs import get_basic_metric_specs
@@ -6,9 +7,11 @@
 from helm.benchmark.run_spec import RunSpec, run_spec_function
 from helm.benchmark.scenarios.scenario import ScenarioSpec
 
+ONE_SHOT_EXAMPLES_URL = "https://raw.githubusercontent.com/ncbi-nlp/MedCalc-Bench/048ba77dbe332e9190935e4a30965bff444b940e/evaluation/one_shot_finalized_explanation.json"
+
 
 @run_spec_function("med_calc_bench_zero_shot_cot")
-def get_med_calc_bench_spec() -> RunSpec:
+def get_med_calc_bench_zero_shot_spec() -> RunSpec:
     scenario_spec = ScenarioSpec(
         class_name="helm.benchmark.scenarios.med_calc_bench_scenario.MedCalcBenchScenario",
         args={},
@@ -16,9 +19,52 @@ def get_med_calc_bench_spec() -> RunSpec:
 
     adapter_spec = get_generation_adapter_spec(
         instructions=_get_zero_shot_cot_instructions(),
-        input_noun="Patient Note",
+        input_noun=None,  # Set directly in the scenario.
+        output_noun="Calculated Value",
+        max_tokens=500,
+    )
+
+    metric_specs: List[MetricSpec] = [
+        MetricSpec(
+            class_name="helm.benchmark.metrics.med_calc_bench_metrics.MedCalcBenchMetric",
+            args={},
+        )
+    ] + get_basic_metric_specs([])
+
+    return RunSpec(
+        name="med_calc_bench",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=metric_specs,
+        groups=["med_calc_bench"],
+    )
+
+
+@run_spec_function("med_calc_bench_one_shot_cot")
+def get_med_calc_bench_one_shot_spec() -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.med_calc_bench_scenario.MedCalcBenchScenario",
+        args={},
+    )
+
+    adapter_spec = get_generation_adapter_spec(
+        instructions=_get_one_shot_cot_instructions(
+            # TODO: Modify this to retrieve the question and calculator ID from the respective dataset sample.
+            # For more information see the docstring for the `_get_one_shot_cot_instructions` function.
+            # One way of doing so is having receiving the calculator ID in this function and passing it to
+            # the scenario, which can then filter the dataset samples by the calculator ID.
+            question=(
+                "What is the patient's Creatinine Clearance using the Cockroft-Gault Equation in terms of mL/min? "
+                "You should use the patient's adjusted body weight in kg instead of the patient's actual body "
+                "weight if the patient is overweight or obese based on their BMI. If the patient's BMI's normal, "
+                "set their adjusted body weight to the minimum of the ideal body and actual weight. If the "
+                "patient is underweight, please set their adjusted body weight to their actual body weight."
+            ),
+            calculator_id="2",
+        ),
+        input_noun=None,  # Set directly in the scenario.
         output_noun="Calculated Value",
-        max_tokens=50,
+        max_tokens=500,
     )
 
     metric_specs: List[MetricSpec] = [
@@ -50,5 +96,48 @@ def _get_zero_shot_cot_instructions() -> str:
         "You are a helpful assistant for calculating a score for a given patient note. "
         "Please think step-by-step to solve the question and then generate the required score. "
         "Your output should contain the step by step thinking and the final answer, which is a short and direct answer to the question. "
-        'Before giving the final answer, write "Final Answer: " followed by the answer.'
+        'Before giving the final answer, write "Calculated Value: " followed by the answer.'
+    )
+
+
+def _get_one_shot_cot_instructions(question: str, calculator_id: str) -> str:
+    """Generate instructions for the MedCalcBench scenario.
+
+    This function is inspired on the system prompt definition in the original code:
+    https://github.com/ncbi-nlp/MedCalc-Bench/blob/048ba77dbe332e9190935e4a30965bff444b940e/evaluation/run.py#L26
+
+    Credits to the original authors: https://github.com/ncbi-nlp/MedCalc-Bench.
+
+    In the original code, there's exactly one example response for each calculator ID.
+    These examples are stored in a JSON file: https://github.com/ncbi-nlp/MedCalc-Bench/blob/048ba77dbe332e9190935e4a30965bff444b940e/evaluation/one_shot_finalized_explanation.json
+    None of the examples include the actual questions. They only contain the step-by-step thinking and the final answer.
+    Looking at the dataset samples we can see that all samples with the same calculator ID use the same question.
+    The original expect that for each sample, we collect the calculator ID and the question for building the one-shot instructions.
+    """
+    examples: Dict = {}
+    with open(ONE_SHOT_EXAMPLES_URL, "r") as f:
+        examples = json.load(f)
+
+    if not examples:
+        raise ValueError(
+            "Failed to load one-shot examples for the MedCalcBench scenario."
+        )
+
+    example = examples.get(calculator_id, {})
+
+    if not example:
+        raise ValueError(
+            f"Failed to find one-shot example for calculator ID {calculator_id}."
+        )
+
+    return (
+        "You are a helpful assistant for calculating a score for a given patient note. "
+        "Please think step-by-step to solve the question and then generate the required score. "
+        "Your output should contain the step by step thinking and the final answer, which is a short and direct answer to the question. "
+        "\nBelow is an example:"
+        # This example follows the formatting of the respective scenario.
+        f"Patient Note:\n\n{example['Patient Note']}"
+        f"\n\nQuestion:\n\n{question}"
+        f"\n\nExplanation:\n\n{example['Response']['step_by_step_thinking']}"
+        f"\n\nCalculated Value: {example['Response']['answer']}"
     )

From 792fb4f9e3de928a9165395fbf12664c72f1a96a Mon Sep 17 00:00:00 2001
From: Leonardo Schettini <lschettini@microsoft.com>
Date: Tue, 17 Dec 2024 16:19:30 +0100
Subject: [PATCH 14/14] fix: dataset loading and standardize naming

---
 ...ch_metrics.py => medcalc_bench_metrics.py} | 51 ++++++----
 ..._bench_specs.py => medcalc_bench_specs.py} | 65 ++++++-------
 .../scenarios/med_calc_bench_scenario.py      | 72 --------------
 .../scenarios/medcalc_bench_scenario.py       | 96 +++++++++++++++++++
 4 files changed, 153 insertions(+), 131 deletions(-)
 rename src/helm/benchmark/metrics/{med_calc_bench_metrics.py => medcalc_bench_metrics.py} (75%)
 rename src/helm/benchmark/run_specs/{med_calc_bench_specs.py => medcalc_bench_specs.py} (77%)
 delete mode 100644 src/helm/benchmark/scenarios/med_calc_bench_scenario.py
 create mode 100644 src/helm/benchmark/scenarios/medcalc_bench_scenario.py

diff --git a/src/helm/benchmark/metrics/med_calc_bench_metrics.py b/src/helm/benchmark/metrics/medcalc_bench_metrics.py
similarity index 75%
rename from src/helm/benchmark/metrics/med_calc_bench_metrics.py
rename to src/helm/benchmark/metrics/medcalc_bench_metrics.py
index a44306d2ec7..346528cf1b2 100644
--- a/src/helm/benchmark/metrics/med_calc_bench_metrics.py
+++ b/src/helm/benchmark/metrics/medcalc_bench_metrics.py
@@ -8,6 +8,7 @@
 from helm.benchmark.metrics.metric_name import MetricName
 from helm.benchmark.metrics.metric_service import MetricService
 from helm.benchmark.metrics.statistic import Stat
+from helm.common.hierarchical_logger import hlog
 
 
 class MedCalcBenchMetric(Metric):
@@ -33,25 +34,33 @@ def evaluate_generation(
             "Only one was expected"
         )
 
-        final_answer = request_state.result.completions[0].text.strip().lower().split("final answer:")[-1].strip()
-
-        try:
-            correctness = self.medcalc_bench_range_metric_calculation(
-                answer=final_answer,
-                ground_truth=request_state.instance.extra_data["ground_truth"],
-                calid=int(request_state.instance.extra_data["calculator_id"]),
-                upper_limit=request_state.instance.extra_data["upper_limit"],
-                lower_limit=request_state.instance.extra_data["lower_limit"],
-            )
-        except ValueError:
-            raise ValueError(
-                "Failed to calculate the correctess of the output for a MedCalc-Bench instance."
-            )
+        final_answer = (
+            request_state.result.completions[0]
+            .text.strip()
+            .lower()
+            .split("calculated value:")[-1]
+            .strip()
+        )
 
-        stat = Stat(MetricName("medcalc_bench_metric"))
-        stat.add(int(correctness))
+        correctness = 0
+        if final_answer:
+            try:
+                correctness = self.medcalc_bench_metric_calculation(
+                    answer=final_answer,
+                    ground_truth=request_state.instance.extra_data["ground_truth"],
+                    calid=int(request_state.instance.extra_data["calculator_id"]),
+                    upper_limit=request_state.instance.extra_data["upper_limit"],
+                    lower_limit=request_state.instance.extra_data["lower_limit"],
+                )
+            except ValueError as e:
+                hlog(
+                    (
+                        "Failed to calculate the correctess of the output for MedCalc-Bench instance "
+                        f'with id {request_state.instance.extra_data["id"]}: {e}'
+                    )
+                )
 
-        return [stat]
+        return [Stat(MetricName("medcalc_bench_metric")).add(correctness)]
 
     def medcalc_bench_metric_calculation(
         self,
@@ -120,8 +129,8 @@ def medcalc_bench_metric_calculation(
             69,
         ]:
             # Output Type: integer A
-            answer = round(eval(answer))
-            if answer == eval(ground_truth):
+            answer = round(int(answer))
+            if answer == int(ground_truth):
                 correctness = 1
             else:
                 correctness = 0
@@ -162,8 +171,8 @@ def medcalc_bench_metric_calculation(
             67,
         ]:
             # Output Type: decimal
-            answer = eval(answer)
-            if answer >= eval(lower_limit) and answer <= eval(upper_limit):
+            answer = float(answer)
+            if answer >= float(lower_limit) and answer <= float(upper_limit):
                 correctness = 1
             else:
                 correctness = 0
diff --git a/src/helm/benchmark/run_specs/med_calc_bench_specs.py b/src/helm/benchmark/run_specs/medcalc_bench_specs.py
similarity index 77%
rename from src/helm/benchmark/run_specs/med_calc_bench_specs.py
rename to src/helm/benchmark/run_specs/medcalc_bench_specs.py
index 216e1b05382..8bdb5374751 100644
--- a/src/helm/benchmark/run_specs/med_calc_bench_specs.py
+++ b/src/helm/benchmark/run_specs/medcalc_bench_specs.py
@@ -1,6 +1,7 @@
 import json
 from typing import Dict, List
 
+from helm.benchmark.adaptation.adapter_spec import AdapterSpec
 from helm.benchmark.adaptation.common_adapter_specs import get_generation_adapter_spec
 from helm.benchmark.metrics.common_metric_specs import get_basic_metric_specs
 from helm.benchmark.metrics.metric import MetricSpec
@@ -10,44 +11,47 @@
 ONE_SHOT_EXAMPLES_URL = "https://raw.githubusercontent.com/ncbi-nlp/MedCalc-Bench/048ba77dbe332e9190935e4a30965bff444b940e/evaluation/one_shot_finalized_explanation.json"
 
 
-@run_spec_function("med_calc_bench_zero_shot_cot")
-def get_med_calc_bench_zero_shot_spec() -> RunSpec:
+@run_spec_function("medcalc_bench")
+def get_medcalc_bench_spec(method: str) -> RunSpec:
     scenario_spec = ScenarioSpec(
-        class_name="helm.benchmark.scenarios.med_calc_bench_scenario.MedCalcBenchScenario",
+        class_name="helm.benchmark.scenarios.medcalc_bench_scenario.MedCalcBenchScenario",
         args={},
     )
 
-    adapter_spec = get_generation_adapter_spec(
-        instructions=_get_zero_shot_cot_instructions(),
-        input_noun=None,  # Set directly in the scenario.
-        output_noun="Calculated Value",
-        max_tokens=500,
-    )
+    if method == "zero_shot":
+        adapter_spec = get_medcalc_bench_zero_shot_adapter()
+    elif method == "one_shot":
+        adapter_spec = get_medcalc_bench_one_shot_adapter()
+    else:
+        raise ValueError(f"Invalid method for MedCalc-Bench: {method}")
 
     metric_specs: List[MetricSpec] = [
         MetricSpec(
-            class_name="helm.benchmark.metrics.med_calc_bench_metrics.MedCalcBenchMetric",
+            class_name="helm.benchmark.metrics.medcalc_bench_metrics.MedCalcBenchMetric",
             args={},
         )
-    ] + get_basic_metric_specs([])
+    ] # + get_basic_metric_specs([])
 
     return RunSpec(
-        name="med_calc_bench",
+        name=f"medcalc_bench:method{method}",
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
         metric_specs=metric_specs,
-        groups=["med_calc_bench"],
+        groups=["medcalc_bench"],
     )
 
 
-@run_spec_function("med_calc_bench_one_shot_cot")
-def get_med_calc_bench_one_shot_spec() -> RunSpec:
-    scenario_spec = ScenarioSpec(
-        class_name="helm.benchmark.scenarios.med_calc_bench_scenario.MedCalcBenchScenario",
-        args={},
+def get_medcalc_bench_zero_shot_adapter() -> AdapterSpec:
+    return get_generation_adapter_spec(
+        instructions=_get_zero_shot_cot_instructions(),
+        input_noun=None,  # Set directly in the scenario.
+        output_noun="\n\nCalculated Value",
+        max_tokens=500,
     )
 
-    adapter_spec = get_generation_adapter_spec(
+
+def get_medcalc_bench_one_shot_adapter() -> AdapterSpec:
+    return get_generation_adapter_spec(
         instructions=_get_one_shot_cot_instructions(
             # TODO: Modify this to retrieve the question and calculator ID from the respective dataset sample.
             # For more information see the docstring for the `_get_one_shot_cot_instructions` function.
@@ -63,28 +67,13 @@ def get_med_calc_bench_one_shot_spec() -> RunSpec:
             calculator_id="2",
         ),
         input_noun=None,  # Set directly in the scenario.
-        output_noun="Calculated Value",
+        output_noun="\n\nCalculated Value",
         max_tokens=500,
     )
 
-    metric_specs: List[MetricSpec] = [
-        MetricSpec(
-            class_name="helm.benchmark.metrics.med_calc_bench_metrics.MedCalcBenchMetric",
-            args={},
-        )
-    ] + get_basic_metric_specs([])
-
-    return RunSpec(
-        name="med_calc_bench",
-        scenario_spec=scenario_spec,
-        adapter_spec=adapter_spec,
-        metric_specs=metric_specs,
-        groups=["med_calc_bench"],
-    )
-
 
 def _get_zero_shot_cot_instructions() -> str:
-    """Generate instructions for the MedCalcBench scenario.
+    """Generate instructions for the MedCalc-Bench scenario.
 
     This function is inspired on the system prompt definition in the original code:
     https://github.com/ncbi-nlp/MedCalc-Bench/blob/048ba77dbe332e9190935e4a30965bff444b940e/evaluation/run.py#L16
@@ -101,7 +90,7 @@ def _get_zero_shot_cot_instructions() -> str:
 
 
 def _get_one_shot_cot_instructions(question: str, calculator_id: str) -> str:
-    """Generate instructions for the MedCalcBench scenario.
+    """Generate instructions for the MedCalc-Bench scenario.
 
     This function is inspired on the system prompt definition in the original code:
     https://github.com/ncbi-nlp/MedCalc-Bench/blob/048ba77dbe332e9190935e4a30965bff444b940e/evaluation/run.py#L26
@@ -120,7 +109,7 @@ def _get_one_shot_cot_instructions(question: str, calculator_id: str) -> str:
 
     if not examples:
         raise ValueError(
-            "Failed to load one-shot examples for the MedCalcBench scenario."
+            "Failed to load one-shot examples for the MedCalc-Bench scenario."
         )
 
     example = examples.get(calculator_id, {})
diff --git a/src/helm/benchmark/scenarios/med_calc_bench_scenario.py b/src/helm/benchmark/scenarios/med_calc_bench_scenario.py
deleted file mode 100644
index 7221319f20d..00000000000
--- a/src/helm/benchmark/scenarios/med_calc_bench_scenario.py
+++ /dev/null
@@ -1,72 +0,0 @@
-import os
-from typing import List, Dict
-import json
-from helm.common.general import ensure_file_downloaded
-from helm.common.constants import TRAIN_SPLIT, TEST_SPLIT, CORRECT_TAG
-from helm.benchmark.scenarios.scenario import Scenario, Instance, Reference, Input, Output
-
-class MedCalcBenchScenario(Scenario):
-    """
-    MedCalcBench scenario: Processes a medical calculation dataset with explanations.
-    
-    Each record in the dataset has:
-    - Row Number
-    - Calculator ID
-    - Calculator Name
-    - Category
-    - Output Type
-    - Note ID
-    - Note Type
-    - Question
-    - Ground Truth Explanation
-    - Patient Note
-    - Relevant Entities
-    - Lower Limit
-    - Upper Limit
-    - Ground Truth Answer
-    
-    The output is formatted as:
-    "The answer is <calculated value>. Steps: <explanation>"
-    """
-
-    # TODO: Add a base url
-    DATASET_DOWNLOAD_BASE_URL: str = ""
-
-    name = "medcalcbench"
-    description = "Medical calculation questions with step-by-step explanations."
-    tags = ["reasoning", "medicine", "calculation"]
-
-    def get_instances(self, output_path: str) -> List[Instance]:
-        splits = {"train": TRAIN_SPLIT, "test": TEST_SPLIT}
-        instances: List[Instance] = []
-
-        for split, split_tag in splits.items():  # Iterate over the splits
-            source_url: str = f"{self.DATASET_DOWNLOAD_BASE_URL}/{split}.jsonl"
-            data_path: str = os.path.join(output_path, f"med_calc_bench_{split}")
-            ensure_file_downloaded(source_url=source_url, target_path=data_path)
-
-            with open(data_path, "r", encoding="utf-8") as f:
-                for line in f:
-                    example: Dict = json.loads(line.strip())
-                    question = example["Question"]
-                    patient_note = example["Patient_Note"]
-
-                    input_text = f"Patient Note:\n\n{patient_note}\n\nQuestion:\n\n{question}"
-
-                    # Format the final answer with explanation
-                    instances.append(
-                        Instance(
-                            input=Input(text=input_text),
-                            references=[Reference(Output(text=example["Ground_Truth_Answer"]), tags=[CORRECT_TAG])],
-                            split=split_tag,
-                            extra_data={
-                                "relevant_entities": example["Relevant_Entities"],
-                                "lower_limit": example["Lower_Limit"],
-                                "upper_limit": example["Upper_Limit"],
-                                "calculator_id": example["Calculator ID"],
-                                "ground_truth": example["Ground_Truth_Answer"],
-                            }
-                        )
-                    )
-
-        return instances
\ No newline at end of file
diff --git a/src/helm/benchmark/scenarios/medcalc_bench_scenario.py b/src/helm/benchmark/scenarios/medcalc_bench_scenario.py
new file mode 100644
index 00000000000..0d28cdc0c86
--- /dev/null
+++ b/src/helm/benchmark/scenarios/medcalc_bench_scenario.py
@@ -0,0 +1,96 @@
+import json
+import os
+from typing import Dict, List
+
+from datasets import DatasetDict, load_dataset
+
+from helm.benchmark.scenarios.scenario import (
+    CORRECT_TAG,
+    TEST_SPLIT,
+    TRAIN_SPLIT,
+    Input,
+    Instance,
+    Output,
+    Reference,
+    Scenario,
+)
+from helm.common.general import ensure_directory_exists
+
+
+class MedCalcBenchScenario(Scenario):
+    """
+    MedCalcBench scenario: Processes a medical calculation dataset with explanations.
+
+    Each record in the dataset has:
+    - Row Number
+    - Calculator ID
+    - Calculator Name
+    - Category
+    - Output Type
+    - Note ID
+    - Note Type
+    - Question
+    - Ground Truth Explanation
+    - Patient Note
+    - Relevant Entities
+    - Lower Limit
+    - Upper Limit
+    - Ground Truth Answer
+
+    The output is formatted as:
+    "The answer is <calculated value>. Steps: <explanation>"
+    """
+
+    HUGGING_FACE_DATASET_PATH: str = "ncbi/MedCalc-Bench-v1.0"
+
+    # TODO: Add a base url
+    DATASET_DOWNLOAD_BASE_URL: str = ""
+
+    name = "medcalcbench"
+    description = "Medical calculation questions with step-by-step explanations."
+    tags = ["reasoning", "medicine", "calculation"]
+
+    def get_instances(self, output_path: str) -> List[Instance]:
+        data_path: str = os.path.join(output_path, "data")
+        ensure_directory_exists(data_path)
+        dataset: DatasetDict = load_dataset(self.HUGGING_FACE_DATASET_PATH)
+
+        splits = {TRAIN_SPLIT: "train", TEST_SPLIT: "test"}
+        instances: List[Instance] = []
+        for (
+            helm_split_name,
+            dataset_split_name,
+        ) in splits.items():  # Iterate over the splits
+            split_data = dataset[dataset_split_name]
+
+            for example in split_data:
+                question = example["Question"]
+                patient_note = example["Patient Note"]
+
+                input_text = (
+                    f"Patient Note:\n\n{patient_note}\n\nQuestion:\n\n{question}"
+                )
+
+                # Format the final answer with explanation
+                instances.append(
+                    Instance(
+                        input=Input(text=input_text),
+                        references=[
+                            Reference(
+                                Output(text=example["Ground Truth Answer"]),
+                                tags=[CORRECT_TAG],
+                            )
+                        ],
+                        split=helm_split_name,
+                        extra_data={
+                            "id": example["Row Number"],
+                            "relevant_entities": example["Relevant Entities"],
+                            "lower_limit": example["Lower Limit"],
+                            "upper_limit": example["Upper Limit"],
+                            "calculator_id": example["Calculator ID"],
+                            "ground_truth": example["Ground Truth Answer"],
+                        },
+                    )
+                )
+
+        return instances