diff --git a/.github/workflows/pytest-check.yml b/.github/workflows/pytest-check.yml index 7b01c573..46a96f31 100644 --- a/.github/workflows/pytest-check.yml +++ b/.github/workflows/pytest-check.yml @@ -45,7 +45,7 @@ jobs: GITHUB_ACTION: 1 - name: Surface failing tests if: always() - uses: pmeier/pytest-results-action@multi-testsuites + uses: pmeier/pytest-results-action@v0.7.1 with: # A list of JUnit XML files, directories containing the former, and wildcard # patterns to process. diff --git a/README.md b/README.md index af340433..34deea90 100644 --- a/README.md +++ b/README.md @@ -21,7 +21,7 @@ Training Utilization - **Blazingly Fast:** By managing the KV Cache of prefixes, we can speed up local inference by up to 6x 🚀. -- **Comprehensive Evaluation:** 56+ commonly used [datasets](https://github.com/RUCAIBox/LLMBox/blob/main/docs/utilization/supported-datasets.md) and benchmarks in evaluating LLMs. +- **Comprehensive Evaluation:** 59+ commonly used [datasets](https://github.com/RUCAIBox/LLMBox/blob/main/docs/utilization/supported-datasets.md) and benchmarks in evaluating LLMs. - **Evaluation Methods:** Accurately reproduce results from original papers of OpenAI, LLaMA, Mistral, and other models. - **In-Context Learning:** We support various ICL strategies, including [`KATE`](https://aclanthology.org/2022.deelio-1.10/), [`GlobalE`](https://aclanthology.org/2022.acl-long.556/), and [`APE`](https://arxiv.org/abs/2211.01910). - **Chain-of-Thought:** For some datasets, we support three types of CoT evaluation: `base`, [`least-to-most`](https://arxiv.org/abs/2205.10625), and [`pal`](https://arxiv.org/abs/2211.10435). @@ -140,7 +140,7 @@ For more details, view the [training](https://github.com/RUCAIBox/LLMBox/tree/ma We provide a broad support on Huggingface models (e.g. `LLaMA-3`, `Mistral`, or the model you are building on), OpenAI, Anthropic, QWen and other OpenAI-compatible models for further utilization. Full list of model backends: [here](https://github.com/RUCAIBox/LLMBox/tree/main/utilization#supported-models). -Currently a total of 56+ commonly used datasets are supported, including: `HellaSwag`, `MMLU`, `GSM8K`, `GPQA`, `AGIEval`, `CEval`, and `CMMLU`. Full list of datasets: [here](https://github.com/RUCAIBox/LLMBox/blob/main/docs/utilization/supported-datasets.md). +Currently a total of 59+ commonly used datasets are supported, including: `HellaSwag`, `MMLU`, `GSM8K`, `GPQA`, `AGIEval`, `CEval`, and `CMMLU`. Full list of datasets: [here](https://github.com/RUCAIBox/LLMBox/blob/main/docs/utilization/supported-datasets.md). ```bash CUDA_VISIBLE_DEVICES=0 python inference.py \ @@ -151,7 +151,9 @@ CUDA_VISIBLE_DEVICES=0 python inference.py \ --ranking_type ppl_no_option ``` -See [benchmarking LLaMA3](https://github.com/RUCAIBox/LLMBox/blob/main/docs/utilization/benchmarking_llama3.md) for more examples. +- 🔥 Recently supported datasets: `imbue_code`, `imbue_public`, and `imbue_private`. + +- 🔥 See [benchmarking LLaMA3](https://github.com/RUCAIBox/LLMBox/blob/main/docs/utilization/benchmarking_llama3.md) for more examples.
Dataset | @@ -177,7 +185,7 @@ You can find the supported datasets in the following table.|||||
Google-Proof Q&A (GPQA ) |
gpqa_main (default), gpqa_extended , ... |
- MultipleChoiceDataset | +MultipleChoice | ✅ | + |
Imbue Code Comprehension (imbue_code ) |
+ / | +MultipleChoice | ++ | + | |
Imbue High Quality Private Evaluations (imbue_private ) |
+ / | +MultipleChoice | ++ | + | |
Imbue High Quality Public Evaluations (imbue_public ) |
+ / | +MultipleChoice | ++ | + | |
LAnguage Modeling Broadened to Account for Discourse Aspects (lambada ) |
default (default), de , ... (source: EleutherAI/lambada_openai) |
diff --git a/tests/dry_test/test_datasets.py b/tests/dry_test/test_datasets.py
index ab01b11e..e3b021a2 100644
--- a/tests/dry_test/test_datasets.py
+++ b/tests/dry_test/test_datasets.py
@@ -1,11 +1,11 @@
import nltk
import pytest
-
from utilization.utils.logging import list_datasets
from .fixtures import *
nltk.download('punkt')
+nltk.download('punkt_tab')
datasets = {
"agieval": [],
diff --git a/tests/dry_test/test_models.py b/tests/dry_test/test_models.py
index d0203a1c..e7c9ec2e 100644
--- a/tests/dry_test/test_models.py
+++ b/tests/dry_test/test_models.py
@@ -26,3 +26,5 @@ def test_models_dry_run(run_evaluate, model, dataset, extra_args):
run_evaluate(["-m", model, "-d", dataset, "-b", "10", "--dry_run"] + extra_args, cuda=0)
except torch.cuda.OutOfMemoryError:
pytest.skip(f"Out of memory error on {model} {dataset}")
+ except FileNotFoundError:
+ pytest.skip(f"File not found error on {model} {dataset}")
diff --git a/utilization/dataset/imbue_code.py b/utilization/dataset/imbue_code.py
new file mode 100644
index 00000000..76451129
--- /dev/null
+++ b/utilization/dataset/imbue_code.py
@@ -0,0 +1,37 @@
+from functools import cached_property
+from logging import getLogger
+
+from .multiple_choice_dataset import MultipleChoiceDataset
+
+logger = getLogger(__name__)
+
+
+class ImbueCode(MultipleChoiceDataset):
+ """The dataset of Imbue code understanding questions.
+
+ These examples fall into 2 categories:
+ - "cloze": fill in the hole to produce the specified outcome;
+ - "eval": given a snippet of python code, determine the outcome.
+ Some questions are very easy, some are much more challenging. Most (if not all) of these questions should be relatively straightforward for an experienced programmer, even without a pencil and paper. Released as part of Imbue's 70b evals post.
+
+ Link: https://huggingface.co/datasets/imbue/code-comprehension?row=0
+
+ Example (To avoid data contamination, some fields are omitted):
+ 'question': 'If we execute the code below, what will `result` be equal to? ```python ... ```'
+ 'choices': [ "'66-66-66-foo'", "'foo-66-66-66'", "'66--66--66--foo'", "''" ]
+ 'correct_answer': '66- ... -foo'
+ """
+
+ instruction = "{{question}}{{'\n' + options if options}}\nAnswer:"
+ evaluation_set = "train"
+ example_set = None
+ load_args = ("imbue/code-comprehension",)
+
+ def format_instance(self, instance):
+ instance["target_idx"] = instance["choices"].index(instance["correct_answer"])
+ instance["options"] = instance["choices"]
+ return instance
+
+ @cached_property
+ def references(self):
+ return [instance["target_idx"] for instance in self.evaluation_data]
diff --git a/utilization/dataset/imbue_private.py b/utilization/dataset/imbue_private.py
new file mode 100644
index 00000000..5a5bb86f
--- /dev/null
+++ b/utilization/dataset/imbue_private.py
@@ -0,0 +1,43 @@
+from functools import cached_property
+from logging import getLogger
+
+from .multiple_choice_dataset import MultipleChoiceDataset
+
+logger = getLogger(__name__)
+
+
+class ImbuePrivate(MultipleChoiceDataset):
+ """The dataset of Imbue private evaluations.
+
+ High-quality question-answer pairs, from private versions of datasets designed to mimic ANLI, ARC, BoolQ, ETHICS, GSM8K, HellaSwag, OpenBookQA, MultiRC, RACE, Social IQa, and WinoGrande. For details, see https://imbue.com/research/70b-evals/. Format: each row contains a question, candidate answers, the correct answer (or multiple correct answers in the case of MultiRC questions), and a question quality score.
+
+ Link: https://huggingface.co/datasets/imbue/high_quality_private_evaluations
+
+ Example (To avoid data contamination, some fields are omitted):
+ 'question': 'For this question, first read the passage below. "The artist ..." Based on the passage above, answer the following question. Which wealth ...?'
+ 'correct_choices': [ "A ... ire" ]
+ 'choices': [ "A billionaire", "A centimillionaire", "A trillionaire", "A decamillionaire" ]
+ 'quality': 0.245109
+ 'original_dataset': race
+ """
+
+ instruction = "{{question}}{{'\n' + options if options}}\nAnswer:"
+ evaluation_set = "train"
+ example_set = None
+ load_args = ("imbue/high_quality_private_evaluations",)
+ category_column = "original_dataset"
+
+ def format_instance(self, instance):
+ if len(instance["correct_choices"]) > 1:
+ logger.warning(
+ f"Multiple correct choices found: {len(instance['correct_choices'])}. Only the first one is used. Multiple correct choices may be supported in the future."
+ )
+
+ correct_choice = instance["correct_choices"][0]
+ instance["target_idx"] = instance["choices"].index(correct_choice)
+ instance["options"] = instance["choices"]
+ return instance
+
+ @cached_property
+ def references(self):
+ return [instance["target_idx"] for instance in self.evaluation_data]
diff --git a/utilization/dataset/imbue_public.py b/utilization/dataset/imbue_public.py
new file mode 100644
index 00000000..4100e90c
--- /dev/null
+++ b/utilization/dataset/imbue_public.py
@@ -0,0 +1,43 @@
+from functools import cached_property
+from logging import getLogger
+
+from .multiple_choice_dataset import MultipleChoiceDataset
+
+logger = getLogger(__name__)
+
+
+class ImbuePublic(MultipleChoiceDataset):
+ """The dataset of Imbue public evaluations.
+
+ High-quality question-answer pairs, originally from ANLI, ARC, BoolQ, ETHICS, GSM8K, HellaSwag, OpenBookQA, MultiRC, RACE, Social IQa, and WinoGrande. For details, see https://imbue.com/research/70b-evals/. Format: each row contains a question, candidate answers, the correct answer (or multiple correct answers in the case of MultiRC questions), and a question quality score.
+
+ Link: https://huggingface.co/datasets/imbue/high_quality_public_evaluations
+
+ Example:
+ 'question': 'The man was released from jail. What is the cause of this?'
+ 'correct_choices': [ "His family paid his bail." ]
+ 'choices': [ "His family paid his bail.", "He attacked a fellow inmate." ]
+ 'quality': 0.348698
+ 'original_dataset': copa
+ """
+
+ instruction = "{{question}}{{'\n' + options if options}}\nAnswer:"
+ evaluation_set = "train"
+ example_set = None
+ load_args = ("imbue/high_quality_public_evaluations",)
+ category_column = "original_dataset"
+
+ def format_instance(self, instance):
+ if len(instance["correct_choices"]) > 1:
+ logger.warning(
+ f"Multiple correct choices found: {len(instance['correct_choices'])}. Only the first one is used. Multiple correct choices may be supported in the future."
+ )
+
+ correct_choice = instance["correct_choices"][0]
+ instance["target_idx"] = instance["choices"].index(correct_choice)
+ instance["options"] = instance["choices"]
+ return instance
+
+ @cached_property
+ def references(self):
+ return [instance["target_idx"] for instance in self.evaluation_data]
diff --git a/utilization/dataset/tldr.py b/utilization/dataset/tldr.py
index 28da1d1b..2d8e1969 100644
--- a/utilization/dataset/tldr.py
+++ b/utilization/dataset/tldr.py
@@ -16,7 +16,7 @@ class TLDR(GenerationDataset):
"""
instruction = "{source}"
- evaluation_set = "train"
+ evaluation_set = "test"
example_set = None
metrics = [Rouge()]
load_args = ("CarperAI/openai_summarize_tldr",)
diff --git a/utilization/model/huggingface_model.py b/utilization/model/huggingface_model.py
index b6a97a1c..34ed3428 100644
--- a/utilization/model/huggingface_model.py
+++ b/utilization/model/huggingface_model.py
@@ -37,6 +37,7 @@ def load_tokenizer(tokenizer_name_or_path: str, use_fast: bool, max_length: int
truncation_side="left",
add_eos_token=False,
add_bos_token=False, # add in chat_template
+ trust_remote_code=True,
)
# TODO: [Important]!!! check for each tokenizer
@@ -86,23 +87,8 @@ def get_model_max_length(
def load_hf_model(args: ModelArguments) -> Tuple[PreTrainedModel, Union[PreTrainedTokenizer, PreTrainedTokenizerFast]]:
logger.info(f"Loading {args.model_name_or_path} using Hugging Face Transformers...")
- # https://github.com/meta-llama/llama/issues/380#issuecomment-1656714118
- if args.torch_dtype == "auto":
- try:
- with open(args.model_name_or_path + "/config.json") as f:
- config = json.load(f)
- if "torch_dtype" in config:
- if config["torch_dtype"] == "float32":
- torch_dtype = "float16"
- else:
- torch_dtype = config["torch_dtype"]
- except:
- torch_dtype = "float16"
- else:
- torch_dtype = args.torch_dtype
-
model_kwargs = dict(
- torch_dtype=getattr(torch, torch_dtype),
+ torch_dtype=getattr(torch, args.torch_dtype),
device_map=args.device_map,
load_in_4bit=args.load_in_4bit,
load_in_8bit=args.load_in_8bit,
@@ -153,16 +139,31 @@ class HuggingFaceModel(Model):
def __init__(self, args: ModelArguments):
super().__init__(args)
+ # https://github.com/meta-llama/llama/issues/380#issuecomment-1656714118
+ if args.torch_dtype == "auto":
+ torch_dtype = "float16"
+ try:
+ with open(args.model_name_or_path + "/config.json") as f:
+ config = json.load(f)
+ if "torch_dtype" in config and config["torch_dtype"] != "float32":
+ torch_dtype = config["torch_dtype"]
+ except:
+ pass
+ else:
+ torch_dtype = args.torch_dtype
+ args.torch_dtype = torch_dtype
+
if getattr(args, "load_hf_model", None) is not None:
_load_hf_model = args.load_hf_model
else:
_load_hf_model = load_hf_model
+
self.model, self._tokenizer = _load_hf_model(args)
if self._tokenizer.model_max_length is None:
logger.warning(f"`model_max_length` is not set for {self.name}. Set to default {DEFAULT_MODEL_MAX_LENGTH}.")
self._tokenizer.model_max_length = DEFAULT_MODEL_MAX_LENGTH
- self.device = "cuda" if torch.cuda.is_available() else "cpu"
+ self.device = self.model.device if torch.cuda.is_available() else "cpu"
self.model_max_input_and_output = self.tokenizer.model_max_length
# model tests
@@ -443,8 +444,7 @@ def get_ppl_with_cache(
last_logits = torch.cat(prefix_cache.next_logits, dim=0).to(logits.device)
shift_logits = torch.cat([last_logits, logits[:, :-1]], dim=-2)
labels[labels == self.tokenizer.pad_token_id] = -100
- probs = self.loss_fct(shift_logits.view(-1, vocab_size),
- labels.view(-1)).view(labels.size(0), -1)
+ probs = self.loss_fct(shift_logits.view(-1, vocab_size), labels.view(-1)).view(labels.size(0), -1)
if exact_match:
greedy_tokens = torch.argmax(shift_logits, dim=-1)