From 310b3bd21d13a85285ae677e666709d25f94ccb8 Mon Sep 17 00:00:00 2001 From: huyiwen <1020030101@qq.com> Date: Fri, 26 Jul 2024 22:25:33 +0800 Subject: [PATCH 1/3] [fix] set vllm default temperature to 0 --- utilization/model_enum.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utilization/model_enum.py b/utilization/model_enum.py index 19e075be..ecb2c7b9 100644 --- a/utilization/model_enum.py +++ b/utilization/model_enum.py @@ -4,7 +4,7 @@ from .utils.generation_args import generation_arg VLLM_ARGS = { - "temperature": generation_arg(), + "temperature": generation_arg(default=0), "top_p": generation_arg(), "top_k": generation_arg(), "max_tokens": generation_arg(default=1024), From c0a398fca3571030f8d4d9316b5d49947ebe1c34 Mon Sep 17 00:00:00 2001 From: huyiwen <1020030101@qq.com> Date: Fri, 26 Jul 2024 23:33:23 +0800 Subject: [PATCH 2/3] [fix] pass@k and self-consistency --- utilization/dataset/dataset.py | 76 ++++++++++++++++++++++++---------- utilization/evaluator.py | 20 +-------- utilization/load_dataset.py | 2 +- 3 files changed, 58 insertions(+), 40 deletions(-) diff --git a/utilization/dataset/dataset.py b/utilization/dataset/dataset.py index 6df9fdce..35864fcd 100644 --- a/utilization/dataset/dataset.py +++ b/utilization/dataset/dataset.py @@ -220,10 +220,10 @@ def __len__(self): return len(self.evaluation_instances) def __getitem__(self, idx): - return self.evaluation_instances[idx] + return deepcopy(self.evaluation_instances[idx]) def __iter__(self): - yield from self.evaluation_instances + yield from deepcopy(self.evaluation_instances) def format_instance(self, instance: dict) -> dict: r"""Format the dataset instance into task format. See [docs](https://github.com/RUCAIBox/LLMBox/blob/main/docs/utilization/how-to-customize-dataset.md#formating-the-instances) for more details. @@ -832,6 +832,17 @@ def __repr__(self): class DatasetCollection(torch.utils.data.Dataset): + r"""The dataset collection class that combines multiple datasets into one. + + Args: + - datasets: A dictionary of dataset instances. The keys are the dataset names and the values are the dataset instances. + + Examples: + Assume a DatasetCollection composed of two datasets: `sub1` and `sub2`. Each dataset has different number of evaluation instances. + - Two subets: `[sub1, sub2]` + - Two subsets with self-consistency = 3: `[sub1, sub1, sub1, sub2, sub2, sub2]` + - Two subsets with normalization: `[sub1, sub1-norm, sub2, sub2-norm]` + """ def __init__(self, datasets: Dict[str, Dataset]): super().__init__() @@ -951,9 +962,6 @@ def set_subset(l: dict): except Exception as e: logger.warning(f"Failed to log predictions: {e}") - def post_processing(self, predictions: List[Union[str, float]]): - return sum((d.post_processing(p) for d, p in zip(self._datasets, self._split_by_subset(predictions))), []) - def __getitem__(self, idx): if self.args.continue_from: idx += self.args.continue_from @@ -975,14 +983,39 @@ def __iter__(self): def __getattr__(self, attr): return getattr(self._datasets[self._cur_idx], attr) - def calculate_metric(self, predictions) -> Tuple[Dict[str, Dict[str, float]], List[Dict[str, List[float]]]]: - results = OrderedDict() + def calculate_metric(self, raw_predictions: List[Union[str, float]]) -> Dict[str, Dict[str, float]]: + r"""Post-process predictions and calculate the metric scores.""" + + metric_results = OrderedDict() + predictions = [] + agg_predictions = [] score_lists = [] - splitted = self._split_by_subset(predictions, option_num=False, normalization=False, sample_num=False) - grouped_display_names = defaultdict(list) # group by dataset - for n, d, p in zip(self.display_names, self._datasets, splitted): - subset_results, score_list = d.calculate_metric(p) - results.update(subset_results) + grouped_display_names = defaultdict(list) + + for n, d, p in zip(self.display_names, self._datasets, self._split_by_subset(raw_predictions)): + # post process + preds = d.post_processing(p) + + # aggregate self-consistency or pass@k + step = d.len(option_num=False, sample_num=False, normalization=False) + if self.args.pass_at_k: + # [inst1, inst2, inst1, inst2] -> [[inst1, inst1], [inst2, inst2]] + agg_preds = [preds[i::step] for i in range(step)] + elif len(preds) // step > 1: + from statistics import mode + + # [inst1, inst2, inst1, inst2] -> [mode([inst1, inst1]), mode([inst2, inst2])] + agg_preds = [mode(preds[i::step]) for i in range(step)] + else: + # [inst1, inst2] + agg_preds = preds + + predictions.extend(preds) + agg_predictions.extend(agg_preds) + + # calculate metric + subset_results, score_list = d.calculate_metric(agg_preds) + metric_results.update(subset_results) score_lists.append(score_list) grouped_display_names[d.dataset_name].append(n) @@ -995,19 +1028,20 @@ def calculate_metric(self, predictions) -> Tuple[Dict[str, Dict[str, float]], Li # skip if not all subsets of a category are available continue fstr = f"{name}[{cat.title().replace('_', ' ')} Macro Average]" - results[fstr] = avg_metrics([results[n] for n in c]) + metric_results[fstr] = avg_metrics([metric_results[n] for n in c]) if name == "gaokao": - r, f = zip(*[(results[name + ":" + n], f) for n, f in GAOKAO_CHINESE_TASKS_SCORE.items()]) - results[name + "[Chinese Weighted Average]"] = avg_metrics(r, f, average_method="weighted") - r, f = zip(*[(results[name + ":" + n], f) for n, f in GAOKAO_ENGLISH_TASKS_SCORE.items()]) - results[name + "[English Weighted Average]"] = avg_metrics(r, f, average_method="weighted") - r, f = zip(*[(results[name + ":" + n], f) for n, f in GAOKAO_TASKS_SCORE.items()]) - results[name + "[Weighted Average]"] = avg_metrics(r, f, average_method="weighted") + r, f = zip(*[(metric_results[name + ":" + n], f) for n, f in GAOKAO_CHINESE_TASKS_SCORE.items()]) + metric_results[name + "[Chinese Weighted Average]"] = avg_metrics(r, f, average_method="weighted") + r, f = zip(*[(metric_results[name + ":" + n], f) for n, f in GAOKAO_ENGLISH_TASKS_SCORE.items()]) + metric_results[name + "[English Weighted Average]"] = avg_metrics(r, f, average_method="weighted") + r, f = zip(*[(metric_results[name + ":" + n], f) for n, f in GAOKAO_TASKS_SCORE.items()]) + metric_results[name + "[Weighted Average]"] = avg_metrics(r, f, average_method="weighted") - results[name + "[Marco Average]"] = avg_metrics([r for k, r in results.items() if k.startswith(name + ":")]) + metric_results[name + "[Marco Average]"] = avg_metrics([r for k, r in metric_results.items() if k.startswith(name + ":")]) - return results, score_lists + self.log_final_results(raw_predictions, predictions, score_lists) + return metric_results def get_batch_sampler(self, reload_tokenizer: bool = False): if reload_tokenizer: diff --git a/utilization/evaluator.py b/utilization/evaluator.py index 3638ad46..f50ec45a 100644 --- a/utilization/evaluator.py +++ b/utilization/evaluator.py @@ -1,5 +1,4 @@ from logging import getLogger -from statistics import mode from typing import Any, Callable, Dict, List, Optional from .load_dataset import load_datasets @@ -129,24 +128,9 @@ def evaluate(self) -> Dict[str, Dict[str, float]]: f"The number of results {len(raw_predictions)} should be equal to the number of samples in the dataset {self.dataset.len()}." ) - # post processing and self-consistency - predictions = self.dataset.post_processing(raw_predictions) - if len(predictions) != self.dataset.len(option_num=False, normalization=False): - raise RuntimeError( - f"The number of results {len(predictions)} should be equal to the number of samples in the dataset {self.dataset.len(option_num=False, normalization=False)}." - ) - - step = self.dataset.len(option_num=False, sample_num=False, normalization=False) - if self.dataset_args.pass_at_k: - mode_predictions = [predictions[i::step] for i in range(step)] - elif len(predictions) // step > 1: - mode_predictions = [mode(predictions[i::step]) for i in range(step)] - else: - mode_predictions = predictions - # calculate metric - metric_results, last_score_lists = self.dataset.calculate_metric(mode_predictions) - self.dataset.log_final_results(raw_predictions, predictions, last_score_lists) + metric_results = self.dataset.calculate_metric(raw_predictions) + msg = f"Evaluation finished successfully:\nevaluation results: {self.dataset_args.evaluation_results_path}" for display_name, result in metric_results.items(): if result is None: diff --git a/utilization/load_dataset.py b/utilization/load_dataset.py index 76ed19bf..78f8da37 100644 --- a/utilization/load_dataset.py +++ b/utilization/load_dataset.py @@ -121,7 +121,7 @@ def get_subsets( found_config = True break except Exception as e: - logger.info(f"Failed when trying to get_dataset_config_names({path}): {e}") + logger.info(f"Failed when trying to get_dataset_config_names({path}): {e}. Trying another method...") logger.debug(f"get_dataset_config_names({path}): {s}") From 848fa135ec6e107dc284b0ce7d5827f3701d860e Mon Sep 17 00:00:00 2001 From: huyiwen <1020030101@qq.com> Date: Fri, 26 Jul 2024 23:55:59 +0800 Subject: [PATCH 3/3] [doc] add acknowledgments --- README.md | 6 ++++++ utilization/dataset/dataset.py | 4 +++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 862167dc..af340433 100644 --- a/README.md +++ b/README.md @@ -288,6 +288,12 @@ We expect all contributions discussed in the issue tracker and going through PRs For more details, view the [CONTRIBUTING](https://github.com/RUCAIBox/LLMBox/tree/main/docs/CONTRIBUTING.md) documentation. +--- + +We thank the following contributors for their contributions to LLMBox: + +- [@xansar](https://github.com/xansar) for fixing multiple complex issues like batch sampler and self-consistency. + ## The Team diff --git a/utilization/dataset/dataset.py b/utilization/dataset/dataset.py index 35864fcd..86e7598a 100644 --- a/utilization/dataset/dataset.py +++ b/utilization/dataset/dataset.py @@ -1038,7 +1038,9 @@ def calculate_metric(self, raw_predictions: List[Union[str, float]]) -> Dict[str r, f = zip(*[(metric_results[name + ":" + n], f) for n, f in GAOKAO_TASKS_SCORE.items()]) metric_results[name + "[Weighted Average]"] = avg_metrics(r, f, average_method="weighted") - metric_results[name + "[Marco Average]"] = avg_metrics([r for k, r in metric_results.items() if k.startswith(name + ":")]) + metric_results[name + "[Marco Average]"] = avg_metrics([ + r for k, r in metric_results.items() if k.startswith(name + ":") + ]) self.log_final_results(raw_predictions, predictions, score_lists) return metric_results