diff --git a/scripts/benchmarks/benchmark.py b/scripts/benchmarks/benchmark.py index 74f6b3b8..8b95e8fd 100644 --- a/scripts/benchmarks/benchmark.py +++ b/scripts/benchmarks/benchmark.py @@ -11,13 +11,13 @@ # Third Party from tqdm import tqdm -from transformers import AutoConfig, HfArgumentParser, TrainingArguments -from transformers import AutoTokenizer +from transformers import AutoConfig, AutoTokenizer, HfArgumentParser, TrainingArguments import datasets import pandas as pd import torch import yaml +# First Party from scripts.benchmarks.data_processing import build_data_formatting_func """ @@ -84,8 +84,9 @@ RESULT_FIELD_PEAK_ALLOCATED_GPU_MEM = "mem_peak_torch_mem_alloc_in_bytes" ERROR_MESSAGES = "error_messages" -SCENARIOS_STANZA_SCN = 'scenarios' -SCENARIOS_STANZA_DATA = 'data_processing' # optional +SCENARIOS_STANZA_SCN = "scenarios" +SCENARIOS_STANZA_DATA = "data_processing" # optional + def extract_gpu_memory_metrics(output_metrics) -> Tuple[float]: """ @@ -155,36 +156,36 @@ class BenchmarkDataset: def __init__( self, data_save_path: str, - dataset_name: str = 'yahma/alpaca-cleaned', + dataset_name: str = "yahma/alpaca-cleaned", dataset_split: str = "train", - formatting: str = 'instruct', + formatting: str = "instruct", tokenize: bool = False, - input_field: str = 'input', - dataset_text_field: str = 'output', + input_field: str = "input", + dataset_text_field: str = "output", chat_template: str = None, ) -> None: - self.dataset_split = datasets.load_dataset( - dataset_name, split=dataset_split - ) + self.dataset_split = datasets.load_dataset(dataset_name, split=dataset_split) self.kwargs = { - 'formatting': formatting, - 'tokenize': tokenize, - 'input_field': input_field, - 'dataset_text_field': dataset_text_field, - 'chat_template' : chat_template + "formatting": formatting, + "tokenize": tokenize, + "input_field": input_field, + "dataset_text_field": dataset_text_field, + "chat_template": chat_template, } - self.training_paths = {} # cache to store the training paths + self.training_paths = {} # cache to store the training paths self.data_save_path = data_save_path def prepare_dataset( - self, model_name: str, response_template: str = None, + self, + model_name: str, + response_template: str = None, ): if model_name in self.training_paths: return self.training_paths[model_name] - if self.kwargs['tokenize']: + if self.kwargs["tokenize"]: tokenizer = AutoTokenizer.from_pretrained(model_name) # for now, if pad_token_id is None, will just do a replacement @@ -193,27 +194,28 @@ def prepare_dataset( # replace some special characters in the model name save_path = DATA_JSON_NAME.format( - re.sub(r'[/-]', '_', model_name), + re.sub(r"[/-]", "_", model_name), ) else: tokenizer = None - save_path = DATA_JSON_NAME.format('all') + save_path = DATA_JSON_NAME.format("all") # get the full path save_path = os.path.join(self.data_save_path, save_path) # build the formatting func format_fn, kwargs = build_data_formatting_func( - tokenizer, **self.kwargs, + tokenizer, + **self.kwargs, features=set(self.dataset_split.features), response_template=response_template, ) - if 'chat_template' in self.kwargs: - print ('*** CHAT TEMPLATE *****') - print (self.kwargs['chat_template']) + if "chat_template" in self.kwargs: + print("*** CHAT TEMPLATE *****") + print(self.kwargs["chat_template"]) - print (f"Preparing dataset '{save_path}'") + print(f"Preparing dataset '{save_path}'") # call the map ds = self.dataset_split.map(format_fn, **kwargs) @@ -225,6 +227,7 @@ def prepare_dataset( self.training_paths[model_name] = save_path return save_path + def convert_keypairs_to_map(keypairs: List): return {key: val for key, val in zip(keypairs[::2], keypairs[1::2])} @@ -673,12 +676,12 @@ def prepare_arguments(args, benchmark_dataset: BenchmarkDataset): for x in products: # prepare the dataset training_path = benchmark_dataset.prepare_dataset( - x['model_name_or_path'], + x["model_name_or_path"], ( - x[HF_ARG_RESPONSE_TEMPLATE] + x[HF_ARG_RESPONSE_TEMPLATE] if HF_ARG_RESPONSE_TEMPLATE in x else constants.get(HF_ARG_RESPONSE_TEMPLATE) - ) + ), ) # update x[HF_ARG_TRAINING_DATA_PATH] = training_path @@ -838,7 +841,7 @@ def main(args): # 1. Prepares a standard BenchmarkDataset # - the preperation of the dataset is deferred to when 'prepare_dataset' is called - # - try to read the data_processing stanza of + # - try to read the data_processing stanza of dataset_processing_args = ConfigUtils.read_yaml(args.scenarios_config_path).get( SCENARIOS_STANZA_DATA, {} ) diff --git a/scripts/benchmarks/compare_with_reference.py b/scripts/benchmarks/compare_with_reference.py index 6a66cebd..46b76b80 100644 --- a/scripts/benchmarks/compare_with_reference.py +++ b/scripts/benchmarks/compare_with_reference.py @@ -36,6 +36,7 @@ BENCHMARK_FILENAME = "benchmarks.csv" OUTLIERS_FILENAME = "outliers.csv" + def plot_chart(ax, x, y, title, xlabel, ylabel): ax.scatter(x, y, s=10) ax.set_title(title, fontsize=8) diff --git a/scripts/benchmarks/data_processing.py b/scripts/benchmarks/data_processing.py index 6d32c3ab..1a860bbe 100644 --- a/scripts/benchmarks/data_processing.py +++ b/scripts/benchmarks/data_processing.py @@ -1,34 +1,43 @@ -from trl import DataCollatorForCompletionOnlyLM +# Standard +from typing import Callable, Dict, List + +# Third Party from transformers import PreTrainedTokenizer -from typing import Dict, Callable, List +from trl import DataCollatorForCompletionOnlyLM + +DEFAULT_FIELDS = ["input_ids", "attention_mask", "labels"] -DEFAULT_FIELDS = [ - 'input_ids', - 'attention_mask', - 'labels' -] def build_data_formatting_func( tokenizer: PreTrainedTokenizer = None, - formatting: str = 'instruct', + formatting: str = "instruct", tokenize: bool = False, - input_field: str = 'input', - dataset_text_field: str = 'output', - features: List = None, + input_field: str = "input", + dataset_text_field: str = "output", + features: List = None, response_template: str = None, chat_template: str = None, ): if tokenizer is None or chat_template is None: return _build_data_formatting_func_without_chat_template( - tokenizer, formatting, tokenize, input_field, dataset_text_field, - features, response_template + tokenizer, + formatting, + tokenize, + input_field, + dataset_text_field, + features, + response_template, ) return _build_data_formatting_func( - tokenizer, tokenize, chat_template, - dataset_text_field, features, response_template + tokenizer, + tokenize, + chat_template, + dataset_text_field, + features, + response_template, ) - + # this one uses the chat template and tokenizer def _build_data_formatting_func( @@ -36,7 +45,7 @@ def _build_data_formatting_func( tokenize: bool = False, chat_template: str = None, dataset_text_field: str = "output", - features: List = None, + features: List = None, response_template: str = None, ): @@ -47,19 +56,18 @@ def _build_data_formatting_func( loss_masking = instruction_mask_loss(tokenizer, response_template) def _format(example): - formatted_and_maybe_tokenized = tokenizer.apply_chat_template([example], tokenize=tokenize) - key = 'input_ids' if tokenize else dataset_text_field + formatted_and_maybe_tokenized = tokenizer.apply_chat_template( + [example], tokenize=tokenize + ) + key = "input_ids" if tokenize else dataset_text_field if not loss_masking: return {key: formatted_and_maybe_tokenized} return loss_masking(formatted_and_maybe_tokenized) - return _format, { - 'remove_columns': features.difference( - set(DEFAULT_FIELDS) - ) - } + return _format, {"remove_columns": features.difference(set(DEFAULT_FIELDS))} + -# ---- NOTE: remove this eventually and move to check templates ---- +# ---- NOTE: remove this eventually and move to check templates ---- PROMPT_DICT = { "prompt_input": ( "Below is an instruction that describes a task, paired with an input that provides further context. " @@ -76,7 +84,9 @@ def _format(example): # combine functions # c = combine(a, b) then c(i) = b(a(i)) FUNC = Callable[[Dict], Dict] -def combine_functions(*funcs : FUNC) -> FUNC: + + +def combine_functions(*funcs: FUNC) -> FUNC: def _combine(x): for f in funcs: x = f(x) @@ -84,13 +94,14 @@ def _combine(x): return _combine + def _build_data_formatting_func_without_chat_template( tokenizer: PreTrainedTokenizer = None, - formatting: str = 'instruct', + formatting: str = "instruct", tokenize: bool = False, - input_field: str = 'input', - dataset_text_field: str = 'output', - features: List = None, + input_field: str = "input", + dataset_text_field: str = "output", + features: List = None, response_template: str = None, ): # FIFO @@ -99,43 +110,31 @@ def _build_data_formatting_func_without_chat_template( if features is None: features = set() - if formatting == 'instruct': + if formatting == "instruct": funcs.append( instruction_formatter( - input_field=input_field, - dataset_text_field=dataset_text_field + input_field=input_field, dataset_text_field=dataset_text_field ) ) if tokenize: - funcs.append( - tokenization( - tokenizer, - dataset_text_field=dataset_text_field - ) - ) + funcs.append(tokenization(tokenizer, dataset_text_field=dataset_text_field)) - if formatting == 'instruct' and response_template: - funcs.append( - instruction_mask_loss(tokenizer, response_template) - ) + if formatting == "instruct" and response_template: + funcs.append(instruction_mask_loss(tokenizer, response_template)) if len(funcs) == 0: - raise ValueError( - "Unable to build a data formatting recipe" - ) + raise ValueError("Unable to build a data formatting recipe") return combine_functions(*funcs), { - 'remove_columns': features.union( + "remove_columns": features.union( set([input_field, dataset_text_field]) - ).difference( - set(DEFAULT_FIELDS) - ) + ).difference(set(DEFAULT_FIELDS)) } + def instruction_formatter( - input_field: str = "input", - dataset_text_field: str = "output" + input_field: str = "input", dataset_text_field: str = "output" ): def format_fn(example: Dict): prompt_input, prompt_no_input = ( @@ -152,20 +151,20 @@ def format_fn(example: Dict): return format_fn -def tokenization( - tokenizer: PreTrainedTokenizer, - dataset_text_field: str = "output" -): + +def tokenization(tokenizer: PreTrainedTokenizer, dataset_text_field: str = "output"): def _tokenize(example): text_field = example[dataset_text_field] + tokenizer.eos_token return tokenizer(text_field) return _tokenize -# ---- NOTE: remove this eventually and move to check templates ---- + +# ---- NOTE: remove this eventually and move to check templates ---- + def instruction_mask_loss( - tokenizer: PreTrainedTokenizer, + tokenizer: PreTrainedTokenizer, response_template: str, take_from_index: int = 2, ): @@ -177,17 +176,21 @@ def instruction_mask_loss( response_template, add_special_tokens=False ) - # this ignores the first + # this ignores the first if len(response_template_ids) > take_from_index: response_template_ids = response_template_ids[take_from_index:] - print (f"Taking response_ids[{take_from_index}:] from '{len(response_template_ids)}' response tokens") - - collator = DataCollatorForCompletionOnlyLM(response_template_ids, tokenizer=tokenizer, ignore_index=-100) + print( + f"Taking response_ids[{take_from_index}:] from '{len(response_template_ids)}' response tokens" + ) + + collator = DataCollatorForCompletionOnlyLM( + response_template_ids, tokenizer=tokenizer, ignore_index=-100 + ) def collate_example(example): # single example - collated_example = collator([example], return_tensors = "pt") + collated_example = collator([example], return_tensors="pt") # flatten the additional dim - return {k: v.view(-1) for k,v in collated_example.items()} + return {k: v.view(-1) for k, v in collated_example.items()} return collate_example