From c623f88ad1738e7a9fc6fb8f5368ab3dc4af05fd Mon Sep 17 00:00:00 2001 From: Yu Chin Fabian Lim Date: Thu, 1 Aug 2024 06:36:39 +0000 Subject: [PATCH] added chat template support Signed-off-by: Yu Chin Fabian Lim --- scripts/benchmarks/benchmark.py | 26 +++++--- scripts/benchmarks/data_processing.py | 79 +++++++++++++++++++++---- scripts/benchmarks/scenarios-other.yaml | 28 ++++++++- scripts/run_benchmarks.sh | 4 +- 4 files changed, 115 insertions(+), 22 deletions(-) diff --git a/scripts/benchmarks/benchmark.py b/scripts/benchmarks/benchmark.py index 708801ce..ac6c0b77 100644 --- a/scripts/benchmarks/benchmark.py +++ b/scripts/benchmarks/benchmark.py @@ -157,7 +157,9 @@ def __init__( tokenize: bool = True, input_field: str = 'input', dataset_text_field: str = 'output', + chat_template: str = None, ) -> None: + self.dataset_split = datasets.load_dataset( dataset_name, split=dataset_split ) @@ -167,12 +169,13 @@ def __init__( 'tokenize': tokenize, 'input_field': input_field, 'dataset_text_field': dataset_text_field, + 'chat_template' : chat_template } self.training_paths = {} # cache to store the training paths self.data_save_path = data_save_path def prepare_dataset( - self, model_name: str, + self, model_name: str, response_template: str = None, ): if model_name in self.training_paths: return self.training_paths[model_name] @@ -198,9 +201,14 @@ def prepare_dataset( # build the formatting func format_fn, kwargs = build_data_formatting_func( tokenizer, **self.kwargs, - features=set(self.dataset_split.features) + features=set(self.dataset_split.features), + response_template=response_template, ) + if 'chat_template' in self.kwargs: + print ('*** CHAT TEMPLATE *****') + print (self.kwargs['chat_template']) + print (f"Preparing dataset '{save_path}'") # call the map @@ -337,11 +345,6 @@ def preload_models(self): # just preload the config AutoConfig.from_pretrained(model_name) - def prepare_datasets(self, benchmark_datasets: BenchmarkDataset): - for model_name in self.arguments["model_name_or_path"]: - print(f"Scenario '{self.name}' preparing dataset for model '{model_name}'") - benchmark_datasets.prepare_dataset(model_name) - def get_scenario_matrices_and_defaults(self): scenario_defaults = {} matrices = {} @@ -665,7 +668,14 @@ def prepare_arguments(args, benchmark_dataset: BenchmarkDataset): # handle the dataset for x in products: # prepare the dataset - training_path = benchmark_dataset.prepare_dataset(x['model_name_or_path']) + training_path = benchmark_dataset.prepare_dataset( + x['model_name_or_path'], + ( + x['response_template'] + if 'response_template' in x + else constants.get('response_template') + ) + ) # update x['training_data_path'] = training_path diff --git a/scripts/benchmarks/data_processing.py b/scripts/benchmarks/data_processing.py index fd34d893..d5e53b24 100644 --- a/scripts/benchmarks/data_processing.py +++ b/scripts/benchmarks/data_processing.py @@ -2,6 +2,64 @@ from transformers import PreTrainedTokenizer from typing import Dict, Callable, List +DEFAULT_FIELDS = [ + 'input_ids', + 'attention_mask', + 'labels' +] + +def build_data_formatting_func( + tokenizer: PreTrainedTokenizer = None, + formatting: str = 'instruct', + tokenize: bool = True, + input_field: str = 'input', + dataset_text_field: str = 'output', + features: List = None, + response_template: str = None, + chat_template: str = None, +): + if tokenizer is None or chat_template is None: + return _build_data_formatting_func_without_chat_template( + tokenizer, formatting, tokenize, input_field, dataset_text_field, + features, response_template + ) + + return _build_data_formatting_func( + tokenizer, tokenize, chat_template, + dataset_text_field, features, response_template + ) + + +# this one uses the chat template and tokenizer +def _build_data_formatting_func( + tokenizer: PreTrainedTokenizer, + tokenize: bool = True, + chat_template: str = None, + dataset_text_field: str = "output", + features: List = None, + response_template: str = None, +): + + tokenizer.chat_template = chat_template + + loss_masking = None + if tokenize and response_template is not None: + loss_masking = instruction_mask_loss(tokenizer, response_template) + + def _format(example): + formatted_and_maybe_tokenized = tokenizer.apply_chat_template([example], tokenize=tokenize) + key = 'input_ids' if tokenize else dataset_text_field + if not loss_masking: + return {key: formatted_and_maybe_tokenized} + return loss_masking(formatted_and_maybe_tokenized) + + return _format, { + 'remove_columns': features.difference( + set(DEFAULT_FIELDS) + ) + } + +# ---- NOTE: remove this eventually and move to check templates ---- PROMPT_DICT = { "prompt_input": ( "Below is an instruction that describes a task, paired with an input that provides further context. " @@ -15,13 +73,6 @@ ), } -RESPONSE_TEMPLATE = '### Response:' -DEFAULT_FIELDS = [ - 'input_ids', - 'attention_mask', - 'labels' -] - # combine functions # c = combine(a, b) then c(i) = b(a(i)) FUNC = Callable[[Dict], Dict] @@ -33,13 +84,14 @@ def _combine(x): return _combine -def build_data_formatting_func( +def _build_data_formatting_func_without_chat_template( tokenizer: PreTrainedTokenizer = None, formatting: str = 'instruct', tokenize: bool = True, input_field: str = 'input', dataset_text_field: str = 'output', features: List = None, + response_template: str = None, ): # FIFO funcs = [] @@ -63,9 +115,9 @@ def build_data_formatting_func( ) ) - if formatting == 'instruct': + if formatting == 'instruct' and response_template: funcs.append( - instruction_mask_loss(tokenizer) + instruction_mask_loss(tokenizer, response_template) ) if len(funcs) == 0: @@ -110,10 +162,15 @@ def _tokenize(example): return _tokenize +# ---- NOTE: remove this eventually and move to check templates ---- + def instruction_mask_loss( tokenizer: PreTrainedTokenizer, - response_template: str = RESPONSE_TEMPLATE, + response_template: str, ): + + print(f"Applying loss masking to reponse template '{response_template}'") + # cheat, use the data collator to mask the loss tokens response_template_ids = tokenizer.encode( response_template, add_special_tokens=False diff --git a/scripts/benchmarks/scenarios-other.yaml b/scripts/benchmarks/scenarios-other.yaml index 1654f19e..66784213 100644 --- a/scripts/benchmarks/scenarios-other.yaml +++ b/scripts/benchmarks/scenarios-other.yaml @@ -1,9 +1,33 @@ # This file holds a list of scenarios to may be run. +# data_processing: +# dataset_name: yahma/alpaca-cleaned +# formatting: "instruct" +# tokenize: True +# input_field: input + data_processing: dataset_name: yahma/alpaca-cleaned - formatting: "instruct" + chat_template: | + {%- for message in messages %} + {% if message['input'] != '' %} + Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request. + + {% else %} + Below is an instruction that describes a task. Write a response that appropriately completes the request. + + {% endif %} + ### Instruction: + {{ message['instruction'] }} + + {% if message['input'] != '' %} + ### Input: + {{ message['input'] }} + + {% endif %} + ### Response: + {{ message['output'] + eos_token }} + {% endfor %} tokenize: True - input_field: input scenarios: - name: full-finetuning diff --git a/scripts/run_benchmarks.sh b/scripts/run_benchmarks.sh index 6e63f530..57ad59c5 100644 --- a/scripts/run_benchmarks.sh +++ b/scripts/run_benchmarks.sh @@ -25,7 +25,7 @@ SCNTAG_PEFT_AUTOGPTQ=accelerated-peft-gptq # ------------- OTHER CONFIGS ----------------- # data will be cached in here -DATA_CACHE=data/cache.json +DATA_CACHE=data # final result placed here BENCH_RESULT_FILE=benchmarks.csv @@ -45,11 +45,13 @@ NUM_GPUS_MATRIX=${1-"1 2"} RESULT_DIR=${2:-"benchmark_outputs"} SCENARIOS_CONFIG=${3:-$SCENARIOS_CONFIG} SCENARIOS_FILTER=${4-$SCNTAG_PEFT_AUTOGPTQ} +DEFAULTS_CONFIG=${5:-$DEFAULTS_CONFIG} echo "NUM_GPUS_MATRIX: $NUM_GPUS_MATRIX" echo "RESULT_DIR: $RESULT_DIR" echo "SCENARIOS_CONFIG: $SCENARIOS_CONFIG" echo "SCENARIOS_FILTER: $SCENARIOS_FILTER" +echo "DEFAULTS_CONFIG: $DEFAULTS_CONFIG" echo "MEMORY_LOGGING: $MEMORY_LOGGING" if [ -n "$RESULT_DIR" ]; then