From c623f88ad1738e7a9fc6fb8f5368ab3dc4af05fd Mon Sep 17 00:00:00 2001
From: Yu Chin Fabian Lim <flim@sg.ibm.com>
Date: Thu, 1 Aug 2024 06:36:39 +0000
Subject: [PATCH] added chat template support

Signed-off-by: Yu Chin Fabian Lim <flim@sg.ibm.com>
---
 scripts/benchmarks/benchmark.py         | 26 +++++---
 scripts/benchmarks/data_processing.py   | 79 +++++++++++++++++++++----
 scripts/benchmarks/scenarios-other.yaml | 28 ++++++++-
 scripts/run_benchmarks.sh               |  4 +-
 4 files changed, 115 insertions(+), 22 deletions(-)

diff --git a/scripts/benchmarks/benchmark.py b/scripts/benchmarks/benchmark.py
index 708801ce..ac6c0b77 100644
--- a/scripts/benchmarks/benchmark.py
+++ b/scripts/benchmarks/benchmark.py
@@ -157,7 +157,9 @@ def __init__(
         tokenize: bool = True,
         input_field: str = 'input',
         dataset_text_field: str = 'output',
+        chat_template: str = None,
     ) -> None:
+
         self.dataset_split = datasets.load_dataset(
             dataset_name, split=dataset_split
         )
@@ -167,12 +169,13 @@ def __init__(
             'tokenize': tokenize,
             'input_field': input_field,
             'dataset_text_field': dataset_text_field,
+            'chat_template' : chat_template
         }
         self.training_paths = {} # cache to store the training paths
         self.data_save_path = data_save_path
 
     def prepare_dataset(
-        self, model_name: str,
+        self, model_name: str, response_template: str = None,
     ):
         if model_name in self.training_paths:
             return self.training_paths[model_name]
@@ -198,9 +201,14 @@ def prepare_dataset(
         # build the formatting func
         format_fn, kwargs = build_data_formatting_func(
             tokenizer, **self.kwargs, 
-            features=set(self.dataset_split.features)
+            features=set(self.dataset_split.features),
+            response_template=response_template,
         )
 
+        if 'chat_template' in self.kwargs:
+            print ('*** CHAT TEMPLATE *****')
+            print (self.kwargs['chat_template'])
+
         print (f"Preparing dataset '{save_path}'")
 
         # call the map
@@ -337,11 +345,6 @@ def preload_models(self):
             # just preload the config
             AutoConfig.from_pretrained(model_name)
 
-    def prepare_datasets(self, benchmark_datasets: BenchmarkDataset):
-        for model_name in self.arguments["model_name_or_path"]:
-            print(f"Scenario '{self.name}' preparing dataset for model '{model_name}'")
-            benchmark_datasets.prepare_dataset(model_name)
-
     def get_scenario_matrices_and_defaults(self):
         scenario_defaults = {}
         matrices = {}
@@ -665,7 +668,14 @@ def prepare_arguments(args, benchmark_dataset: BenchmarkDataset):
         # handle the dataset
         for x in products:
             # prepare the dataset
-            training_path = benchmark_dataset.prepare_dataset(x['model_name_or_path'])
+            training_path = benchmark_dataset.prepare_dataset(
+                x['model_name_or_path'],
+                (
+                    x['response_template'] 
+                    if 'response_template' in x
+                    else constants.get('response_template')
+                )
+            )
             # update
             x['training_data_path'] = training_path
 
diff --git a/scripts/benchmarks/data_processing.py b/scripts/benchmarks/data_processing.py
index fd34d893..d5e53b24 100644
--- a/scripts/benchmarks/data_processing.py
+++ b/scripts/benchmarks/data_processing.py
@@ -2,6 +2,64 @@
 from transformers import PreTrainedTokenizer
 from typing import Dict, Callable, List
 
+DEFAULT_FIELDS = [
+    'input_ids', 
+    'attention_mask', 
+    'labels'
+]
+
+def build_data_formatting_func(
+    tokenizer: PreTrainedTokenizer = None,
+    formatting: str = 'instruct',
+    tokenize: bool = True,
+    input_field: str = 'input',
+    dataset_text_field: str = 'output',
+    features: List = None, 
+    response_template: str = None,
+    chat_template: str = None,
+):
+    if tokenizer is None or chat_template is None:
+        return _build_data_formatting_func_without_chat_template(
+            tokenizer, formatting, tokenize, input_field, dataset_text_field,
+            features, response_template
+        )
+
+    return _build_data_formatting_func(
+        tokenizer, tokenize, chat_template,
+        dataset_text_field, features, response_template
+    )
+    
+
+# this one uses the chat template and tokenizer
+def _build_data_formatting_func(
+    tokenizer: PreTrainedTokenizer,
+    tokenize: bool = True,
+    chat_template: str = None,
+    dataset_text_field: str = "output",
+    features: List = None, 
+    response_template: str = None,
+):
+
+    tokenizer.chat_template = chat_template
+
+    loss_masking = None
+    if tokenize and response_template is not None:
+        loss_masking = instruction_mask_loss(tokenizer, response_template)
+
+    def _format(example):
+        formatted_and_maybe_tokenized = tokenizer.apply_chat_template([example], tokenize=tokenize)
+        key = 'input_ids' if tokenize else dataset_text_field
+        if not loss_masking:
+            return {key: formatted_and_maybe_tokenized}
+        return loss_masking(formatted_and_maybe_tokenized)
+
+    return _format, {
+        'remove_columns': features.difference(
+            set(DEFAULT_FIELDS)
+        )
+    }
+
+# ---- NOTE: remove this eventually and move to check templates ---- 
 PROMPT_DICT = {
     "prompt_input": (
         "Below is an instruction that describes a task, paired with an input that provides further context. "
@@ -15,13 +73,6 @@
     ),
 }
 
-RESPONSE_TEMPLATE = '### Response:'
-DEFAULT_FIELDS = [
-    'input_ids', 
-    'attention_mask', 
-    'labels'
-]
-
 # combine functions
 # c = combine(a, b) then c(i) = b(a(i))
 FUNC = Callable[[Dict], Dict]
@@ -33,13 +84,14 @@ def _combine(x):
 
     return _combine
 
-def build_data_formatting_func(
+def _build_data_formatting_func_without_chat_template(
     tokenizer: PreTrainedTokenizer = None,
     formatting: str = 'instruct',
     tokenize: bool = True,
     input_field: str = 'input',
     dataset_text_field: str = 'output',
     features: List = None, 
+    response_template: str = None,
 ):
     # FIFO
     funcs = []
@@ -63,9 +115,9 @@ def build_data_formatting_func(
             )
         )
 
-        if formatting == 'instruct':
+        if formatting == 'instruct' and response_template:
             funcs.append(
-                instruction_mask_loss(tokenizer)
+                instruction_mask_loss(tokenizer, response_template)
             )
 
     if len(funcs) == 0:
@@ -110,10 +162,15 @@ def _tokenize(example):
 
     return _tokenize
 
+# ---- NOTE: remove this eventually and move to check templates ---- 
+
 def instruction_mask_loss(
     tokenizer: PreTrainedTokenizer, 
-    response_template: str = RESPONSE_TEMPLATE, 
+    response_template: str,
 ):
+
+    print(f"Applying loss masking to reponse template '{response_template}'")
+
     # cheat, use the data collator to mask the loss tokens
     response_template_ids = tokenizer.encode(
         response_template, add_special_tokens=False
diff --git a/scripts/benchmarks/scenarios-other.yaml b/scripts/benchmarks/scenarios-other.yaml
index 1654f19e..66784213 100644
--- a/scripts/benchmarks/scenarios-other.yaml
+++ b/scripts/benchmarks/scenarios-other.yaml
@@ -1,9 +1,33 @@
 # This file holds a list of scenarios to may be run.
+# data_processing:
+#   dataset_name: yahma/alpaca-cleaned
+#   formatting: "instruct"
+#   tokenize: True
+#   input_field: input
+
 data_processing:
   dataset_name: yahma/alpaca-cleaned
-  formatting: "instruct"
+  chat_template: |
+    {%- for message in messages %}
+        {% if message['input'] != '' %}
+    Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
+
+        {% else %}
+    Below is an instruction that describes a task. Write a response that appropriately completes the request.
+
+        {% endif %}
+    ### Instruction:
+    {{ message['instruction'] }}
+
+        {% if message['input'] != '' %}
+    ### Input:
+    {{ message['input'] }}
+
+        {% endif %}
+    ### Response:
+    {{ message['output'] + eos_token }}
+    {% endfor %}
   tokenize: True
-  input_field: input
 
 scenarios:
     -   name: full-finetuning
diff --git a/scripts/run_benchmarks.sh b/scripts/run_benchmarks.sh
index 6e63f530..57ad59c5 100644
--- a/scripts/run_benchmarks.sh
+++ b/scripts/run_benchmarks.sh
@@ -25,7 +25,7 @@ SCNTAG_PEFT_AUTOGPTQ=accelerated-peft-gptq
 # ------------- OTHER CONFIGS -----------------
 
 # data will be cached in here
-DATA_CACHE=data/cache.json
+DATA_CACHE=data
 
 # final result placed here
 BENCH_RESULT_FILE=benchmarks.csv
@@ -45,11 +45,13 @@ NUM_GPUS_MATRIX=${1-"1 2"}
 RESULT_DIR=${2:-"benchmark_outputs"}
 SCENARIOS_CONFIG=${3:-$SCENARIOS_CONFIG}
 SCENARIOS_FILTER=${4-$SCNTAG_PEFT_AUTOGPTQ}
+DEFAULTS_CONFIG=${5:-$DEFAULTS_CONFIG}
 
 echo "NUM_GPUS_MATRIX: $NUM_GPUS_MATRIX"
 echo "RESULT_DIR: $RESULT_DIR"
 echo "SCENARIOS_CONFIG: $SCENARIOS_CONFIG"
 echo "SCENARIOS_FILTER: $SCENARIOS_FILTER"
+echo "DEFAULTS_CONFIG: $DEFAULTS_CONFIG"
 echo "MEMORY_LOGGING: $MEMORY_LOGGING"
 
 if [ -n "$RESULT_DIR" ]; then