Skip to content

Commit

Permalink
added chat template support
Browse files Browse the repository at this point in the history
Signed-off-by: Yu Chin Fabian Lim <[email protected]>
  • Loading branch information
fabianlim committed Aug 1, 2024
1 parent 5ca8688 commit c623f88
Show file tree
Hide file tree
Showing 4 changed files with 115 additions and 22 deletions.
26 changes: 18 additions & 8 deletions scripts/benchmarks/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,9 @@ def __init__(
tokenize: bool = True,
input_field: str = 'input',
dataset_text_field: str = 'output',
chat_template: str = None,
) -> None:

self.dataset_split = datasets.load_dataset(
dataset_name, split=dataset_split
)
Expand All @@ -167,12 +169,13 @@ def __init__(
'tokenize': tokenize,
'input_field': input_field,
'dataset_text_field': dataset_text_field,
'chat_template' : chat_template
}
self.training_paths = {} # cache to store the training paths
self.data_save_path = data_save_path

def prepare_dataset(
self, model_name: str,
self, model_name: str, response_template: str = None,
):
if model_name in self.training_paths:
return self.training_paths[model_name]
Expand All @@ -198,9 +201,14 @@ def prepare_dataset(
# build the formatting func
format_fn, kwargs = build_data_formatting_func(
tokenizer, **self.kwargs,
features=set(self.dataset_split.features)
features=set(self.dataset_split.features),
response_template=response_template,
)

if 'chat_template' in self.kwargs:
print ('*** CHAT TEMPLATE *****')
print (self.kwargs['chat_template'])

print (f"Preparing dataset '{save_path}'")

# call the map
Expand Down Expand Up @@ -337,11 +345,6 @@ def preload_models(self):
# just preload the config
AutoConfig.from_pretrained(model_name)

def prepare_datasets(self, benchmark_datasets: BenchmarkDataset):
for model_name in self.arguments["model_name_or_path"]:
print(f"Scenario '{self.name}' preparing dataset for model '{model_name}'")
benchmark_datasets.prepare_dataset(model_name)

def get_scenario_matrices_and_defaults(self):
scenario_defaults = {}
matrices = {}
Expand Down Expand Up @@ -665,7 +668,14 @@ def prepare_arguments(args, benchmark_dataset: BenchmarkDataset):
# handle the dataset
for x in products:
# prepare the dataset
training_path = benchmark_dataset.prepare_dataset(x['model_name_or_path'])
training_path = benchmark_dataset.prepare_dataset(
x['model_name_or_path'],
(
x['response_template']
if 'response_template' in x
else constants.get('response_template')
)
)
# update
x['training_data_path'] = training_path

Expand Down
79 changes: 68 additions & 11 deletions scripts/benchmarks/data_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,64 @@
from transformers import PreTrainedTokenizer
from typing import Dict, Callable, List

DEFAULT_FIELDS = [
'input_ids',
'attention_mask',
'labels'
]

def build_data_formatting_func(
tokenizer: PreTrainedTokenizer = None,
formatting: str = 'instruct',
tokenize: bool = True,
input_field: str = 'input',
dataset_text_field: str = 'output',
features: List = None,
response_template: str = None,
chat_template: str = None,
):
if tokenizer is None or chat_template is None:
return _build_data_formatting_func_without_chat_template(
tokenizer, formatting, tokenize, input_field, dataset_text_field,
features, response_template
)

return _build_data_formatting_func(
tokenizer, tokenize, chat_template,
dataset_text_field, features, response_template
)


# this one uses the chat template and tokenizer
def _build_data_formatting_func(
tokenizer: PreTrainedTokenizer,
tokenize: bool = True,
chat_template: str = None,
dataset_text_field: str = "output",
features: List = None,
response_template: str = None,
):

tokenizer.chat_template = chat_template

loss_masking = None
if tokenize and response_template is not None:
loss_masking = instruction_mask_loss(tokenizer, response_template)

def _format(example):
formatted_and_maybe_tokenized = tokenizer.apply_chat_template([example], tokenize=tokenize)
key = 'input_ids' if tokenize else dataset_text_field
if not loss_masking:
return {key: formatted_and_maybe_tokenized}
return loss_masking(formatted_and_maybe_tokenized)

return _format, {
'remove_columns': features.difference(
set(DEFAULT_FIELDS)
)
}

# ---- NOTE: remove this eventually and move to check templates ----
PROMPT_DICT = {
"prompt_input": (
"Below is an instruction that describes a task, paired with an input that provides further context. "
Expand All @@ -15,13 +73,6 @@
),
}

RESPONSE_TEMPLATE = '### Response:'
DEFAULT_FIELDS = [
'input_ids',
'attention_mask',
'labels'
]

# combine functions
# c = combine(a, b) then c(i) = b(a(i))
FUNC = Callable[[Dict], Dict]
Expand All @@ -33,13 +84,14 @@ def _combine(x):

return _combine

def build_data_formatting_func(
def _build_data_formatting_func_without_chat_template(
tokenizer: PreTrainedTokenizer = None,
formatting: str = 'instruct',
tokenize: bool = True,
input_field: str = 'input',
dataset_text_field: str = 'output',
features: List = None,
response_template: str = None,
):
# FIFO
funcs = []
Expand All @@ -63,9 +115,9 @@ def build_data_formatting_func(
)
)

if formatting == 'instruct':
if formatting == 'instruct' and response_template:
funcs.append(
instruction_mask_loss(tokenizer)
instruction_mask_loss(tokenizer, response_template)
)

if len(funcs) == 0:
Expand Down Expand Up @@ -110,10 +162,15 @@ def _tokenize(example):

return _tokenize

# ---- NOTE: remove this eventually and move to check templates ----

def instruction_mask_loss(
tokenizer: PreTrainedTokenizer,
response_template: str = RESPONSE_TEMPLATE,
response_template: str,
):

print(f"Applying loss masking to reponse template '{response_template}'")

# cheat, use the data collator to mask the loss tokens
response_template_ids = tokenizer.encode(
response_template, add_special_tokens=False
Expand Down
28 changes: 26 additions & 2 deletions scripts/benchmarks/scenarios-other.yaml
Original file line number Diff line number Diff line change
@@ -1,9 +1,33 @@
# This file holds a list of scenarios to may be run.
# data_processing:
# dataset_name: yahma/alpaca-cleaned
# formatting: "instruct"
# tokenize: True
# input_field: input

data_processing:
dataset_name: yahma/alpaca-cleaned
formatting: "instruct"
chat_template: |
{%- for message in messages %}
{% if message['input'] != '' %}
Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
{% else %}
Below is an instruction that describes a task. Write a response that appropriately completes the request.
{% endif %}
### Instruction:
{{ message['instruction'] }}
{% if message['input'] != '' %}
### Input:
{{ message['input'] }}
{% endif %}
### Response:
{{ message['output'] + eos_token }}
{% endfor %}
tokenize: True
input_field: input

scenarios:
- name: full-finetuning
Expand Down
4 changes: 3 additions & 1 deletion scripts/run_benchmarks.sh
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ SCNTAG_PEFT_AUTOGPTQ=accelerated-peft-gptq
# ------------- OTHER CONFIGS -----------------

# data will be cached in here
DATA_CACHE=data/cache.json
DATA_CACHE=data

# final result placed here
BENCH_RESULT_FILE=benchmarks.csv
Expand All @@ -45,11 +45,13 @@ NUM_GPUS_MATRIX=${1-"1 2"}
RESULT_DIR=${2:-"benchmark_outputs"}
SCENARIOS_CONFIG=${3:-$SCENARIOS_CONFIG}
SCENARIOS_FILTER=${4-$SCNTAG_PEFT_AUTOGPTQ}
DEFAULTS_CONFIG=${5:-$DEFAULTS_CONFIG}

echo "NUM_GPUS_MATRIX: $NUM_GPUS_MATRIX"
echo "RESULT_DIR: $RESULT_DIR"
echo "SCENARIOS_CONFIG: $SCENARIOS_CONFIG"
echo "SCENARIOS_FILTER: $SCENARIOS_FILTER"
echo "DEFAULTS_CONFIG: $DEFAULTS_CONFIG"
echo "MEMORY_LOGGING: $MEMORY_LOGGING"

if [ -n "$RESULT_DIR" ]; then
Expand Down

0 comments on commit c623f88

Please sign in to comment.