allow for data formatting and tokenization during bench

Signed-off-by: Yu Chin Fabian Lim <[email protected]>
foundation-model-stack · Aug 1, 2024 · 5ca8688 · 5ca8688
1 parent b6c1455
commit 5ca8688
Show file tree

Hide file tree

Showing 4 changed files with 247 additions and 55 deletions.
diff --git a/scripts/benchmarks/benchmark.py b/scripts/benchmarks/benchmark.py
@@ -12,11 +12,14 @@
 # Third Party
 from tqdm import tqdm
 from transformers import AutoConfig, HfArgumentParser, TrainingArguments
+from transformers import AutoTokenizer
 import datasets
 import pandas as pd
 import torch
 import yaml
 
+from scripts.benchmarks.data_processing import build_data_formatting_func
+
 """
 This benchmarking script 
     1. Prepares a standard BenchmarkDataset
@@ -26,19 +29,6 @@
     4. Consolidates the experiment results into a summary
 """
 
-PROMPT_DICT = {
-    "prompt_input": (
-        "Below is an instruction that describes a task, paired with an input that provides further context. "
-        "Write a response that appropriately completes the request.\n\n"
-        "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:"
-    ),
-    "prompt_no_input": (
-        "Below is an instruction that describes a task. "
-        "Write a response that appropriately completes the request.\n\n"
-        "### Instruction:\n{instruction}\n\n### Response:"
-    ),
-}
-
 COMMAND_PYTHON = "python"
 COMMAND_ACCELERATE = "accelerate launch --config_file {accelerate_config_path} --num_processes={num_processes} --main_process_port={process_port}"
 FMS_TRAINER = "-m tuning.sft_trainer"
@@ -50,6 +40,7 @@
 FILE_SHELL_COMMAND = "command.sh"
 FILE_SCRIPT_ARGS = "script.json"
 FILE_SUMMARY_CSV = "raw_summary.csv"
+DATA_JSON_NAME = "cache_{}.json"
 
 DIR_BENCHMARKS = os.path.dirname(os.path.realpath(__file__))
 DIR_PREFIX_EXPERIMENT = "exp"
@@ -156,44 +147,71 @@ def get_hf_arguments_with_no_value(dataclass_types):
 TRUE_FALSE_ARGUMENTS = get_hf_arguments_with_no_value(dataclass_types=TrainingArguments)
 
 
-def format_fn(example, input_key: str = "input", output_key: str = "output"):
-    prompt_input, prompt_no_input = (
-        PROMPT_DICT["prompt_input"],
-        PROMPT_DICT["prompt_no_input"],
-    )
-    output = (
-        prompt_input.format_map(example)
-        if example.get(input_key, "") != ""
-        else prompt_no_input.format_map(example)
-    )
-    output = f"{output} {example[output_key]}"
-    return {output_key: output}
-
-
 class BenchmarkDataset:
     def __init__(
         self,
-        dataset_name: str,
-        format_fn: Callable,
-        unused_columns: List[str] = ["instruction", "input"],
+        data_save_path: str,
+        dataset_name: str = 'yahma/alpaca-cleaned',
+        dataset_split: str = "train",
+        formatting: str = 'instruct',
+        tokenize: bool = True,
+        input_field: str = 'input',
+        dataset_text_field: str = 'output',
     ) -> None:
-        self.dataset_name = dataset_name
-        self.dataset = self.prepare_dataset(format_fn, unused_columns=unused_columns)
+        self.dataset_split = datasets.load_dataset(
+            dataset_name, split=dataset_split
+        )
 
-    def save_to_path(self, save_path: str):
-        self.dataset.to_json(save_path)
+        self.kwargs = {
+            'formatting': formatting,
+            'tokenize': tokenize,
+            'input_field': input_field,
+            'dataset_text_field': dataset_text_field,
+        }
+        self.training_paths = {} # cache to store the training paths
+        self.data_save_path = data_save_path
 
     def prepare_dataset(
-        self,
-        format_fn: Callable = None,
-        dataset_split: str = "train",
-        unused_columns: List[str] = None,
+        self, model_name: str,
     ):
-        ds = datasets.load_dataset(self.dataset_name)
-        if format_fn:
-            ds = ds[dataset_split].map(format_fn, remove_columns=unused_columns)
-        return ds
+        if model_name in self.training_paths:
+            return self.training_paths[model_name]
+
+        if self.kwargs['tokenize']:
+            tokenizer = AutoTokenizer.from_pretrained(model_name)
+
+            # for now, if pad_token_id is None, will just do a replacement
+            if tokenizer.pad_token_id is None:
+                tokenizer.pad_token_id = tokenizer.eos_token_id
+
+            # replace some special characters in the model name
+            save_path = DATA_JSON_NAME.format(
+                re.sub(r'[/-]', '_', model_name),
+            )
+        else:
+            tokenizer = None
+            save_path = DATA_JSON_NAME.format('all')
+
+        # get the full path
+        save_path = os.path.join(self.data_save_path, save_path)
+
+        # build the formatting func
+        format_fn, kwargs = build_data_formatting_func(
+            tokenizer, **self.kwargs, 
+            features=set(self.dataset_split.features)
+        )
+
+        print (f"Preparing dataset '{save_path}'")
+
+        # call the map
+        ds = self.dataset_split.map(format_fn, **kwargs)
+
+        # save it
+        ds.to_json(save_path)
 
+        # store in cache
+        self.training_paths[model_name] = save_path
+        return save_path
 
 def convert_keypairs_to_map(keypairs: List):
     return {key: val for key, val in zip(keypairs[::2], keypairs[1::2])}
@@ -319,6 +337,11 @@ def preload_models(self):
             # just preload the config
             AutoConfig.from_pretrained(model_name)
 
+    def prepare_datasets(self, benchmark_datasets: BenchmarkDataset):
+        for model_name in self.arguments["model_name_or_path"]:
+            print(f"Scenario '{self.name}' preparing dataset for model '{model_name}'")
+            benchmark_datasets.prepare_dataset(model_name)
+
     def get_scenario_matrices_and_defaults(self):
         scenario_defaults = {}
         matrices = {}
@@ -595,9 +618,8 @@ def get_peak_mem_usage_by_device_id(gpu_logs: pd.DataFrame):
     return peak_values.sub(initial_values), device_name
 
 
-def prepare_arguments(args):
+def prepare_arguments(args, benchmark_dataset: BenchmarkDataset):
     defaults = ConfigUtils.read_yaml(args.defaults_config_path)
-    defaults["training_data_path"] = args.dataset_save_path
     scenarios = ConfigUtils.read_yaml(args.scenarios_config_path)["scenarios"]
     acceleration_config_map = convert_keypairs_to_map(
         args.acceleration_framework_config_keypairs
@@ -640,6 +662,13 @@ def prepare_arguments(args):
         if args.preload_models and len(products) > 0:
             scenario.preload_models()
 
+        # handle the dataset
+        for x in products:
+            # prepare the dataset
+            training_path = benchmark_dataset.prepare_dataset(x['model_name_or_path'])
+            # update
+            x['training_data_path'] = training_path
+
         for (
             num_gpus,
             framework_config,
@@ -794,18 +823,21 @@ def main(args):
         args.log_nvidia_smi = False
 
     # 1. Prepares a standard BenchmarkDataset
-    # TODO: consider caching the json file
+    # -  the preperation of the dataset is deferred to when 'prepare_dataset' is called
+    dataset_processing_args = ConfigUtils.read_yaml(args.scenarios_config_path)['data_processing']
     if not args.no_data_processing:
-        benchmark_dataset = BenchmarkDataset(args.dataset_name, format_fn)
-        benchmark_dataset.save_to_path(args.dataset_save_path)
+        benchmark_dataset = BenchmarkDataset(
+            args.dataset_save_path,
+            **dataset_processing_args,
+        )
 
     # dump out the script arguments
     os.makedirs(args.results_output_path, exist_ok=True)
     with open(os.path.join(args.results_output_path, FILE_SCRIPT_ARGS), "w") as f:
         json.dump(vars(args), f, indent=4, sort_keys=True)
 
     # 2. Prepares a list of experiment arguments from a set of configs
-    experiment_args = prepare_arguments(args)
+    experiment_args = prepare_arguments(args, benchmark_dataset)
 
     # 3. Builds a list of experiment objects to run based on the set of experiment arguments
     experiment_stats = {}
@@ -941,16 +973,10 @@ def main(args):
         default=f"{DIR_BENCHMARKS}/defaults.yaml",
         help="path to defaults config file",
     )
-    parser.add_argument(
-        "--dataset_name",
-        type=str,
-        default="yahma/alpaca-cleaned",
-        help="dataset to benchmark on",
-    )
     parser.add_argument(
         "--dataset_save_path",
         type=str,
-        default=f"{DIR_BENCHMARKS}/data/cache.json",
+        default=f"{DIR_BENCHMARKS}/data",
         help="dataset cache path",
     )
     parser.add_argument(

diff --git a/scripts/benchmarks/data_processing.py b/scripts/benchmarks/data_processing.py
@@ -0,0 +1,129 @@
+from trl import DataCollatorForCompletionOnlyLM
+from transformers import PreTrainedTokenizer
+from typing import Dict, Callable, List
+
+PROMPT_DICT = {
+    "prompt_input": (
+        "Below is an instruction that describes a task, paired with an input that provides further context. "
+        "Write a response that appropriately completes the request.\n\n"
+        "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:"
+    ),
+    "prompt_no_input": (
+        "Below is an instruction that describes a task. "
+        "Write a response that appropriately completes the request.\n\n"
+        "### Instruction:\n{instruction}\n\n### Response:"
+    ),
+}
+
+RESPONSE_TEMPLATE = '### Response:'
+DEFAULT_FIELDS = [
+    'input_ids', 
+    'attention_mask', 
+    'labels'
+]
+
+# combine functions
+# c = combine(a, b) then c(i) = b(a(i))
+FUNC = Callable[[Dict], Dict]
+def combine_functions(*funcs : FUNC) -> FUNC:
+    def _combine(x):
+        for f in funcs:
+            x = f(x)
+        return x
+
+    return _combine
+
+def build_data_formatting_func(
+    tokenizer: PreTrainedTokenizer = None,
+    formatting: str = 'instruct',
+    tokenize: bool = True,
+    input_field: str = 'input',
+    dataset_text_field: str = 'output',
+    features: List = None, 
+):
+    # FIFO
+    funcs = []
+
+    if features is None:
+        features = set()
+
+    if formatting == 'instruct':
+        funcs.append(
+            instruction_formatter(
+                input_field=input_field,
+                dataset_text_field=dataset_text_field
+            )
+        )
+
+    if tokenize:
+        funcs.append(
+            tokenization(
+                tokenizer,
+                dataset_text_field=dataset_text_field
+            )
+        )
+
+        if formatting == 'instruct':
+            funcs.append(
+                instruction_mask_loss(tokenizer)
+            )
+
+    if len(funcs) == 0:
+        raise ValueError(
+            "Unable to build a data formatting recipe"
+        )
+
+    return combine_functions(*funcs), {
+        'remove_columns': features.union(
+            set([input_field, dataset_text_field])
+        ).difference(
+            set(DEFAULT_FIELDS)
+        )
+    }
+
+def instruction_formatter(
+    input_field: str = "input", 
+    dataset_text_field: str = "output"
+):
+    def format_fn(example: Dict):
+        prompt_input, prompt_no_input = (
+            PROMPT_DICT["prompt_input"],
+            PROMPT_DICT["prompt_no_input"],
+        )
+        output = (
+            prompt_input.format_map(example)
+            if example.get(input_field, "") != ""
+            else prompt_no_input.format_map(example)
+        )
+        output = f"{output} {example[dataset_text_field]}"
+        return {dataset_text_field: output}
+
+    return format_fn
+
+def tokenization(
+    tokenizer: PreTrainedTokenizer, 
+    dataset_text_field: str = "output"
+):
+    def _tokenize(example):
+        text_field = example[dataset_text_field] + tokenizer.eos_token
+        return tokenizer(text_field)
+
+    return _tokenize
+
+def instruction_mask_loss(
+    tokenizer: PreTrainedTokenizer, 
+    response_template: str = RESPONSE_TEMPLATE, 
+):
+    # cheat, use the data collator to mask the loss tokens
+    response_template_ids = tokenizer.encode(
+        response_template, add_special_tokens=False
+    )[2:]
+    collator = DataCollatorForCompletionOnlyLM(response_template_ids, tokenizer=tokenizer, ignore_index=-100)
+
+    def collate_example(example):
+        # single example
+        collated_example = collator([example], return_tensors = "pt")
+        # flatten the additional dim
+        return {k: v.view(-1) for k,v in collated_example.items()}
+
+    return collate_example
diff --git a/scripts/benchmarks/defaults-other.yaml b/scripts/benchmarks/defaults-other.yaml
@@ -0,0 +1,23 @@
+# This file holds two sections:
+# - sft_tuning: for non-HF arguments
+# - hf: for HF arguments
+# TODO: consider combining them to a single list
+
+# Below are custom arguments for sft_trainer.py
+use_flash_attn: True
+
+# Below are the transformers.TrainingArguments
+include_tokens_per_second: True
+num_train_epochs: 1
+gradient_accumulation_steps: 1
+gradient_checkpointing: True
+evaluation_strategy: "no"
+save_strategy: "no"
+weight_decay: 0.01
+warmup_steps: 10
+adam_epsilon: 1e-4
+lr_scheduler_type: linear
+logging_strategy: steps
+logging_steps: 10
+max_steps: 100
+
diff --git a/scripts/benchmarks/scenarios-other.yaml b/scripts/benchmarks/scenarios-other.yaml
@@ -0,0 +1,14 @@
+# This file holds a list of scenarios to may be run.
+data_processing:
+  dataset_name: yahma/alpaca-cleaned
+  formatting: "instruct"
+  tokenize: True
+  input_field: input
+
+scenarios:
+    -   name: full-finetuning
+        arguments:
+            learning_rate: 2e-5
+            model_name_or_path: 
+                - 'mistralai/Mistral-7B-v0.1'
+            torch_dtype: float16