cleanup

Signed-off-by: Yu Chin Fabian Lim <[email protected]>
foundation-model-stack · Aug 1, 2024 · 19a9c5b · 19a9c5b
1 parent c623f88
commit 19a9c5b
Show file tree

Hide file tree

Showing 4 changed files with 43 additions and 16 deletions.
diff --git a/scripts/benchmarks/benchmark.py b/scripts/benchmarks/benchmark.py
@@ -77,11 +77,15 @@
 HF_TRAINER_LOG_GPU_STAGE_TRAIN = "train_mem_gpu"
 KEYWORD_PEAKED_DELTA = "peaked_delta"
 KEYWORD_ALLOC_DELTA = "alloc_delta"
-HF_ARG_SKIP_MEMORY_METRIC = "--skip_memory_metrics"
+HF_ARG_TRAINING_DATA_PATH = "training_data_path"
+HF_ARG_RESPONSE_TEMPLATE = "response_template"
+HF_ARG_SKIP_MEMORY_METRIC = "skip_memory_metrics"
 RESULT_FIELD_ALLOCATED_GPU_MEM = "mem_torch_mem_alloc_in_bytes"
 RESULT_FIELD_PEAK_ALLOCATED_GPU_MEM = "mem_peak_torch_mem_alloc_in_bytes"
 ERROR_MESSAGES = "error_messages"
 
+SCENARIOS_STANZA_SCN = 'scenarios'
+SCENARIOS_STANZA_DATA = 'data_processing' # optional
 
 def extract_gpu_memory_metrics(output_metrics) -> Tuple[float]:
     """
@@ -154,7 +158,7 @@ def __init__(
         dataset_name: str = 'yahma/alpaca-cleaned',
         dataset_split: str = "train",
         formatting: str = 'instruct',
-        tokenize: bool = True,
+        tokenize: bool = False,
         input_field: str = 'input',
         dataset_text_field: str = 'output',
         chat_template: str = None,
@@ -623,7 +627,7 @@ def get_peak_mem_usage_by_device_id(gpu_logs: pd.DataFrame):
 
 def prepare_arguments(args, benchmark_dataset: BenchmarkDataset):
     defaults = ConfigUtils.read_yaml(args.defaults_config_path)
-    scenarios = ConfigUtils.read_yaml(args.scenarios_config_path)["scenarios"]
+    scenarios = ConfigUtils.read_yaml(args.scenarios_config_path)[SCENARIOS_STANZA_SCN]
     acceleration_config_map = convert_keypairs_to_map(
         args.acceleration_framework_config_keypairs
     )
@@ -671,13 +675,13 @@ def prepare_arguments(args, benchmark_dataset: BenchmarkDataset):
             training_path = benchmark_dataset.prepare_dataset(
                 x['model_name_or_path'],
                 (
-                    x['response_template'] 
-                    if 'response_template' in x
-                    else constants.get('response_template')
+                    x[HF_ARG_RESPONSE_TEMPLATE] 
+                    if HF_ARG_RESPONSE_TEMPLATE in x
+                    else constants.get(HF_ARG_RESPONSE_TEMPLATE)
                 )
             )
             # update
-            x['training_data_path'] = training_path
+            x[HF_ARG_TRAINING_DATA_PATH] = training_path
 
         for (
             num_gpus,
@@ -704,7 +708,7 @@ def generate_list_of_experiments(
         expr_arg_w_outputdir = exp_arg + [
             "--output_dir",
             os.path.join(experiment_output_dir, hf_products_dir),
-            HF_ARG_SKIP_MEMORY_METRIC,
+            "--" + HF_ARG_SKIP_MEMORY_METRIC,
             not log_memory_in_trainer,
         ]
         expr_cls = Experiment if not dry_run else DryRunExperiment
@@ -834,7 +838,10 @@ def main(args):
 
     # 1. Prepares a standard BenchmarkDataset
     # -  the preperation of the dataset is deferred to when 'prepare_dataset' is called
-    dataset_processing_args = ConfigUtils.read_yaml(args.scenarios_config_path)['data_processing']
+    # -  try to read the data_processing stanza of 
+    dataset_processing_args = ConfigUtils.read_yaml(args.scenarios_config_path).get(
+        SCENARIOS_STANZA_DATA, {}
+    )
     if not args.no_data_processing:
         benchmark_dataset = BenchmarkDataset(
             args.dataset_save_path,

diff --git a/scripts/benchmarks/data_processing.py b/scripts/benchmarks/data_processing.py
@@ -11,7 +11,7 @@
 def build_data_formatting_func(
     tokenizer: PreTrainedTokenizer = None,
     formatting: str = 'instruct',
-    tokenize: bool = True,
+    tokenize: bool = False,
     input_field: str = 'input',
     dataset_text_field: str = 'output',
     features: List = None, 
@@ -33,7 +33,7 @@ def build_data_formatting_func(
 # this one uses the chat template and tokenizer
 def _build_data_formatting_func(
     tokenizer: PreTrainedTokenizer,
-    tokenize: bool = True,
+    tokenize: bool = False,
     chat_template: str = None,
     dataset_text_field: str = "output",
     features: List = None, 
@@ -87,7 +87,7 @@ def _combine(x):
 def _build_data_formatting_func_without_chat_template(
     tokenizer: PreTrainedTokenizer = None,
     formatting: str = 'instruct',
-    tokenize: bool = True,
+    tokenize: bool = False,
     input_field: str = 'input',
     dataset_text_field: str = 'output',
     features: List = None, 
@@ -167,14 +167,21 @@ def _tokenize(example):
 def instruction_mask_loss(
     tokenizer: PreTrainedTokenizer, 
     response_template: str,
+    take_from_index: int = 2,
 ):
 
     print(f"Applying loss masking to reponse template '{response_template}'")
 
     # cheat, use the data collator to mask the loss tokens
     response_template_ids = tokenizer.encode(
         response_template, add_special_tokens=False
-    )[2:]
+    )
+
+    # this ignores the first 
+    if len(response_template_ids) > take_from_index:
+        response_template_ids = response_template_ids[take_from_index:]
+        print (f"Taking response_ids[{take_from_index}:] from '{len(response_template_ids)}' response tokens")
+
     collator = DataCollatorForCompletionOnlyLM(response_template_ids, tokenizer=tokenizer, ignore_index=-100)
 
     def collate_example(example):

diff --git a/scripts/benchmarks/scenarios-other.yaml → scripts/benchmarks/scenarios-pretok.yaml b/scripts/benchmarks/scenarios-other.yaml → scripts/benchmarks/scenarios-pretok.yaml
@@ -1,10 +1,24 @@
-# This file holds a list of scenarios to may be run.
+# This file holds a sample full-finetuning scenario and 
+# demonstrates various pretokenization scenarios
+
+# the data_processing stanza is optional
+# - if it is missing, then the defaults is to use alpaca
+# with instruct formatting and no tokenization
+
+# - this is an older style method which does not rely on 
+#   chat templates, this will also do instruct formatting
+# - but if tokenize = True, this works only if 
+#   sft_trainer accepts pretokenized dataset
 # data_processing:
 #   dataset_name: yahma/alpaca-cleaned
 #   formatting: "instruct"
 #   tokenize: True
 #   input_field: input
 
+# - this is the new style, with the chat templates for formatting
+# - this is the best approach to keep things flexible and
+#   allows to configure many different datasets
+#  - there is an option of setting tokenize is True or False
 data_processing:
   dataset_name: yahma/alpaca-cleaned
   chat_template: |
@@ -29,6 +43,7 @@ data_processing:
     {% endfor %}
   tokenize: True
 
+# scenarios
 scenarios:
     -   name: full-finetuning
         arguments:

diff --git a/scripts/run_benchmarks.sh b/scripts/run_benchmarks.sh
@@ -45,13 +45,11 @@ NUM_GPUS_MATRIX=${1-"1 2"}
 RESULT_DIR=${2:-"benchmark_outputs"}
 SCENARIOS_CONFIG=${3:-$SCENARIOS_CONFIG}
 SCENARIOS_FILTER=${4-$SCNTAG_PEFT_AUTOGPTQ}
-DEFAULTS_CONFIG=${5:-$DEFAULTS_CONFIG}
 
 echo "NUM_GPUS_MATRIX: $NUM_GPUS_MATRIX"
 echo "RESULT_DIR: $RESULT_DIR"
 echo "SCENARIOS_CONFIG: $SCENARIOS_CONFIG"
 echo "SCENARIOS_FILTER: $SCENARIOS_FILTER"
-echo "DEFAULTS_CONFIG: $DEFAULTS_CONFIG"
 echo "MEMORY_LOGGING: $MEMORY_LOGGING"
 
 if [ -n "$RESULT_DIR" ]; then