Skip to content

Commit

Permalink
cleanup
Browse files Browse the repository at this point in the history
Signed-off-by: Yu Chin Fabian Lim <[email protected]>
  • Loading branch information
fabianlim committed Aug 1, 2024
1 parent c623f88 commit 19a9c5b
Show file tree
Hide file tree
Showing 4 changed files with 43 additions and 16 deletions.
25 changes: 16 additions & 9 deletions scripts/benchmarks/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,11 +77,15 @@
HF_TRAINER_LOG_GPU_STAGE_TRAIN = "train_mem_gpu"
KEYWORD_PEAKED_DELTA = "peaked_delta"
KEYWORD_ALLOC_DELTA = "alloc_delta"
HF_ARG_SKIP_MEMORY_METRIC = "--skip_memory_metrics"
HF_ARG_TRAINING_DATA_PATH = "training_data_path"
HF_ARG_RESPONSE_TEMPLATE = "response_template"
HF_ARG_SKIP_MEMORY_METRIC = "skip_memory_metrics"
RESULT_FIELD_ALLOCATED_GPU_MEM = "mem_torch_mem_alloc_in_bytes"
RESULT_FIELD_PEAK_ALLOCATED_GPU_MEM = "mem_peak_torch_mem_alloc_in_bytes"
ERROR_MESSAGES = "error_messages"

SCENARIOS_STANZA_SCN = 'scenarios'
SCENARIOS_STANZA_DATA = 'data_processing' # optional

def extract_gpu_memory_metrics(output_metrics) -> Tuple[float]:
"""
Expand Down Expand Up @@ -154,7 +158,7 @@ def __init__(
dataset_name: str = 'yahma/alpaca-cleaned',
dataset_split: str = "train",
formatting: str = 'instruct',
tokenize: bool = True,
tokenize: bool = False,
input_field: str = 'input',
dataset_text_field: str = 'output',
chat_template: str = None,
Expand Down Expand Up @@ -623,7 +627,7 @@ def get_peak_mem_usage_by_device_id(gpu_logs: pd.DataFrame):

def prepare_arguments(args, benchmark_dataset: BenchmarkDataset):
defaults = ConfigUtils.read_yaml(args.defaults_config_path)
scenarios = ConfigUtils.read_yaml(args.scenarios_config_path)["scenarios"]
scenarios = ConfigUtils.read_yaml(args.scenarios_config_path)[SCENARIOS_STANZA_SCN]
acceleration_config_map = convert_keypairs_to_map(
args.acceleration_framework_config_keypairs
)
Expand Down Expand Up @@ -671,13 +675,13 @@ def prepare_arguments(args, benchmark_dataset: BenchmarkDataset):
training_path = benchmark_dataset.prepare_dataset(
x['model_name_or_path'],
(
x['response_template']
if 'response_template' in x
else constants.get('response_template')
x[HF_ARG_RESPONSE_TEMPLATE]
if HF_ARG_RESPONSE_TEMPLATE in x
else constants.get(HF_ARG_RESPONSE_TEMPLATE)
)
)
# update
x['training_data_path'] = training_path
x[HF_ARG_TRAINING_DATA_PATH] = training_path

for (
num_gpus,
Expand All @@ -704,7 +708,7 @@ def generate_list_of_experiments(
expr_arg_w_outputdir = exp_arg + [
"--output_dir",
os.path.join(experiment_output_dir, hf_products_dir),
HF_ARG_SKIP_MEMORY_METRIC,
"--" + HF_ARG_SKIP_MEMORY_METRIC,
not log_memory_in_trainer,
]
expr_cls = Experiment if not dry_run else DryRunExperiment
Expand Down Expand Up @@ -834,7 +838,10 @@ def main(args):

# 1. Prepares a standard BenchmarkDataset
# - the preperation of the dataset is deferred to when 'prepare_dataset' is called
dataset_processing_args = ConfigUtils.read_yaml(args.scenarios_config_path)['data_processing']
# - try to read the data_processing stanza of
dataset_processing_args = ConfigUtils.read_yaml(args.scenarios_config_path).get(
SCENARIOS_STANZA_DATA, {}
)
if not args.no_data_processing:
benchmark_dataset = BenchmarkDataset(
args.dataset_save_path,
Expand Down
15 changes: 11 additions & 4 deletions scripts/benchmarks/data_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
def build_data_formatting_func(
tokenizer: PreTrainedTokenizer = None,
formatting: str = 'instruct',
tokenize: bool = True,
tokenize: bool = False,
input_field: str = 'input',
dataset_text_field: str = 'output',
features: List = None,
Expand All @@ -33,7 +33,7 @@ def build_data_formatting_func(
# this one uses the chat template and tokenizer
def _build_data_formatting_func(
tokenizer: PreTrainedTokenizer,
tokenize: bool = True,
tokenize: bool = False,
chat_template: str = None,
dataset_text_field: str = "output",
features: List = None,
Expand Down Expand Up @@ -87,7 +87,7 @@ def _combine(x):
def _build_data_formatting_func_without_chat_template(
tokenizer: PreTrainedTokenizer = None,
formatting: str = 'instruct',
tokenize: bool = True,
tokenize: bool = False,
input_field: str = 'input',
dataset_text_field: str = 'output',
features: List = None,
Expand Down Expand Up @@ -167,14 +167,21 @@ def _tokenize(example):
def instruction_mask_loss(
tokenizer: PreTrainedTokenizer,
response_template: str,
take_from_index: int = 2,
):

print(f"Applying loss masking to reponse template '{response_template}'")

# cheat, use the data collator to mask the loss tokens
response_template_ids = tokenizer.encode(
response_template, add_special_tokens=False
)[2:]
)

# this ignores the first
if len(response_template_ids) > take_from_index:
response_template_ids = response_template_ids[take_from_index:]
print (f"Taking response_ids[{take_from_index}:] from '{len(response_template_ids)}' response tokens")

collator = DataCollatorForCompletionOnlyLM(response_template_ids, tokenizer=tokenizer, ignore_index=-100)

def collate_example(example):
Expand Down
Original file line number Diff line number Diff line change
@@ -1,10 +1,24 @@
# This file holds a list of scenarios to may be run.
# This file holds a sample full-finetuning scenario and
# demonstrates various pretokenization scenarios

# the data_processing stanza is optional
# - if it is missing, then the defaults is to use alpaca
# with instruct formatting and no tokenization

# - this is an older style method which does not rely on
# chat templates, this will also do instruct formatting
# - but if tokenize = True, this works only if
# sft_trainer accepts pretokenized dataset
# data_processing:
# dataset_name: yahma/alpaca-cleaned
# formatting: "instruct"
# tokenize: True
# input_field: input

# - this is the new style, with the chat templates for formatting
# - this is the best approach to keep things flexible and
# allows to configure many different datasets
# - there is an option of setting tokenize is True or False
data_processing:
dataset_name: yahma/alpaca-cleaned
chat_template: |
Expand All @@ -29,6 +43,7 @@ data_processing:
{% endfor %}
tokenize: True

# scenarios
scenarios:
- name: full-finetuning
arguments:
Expand Down
2 changes: 0 additions & 2 deletions scripts/run_benchmarks.sh
Original file line number Diff line number Diff line change
Expand Up @@ -45,13 +45,11 @@ NUM_GPUS_MATRIX=${1-"1 2"}
RESULT_DIR=${2:-"benchmark_outputs"}
SCENARIOS_CONFIG=${3:-$SCENARIOS_CONFIG}
SCENARIOS_FILTER=${4-$SCNTAG_PEFT_AUTOGPTQ}
DEFAULTS_CONFIG=${5:-$DEFAULTS_CONFIG}

echo "NUM_GPUS_MATRIX: $NUM_GPUS_MATRIX"
echo "RESULT_DIR: $RESULT_DIR"
echo "SCENARIOS_CONFIG: $SCENARIOS_CONFIG"
echo "SCENARIOS_FILTER: $SCENARIOS_FILTER"
echo "DEFAULTS_CONFIG: $DEFAULTS_CONFIG"
echo "MEMORY_LOGGING: $MEMORY_LOGGING"

if [ -n "$RESULT_DIR" ]; then
Expand Down

0 comments on commit 19a9c5b

Please sign in to comment.