From e59197130632faca7d5118efeae8325df9dfc24f Mon Sep 17 00:00:00 2001 From: Andrew Theurer Date: Fri, 15 Nov 2024 17:57:25 -0500 Subject: [PATCH] initial script, without functioning post-processing --- README.md | 4 +- multiplex.json | 13 +++++ pytorch-base | 3 + pytorch-client | 55 ++++++++++++++++++ pytorch-get-runtime | 6 ++ pytorch-post-process | 134 +++++++++++++++++++++++++++++++++++++++++++ rickshaw.json | 27 +++++++++ workshop.json | 18 ++++++ 8 files changed, 259 insertions(+), 1 deletion(-) create mode 100644 multiplex.json create mode 100755 pytorch-base create mode 100755 pytorch-client create mode 100755 pytorch-get-runtime create mode 100755 pytorch-post-process create mode 100644 rickshaw.json create mode 100644 workshop.json diff --git a/README.md b/README.md index 49d65a8..d6a3776 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,4 @@ # bench-pytorch -wrapper for pytorch benchmark +benchmark automation for pytorch-benchmark + +This project works with crucible automation to run the pytorch benchmark. Please see the run-pytorch.json file for exmaple usage. This file is used with "crucible run --from-file run-pytorch.json" on your host. diff --git a/multiplex.json b/multiplex.json new file mode 100644 index 0000000..d40a38c --- /dev/null +++ b/multiplex.json @@ -0,0 +1,13 @@ +{ + "presets": { + }, + "validations": { + "models" : { + "description" : "valid models", + "args" : [ + "model" + ], + "vals" : "BERT_pytorch|fastNLP_Bert|hf_Bert_large|hf_BigBird|hf_DistilBert|hf_GPT2|hf_GPT2_large|hf_Longformer|hf_Roberta_base|hf_Reformer|llama|llama_v2_7b_16h|llava|hf_Whisper|hf_T5_large|hf_T5|nanogpt|timm_vision_transformer|timm_vision_transformer_large" + } + } +} diff --git a/pytorch-base b/pytorch-base new file mode 100755 index 0000000..acfa336 --- /dev/null +++ b/pytorch-base @@ -0,0 +1,3 @@ +#!/bin/bash + +if ! source ${TOOLBOX_HOME}/bash/library/bench-base; then echo "ERROR: Could not source bench-base from \$TOOLBOX_HOME [${TOOLBOX_HOME}]"; exit 1; fi diff --git a/pytorch-client b/pytorch-client new file mode 100755 index 0000000..ae7a15a --- /dev/null +++ b/pytorch-client @@ -0,0 +1,55 @@ +#!/bin/bash +# -*- mode: sh; indent-tabs-mode: nil; sh-basic-offset: 4 -*- +# vim: autoindent tabstop=4 shiftwidth=4 expandtab softtabstop=4 filetype=bash +exec >pytorch-client-stderrout.txt +exec 2>&1 + +. /usr/bin/pytorch-base || (echo "/usr/bin/pytorch-base not found"; exit 1) + +model="llama" + +pytorch --version | awk '{print $3}' >pytorch-version.txt + +pwd=`/bin/pwd` +pushd /opt/app-root/lib/python3.11/site-packages +find . -name METADATA | cpio -pdumv $pwd/instructlab-site-packages-metadata +popd + +longopts="" +longopts+=" model:" + +opts=$(getopt -q -o "" --longoptions "$longopts" -n "getopt.sh" -- "$@"); +if [ $? -ne 0 ]; then + printf -- "\tUnrecognized option specified\n\n" + exit 1 +fi +eval set -- "$opts"; +while true; do + arg=$1; shift + val=$1; shift + case "$arg" in + --model) + model="$val" + ;; + --) + break + ;; + *) + echo "Invalid option: [$arg]" + exit 1 + esac +done + + +echo "model: $workflow" +git clone https://github.com/pytorch/benchmark.git +pushd benchmark +pip uninstall -y nvidia-ml-py +pip install pynvml +python run_benchmark.py --help +date +%s >begin.txt +python run_benchmark.py test_bench -m $model >run_benchmark_output.txt +date +%s >end.txt +rc=$? +popd +exit $rc diff --git a/pytorch-get-runtime b/pytorch-get-runtime new file mode 100755 index 0000000..8aadf7b --- /dev/null +++ b/pytorch-get-runtime @@ -0,0 +1,6 @@ +#!/bin/bash +# vim: autoindent tabstop=4 shiftwidth=4 expandtab softtabstop=4 filetype=bash +# -*- mode: sh; indent-tabs-mode: nil; sh-basic-offset: 4 -*- + +# request unbounded runtime by returning -1 +echo "-1" diff --git a/pytorch-post-process b/pytorch-post-process new file mode 100755 index 0000000..c31a7bb --- /dev/null +++ b/pytorch-post-process @@ -0,0 +1,134 @@ +#!/usr/bin/env python3 +# -*- mode: python; indent-tabs-mode: nil; python-indent-level: 4 -*- +# vim: autoindent tabstop=4 shiftwidth=4 expandtab softtabstop=4 filetype=python + +import sys +import os +import lzma +import re +import copy +import math +import json +import yaml +import argparse +import glob +from datetime import datetime +from pathlib import Path + +TOOLBOX_HOME = os.environ.get('TOOLBOX_HOME') +if TOOLBOX_HOME is None: + print("This script requires libraries that are provided by the toolbox project.") + print("Toolbox can be acquired from https://github.com/perftool-incubator/toolbox and") + print("then use 'export TOOLBOX_HOME=/path/to/toolbox' so that it can be located.") + exit(1) +else: + p = Path(TOOLBOX_HOME) / 'python' + if not p.exists() or not p.is_dir(): + print("ERROR: /python ('%s') does not exist!" % (p)) + exit(2) + sys.path.append(str(p)) +from toolbox.metrics import log_sample +from toolbox.metrics import finish_samples + +params = {} + +class t_global(object): + args = None + +def process_options(): + parser = argparse.ArgumentParser(description = 'Post process raw benchmark data into Common Data Model output') + + parser.add_argument('--model', + dest = 'model', + help = '', + default = "llama" + ) + + t_global.args, unknown = parser.parse_known_args() + + return() + +def main(): + process_options() + if t_global.args.workflow == '': + print('workflow was not defined, exiting') + return(1) + + # In any benchmark post-process script, the metrics generated need to be attributed to a + # time-period (AKA benchmark-phase). The period which is used to report and offical + # result for the benchmark is the 'measurement' period. Other periods thay may exist + # could be "warm-up", "prep", etc. + + iter_sample = { + 'primary-period': "measurement", + 'benchmark': "pytorch", + 'periods': [], + 'rickshaw-bench-metric': { 'schema': { 'version': '2021.04.12' } } + } + + metric_files = [] + + first_ts = None + last_ts = None + period = { 'name': 'phase' + str(phase), 'metric-files': [] } + file_id = 'phase' + str(phase) + desc = {'source' : 'pytorch', 'class': 'throughput'} + names = {} + desc['type'] = 'train-samples-sec'; + filename = 'e2e/phase' + str(phase) + '/checkpoints/training_params_and_metrics_global0.jsonl.xz' + print('Opening ' + filename) + this_period_first_ts = None + this_period_last_ts = None + with open(filename, 'rt') as file: + for line in file: + d = json.loads(line) + # file contents to parse (per line): + #{"epoch": 0, "step": 1, "rank": 0, + # "loss": 0.18146394193172455, + # "overall_throughput": 3.5244029279710176, + # "lr": 0.0, "cuda_mem_allocated": 14.08400821685791, + # "cuda_malloc_retries": 0, + # "num_loss_counted_tokens": 4940, "batch_size": 14, + # "total_loss": 0.4069821238517761, "gradnorm": null, + # "weight_norm": 557.9681396484375, + # "timestamp": "2024-07-18T22:46:41.628932"} + if 'epoch' in d.keys(): + dt = datetime.strptime(d['timestamp'], '%Y-%m-%dT%X.%f') + ts = math.floor(dt.timestamp() * 1000) + if this_period_first_ts == None: + this_period_first_ts = ts + if first_ts == None: + first_ts = ts + sample = {'end': ts, 'value': d['overall_throughput']} + log_sample(file_id, desc, names, sample) + last_ts = ts + this_period_last_ts = ts + metric_file_name = finish_samples() + period['metric-files'].append(metric_file_name) + iter_sample['periods'].append(period) + + # Now create the primary metric and the primary-period + iter_sample['primary-metric'] = 'actual-train-seconds' + period = { 'name': 'measurement', 'metric-files': [] } + file_id = 'measurement' + desc = {'source' : 'pytorch', 'class': 'count', 'type': 'actual-train-seconds'} + names = {} + sample = {'begin': first_ts, 'end': last_ts, 'value': (last_ts - first_ts) / 1000} + log_sample(file_id, desc, names, sample) + metric_file_name = finish_samples() + period['metric-files'].append(metric_file_name) + iter_sample['periods'].append(period) + + + metric_file_name = finish_samples() + period['metric-files'].append(metric_file_name) + iter_sample['periods'].append(period) + + f = open('post-process-data.json', 'w') + f.write(json.dumps(iter_sample)) + f.close + return(0) + + +if __name__ == "__main__": + exit(main()) diff --git a/rickshaw.json b/rickshaw.json new file mode 100644 index 0000000..1298206 --- /dev/null +++ b/rickshaw.json @@ -0,0 +1,27 @@ +{ + "rickshaw-benchmark": { + "schema": { "version": "2020.05.18" } + }, + "benchmark": "pytorch", + "controller" : { + "post-script" : "%bench-dir%/pytorch-post-process" + }, + "client" : { + "files-from-controller": [ + { + "src": "%bench-dir%/pytorch-get-runtime", + "dest": "/usr/bin/" + }, + { + "src": "%bench-dir%/pytorch-base", + "dest": "/usr/bin/" + }, + { + "src": "%bench-dir%/pytorch-client", + "dest": "/usr/bin/" + } + ], + "start": "pytorch-client", + "runtime": "pytorch-get-runtime" + } +} diff --git a/workshop.json b/workshop.json new file mode 100644 index 0000000..1d59a19 --- /dev/null +++ b/workshop.json @@ -0,0 +1,18 @@ +{ + "workshop": { + "schema": { + "version": "2020.03.02" + } + }, + "userenvs": [ + { + "name": "default", + "requirements": [] + }, + { + "name": "rhel-ai", + "requirements": [] + } + ], + "requirements": [] +}