Skip to content

Commit

Permalink
Merge pull request #1 from perftool-incubator/init1
Browse files Browse the repository at this point in the history
initial script, without functioning post-processing
  • Loading branch information
atheurer authored Nov 15, 2024
2 parents b5c3f3c + e591971 commit 03a57ed
Show file tree
Hide file tree
Showing 8 changed files with 259 additions and 1 deletion.
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,2 +1,4 @@
# bench-pytorch
wrapper for pytorch benchmark
benchmark automation for pytorch-benchmark

This project works with crucible automation to run the pytorch benchmark. Please see the run-pytorch.json file for exmaple usage. This file is used with "crucible run --from-file run-pytorch.json" on your host.
13 changes: 13 additions & 0 deletions multiplex.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
{
"presets": {
},
"validations": {
"models" : {
"description" : "valid models",
"args" : [
"model"
],
"vals" : "BERT_pytorch|fastNLP_Bert|hf_Bert_large|hf_BigBird|hf_DistilBert|hf_GPT2|hf_GPT2_large|hf_Longformer|hf_Roberta_base|hf_Reformer|llama|llama_v2_7b_16h|llava|hf_Whisper|hf_T5_large|hf_T5|nanogpt|timm_vision_transformer|timm_vision_transformer_large"
}
}
}
3 changes: 3 additions & 0 deletions pytorch-base
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#!/bin/bash

if ! source ${TOOLBOX_HOME}/bash/library/bench-base; then echo "ERROR: Could not source bench-base from \$TOOLBOX_HOME [${TOOLBOX_HOME}]"; exit 1; fi
55 changes: 55 additions & 0 deletions pytorch-client
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
#!/bin/bash
# -*- mode: sh; indent-tabs-mode: nil; sh-basic-offset: 4 -*-
# vim: autoindent tabstop=4 shiftwidth=4 expandtab softtabstop=4 filetype=bash
exec >pytorch-client-stderrout.txt
exec 2>&1

. /usr/bin/pytorch-base || (echo "/usr/bin/pytorch-base not found"; exit 1)

model="llama"

pytorch --version | awk '{print $3}' >pytorch-version.txt

pwd=`/bin/pwd`
pushd /opt/app-root/lib/python3.11/site-packages
find . -name METADATA | cpio -pdumv $pwd/instructlab-site-packages-metadata
popd

longopts=""
longopts+=" model:"

opts=$(getopt -q -o "" --longoptions "$longopts" -n "getopt.sh" -- "$@");
if [ $? -ne 0 ]; then
printf -- "\tUnrecognized option specified\n\n"
exit 1
fi
eval set -- "$opts";
while true; do
arg=$1; shift
val=$1; shift
case "$arg" in
--model)
model="$val"
;;
--)
break
;;
*)
echo "Invalid option: [$arg]"
exit 1
esac
done


echo "model: $workflow"
git clone https://github.com/pytorch/benchmark.git
pushd benchmark
pip uninstall -y nvidia-ml-py
pip install pynvml
python run_benchmark.py --help
date +%s >begin.txt
python run_benchmark.py test_bench -m $model >run_benchmark_output.txt
date +%s >end.txt
rc=$?
popd
exit $rc
6 changes: 6 additions & 0 deletions pytorch-get-runtime
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
#!/bin/bash
# vim: autoindent tabstop=4 shiftwidth=4 expandtab softtabstop=4 filetype=bash
# -*- mode: sh; indent-tabs-mode: nil; sh-basic-offset: 4 -*-

# request unbounded runtime by returning -1
echo "-1"
134 changes: 134 additions & 0 deletions pytorch-post-process
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
#!/usr/bin/env python3
# -*- mode: python; indent-tabs-mode: nil; python-indent-level: 4 -*-
# vim: autoindent tabstop=4 shiftwidth=4 expandtab softtabstop=4 filetype=python

import sys
import os
import lzma
import re
import copy
import math
import json
import yaml
import argparse
import glob
from datetime import datetime
from pathlib import Path

TOOLBOX_HOME = os.environ.get('TOOLBOX_HOME')
if TOOLBOX_HOME is None:
print("This script requires libraries that are provided by the toolbox project.")
print("Toolbox can be acquired from https://github.com/perftool-incubator/toolbox and")
print("then use 'export TOOLBOX_HOME=/path/to/toolbox' so that it can be located.")
exit(1)
else:
p = Path(TOOLBOX_HOME) / 'python'
if not p.exists() or not p.is_dir():
print("ERROR: <TOOLBOX_HOME>/python ('%s') does not exist!" % (p))
exit(2)
sys.path.append(str(p))
from toolbox.metrics import log_sample
from toolbox.metrics import finish_samples

params = {}

class t_global(object):
args = None

def process_options():
parser = argparse.ArgumentParser(description = 'Post process raw benchmark data into Common Data Model output')

parser.add_argument('--model',
dest = 'model',
help = '',
default = "llama"
)

t_global.args, unknown = parser.parse_known_args()

return()

def main():
process_options()
if t_global.args.workflow == '':
print('workflow was not defined, exiting')
return(1)

# In any benchmark post-process script, the metrics generated need to be attributed to a
# time-period (AKA benchmark-phase). The period which is used to report and offical
# result for the benchmark is the 'measurement' period. Other periods thay may exist
# could be "warm-up", "prep", etc.

iter_sample = {
'primary-period': "measurement",
'benchmark': "pytorch",
'periods': [],
'rickshaw-bench-metric': { 'schema': { 'version': '2021.04.12' } }
}

metric_files = []

first_ts = None
last_ts = None
period = { 'name': 'phase' + str(phase), 'metric-files': [] }
file_id = 'phase' + str(phase)
desc = {'source' : 'pytorch', 'class': 'throughput'}
names = {}
desc['type'] = 'train-samples-sec';
filename = 'e2e/phase' + str(phase) + '/checkpoints/training_params_and_metrics_global0.jsonl.xz'
print('Opening ' + filename)
this_period_first_ts = None
this_period_last_ts = None
with open(filename, 'rt') as file:
for line in file:
d = json.loads(line)
# file contents to parse (per line):
#{"epoch": 0, "step": 1, "rank": 0,
# "loss": 0.18146394193172455,
# "overall_throughput": 3.5244029279710176,
# "lr": 0.0, "cuda_mem_allocated": 14.08400821685791,
# "cuda_malloc_retries": 0,
# "num_loss_counted_tokens": 4940, "batch_size": 14,
# "total_loss": 0.4069821238517761, "gradnorm": null,
# "weight_norm": 557.9681396484375,
# "timestamp": "2024-07-18T22:46:41.628932"}
if 'epoch' in d.keys():
dt = datetime.strptime(d['timestamp'], '%Y-%m-%dT%X.%f')
ts = math.floor(dt.timestamp() * 1000)
if this_period_first_ts == None:
this_period_first_ts = ts
if first_ts == None:
first_ts = ts
sample = {'end': ts, 'value': d['overall_throughput']}
log_sample(file_id, desc, names, sample)
last_ts = ts
this_period_last_ts = ts
metric_file_name = finish_samples()
period['metric-files'].append(metric_file_name)
iter_sample['periods'].append(period)

# Now create the primary metric and the primary-period
iter_sample['primary-metric'] = 'actual-train-seconds'
period = { 'name': 'measurement', 'metric-files': [] }
file_id = 'measurement'
desc = {'source' : 'pytorch', 'class': 'count', 'type': 'actual-train-seconds'}
names = {}
sample = {'begin': first_ts, 'end': last_ts, 'value': (last_ts - first_ts) / 1000}
log_sample(file_id, desc, names, sample)
metric_file_name = finish_samples()
period['metric-files'].append(metric_file_name)
iter_sample['periods'].append(period)


metric_file_name = finish_samples()
period['metric-files'].append(metric_file_name)
iter_sample['periods'].append(period)

f = open('post-process-data.json', 'w')
f.write(json.dumps(iter_sample))
f.close
return(0)


if __name__ == "__main__":
exit(main())
27 changes: 27 additions & 0 deletions rickshaw.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
{
"rickshaw-benchmark": {
"schema": { "version": "2020.05.18" }
},
"benchmark": "pytorch",
"controller" : {
"post-script" : "%bench-dir%/pytorch-post-process"
},
"client" : {
"files-from-controller": [
{
"src": "%bench-dir%/pytorch-get-runtime",
"dest": "/usr/bin/"
},
{
"src": "%bench-dir%/pytorch-base",
"dest": "/usr/bin/"
},
{
"src": "%bench-dir%/pytorch-client",
"dest": "/usr/bin/"
}
],
"start": "pytorch-client",
"runtime": "pytorch-get-runtime"
}
}
18 changes: 18 additions & 0 deletions workshop.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
{
"workshop": {
"schema": {
"version": "2020.03.02"
}
},
"userenvs": [
{
"name": "default",
"requirements": []
},
{
"name": "rhel-ai",
"requirements": []
}
],
"requirements": []
}

0 comments on commit 03a57ed

Please sign in to comment.