Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

initial script, without functioning post-processing #1

Merged
merged 1 commit into from
Nov 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,2 +1,4 @@
# bench-pytorch
wrapper for pytorch benchmark
benchmark automation for pytorch-benchmark

This project works with crucible automation to run the pytorch benchmark. Please see the run-pytorch.json file for exmaple usage. This file is used with "crucible run --from-file run-pytorch.json" on your host.
13 changes: 13 additions & 0 deletions multiplex.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
{
"presets": {
},
"validations": {
"models" : {
"description" : "valid models",
"args" : [
"model"
],
"vals" : "BERT_pytorch|fastNLP_Bert|hf_Bert_large|hf_BigBird|hf_DistilBert|hf_GPT2|hf_GPT2_large|hf_Longformer|hf_Roberta_base|hf_Reformer|llama|llama_v2_7b_16h|llava|hf_Whisper|hf_T5_large|hf_T5|nanogpt|timm_vision_transformer|timm_vision_transformer_large"
}
}
}
3 changes: 3 additions & 0 deletions pytorch-base
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#!/bin/bash

if ! source ${TOOLBOX_HOME}/bash/library/bench-base; then echo "ERROR: Could not source bench-base from \$TOOLBOX_HOME [${TOOLBOX_HOME}]"; exit 1; fi
55 changes: 55 additions & 0 deletions pytorch-client
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
#!/bin/bash
# -*- mode: sh; indent-tabs-mode: nil; sh-basic-offset: 4 -*-
# vim: autoindent tabstop=4 shiftwidth=4 expandtab softtabstop=4 filetype=bash
exec >pytorch-client-stderrout.txt
exec 2>&1

. /usr/bin/pytorch-base || (echo "/usr/bin/pytorch-base not found"; exit 1)

model="llama"

pytorch --version | awk '{print $3}' >pytorch-version.txt

pwd=`/bin/pwd`
pushd /opt/app-root/lib/python3.11/site-packages
find . -name METADATA | cpio -pdumv $pwd/instructlab-site-packages-metadata
popd

longopts=""
longopts+=" model:"

opts=$(getopt -q -o "" --longoptions "$longopts" -n "getopt.sh" -- "$@");
if [ $? -ne 0 ]; then
printf -- "\tUnrecognized option specified\n\n"
exit 1
fi
eval set -- "$opts";
while true; do
arg=$1; shift
val=$1; shift
case "$arg" in
--model)
model="$val"
;;
--)
break
;;
*)
echo "Invalid option: [$arg]"
exit 1
esac
done


echo "model: $workflow"
git clone https://github.com/pytorch/benchmark.git
pushd benchmark
pip uninstall -y nvidia-ml-py
pip install pynvml
python run_benchmark.py --help
date +%s >begin.txt
python run_benchmark.py test_bench -m $model >run_benchmark_output.txt
date +%s >end.txt
rc=$?
popd
exit $rc
6 changes: 6 additions & 0 deletions pytorch-get-runtime
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
#!/bin/bash
# vim: autoindent tabstop=4 shiftwidth=4 expandtab softtabstop=4 filetype=bash
# -*- mode: sh; indent-tabs-mode: nil; sh-basic-offset: 4 -*-

# request unbounded runtime by returning -1
echo "-1"
134 changes: 134 additions & 0 deletions pytorch-post-process
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
#!/usr/bin/env python3
# -*- mode: python; indent-tabs-mode: nil; python-indent-level: 4 -*-
# vim: autoindent tabstop=4 shiftwidth=4 expandtab softtabstop=4 filetype=python

import sys
import os
import lzma
import re
import copy
import math
import json
import yaml
import argparse
import glob
from datetime import datetime
from pathlib import Path

TOOLBOX_HOME = os.environ.get('TOOLBOX_HOME')
if TOOLBOX_HOME is None:
print("This script requires libraries that are provided by the toolbox project.")
print("Toolbox can be acquired from https://github.com/perftool-incubator/toolbox and")
print("then use 'export TOOLBOX_HOME=/path/to/toolbox' so that it can be located.")
exit(1)
else:
p = Path(TOOLBOX_HOME) / 'python'
if not p.exists() or not p.is_dir():
print("ERROR: <TOOLBOX_HOME>/python ('%s') does not exist!" % (p))
exit(2)
sys.path.append(str(p))
from toolbox.metrics import log_sample
from toolbox.metrics import finish_samples

params = {}

class t_global(object):
args = None

def process_options():
parser = argparse.ArgumentParser(description = 'Post process raw benchmark data into Common Data Model output')

parser.add_argument('--model',
dest = 'model',
help = '',
default = "llama"
)

t_global.args, unknown = parser.parse_known_args()

return()

def main():
process_options()
if t_global.args.workflow == '':
print('workflow was not defined, exiting')
return(1)

# In any benchmark post-process script, the metrics generated need to be attributed to a
# time-period (AKA benchmark-phase). The period which is used to report and offical
# result for the benchmark is the 'measurement' period. Other periods thay may exist
# could be "warm-up", "prep", etc.

iter_sample = {
'primary-period': "measurement",
'benchmark': "pytorch",
'periods': [],
'rickshaw-bench-metric': { 'schema': { 'version': '2021.04.12' } }
}

metric_files = []

first_ts = None
last_ts = None
period = { 'name': 'phase' + str(phase), 'metric-files': [] }
file_id = 'phase' + str(phase)
desc = {'source' : 'pytorch', 'class': 'throughput'}
names = {}
desc['type'] = 'train-samples-sec';
filename = 'e2e/phase' + str(phase) + '/checkpoints/training_params_and_metrics_global0.jsonl.xz'
print('Opening ' + filename)
this_period_first_ts = None
this_period_last_ts = None
with open(filename, 'rt') as file:
for line in file:
d = json.loads(line)
# file contents to parse (per line):
#{"epoch": 0, "step": 1, "rank": 0,
# "loss": 0.18146394193172455,
# "overall_throughput": 3.5244029279710176,
# "lr": 0.0, "cuda_mem_allocated": 14.08400821685791,
# "cuda_malloc_retries": 0,
# "num_loss_counted_tokens": 4940, "batch_size": 14,
# "total_loss": 0.4069821238517761, "gradnorm": null,
# "weight_norm": 557.9681396484375,
# "timestamp": "2024-07-18T22:46:41.628932"}
if 'epoch' in d.keys():
dt = datetime.strptime(d['timestamp'], '%Y-%m-%dT%X.%f')
ts = math.floor(dt.timestamp() * 1000)
if this_period_first_ts == None:
this_period_first_ts = ts
if first_ts == None:
first_ts = ts
sample = {'end': ts, 'value': d['overall_throughput']}
log_sample(file_id, desc, names, sample)
last_ts = ts
this_period_last_ts = ts
metric_file_name = finish_samples()
period['metric-files'].append(metric_file_name)
iter_sample['periods'].append(period)

# Now create the primary metric and the primary-period
iter_sample['primary-metric'] = 'actual-train-seconds'
period = { 'name': 'measurement', 'metric-files': [] }
file_id = 'measurement'
desc = {'source' : 'pytorch', 'class': 'count', 'type': 'actual-train-seconds'}
names = {}
sample = {'begin': first_ts, 'end': last_ts, 'value': (last_ts - first_ts) / 1000}
log_sample(file_id, desc, names, sample)
metric_file_name = finish_samples()
period['metric-files'].append(metric_file_name)
iter_sample['periods'].append(period)


metric_file_name = finish_samples()
period['metric-files'].append(metric_file_name)
iter_sample['periods'].append(period)

f = open('post-process-data.json', 'w')
f.write(json.dumps(iter_sample))
f.close
return(0)


if __name__ == "__main__":
exit(main())
27 changes: 27 additions & 0 deletions rickshaw.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
{
"rickshaw-benchmark": {
"schema": { "version": "2020.05.18" }
},
"benchmark": "pytorch",
"controller" : {
"post-script" : "%bench-dir%/pytorch-post-process"
},
"client" : {
"files-from-controller": [
{
"src": "%bench-dir%/pytorch-get-runtime",
"dest": "/usr/bin/"
},
{
"src": "%bench-dir%/pytorch-base",
"dest": "/usr/bin/"
},
{
"src": "%bench-dir%/pytorch-client",
"dest": "/usr/bin/"
}
],
"start": "pytorch-client",
"runtime": "pytorch-get-runtime"
}
}
18 changes: 18 additions & 0 deletions workshop.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
{
"workshop": {
"schema": {
"version": "2020.03.02"
}
},
"userenvs": [
{
"name": "default",
"requirements": []
},
{
"name": "rhel-ai",
"requirements": []
}
],
"requirements": []
}