-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #5 from perftool-incubator/code
Initial code, for training only
- Loading branch information
Showing
8 changed files
with
363 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,4 @@ | ||
# bench-ilab | ||
benchmark automation for InstructLab | ||
|
||
This project works with crucible automation to run [training] workload for InstructLab. Please see the run-ilab.json file for exmaple usage. This file is used with "crucible run --from-file run-ilab.json" on your host. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,118 @@ | ||
#!/bin/bash | ||
# -*- mode: sh; indent-tabs-mode: nil; sh-basic-offset: 4 -*- | ||
# vim: autoindent tabstop=4 shiftwidth=4 expandtab softtabstop=4 filetype=bash | ||
exec >ilab-client-stderrout.txt | ||
exec 2>&1 | ||
|
||
nnodes=1 | ||
nproc_per_node=1 | ||
num_epochs=1 | ||
log_level="INFO" | ||
effective_batch_size="" | ||
cpu_offload_optimizer="" | ||
cpu_offload_pin_memory="" | ||
num_runavg_samples=2 | ||
|
||
opts=$(getopt -q -o "" --longoptions "model:,data-path:,nnodes:,nproc-per-node:,num-epochs:,effective-batch-size:,cpu-offload-optimizer:,cpu-offload-pin-memory:,num-runavg-samples:" -n "getopt.sh" -- "$@"); | ||
if [ $? -ne 0 ]; then | ||
printf -- "\tUnrecognized option specified\n\n" | ||
exit 1 | ||
fi | ||
eval set -- "$opts"; | ||
while true; do | ||
arg=$1; shift | ||
val=$1; shift | ||
case "$arg" in | ||
--model) | ||
model=$val | ||
;; | ||
--data-path) | ||
data_path=$val | ||
;; | ||
--nnodes) | ||
nnodes=$val | ||
;; | ||
--nproc-per-node) | ||
nproc_per_node=$val | ||
;; | ||
--num-epochs) | ||
num_epochs=$val | ||
;; | ||
--effective-batch-size) | ||
effective_batch_size="--effective_batch_size $val" | ||
;; | ||
--cpu-offload-optimizer) | ||
if [ $val == "1" ]; then | ||
cpu_offload_optimizer="--cpu-offload-optimizer" | ||
fi | ||
;; | ||
--cpu-offload-pin-memory) | ||
if [ $val == "1" ]; then | ||
cpu_offload_pin_memory="--cpu-offload-pin-memory" | ||
fi | ||
;; | ||
--num-runavg-samples) | ||
num_runavg_samples=$val | ||
;; | ||
--) | ||
break | ||
;; | ||
*) | ||
echo "Invalid option: $1" | ||
exit 1 | ||
esac | ||
done | ||
|
||
|
||
|
||
# usage: entrypoint.py [-h] --model --data-path --nnodes --nproc-per-node --ckpt-output-dir --num-epochs NUM_EPOCHS --dolomite --effective-batch-size --max-batch-len --data-output-dir --cpu-offload-optimizer --cpu-offload-pin-memory --cpu-offload-optimizer-ratio | ||
train_cmd="python3.11 /instructlab/entrypoint.py\ | ||
--model $model\ | ||
--data-path $data_path\ | ||
--nnodes=$nnodes\ | ||
--nproc-per-node=$nproc_per_node\ | ||
--num-epochs=$num_epochs | ||
--ckpt-output-dir .\ | ||
$effective_batch_size\ | ||
$cpu_offload_optimizer\ | ||
$cpu_offload_pin_memory" | ||
|
||
# python3.11 /instructlab/entrypoint.py --model /home/models/granite-7b-lab/ --data-path /home/data/jun12-phase05.jsonl --nnodes 1 --nproc-per-node 2 --num-epochs 1 --cpu-offload-optimizer --cpu-offload-pin-memory --ckpt-output-dir . | ||
|
||
# python3.11 /instructlab/entrypoint.py --model /home/models/granite-7b-lab/ --data-path /home/data/jun12-phase05.jsonl --nnodes 1 --nproc-per-node 2 --num-epochs 1 --cpu-offload-optimizer --cpu-offload-pin-memory --ckpt-output-dir . | ||
|
||
echo "/home/models:" | ||
ls -la /home/models | ||
echo "/home/data:" | ||
ls -la /home/data | ||
|
||
|
||
|
||
echo "train cmd:" | ||
echo "$train_cmd" | ||
|
||
count=0 | ||
rc=0 | ||
echo "Training:" | ||
$train_cmd 2>&1 | | ||
{ | ||
while read line; do | ||
echo "$line" >>train.txt | ||
if echo "$line" | grep -q RunningAvgSamplesPerSec; then | ||
((count++)) | ||
echo "found line with RunningAvgSamplesPerSec: [$line]" | ||
fi | ||
if [ $count -ge $num_runavg_samples ]; then | ||
break | ||
fi | ||
done | ||
echo "count: $count" | ||
if [ $count -lt $num_runavg_samples ]; then | ||
echo "did not get the number of running avg samples, so exiting error" | ||
exit 1 | ||
else | ||
echo "Exiting without error" | ||
exit 0 | ||
fi | ||
} | ||
exit $? |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
#!/bin/bash | ||
# vim: autoindent tabstop=4 shiftwidth=4 expandtab softtabstop=4 filetype=bash | ||
# -*- mode: sh; indent-tabs-mode: nil; sh-basic-offset: 4 -*- | ||
|
||
# request unbounded runtime by returning -1 | ||
echo "-1" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
#!/usr/bin/perl | ||
## -*- mode: perl; indent-tabs-mode: nil; perl-indent-level: 4 -*- | ||
## vim: autoindent tabstop=4 shiftwidth=4 expandtab softtabstop=4 filetype=perl | ||
|
||
use strict; | ||
use warnings; | ||
use JSON::XS; | ||
use Data::Dumper; | ||
use Time::Piece; | ||
BEGIN { | ||
if (!(exists $ENV{'TOOLBOX_HOME'} && -d "$ENV{'TOOLBOX_HOME'}/perl")) { | ||
print "This script requires libraries that are provided by the toolbox project.\n"; | ||
print "Toolbox can be acquired from https://github.com/perftool-incubator/toolbox and\n"; | ||
print "then use 'export TOOLBOX_HOME=/path/to/toolbox' so that it can be located.\n"; | ||
exit 1; | ||
} | ||
} | ||
use lib "$ENV{'TOOLBOX_HOME'}/perl"; | ||
use toolbox::json; | ||
use toolbox::metrics; | ||
|
||
my $coder = JSON::XS->new; | ||
my $log_file = "training_params_and_metrics_global0.jsonl"; | ||
my @logfile_metrics; | ||
my %names = (); | ||
my %desc = ('source' => 'ilab', 'class' => 'throughput', 'type' => 'train-samples-sec'); | ||
(my $rc, my $log_fh) = open_read_text_file($log_file); | ||
# file contents to parse: | ||
#{"epoch": 0, "step": 1, "rank": 0, "loss": 0.18146394193172455, "overall_throughput": 3.5244029279710176, "lr": 0.0, "cuda_mem_allocated": 14.08400821685791, "cuda_malloc_retries": 0, "num_loss_counted_tokens": 4940, "batch_size": 14, "total_loss": 0.4069821238517761, "gradnorm": null, "weight_norm": 557.9681396484375, "timestamp": "2024-07-18T22:46:41.628932"} | ||
while (<$log_fh>) { | ||
my $json_ref; | ||
$json_ref = $coder->decode($_); | ||
# Strptime does not recognize microseconds, so we split the timestamp in two sections | ||
if ( exists $$json_ref{"epoch"} and $$json_ref{"timestamp"} =~ /([^\.]*)\.(\d+)/ ) { | ||
my $timestamp = $1; | ||
my $msec = $2 /1000; | ||
my $epoch = Time::Piece->strptime($timestamp, '%Y-%m-%dT%T')->epoch; | ||
my $epoch_ms = $epoch * 1000 + $msec; | ||
my %s = ('end' => int $epoch_ms, 'value' => $$json_ref{"overall_throughput"}); | ||
log_sample("0", \%desc, \%names, \%s); | ||
} | ||
} | ||
close($log_fh); | ||
my $metric_data_name = finish_samples(); | ||
|
||
# In any benchmark post-process script, the metrics generated need to be attributed to a | ||
# time-period (AKA benchmark-phase). The period which is used to report and offical | ||
# result for the benchmark is the 'measurement' period. Ohter periods thay may exist | ||
# could be "warm-up", "prep", etc. | ||
my %sample; | ||
my @periods; | ||
my %period = ('name' => 'measurement'); | ||
my @metric_files = ( $metric_data_name ); | ||
$period{'metric-files'} = \@metric_files; | ||
push(@periods, \%period); | ||
$sample{'primary-metric'} = "train-samples-sec"; | ||
$sample{'primary-period'} = "measurement"; | ||
$sample{'benchmark'} = "ilab"; | ||
$sample{'periods'} = \@periods; | ||
$sample{'rickshaw-bench-metric'}{'schema'}{'version'} = "2021.04.12"; | ||
open(JSON_FH, ">post-process-data.json") || die("Could not open file post-process-data.json for writing\n"); | ||
print JSON_FH $coder->encode(\%sample); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
{ | ||
"presets": { | ||
}, | ||
"validations": { | ||
"generic_string" : { | ||
"description" : "all types of strings", | ||
"args" : [ | ||
"model", "data-path" | ||
], | ||
"vals" : ".+" | ||
}, | ||
"integer_ge_zero" : { | ||
"description" : "a whole number >= 0", | ||
"args" : [ "nnodes", "nproc-per-node", "num-epochs", "effective-batch-size", "num-runavg-samples" ], | ||
"vals" : "[1-9][0-9]*" | ||
}, | ||
"bool_0_1" : { | ||
"description" : "boolean as 0 (false) or 1 (true)", | ||
"args" : [ "cpu-offload-optimizer", "cpu-offload-pin-memory" ], | ||
"vals" : "[0-1]" | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
{ | ||
"rickshaw-benchmark": { | ||
"schema": { "version": "2020.05.18" } | ||
}, | ||
"benchmark": "ilab", | ||
"controller" : { | ||
"post-script" : "%bench-dir%/ilab-post-process" | ||
}, | ||
"client" : { | ||
"files-from-controller": [ | ||
{ | ||
"src": "%bench-dir%/ilab-get-runtime", | ||
"dest": "/usr/bin/" | ||
}, | ||
{ | ||
"src": "%bench-dir%/ilab-client", | ||
"dest": "/usr/bin/" | ||
} | ||
], | ||
"start": "ilab-client", | ||
"runtime": "ilab-get-runtime" | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
{ | ||
"tags": { | ||
"topology": "none" | ||
}, | ||
"endpoints": [ | ||
{ | ||
"type": "remotehosts", | ||
"remotes": [ | ||
{ | ||
"engines": [ { "role": "client", "ids": "1" } ], | ||
"config": { | ||
"host": "localhost", | ||
"settings": { | ||
"controller-ip-address": "10.26.8.21", | ||
"userenv": "rhel-ai", | ||
"osruntime": "podman", | ||
"podman-settings": { | ||
"device": "nvidia.com/gpu=all", | ||
"shm-size": "10.00gb" | ||
}, | ||
"host-mounts": [ | ||
{ "src": "/home", "dest": "/home" } ], | ||
"cpu-partitioning": false | ||
} | ||
} | ||
} | ||
] | ||
} | ||
], | ||
"run-params": { "num-samples": 1, "max-sample-failures": 1, "test-order": "r" }, | ||
"tool-params": [ | ||
{ "tool": "sysstat", | ||
"params": [ | ||
{ "arg": "subtools", "val": "mpstat,sar,iostat" }, | ||
{ "arg": "interval", "val": "15" } | ||
] | ||
}, | ||
{ "tool": "procstat", | ||
"params": [ | ||
{ "arg": "interval", "val": "15" } | ||
] | ||
} | ||
], | ||
"benchmarks": [ | ||
{ | ||
"name": "ilab", | ||
"ids": "1", | ||
"mv-params": { | ||
"sets": [ | ||
{ | ||
"params": [ | ||
{ "arg": "cpu-offload-optimizer", "vals": [ "1" ] }, | ||
{ "arg": "cpu-offload-pin-memory", "vals": [ "1" ] }, | ||
{ "arg": "nnodes", "vals": [ "1" ] }, | ||
{ "arg": "model", "vals": [ "/home/models/granite-7b-lab/" ] }, | ||
{ "arg": "data-path", "vals": [ "/home/data/jun12-phase05.jsonl" ] }, | ||
{ "arg": "num-runavg-samples", "vals": [ "8" ] }, | ||
{ "arg": "nproc-per-node", "vals": [ "2" ] } | ||
] | ||
} | ||
] | ||
} | ||
} | ||
] | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,64 @@ | ||
{ | ||
"workshop": { | ||
"schema": { | ||
"version": "2020.03.02" | ||
} | ||
}, | ||
"userenvs": [ | ||
{ | ||
"name": "rhel-ai", | ||
"requirements": [] | ||
}, | ||
{ | ||
"name": "cuda-rhubi9", | ||
"requirements": [ | ||
"python-pip-packages", | ||
"os-pkgs", | ||
"instructlab" | ||
] | ||
} | ||
], | ||
"requirements": [ | ||
{ | ||
"name": "python-pip-packages", | ||
"type": "python311", | ||
"python3_info": { | ||
"packages": [ | ||
"torch==2.3.0 --index-url https://download.pytorch.org/whl/cu121", | ||
"packaging wheel", | ||
"flash-attn==2.5.9.post1", | ||
"deepspeed==0.14.2", | ||
"transformers==4.40.1", | ||
"ipdb", | ||
"jupyterlab", | ||
"gpustat", | ||
"matplotlib", | ||
"hydra-core", | ||
"datasets", | ||
"rich", | ||
"numba", | ||
"'numpy<2.0.0'" | ||
] | ||
} | ||
}, | ||
{ | ||
"name": "os-pkgs", | ||
"type": "distro", | ||
"distro_info": { | ||
"packages": [ | ||
"libnccl", | ||
"libaio-devel" | ||
] | ||
} | ||
}, | ||
{ "name": "instructlab", | ||
"type": "manual", | ||
"manual_info": { | ||
"commands": [ | ||
"git clone https://github.com/instructlab/training.git /var/run/training", | ||
"pushd /var/run/training; python3.11 -m pip install ." | ||
] | ||
} | ||
} | ||
] | ||
} |