Skip to content

Commit

Permalink
Initial code, for training only
Browse files Browse the repository at this point in the history
  • Loading branch information
atheurer committed Jul 23, 2024
1 parent 9852331 commit 2701790
Show file tree
Hide file tree
Showing 8 changed files with 363 additions and 0 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,2 +1,4 @@
# bench-ilab
benchmark automation for InstructLab

This project works with crucible automation to run [training] workload for InstructLab. Please see the run-ilab.json file for exmaple usage. This file is used with "crucible run --from-file run-ilab.json" on your host.
118 changes: 118 additions & 0 deletions ilab-client
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
#!/bin/bash
# -*- mode: sh; indent-tabs-mode: nil; sh-basic-offset: 4 -*-
# vim: autoindent tabstop=4 shiftwidth=4 expandtab softtabstop=4 filetype=bash
exec >ilab-client-stderrout.txt
exec 2>&1

nnodes=1
nproc_per_node=1
num_epochs=1
log_level="INFO"
effective_batch_size=""
cpu_offload_optimizer=""
cpu_offload_pin_memory=""
num_runavg_samples=2

opts=$(getopt -q -o "" --longoptions "model:,data-path:,nnodes:,nproc-per-node:,num-epochs:,effective-batch-size:,cpu-offload-optimizer:,cpu-offload-pin-memory:,num-runavg-samples:" -n "getopt.sh" -- "$@");
if [ $? -ne 0 ]; then
printf -- "\tUnrecognized option specified\n\n"
exit 1
fi
eval set -- "$opts";
while true; do
arg=$1; shift
val=$1; shift
case "$arg" in
--model)
model=$val
;;
--data-path)
data_path=$val
;;
--nnodes)
nnodes=$val
;;
--nproc-per-node)
nproc_per_node=$val
;;
--num-epochs)
num_epochs=$val
;;
--effective-batch-size)
effective_batch_size="--effective_batch_size $val"
;;
--cpu-offload-optimizer)
if [ $val == "1" ]; then
cpu_offload_optimizer="--cpu-offload-optimizer"
fi
;;
--cpu-offload-pin-memory)
if [ $val == "1" ]; then
cpu_offload_pin_memory="--cpu-offload-pin-memory"
fi
;;
--num-runavg-samples)
num_runavg_samples=$val
;;
--)
break
;;
*)
echo "Invalid option: $1"
exit 1
esac
done



# usage: entrypoint.py [-h] --model --data-path --nnodes --nproc-per-node --ckpt-output-dir --num-epochs NUM_EPOCHS --dolomite --effective-batch-size --max-batch-len --data-output-dir --cpu-offload-optimizer --cpu-offload-pin-memory --cpu-offload-optimizer-ratio
train_cmd="python3.11 /instructlab/entrypoint.py\
--model $model\
--data-path $data_path\
--nnodes=$nnodes\
--nproc-per-node=$nproc_per_node\
--num-epochs=$num_epochs
--ckpt-output-dir .\
$effective_batch_size\
$cpu_offload_optimizer\
$cpu_offload_pin_memory"

# python3.11 /instructlab/entrypoint.py --model /home/models/granite-7b-lab/ --data-path /home/data/jun12-phase05.jsonl --nnodes 1 --nproc-per-node 2 --num-epochs 1 --cpu-offload-optimizer --cpu-offload-pin-memory --ckpt-output-dir .

# python3.11 /instructlab/entrypoint.py --model /home/models/granite-7b-lab/ --data-path /home/data/jun12-phase05.jsonl --nnodes 1 --nproc-per-node 2 --num-epochs 1 --cpu-offload-optimizer --cpu-offload-pin-memory --ckpt-output-dir .

echo "/home/models:"
ls -la /home/models
echo "/home/data:"
ls -la /home/data



echo "train cmd:"
echo "$train_cmd"

count=0
rc=0
echo "Training:"
$train_cmd 2>&1 |
{
while read line; do
echo "$line" >>train.txt
if echo "$line" | grep -q RunningAvgSamplesPerSec; then
((count++))
echo "found line with RunningAvgSamplesPerSec: [$line]"
fi
if [ $count -ge $num_runavg_samples ]; then
break
fi
done
echo "count: $count"
if [ $count -lt $num_runavg_samples ]; then
echo "did not get the number of running avg samples, so exiting error"
exit 1
else
echo "Exiting without error"
exit 0
fi
}
exit $?
6 changes: 6 additions & 0 deletions ilab-get-runtime
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
#!/bin/bash
# vim: autoindent tabstop=4 shiftwidth=4 expandtab softtabstop=4 filetype=bash
# -*- mode: sh; indent-tabs-mode: nil; sh-basic-offset: 4 -*-

# request unbounded runtime by returning -1
echo "-1"
62 changes: 62 additions & 0 deletions ilab-post-process
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
#!/usr/bin/perl
## -*- mode: perl; indent-tabs-mode: nil; perl-indent-level: 4 -*-
## vim: autoindent tabstop=4 shiftwidth=4 expandtab softtabstop=4 filetype=perl

use strict;
use warnings;
use JSON::XS;
use Data::Dumper;
use Time::Piece;
BEGIN {
if (!(exists $ENV{'TOOLBOX_HOME'} && -d "$ENV{'TOOLBOX_HOME'}/perl")) {
print "This script requires libraries that are provided by the toolbox project.\n";
print "Toolbox can be acquired from https://github.com/perftool-incubator/toolbox and\n";
print "then use 'export TOOLBOX_HOME=/path/to/toolbox' so that it can be located.\n";
exit 1;
}
}
use lib "$ENV{'TOOLBOX_HOME'}/perl";
use toolbox::json;
use toolbox::metrics;

my $coder = JSON::XS->new;
my $log_file = "training_params_and_metrics_global0.jsonl";
my @logfile_metrics;
my %names = ();
my %desc = ('source' => 'ilab', 'class' => 'throughput', 'type' => 'train-samples-sec');
(my $rc, my $log_fh) = open_read_text_file($log_file);
# file contents to parse:
#{"epoch": 0, "step": 1, "rank": 0, "loss": 0.18146394193172455, "overall_throughput": 3.5244029279710176, "lr": 0.0, "cuda_mem_allocated": 14.08400821685791, "cuda_malloc_retries": 0, "num_loss_counted_tokens": 4940, "batch_size": 14, "total_loss": 0.4069821238517761, "gradnorm": null, "weight_norm": 557.9681396484375, "timestamp": "2024-07-18T22:46:41.628932"}
while (<$log_fh>) {
my $json_ref;
$json_ref = $coder->decode($_);
# Strptime does not recognize microseconds, so we split the timestamp in two sections
if ( exists $$json_ref{"epoch"} and $$json_ref{"timestamp"} =~ /([^\.]*)\.(\d+)/ ) {
my $timestamp = $1;
my $msec = $2 /1000;
my $epoch = Time::Piece->strptime($timestamp, '%Y-%m-%dT%T')->epoch;
my $epoch_ms = $epoch * 1000 + $msec;
my %s = ('end' => int $epoch_ms, 'value' => $$json_ref{"overall_throughput"});
log_sample("0", \%desc, \%names, \%s);
}
}
close($log_fh);
my $metric_data_name = finish_samples();

# In any benchmark post-process script, the metrics generated need to be attributed to a
# time-period (AKA benchmark-phase). The period which is used to report and offical
# result for the benchmark is the 'measurement' period. Ohter periods thay may exist
# could be "warm-up", "prep", etc.
my %sample;
my @periods;
my %period = ('name' => 'measurement');
my @metric_files = ( $metric_data_name );
$period{'metric-files'} = \@metric_files;
push(@periods, \%period);
$sample{'primary-metric'} = "train-samples-sec";
$sample{'primary-period'} = "measurement";
$sample{'benchmark'} = "ilab";
$sample{'periods'} = \@periods;
$sample{'rickshaw-bench-metric'}{'schema'}{'version'} = "2021.04.12";
open(JSON_FH, ">post-process-data.json") || die("Could not open file post-process-data.json for writing\n");
print JSON_FH $coder->encode(\%sample);
23 changes: 23 additions & 0 deletions multiplex.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
{
"presets": {
},
"validations": {
"generic_string" : {
"description" : "all types of strings",
"args" : [
"model", "data-path"
],
"vals" : ".+"
},
"integer_ge_zero" : {
"description" : "a whole number >= 0",
"args" : [ "nnodes", "nproc-per-node", "num-epochs", "effective-batch-size", "num-runavg-samples" ],
"vals" : "[1-9][0-9]*"
},
"bool_0_1" : {
"description" : "boolean as 0 (false) or 1 (true)",
"args" : [ "cpu-offload-optimizer", "cpu-offload-pin-memory" ],
"vals" : "[0-1]"
}
}
}
23 changes: 23 additions & 0 deletions rickshaw.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
{
"rickshaw-benchmark": {
"schema": { "version": "2020.05.18" }
},
"benchmark": "ilab",
"controller" : {
"post-script" : "%bench-dir%/ilab-post-process"
},
"client" : {
"files-from-controller": [
{
"src": "%bench-dir%/ilab-get-runtime",
"dest": "/usr/bin/"
},
{
"src": "%bench-dir%/ilab-client",
"dest": "/usr/bin/"
}
],
"start": "ilab-client",
"runtime": "ilab-get-runtime"
}
}
65 changes: 65 additions & 0 deletions run-ilab.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
{
"tags": {
"topology": "none"
},
"endpoints": [
{
"type": "remotehosts",
"remotes": [
{
"engines": [ { "role": "client", "ids": "1" } ],
"config": {
"host": "localhost",
"settings": {
"controller-ip-address": "10.26.8.21",
"userenv": "rhel-ai",
"osruntime": "podman",
"podman-settings": {
"device": "nvidia.com/gpu=all",
"shm-size": "10.00gb"
},
"host-mounts": [
{ "src": "/home", "dest": "/home" } ],
"cpu-partitioning": false
}
}
}
]
}
],
"run-params": { "num-samples": 1, "max-sample-failures": 1, "test-order": "r" },
"tool-params": [
{ "tool": "sysstat",
"params": [
{ "arg": "subtools", "val": "mpstat,sar,iostat" },
{ "arg": "interval", "val": "15" }
]
},
{ "tool": "procstat",
"params": [
{ "arg": "interval", "val": "15" }
]
}
],
"benchmarks": [
{
"name": "ilab",
"ids": "1",
"mv-params": {
"sets": [
{
"params": [
{ "arg": "cpu-offload-optimizer", "vals": [ "1" ] },
{ "arg": "cpu-offload-pin-memory", "vals": [ "1" ] },
{ "arg": "nnodes", "vals": [ "1" ] },
{ "arg": "model", "vals": [ "/home/models/granite-7b-lab/" ] },
{ "arg": "data-path", "vals": [ "/home/data/jun12-phase05.jsonl" ] },
{ "arg": "num-runavg-samples", "vals": [ "8" ] },
{ "arg": "nproc-per-node", "vals": [ "2" ] }
]
}
]
}
}
]
}
64 changes: 64 additions & 0 deletions workshop.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
{
"workshop": {
"schema": {
"version": "2020.03.02"
}
},
"userenvs": [
{
"name": "rhel-ai",
"requirements": []
},
{
"name": "cuda-rhubi9",
"requirements": [
"python-pip-packages",
"os-pkgs",
"instructlab"
]
}
],
"requirements": [
{
"name": "python-pip-packages",
"type": "python311",
"python3_info": {
"packages": [
"torch==2.3.0 --index-url https://download.pytorch.org/whl/cu121",
"packaging wheel",
"flash-attn==2.5.9.post1",
"deepspeed==0.14.2",
"transformers==4.40.1",
"ipdb",
"jupyterlab",
"gpustat",
"matplotlib",
"hydra-core",
"datasets",
"rich",
"numba",
"'numpy<2.0.0'"
]
}
},
{
"name": "os-pkgs",
"type": "distro",
"distro_info": {
"packages": [
"libnccl",
"libaio-devel"
]
}
},
{ "name": "instructlab",
"type": "manual",
"manual_info": {
"commands": [
"git clone https://github.com/instructlab/training.git /var/run/training",
"pushd /var/run/training; python3.11 -m pip install ."
]
}
}
]
}

0 comments on commit 2701790

Please sign in to comment.