diff --git a/README.md b/README.md index 47e639a..9e8cb38 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,4 @@ # bench-ilab benchmark automation for InstructLab + +This project works with crucible automation to run [training] workload for InstructLab. Please see the run-ilab.json file for exmaple usage. This file is used with "crucible run --from-file run-ilab.json" on your host. diff --git a/ilab-client b/ilab-client new file mode 100755 index 0000000..4121760 --- /dev/null +++ b/ilab-client @@ -0,0 +1,118 @@ +#!/bin/bash +# -*- mode: sh; indent-tabs-mode: nil; sh-basic-offset: 4 -*- +# vim: autoindent tabstop=4 shiftwidth=4 expandtab softtabstop=4 filetype=bash +exec >ilab-client-stderrout.txt +exec 2>&1 + +nnodes=1 +nproc_per_node=1 +num_epochs=1 +log_level="INFO" +effective_batch_size="" +cpu_offload_optimizer="" +cpu_offload_pin_memory="" +num_runavg_samples=2 + +opts=$(getopt -q -o "" --longoptions "model:,data-path:,nnodes:,nproc-per-node:,num-epochs:,effective-batch-size:,cpu-offload-optimizer:,cpu-offload-pin-memory:,num-runavg-samples:" -n "getopt.sh" -- "$@"); +if [ $? -ne 0 ]; then + printf -- "\tUnrecognized option specified\n\n" + exit 1 +fi +eval set -- "$opts"; +while true; do + arg=$1; shift + val=$1; shift + case "$arg" in + --model) + model=$val + ;; + --data-path) + data_path=$val + ;; + --nnodes) + nnodes=$val + ;; + --nproc-per-node) + nproc_per_node=$val + ;; + --num-epochs) + num_epochs=$val + ;; + --effective-batch-size) + effective_batch_size="--effective_batch_size $val" + ;; + --cpu-offload-optimizer) + if [ $val == "1" ]; then + cpu_offload_optimizer="--cpu-offload-optimizer" + fi + ;; + --cpu-offload-pin-memory) + if [ $val == "1" ]; then + cpu_offload_pin_memory="--cpu-offload-pin-memory" + fi + ;; + --num-runavg-samples) + num_runavg_samples=$val + ;; + --) + break + ;; + *) + echo "Invalid option: $1" + exit 1 + esac +done + + + +# usage: entrypoint.py [-h] --model --data-path --nnodes --nproc-per-node --ckpt-output-dir --num-epochs NUM_EPOCHS --dolomite --effective-batch-size --max-batch-len --data-output-dir --cpu-offload-optimizer --cpu-offload-pin-memory --cpu-offload-optimizer-ratio +train_cmd="python3.11 /instructlab/entrypoint.py\ + --model $model\ + --data-path $data_path\ + --nnodes=$nnodes\ + --nproc-per-node=$nproc_per_node\ + --num-epochs=$num_epochs + --ckpt-output-dir .\ + $effective_batch_size\ + $cpu_offload_optimizer\ + $cpu_offload_pin_memory" + +# python3.11 /instructlab/entrypoint.py --model /home/models/granite-7b-lab/ --data-path /home/data/jun12-phase05.jsonl --nnodes 1 --nproc-per-node 2 --num-epochs 1 --cpu-offload-optimizer --cpu-offload-pin-memory --ckpt-output-dir . + +# python3.11 /instructlab/entrypoint.py --model /home/models/granite-7b-lab/ --data-path /home/data/jun12-phase05.jsonl --nnodes 1 --nproc-per-node 2 --num-epochs 1 --cpu-offload-optimizer --cpu-offload-pin-memory --ckpt-output-dir . + +echo "/home/models:" +ls -la /home/models +echo "/home/data:" +ls -la /home/data + + + +echo "train cmd:" +echo "$train_cmd" + +count=0 +rc=0 +echo "Training:" +$train_cmd 2>&1 | +{ + while read line; do + echo "$line" >>train.txt + if echo "$line" | grep -q RunningAvgSamplesPerSec; then + ((count++)) + echo "found line with RunningAvgSamplesPerSec: [$line]" + fi + if [ $count -ge $num_runavg_samples ]; then + break + fi + done + echo "count: $count" + if [ $count -lt $num_runavg_samples ]; then + echo "did not get the number of running avg samples, so exiting error" + exit 1 + else + echo "Exiting without error" + exit 0 + fi +} +exit $? diff --git a/ilab-get-runtime b/ilab-get-runtime new file mode 100755 index 0000000..8aadf7b --- /dev/null +++ b/ilab-get-runtime @@ -0,0 +1,6 @@ +#!/bin/bash +# vim: autoindent tabstop=4 shiftwidth=4 expandtab softtabstop=4 filetype=bash +# -*- mode: sh; indent-tabs-mode: nil; sh-basic-offset: 4 -*- + +# request unbounded runtime by returning -1 +echo "-1" diff --git a/ilab-post-process b/ilab-post-process new file mode 100755 index 0000000..9f3f064 --- /dev/null +++ b/ilab-post-process @@ -0,0 +1,62 @@ +#!/usr/bin/perl +## -*- mode: perl; indent-tabs-mode: nil; perl-indent-level: 4 -*- +## vim: autoindent tabstop=4 shiftwidth=4 expandtab softtabstop=4 filetype=perl + +use strict; +use warnings; +use JSON::XS; +use Data::Dumper; +use Time::Piece; +BEGIN { + if (!(exists $ENV{'TOOLBOX_HOME'} && -d "$ENV{'TOOLBOX_HOME'}/perl")) { + print "This script requires libraries that are provided by the toolbox project.\n"; + print "Toolbox can be acquired from https://github.com/perftool-incubator/toolbox and\n"; + print "then use 'export TOOLBOX_HOME=/path/to/toolbox' so that it can be located.\n"; + exit 1; + } +} +use lib "$ENV{'TOOLBOX_HOME'}/perl"; +use toolbox::json; +use toolbox::metrics; + +my $coder = JSON::XS->new; +my $log_file = "training_params_and_metrics_global0.jsonl"; +my @logfile_metrics; +my %names = (); +my %desc = ('source' => 'ilab', 'class' => 'throughput', 'type' => 'train-samples-sec'); +(my $rc, my $log_fh) = open_read_text_file($log_file); +# file contents to parse: +#{"epoch": 0, "step": 1, "rank": 0, "loss": 0.18146394193172455, "overall_throughput": 3.5244029279710176, "lr": 0.0, "cuda_mem_allocated": 14.08400821685791, "cuda_malloc_retries": 0, "num_loss_counted_tokens": 4940, "batch_size": 14, "total_loss": 0.4069821238517761, "gradnorm": null, "weight_norm": 557.9681396484375, "timestamp": "2024-07-18T22:46:41.628932"} +while (<$log_fh>) { + my $json_ref; + $json_ref = $coder->decode($_); + # Strptime does not recognize microseconds, so we split the timestamp in two sections + if ( exists $$json_ref{"epoch"} and $$json_ref{"timestamp"} =~ /([^\.]*)\.(\d+)/ ) { + my $timestamp = $1; + my $msec = $2 /1000; + my $epoch = Time::Piece->strptime($timestamp, '%Y-%m-%dT%T')->epoch; + my $epoch_ms = $epoch * 1000 + $msec; + my %s = ('end' => int $epoch_ms, 'value' => $$json_ref{"overall_throughput"}); + log_sample("0", \%desc, \%names, \%s); + } +} +close($log_fh); +my $metric_data_name = finish_samples(); + +# In any benchmark post-process script, the metrics generated need to be attributed to a +# time-period (AKA benchmark-phase). The period which is used to report and offical +# result for the benchmark is the 'measurement' period. Ohter periods thay may exist +# could be "warm-up", "prep", etc. +my %sample; +my @periods; +my %period = ('name' => 'measurement'); +my @metric_files = ( $metric_data_name ); +$period{'metric-files'} = \@metric_files; +push(@periods, \%period); +$sample{'primary-metric'} = "train-samples-sec"; +$sample{'primary-period'} = "measurement"; +$sample{'benchmark'} = "ilab"; +$sample{'periods'} = \@periods; +$sample{'rickshaw-bench-metric'}{'schema'}{'version'} = "2021.04.12"; +open(JSON_FH, ">post-process-data.json") || die("Could not open file post-process-data.json for writing\n"); +print JSON_FH $coder->encode(\%sample); diff --git a/multiplex.json b/multiplex.json new file mode 100644 index 0000000..7796bb1 --- /dev/null +++ b/multiplex.json @@ -0,0 +1,23 @@ +{ + "presets": { + }, + "validations": { + "generic_string" : { + "description" : "all types of strings", + "args" : [ + "model", "data-path" + ], + "vals" : ".+" + }, + "integer_ge_zero" : { + "description" : "a whole number >= 0", + "args" : [ "nnodes", "nproc-per-node", "num-epochs", "effective-batch-size", "num-runavg-samples" ], + "vals" : "[1-9][0-9]*" + }, + "bool_0_1" : { + "description" : "boolean as 0 (false) or 1 (true)", + "args" : [ "cpu-offload-optimizer", "cpu-offload-pin-memory" ], + "vals" : "[0-1]" + } + } +} diff --git a/rickshaw.json b/rickshaw.json new file mode 100644 index 0000000..2ebd1d5 --- /dev/null +++ b/rickshaw.json @@ -0,0 +1,23 @@ +{ + "rickshaw-benchmark": { + "schema": { "version": "2020.05.18" } + }, + "benchmark": "ilab", + "controller" : { + "post-script" : "%bench-dir%/ilab-post-process" + }, + "client" : { + "files-from-controller": [ + { + "src": "%bench-dir%/ilab-get-runtime", + "dest": "/usr/bin/" + }, + { + "src": "%bench-dir%/ilab-client", + "dest": "/usr/bin/" + } + ], + "start": "ilab-client", + "runtime": "ilab-get-runtime" + } +} diff --git a/run-ilab.json b/run-ilab.json new file mode 100644 index 0000000..8d9beec --- /dev/null +++ b/run-ilab.json @@ -0,0 +1,65 @@ +{ + "tags": { + "topology": "none" + }, + "endpoints": [ + { + "type": "remotehosts", + "remotes": [ + { + "engines": [ { "role": "client", "ids": "1" } ], + "config": { + "host": "localhost", + "settings": { + "controller-ip-address": "10.26.8.21", + "userenv": "rhel-ai", + "osruntime": "podman", + "podman-settings": { + "device": "nvidia.com/gpu=all", + "shm-size": "10.00gb" + }, + "host-mounts": [ + { "src": "/home", "dest": "/home" } ], + "cpu-partitioning": false + } + } + } + ] + } + ], + "run-params": { "num-samples": 1, "max-sample-failures": 1, "test-order": "r" }, + "tool-params": [ + { "tool": "sysstat", + "params": [ + { "arg": "subtools", "val": "mpstat,sar,iostat" }, + { "arg": "interval", "val": "15" } + ] + }, + { "tool": "procstat", + "params": [ + { "arg": "interval", "val": "15" } + ] + } + ], + "benchmarks": [ + { + "name": "ilab", + "ids": "1", + "mv-params": { + "sets": [ + { + "params": [ + { "arg": "cpu-offload-optimizer", "vals": [ "1" ] }, + { "arg": "cpu-offload-pin-memory", "vals": [ "1" ] }, + { "arg": "nnodes", "vals": [ "1" ] }, + { "arg": "model", "vals": [ "/home/models/granite-7b-lab/" ] }, + { "arg": "data-path", "vals": [ "/home/data/jun12-phase05.jsonl" ] }, + { "arg": "num-runavg-samples", "vals": [ "8" ] }, + { "arg": "nproc-per-node", "vals": [ "2" ] } + ] + } + ] + } + } + ] +} diff --git a/workshop.json b/workshop.json new file mode 100644 index 0000000..a319d35 --- /dev/null +++ b/workshop.json @@ -0,0 +1,64 @@ +{ + "workshop": { + "schema": { + "version": "2020.03.02" + } + }, + "userenvs": [ + { + "name": "rhel-ai", + "requirements": [] + }, + { + "name": "cuda-rhubi9", + "requirements": [ + "python-pip-packages", + "os-pkgs", + "instructlab" + ] + } + ], + "requirements": [ + { + "name": "python-pip-packages", + "type": "python311", + "python3_info": { + "packages": [ + "torch==2.3.0 --index-url https://download.pytorch.org/whl/cu121", + "packaging wheel", + "flash-attn==2.5.9.post1", + "deepspeed==0.14.2", + "transformers==4.40.1", + "ipdb", + "jupyterlab", + "gpustat", + "matplotlib", + "hydra-core", + "datasets", + "rich", + "numba", + "'numpy<2.0.0'" + ] + } + }, + { + "name": "os-pkgs", + "type": "distro", + "distro_info": { + "packages": [ + "libnccl", + "libaio-devel" + ] + } + }, + { "name": "instructlab", + "type": "manual", + "manual_info": { + "commands": [ + "git clone https://github.com/instructlab/training.git /var/run/training", + "pushd /var/run/training; python3.11 -m pip install ." + ] + } + } + ] +}