Merge pull request #5 from perftool-incubator/code

Initial code, for training only
perftool-incubator · Jul 23, 2024 · c098829 · c098829
2 parents c975eaf + 2701790
commit c098829
Show file tree

Hide file tree

Showing 8 changed files with 363 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -1,2 +1,4 @@
 # bench-ilab
 benchmark automation for InstructLab
+
+This project works with crucible automation to run [training] workload for InstructLab.  Please see the run-ilab.json file for exmaple usage.  This file is used with "crucible run --from-file run-ilab.json" on your host.
diff --git a/ilab-client b/ilab-client
@@ -0,0 +1,118 @@
+#!/bin/bash
+# -*- mode: sh; indent-tabs-mode: nil; sh-basic-offset: 4 -*-
+# vim: autoindent tabstop=4 shiftwidth=4 expandtab softtabstop=4 filetype=bash
+exec >ilab-client-stderrout.txt
+exec 2>&1
+
+nnodes=1
+nproc_per_node=1
+num_epochs=1
+log_level="INFO"
+effective_batch_size=""
+cpu_offload_optimizer=""
+cpu_offload_pin_memory=""
+num_runavg_samples=2
+
+opts=$(getopt -q -o "" --longoptions "model:,data-path:,nnodes:,nproc-per-node:,num-epochs:,effective-batch-size:,cpu-offload-optimizer:,cpu-offload-pin-memory:,num-runavg-samples:" -n "getopt.sh" -- "$@");
+if [ $? -ne 0 ]; then
+    printf -- "\tUnrecognized option specified\n\n"
+    exit 1
+fi
+eval set -- "$opts";
+while true; do
+    arg=$1; shift
+    val=$1; shift
+    case "$arg" in
+        --model)
+	    model=$val
+	    ;;
+        --data-path)
+	    data_path=$val
+	    ;;
+        --nnodes)
+	    nnodes=$val
+	    ;;
+        --nproc-per-node)
+	    nproc_per_node=$val
+	    ;;
+        --num-epochs)
+	    num_epochs=$val
+	    ;;
+	--effective-batch-size)
+	    effective_batch_size="--effective_batch_size $val"
+	    ;;
+        --cpu-offload-optimizer)
+	    if [ $val == "1" ]; then
+                cpu_offload_optimizer="--cpu-offload-optimizer"
+            fi
+            ;;
+        --cpu-offload-pin-memory)
+	    if [ $val == "1" ]; then
+                cpu_offload_pin_memory="--cpu-offload-pin-memory"
+            fi
+	    ;;
+        --num-runavg-samples)
+            num_runavg_samples=$val
+            ;;
+        --)
+            break
+            ;;
+        *)
+            echo "Invalid option: $1"
+            exit 1
+    esac
+done
+
+
+
+# usage: entrypoint.py [-h] --model --data-path --nnodes --nproc-per-node --ckpt-output-dir --num-epochs NUM_EPOCHS --dolomite --effective-batch-size --max-batch-len --data-output-dir --cpu-offload-optimizer --cpu-offload-pin-memory --cpu-offload-optimizer-ratio 
+train_cmd="python3.11 /instructlab/entrypoint.py\
+	    --model $model\
+	    --data-path $data_path\
+            --nnodes=$nnodes\
+            --nproc-per-node=$nproc_per_node\
+	    --num-epochs=$num_epochs
+	    --ckpt-output-dir .\
+	    $effective_batch_size\
+	    $cpu_offload_optimizer\
+	    $cpu_offload_pin_memory"
+
+# python3.11 /instructlab/entrypoint.py  --model /home/models/granite-7b-lab/ --data-path /home/data/jun12-phase05.jsonl --nnodes 1 --nproc-per-node 2 --num-epochs 1 --cpu-offload-optimizer --cpu-offload-pin-memory --ckpt-output-dir .
+
+# python3.11 /instructlab/entrypoint.py  --model /home/models/granite-7b-lab/ --data-path /home/data/jun12-phase05.jsonl --nnodes 1 --nproc-per-node 2 --num-epochs 1 --cpu-offload-optimizer --cpu-offload-pin-memory --ckpt-output-dir .
+
+echo "/home/models:"
+ls -la /home/models
+echo "/home/data:"
+ls -la /home/data
+
+
+
+echo "train cmd:"
+echo "$train_cmd"
+
+count=0
+rc=0
+echo "Training:"
+$train_cmd 2>&1 |
+{
+    while read line; do
+        echo "$line" >>train.txt
+        if echo "$line" | grep -q RunningAvgSamplesPerSec; then
+            ((count++))
+            echo "found line with RunningAvgSamplesPerSec: [$line]"
+        fi
+        if [ $count -ge $num_runavg_samples ]; then
+            break
+        fi
+    done
+    echo "count: $count"
+    if [ $count -lt $num_runavg_samples ]; then
+        echo "did not get the number of running avg samples, so exiting error"
+        exit 1
+    else
+        echo "Exiting without error"
+        exit 0
+    fi
+}
+exit $?
diff --git a/ilab-get-runtime b/ilab-get-runtime
@@ -0,0 +1,6 @@
+#!/bin/bash
+# vim: autoindent tabstop=4 shiftwidth=4 expandtab softtabstop=4 filetype=bash
+# -*- mode: sh; indent-tabs-mode: nil; sh-basic-offset: 4 -*-
+
+# request unbounded runtime by returning -1
+echo "-1"
diff --git a/ilab-post-process b/ilab-post-process
@@ -0,0 +1,62 @@
+#!/usr/bin/perl
+## -*- mode: perl; indent-tabs-mode: nil; perl-indent-level: 4 -*-
+## vim: autoindent tabstop=4 shiftwidth=4 expandtab softtabstop=4 filetype=perl
+
+use strict;
+use warnings;
+use JSON::XS;
+use Data::Dumper;
+use Time::Piece;
+BEGIN {
+    if (!(exists $ENV{'TOOLBOX_HOME'} && -d "$ENV{'TOOLBOX_HOME'}/perl")) {
+    print "This script requires libraries that are provided by the toolbox project.\n";
+    print "Toolbox can be acquired from https://github.com/perftool-incubator/toolbox and\n";
+    print "then use 'export TOOLBOX_HOME=/path/to/toolbox' so that it can be located.\n";
+    exit 1;
+    }
+}
+use lib "$ENV{'TOOLBOX_HOME'}/perl";
+use toolbox::json;
+use toolbox::metrics;
+
+my $coder = JSON::XS->new;
+my $log_file = "training_params_and_metrics_global0.jsonl";
+my @logfile_metrics;
+my %names = ();
+my %desc = ('source' => 'ilab', 'class' => 'throughput', 'type' => 'train-samples-sec');
+(my $rc, my $log_fh) = open_read_text_file($log_file);
+# file contents to parse:
+#{"epoch": 0, "step": 1, "rank": 0, "loss": 0.18146394193172455, "overall_throughput": 3.5244029279710176, "lr": 0.0, "cuda_mem_allocated": 14.08400821685791, "cuda_malloc_retries": 0, "num_loss_counted_tokens": 4940, "batch_size": 14, "total_loss": 0.4069821238517761, "gradnorm": null, "weight_norm": 557.9681396484375, "timestamp": "2024-07-18T22:46:41.628932"}
+while (<$log_fh>) {
+    my $json_ref;
+    $json_ref = $coder->decode($_);
+    # Strptime does not recognize microseconds, so we split the timestamp in two sections
+    if ( exists $$json_ref{"epoch"} and $$json_ref{"timestamp"} =~ /([^\.]*)\.(\d+)/ ) {
+	my $timestamp = $1;
+	my $msec = $2 /1000;
+	my $epoch = Time::Piece->strptime($timestamp, '%Y-%m-%dT%T')->epoch;
+        my $epoch_ms = $epoch * 1000 + $msec;
+        my %s = ('end' => int $epoch_ms, 'value' => $$json_ref{"overall_throughput"});
+        log_sample("0", \%desc, \%names, \%s);
+    }
+}
+close($log_fh);
+my $metric_data_name = finish_samples();
+
+# In any benchmark post-process script, the metrics generated need to be attributed to a
+# time-period (AKA benchmark-phase).  The period which is used to report and offical
+# result for the benchmark is the 'measurement' period.  Ohter periods thay may exist
+# could be "warm-up", "prep", etc.
+my %sample;
+my @periods;
+my %period = ('name' => 'measurement');
+my @metric_files = ( $metric_data_name );
+$period{'metric-files'} = \@metric_files;
+push(@periods, \%period);
+$sample{'primary-metric'} = "train-samples-sec";
+$sample{'primary-period'} = "measurement";
+$sample{'benchmark'} = "ilab";
+$sample{'periods'} = \@periods;
+$sample{'rickshaw-bench-metric'}{'schema'}{'version'} = "2021.04.12";
+open(JSON_FH, ">post-process-data.json") || die("Could not open file post-process-data.json for writing\n");
+print JSON_FH $coder->encode(\%sample);
diff --git a/multiplex.json b/multiplex.json
@@ -0,0 +1,23 @@
+{
+    "presets": {
+    },
+    "validations": {
+      "generic_string" : {
+        "description" : "all types of strings",
+        "args" : [
+            "model", "data-path"
+        ],
+        "vals" : ".+"
+      },
+      "integer_ge_zero" : {
+        "description" : "a whole number >= 0",
+        "args" : [ "nnodes", "nproc-per-node", "num-epochs", "effective-batch-size", "num-runavg-samples" ],
+        "vals" : "[1-9][0-9]*"
+      },
+      "bool_0_1" : {
+        "description" : "boolean as 0 (false) or 1 (true)",
+        "args" : [ "cpu-offload-optimizer", "cpu-offload-pin-memory" ],
+        "vals" : "[0-1]"
+      }
+  }
+}
diff --git a/rickshaw.json b/rickshaw.json
@@ -0,0 +1,23 @@
+{
+  "rickshaw-benchmark": {
+    "schema": { "version": "2020.05.18" }
+  },
+  "benchmark": "ilab",
+  "controller" : {
+    "post-script" : "%bench-dir%/ilab-post-process"
+  },
+  "client" : {
+    "files-from-controller": [
+      {
+          "src": "%bench-dir%/ilab-get-runtime",
+          "dest": "/usr/bin/"
+      },
+      {
+          "src": "%bench-dir%/ilab-client",
+          "dest": "/usr/bin/"
+      }
+    ],
+    "start": "ilab-client",
+    "runtime": "ilab-get-runtime"
+  }
+}
diff --git a/run-ilab.json b/run-ilab.json
@@ -0,0 +1,65 @@
+{
+  "tags": {
+    "topology": "none"
+  },
+  "endpoints": [
+    {
+      "type": "remotehosts",
+      "remotes": [
+        {
+          "engines": [ { "role": "client", "ids": "1" } ],
+          "config": {
+            "host": "localhost",
+            "settings": {
+              "controller-ip-address": "10.26.8.21",
+              "userenv": "rhel-ai",
+              "osruntime": "podman",
+              "podman-settings": {
+                "device": "nvidia.com/gpu=all",
+                "shm-size": "10.00gb"
+              },
+	      "host-mounts": [
+                { "src": "/home", "dest": "/home" } ],
+              "cpu-partitioning": false
+            }
+          }
+        }
+      ]
+    }
+  ],
+  "run-params": { "num-samples": 1, "max-sample-failures": 1, "test-order": "r" },
+  "tool-params": [
+    { "tool": "sysstat",
+      "params": [
+        { "arg": "subtools", "val": "mpstat,sar,iostat" },
+        { "arg": "interval", "val": "15" }
+      ]
+    },
+    { "tool": "procstat",
+      "params": [
+        { "arg": "interval", "val": "15" }
+      ]
+    }
+  ],
+  "benchmarks": [
+    {
+      "name": "ilab",
+      "ids": "1",
+      "mv-params": {
+        "sets": [
+          {
+            "params": [
+	      { "arg": "cpu-offload-optimizer", "vals": [ "1" ] },
+	      { "arg": "cpu-offload-pin-memory", "vals": [ "1" ] },
+	      { "arg": "nnodes", "vals": [ "1" ] },
+	      { "arg": "model", "vals": [ "/home/models/granite-7b-lab/" ] },
+	      { "arg": "data-path", "vals": [ "/home/data/jun12-phase05.jsonl" ] },
+	      { "arg": "num-runavg-samples", "vals": [ "8" ] },
+	      { "arg": "nproc-per-node", "vals": [ "2" ] }
+	    ]
+          }
+	]
+      }
+    }
+  ]
+}
diff --git a/workshop.json b/workshop.json
@@ -0,0 +1,64 @@
+{
+    "workshop": {
+        "schema": {
+            "version": "2020.03.02"
+        }
+    },
+    "userenvs": [
+        {
+            "name": "rhel-ai",
+            "requirements": []
+        },
+        {
+            "name": "cuda-rhubi9",
+            "requirements": [
+                "python-pip-packages",
+		"os-pkgs",
+                "instructlab"
+            ]
+        }
+    ],
+    "requirements": [
+        {
+            "name": "python-pip-packages",
+            "type": "python311",
+            "python3_info": {
+                "packages": [
+                   "torch==2.3.0 --index-url https://download.pytorch.org/whl/cu121",
+		   "packaging wheel",
+		   "flash-attn==2.5.9.post1",
+		   "deepspeed==0.14.2",
+		   "transformers==4.40.1",
+		   "ipdb",
+		   "jupyterlab",
+		   "gpustat",
+		   "matplotlib",
+		   "hydra-core",
+		   "datasets",
+		   "rich",
+		   "numba",
+		   "'numpy<2.0.0'"
+                ]
+            }
+        },
+        {
+            "name": "os-pkgs",
+            "type": "distro",
+            "distro_info": {
+                "packages": [
+		    "libnccl",
+		    "libaio-devel"
+                ]
+            }
+        },
+	{   "name": "instructlab",
+            "type": "manual",
+            "manual_info": {
+                "commands": [
+                    "git clone https://github.com/instructlab/training.git /var/run/training",
+		    "pushd /var/run/training; python3.11 -m pip install ."
+                ]
+            }
+        }
+    ]
+}