From e59197130632faca7d5118efeae8325df9dfc24f Mon Sep 17 00:00:00 2001
From: Andrew Theurer <atheurer@redhat.com>
Date: Fri, 15 Nov 2024 17:57:25 -0500
Subject: [PATCH] initial script, without functioning post-processing

---
 README.md            |   4 +-
 multiplex.json       |  13 +++++
 pytorch-base         |   3 +
 pytorch-client       |  55 ++++++++++++++++++
 pytorch-get-runtime  |   6 ++
 pytorch-post-process | 134 +++++++++++++++++++++++++++++++++++++++++++
 rickshaw.json        |  27 +++++++++
 workshop.json        |  18 ++++++
 8 files changed, 259 insertions(+), 1 deletion(-)
 create mode 100644 multiplex.json
 create mode 100755 pytorch-base
 create mode 100755 pytorch-client
 create mode 100755 pytorch-get-runtime
 create mode 100755 pytorch-post-process
 create mode 100644 rickshaw.json
 create mode 100644 workshop.json

diff --git a/README.md b/README.md
index 49d65a8..d6a3776 100644
--- a/README.md
+++ b/README.md
@@ -1,2 +1,4 @@
 # bench-pytorch
-wrapper for pytorch benchmark
+benchmark automation for pytorch-benchmark
+
+This project works with crucible automation to run the pytorch benchmark.  Please see the run-pytorch.json file for exmaple usage.  This file is used with "crucible run --from-file run-pytorch.json" on your host.
diff --git a/multiplex.json b/multiplex.json
new file mode 100644
index 0000000..d40a38c
--- /dev/null
+++ b/multiplex.json
@@ -0,0 +1,13 @@
+{
+  "presets": {
+  },
+  "validations": {
+    "models" : {
+      "description" : "valid models",
+      "args" : [
+        "model"
+      ],
+      "vals" : "BERT_pytorch|fastNLP_Bert|hf_Bert_large|hf_BigBird|hf_DistilBert|hf_GPT2|hf_GPT2_large|hf_Longformer|hf_Roberta_base|hf_Reformer|llama|llama_v2_7b_16h|llava|hf_Whisper|hf_T5_large|hf_T5|nanogpt|timm_vision_transformer|timm_vision_transformer_large"
+    }
+  }
+}
diff --git a/pytorch-base b/pytorch-base
new file mode 100755
index 0000000..acfa336
--- /dev/null
+++ b/pytorch-base
@@ -0,0 +1,3 @@
+#!/bin/bash
+
+if ! source ${TOOLBOX_HOME}/bash/library/bench-base; then echo "ERROR: Could not source bench-base from \$TOOLBOX_HOME [${TOOLBOX_HOME}]"; exit 1; fi
diff --git a/pytorch-client b/pytorch-client
new file mode 100755
index 0000000..ae7a15a
--- /dev/null
+++ b/pytorch-client
@@ -0,0 +1,55 @@
+#!/bin/bash
+# -*- mode: sh; indent-tabs-mode: nil; sh-basic-offset: 4 -*-
+# vim: autoindent tabstop=4 shiftwidth=4 expandtab softtabstop=4 filetype=bash
+exec >pytorch-client-stderrout.txt
+exec 2>&1
+
+. /usr/bin/pytorch-base || (echo "/usr/bin/pytorch-base not found"; exit 1)
+
+model="llama"
+
+pytorch --version | awk '{print $3}' >pytorch-version.txt
+
+pwd=`/bin/pwd`
+pushd /opt/app-root/lib/python3.11/site-packages
+find . -name METADATA | cpio -pdumv $pwd/instructlab-site-packages-metadata
+popd
+
+longopts=""
+longopts+=" model:"
+
+opts=$(getopt -q -o "" --longoptions "$longopts" -n "getopt.sh" -- "$@");
+if [ $? -ne 0 ]; then
+    printf -- "\tUnrecognized option specified\n\n"
+    exit 1
+fi
+eval set -- "$opts";
+while true; do
+    arg=$1; shift
+    val=$1; shift
+    case "$arg" in
+        --model)
+            model="$val"
+            ;;
+        --)
+            break
+            ;;
+        *)
+            echo "Invalid option: [$arg]"
+            exit 1
+    esac
+done
+
+
+echo "model: $workflow"
+git clone https://github.com/pytorch/benchmark.git
+pushd benchmark
+pip uninstall -y nvidia-ml-py
+pip install pynvml
+python run_benchmark.py --help
+date +%s >begin.txt
+python run_benchmark.py test_bench -m $model >run_benchmark_output.txt
+date +%s >end.txt
+rc=$?
+popd
+exit $rc
diff --git a/pytorch-get-runtime b/pytorch-get-runtime
new file mode 100755
index 0000000..8aadf7b
--- /dev/null
+++ b/pytorch-get-runtime
@@ -0,0 +1,6 @@
+#!/bin/bash
+# vim: autoindent tabstop=4 shiftwidth=4 expandtab softtabstop=4 filetype=bash
+# -*- mode: sh; indent-tabs-mode: nil; sh-basic-offset: 4 -*-
+
+# request unbounded runtime by returning -1
+echo "-1"
diff --git a/pytorch-post-process b/pytorch-post-process
new file mode 100755
index 0000000..c31a7bb
--- /dev/null
+++ b/pytorch-post-process
@@ -0,0 +1,134 @@
+#!/usr/bin/env python3
+# -*- mode: python; indent-tabs-mode: nil; python-indent-level: 4 -*-
+# vim: autoindent tabstop=4 shiftwidth=4 expandtab softtabstop=4 filetype=python
+
+import sys
+import os
+import lzma
+import re
+import copy
+import math
+import json
+import yaml
+import argparse
+import glob
+from datetime import datetime
+from pathlib import Path
+
+TOOLBOX_HOME = os.environ.get('TOOLBOX_HOME')
+if TOOLBOX_HOME is None:
+    print("This script requires libraries that are provided by the toolbox project.")
+    print("Toolbox can be acquired from https://github.com/perftool-incubator/toolbox and")
+    print("then use 'export TOOLBOX_HOME=/path/to/toolbox' so that it can be located.")
+    exit(1)
+else:
+    p = Path(TOOLBOX_HOME) / 'python'
+    if not p.exists() or not p.is_dir():
+        print("ERROR: <TOOLBOX_HOME>/python ('%s') does not exist!" % (p))
+        exit(2)
+    sys.path.append(str(p))
+from toolbox.metrics import log_sample
+from toolbox.metrics import finish_samples
+
+params = {}
+
+class t_global(object):
+     args = None
+
+def process_options():
+    parser = argparse.ArgumentParser(description = 'Post process raw benchmark data into Common Data Model output')
+
+    parser.add_argument('--model',
+                        dest = 'model',
+                        help = '',
+                        default = "llama"
+                        )
+
+    t_global.args, unknown = parser.parse_known_args()
+
+    return()
+
+def main():
+    process_options()
+    if t_global.args.workflow == '':
+        print('workflow was not defined, exiting')
+        return(1)
+
+    # In any benchmark post-process script, the metrics generated need to be attributed to a
+    # time-period (AKA benchmark-phase).  The period which is used to report and offical
+    # result for the benchmark is the 'measurement' period.  Other periods thay may exist
+    # could be "warm-up", "prep", etc.
+
+    iter_sample = {
+        'primary-period': "measurement",
+        'benchmark': "pytorch",
+        'periods': [],
+        'rickshaw-bench-metric': { 'schema': { 'version': '2021.04.12' } }
+        }
+
+    metric_files = []
+
+    first_ts = None
+    last_ts = None
+    period = { 'name': 'phase' + str(phase), 'metric-files': [] }
+    file_id = 'phase' + str(phase)
+    desc = {'source' : 'pytorch', 'class': 'throughput'}
+    names = {}
+    desc['type'] = 'train-samples-sec';
+    filename = 'e2e/phase' + str(phase) + '/checkpoints/training_params_and_metrics_global0.jsonl.xz'
+    print('Opening ' + filename)
+    this_period_first_ts = None
+    this_period_last_ts = None
+    with open(filename, 'rt') as file:
+        for line in file:
+            d = json.loads(line)
+            # file contents to parse (per line):
+            #{"epoch": 0, "step": 1, "rank": 0,
+            # "loss": 0.18146394193172455,
+            # "overall_throughput": 3.5244029279710176,
+            # "lr": 0.0, "cuda_mem_allocated": 14.08400821685791,
+            # "cuda_malloc_retries": 0,
+            # "num_loss_counted_tokens": 4940, "batch_size": 14,
+            # "total_loss": 0.4069821238517761, "gradnorm": null,
+            # "weight_norm": 557.9681396484375,
+            # "timestamp": "2024-07-18T22:46:41.628932"}
+            if 'epoch' in d.keys():
+                dt = datetime.strptime(d['timestamp'], '%Y-%m-%dT%X.%f')
+                ts = math.floor(dt.timestamp() * 1000)
+                if this_period_first_ts == None:
+                    this_period_first_ts = ts
+                if first_ts == None:
+                    first_ts = ts
+                sample = {'end': ts, 'value': d['overall_throughput']}
+                log_sample(file_id, desc, names, sample)
+                last_ts = ts
+                this_period_last_ts = ts
+    metric_file_name = finish_samples()
+    period['metric-files'].append(metric_file_name)
+    iter_sample['periods'].append(period)
+
+    # Now create the primary metric and the primary-period
+    iter_sample['primary-metric'] = 'actual-train-seconds'
+    period = { 'name': 'measurement', 'metric-files': [] }
+    file_id = 'measurement'
+    desc = {'source' : 'pytorch', 'class': 'count', 'type': 'actual-train-seconds'}
+    names = {}
+    sample = {'begin': first_ts, 'end': last_ts, 'value': (last_ts - first_ts) / 1000}
+    log_sample(file_id, desc, names, sample)
+    metric_file_name = finish_samples()
+    period['metric-files'].append(metric_file_name)
+    iter_sample['periods'].append(period)
+
+
+    metric_file_name = finish_samples()
+    period['metric-files'].append(metric_file_name)
+    iter_sample['periods'].append(period)
+
+    f = open('post-process-data.json', 'w')
+    f.write(json.dumps(iter_sample))
+    f.close
+    return(0)
+
+
+if __name__ == "__main__":
+    exit(main())
diff --git a/rickshaw.json b/rickshaw.json
new file mode 100644
index 0000000..1298206
--- /dev/null
+++ b/rickshaw.json
@@ -0,0 +1,27 @@
+{
+  "rickshaw-benchmark": {
+    "schema": { "version": "2020.05.18" }
+  },
+  "benchmark": "pytorch",
+  "controller" : {
+    "post-script" : "%bench-dir%/pytorch-post-process"
+  },
+  "client" : {
+    "files-from-controller": [
+      {
+          "src": "%bench-dir%/pytorch-get-runtime",
+          "dest": "/usr/bin/"
+      },
+      {
+          "src": "%bench-dir%/pytorch-base",
+          "dest": "/usr/bin/"
+      },
+      {
+          "src": "%bench-dir%/pytorch-client",
+          "dest": "/usr/bin/"
+      }
+    ],
+    "start": "pytorch-client",
+    "runtime": "pytorch-get-runtime"
+  }
+}
diff --git a/workshop.json b/workshop.json
new file mode 100644
index 0000000..1d59a19
--- /dev/null
+++ b/workshop.json
@@ -0,0 +1,18 @@
+{
+    "workshop": {
+        "schema": {
+            "version": "2020.03.02"
+        }
+    },
+    "userenvs": [
+        {
+            "name": "default",
+            "requirements": []
+        },
+        {
+            "name": "rhel-ai",
+            "requirements": []
+        }
+    ],
+    "requirements": []
+}