From a11f7a40fdddd96c5a97856d84e0ac01a699ab24 Mon Sep 17 00:00:00 2001
From: Andrew Theurer <atheurer@redhat.com>
Date: Mon, 18 Nov 2024 16:20:16 -0500
Subject: [PATCH] bug fixes

---
 pytorch-client       |  12 ++---
 pytorch-post-process | 106 ++++++++++++++++++-------------------------
 2 files changed, 50 insertions(+), 68 deletions(-)

diff --git a/pytorch-client b/pytorch-client
index ae7a15a..e83e65c 100755
--- a/pytorch-client
+++ b/pytorch-client
@@ -8,8 +8,6 @@ exec 2>&1
 
 model="llama"
 
-pytorch --version | awk '{print $3}' >pytorch-version.txt
-
 pwd=`/bin/pwd`
 pushd /opt/app-root/lib/python3.11/site-packages
 find . -name METADATA | cpio -pdumv $pwd/instructlab-site-packages-metadata
@@ -46,10 +44,12 @@ git clone https://github.com/pytorch/benchmark.git
 pushd benchmark
 pip uninstall -y nvidia-ml-py
 pip install pynvml
-python run_benchmark.py --help
-date +%s >begin.txt
-python run_benchmark.py test_bench -m $model >run_benchmark_output.txt
-date +%s >end.txt
+python run_benchmark.py test_bench --help
+bench_cmd="python run_benchmark.py test_bench -m $model"
+echo "About to run: $bench_cmd"
+date +%s.%N >begin.txt
+$bench_cmd >run_benchmark_output.txt
+date +%s.%N >end.txt
 rc=$?
 popd
 exit $rc
diff --git a/pytorch-post-process b/pytorch-post-process
index c31a7bb..c3f7e24 100755
--- a/pytorch-post-process
+++ b/pytorch-post-process
@@ -50,9 +50,6 @@ def process_options():
 
 def main():
     process_options()
-    if t_global.args.workflow == '':
-        print('workflow was not defined, exiting')
-        return(1)
 
     # In any benchmark post-process script, the metrics generated need to be attributed to a
     # time-period (AKA benchmark-phase).  The period which is used to report and offical
@@ -68,66 +65,51 @@ def main():
 
     metric_files = []
 
-    first_ts = None
-    last_ts = None
-    period = { 'name': 'phase' + str(phase), 'metric-files': [] }
-    file_id = 'phase' + str(phase)
-    desc = {'source' : 'pytorch', 'class': 'throughput'}
-    names = {}
-    desc['type'] = 'train-samples-sec';
-    filename = 'e2e/phase' + str(phase) + '/checkpoints/training_params_and_metrics_global0.jsonl.xz'
-    print('Opening ' + filename)
-    this_period_first_ts = None
-    this_period_last_ts = None
-    with open(filename, 'rt') as file:
-        for line in file:
-            d = json.loads(line)
-            # file contents to parse (per line):
-            #{"epoch": 0, "step": 1, "rank": 0,
-            # "loss": 0.18146394193172455,
-            # "overall_throughput": 3.5244029279710176,
-            # "lr": 0.0, "cuda_mem_allocated": 14.08400821685791,
-            # "cuda_malloc_retries": 0,
-            # "num_loss_counted_tokens": 4940, "batch_size": 14,
-            # "total_loss": 0.4069821238517761, "gradnorm": null,
-            # "weight_norm": 557.9681396484375,
-            # "timestamp": "2024-07-18T22:46:41.628932"}
-            if 'epoch' in d.keys():
-                dt = datetime.strptime(d['timestamp'], '%Y-%m-%dT%X.%f')
-                ts = math.floor(dt.timestamp() * 1000)
-                if this_period_first_ts == None:
-                    this_period_first_ts = ts
-                if first_ts == None:
-                    first_ts = ts
-                sample = {'end': ts, 'value': d['overall_throughput']}
-                log_sample(file_id, desc, names, sample)
-                last_ts = ts
-                this_period_last_ts = ts
-    metric_file_name = finish_samples()
-    period['metric-files'].append(metric_file_name)
-    iter_sample['periods'].append(period)
-
-    # Now create the primary metric and the primary-period
-    iter_sample['primary-metric'] = 'actual-train-seconds'
-    period = { 'name': 'measurement', 'metric-files': [] }
-    file_id = 'measurement'
-    desc = {'source' : 'pytorch', 'class': 'count', 'type': 'actual-train-seconds'}
-    names = {}
-    sample = {'begin': first_ts, 'end': last_ts, 'value': (last_ts - first_ts) / 1000}
-    log_sample(file_id, desc, names, sample)
-    metric_file_name = finish_samples()
-    period['metric-files'].append(metric_file_name)
-    iter_sample['periods'].append(period)
-
-
-    metric_file_name = finish_samples()
-    period['metric-files'].append(metric_file_name)
-    iter_sample['periods'].append(period)
-
-    f = open('post-process-data.json', 'w')
-    f.write(json.dumps(iter_sample))
+    f = open("benchmark/begin.txt","r")
+    begin = int(math.floor(1000*float(f.readline())))
+    begin
     f.close
-    return(0)
+    f = open("benchmark/end.txt","r")
+    end = int(math.floor(1000*float(f.readline())))
+    f.close
+
+    with open("benchmark/run_benchmark_output.txt", "r") as f:
+        initial_data = f.readline()
+        d = json.loads(f.read())
+
+        # example file:
+        # Running TorchBenchModelConfig(name='llama', test='eval', device='cuda', batch_size=None, extra_args=[], extra_env=None, output_dir=None) ... [done]
+        # {
+        # "name": "test_bench",
+        # "environ": {
+        #     "pytorch_git_version": "2d9a7eec9ccc362e00d2fd2b4b845d3f90d955aa",
+        #     "pytorch_version": "2.3.1",
+        #     "device": "NVIDIA L40S"
+        # },
+        # "metrics": {
+        #     "model=llama, test=eval, device=cuda, bs=None, extra_args=[], metric=latencies": 4.101454,
+        #     "model=llama, test=eval, device=cuda, bs=None, extra_args=[], metric=cpu_peak_mem": 0.8515625,
+        #     "model=llama, test=eval, device=cuda, bs=None, extra_args=[], metric=gpu_peak_mem": 3.8720703125
+        #  }
+        # }
+
+        print(d)
+
+        # Now create the primary metric and the primary-period
+        iter_sample['primary-metric'] = 'elapsed-time-milliseconds'
+        period = { 'name': 'measurement', 'metric-files': [] }
+        file_id = 'measurement'
+        desc = {'source' : 'pytorch', 'class': 'count', 'type': 'elapsed-time-milliseconds'}
+        names = {}
+        sample = {'begin': begin, 'end': end, 'value': end - begin}
+        log_sample(file_id, desc, names, sample)
+
+        metric_file_name = finish_samples()
+        period['metric-files'].append(metric_file_name)
+        iter_sample['periods'].append(period)
+        f = open('post-process-data.json', 'w')
+        f.write(json.dumps(iter_sample))
+        f.close
 
 
 if __name__ == "__main__":