From 2bea211445af7e704ab3ffb9029b829c2c5e10a2 Mon Sep 17 00:00:00 2001
From: Andrew Theurer <atheurer@redhat.com>
Date: Wed, 7 Aug 2024 07:53:14 -0400
Subject: [PATCH] Use python for post-processing

---
 ilab-post-process | 131 ++++++++++++++++++++++++++--------------------
 1 file changed, 73 insertions(+), 58 deletions(-)

diff --git a/ilab-post-process b/ilab-post-process
index 9f3f064..6c01587 100755
--- a/ilab-post-process
+++ b/ilab-post-process
@@ -1,62 +1,77 @@
-#!/usr/bin/perl
-## -*- mode: perl; indent-tabs-mode: nil; perl-indent-level: 4 -*-
-## vim: autoindent tabstop=4 shiftwidth=4 expandtab softtabstop=4 filetype=perl
-
-use strict;
-use warnings;
-use JSON::XS;
-use Data::Dumper;
-use Time::Piece;
-BEGIN {
-    if (!(exists $ENV{'TOOLBOX_HOME'} && -d "$ENV{'TOOLBOX_HOME'}/perl")) {
-    print "This script requires libraries that are provided by the toolbox project.\n";
-    print "Toolbox can be acquired from https://github.com/perftool-incubator/toolbox and\n";
-    print "then use 'export TOOLBOX_HOME=/path/to/toolbox' so that it can be located.\n";
-    exit 1;
-    }
-}
-use lib "$ENV{'TOOLBOX_HOME'}/perl";
-use toolbox::json;
-use toolbox::metrics;
-
-my $coder = JSON::XS->new;
-my $log_file = "training_params_and_metrics_global0.jsonl";
-my @logfile_metrics;
-my %names = ();
-my %desc = ('source' => 'ilab', 'class' => 'throughput', 'type' => 'train-samples-sec');
-(my $rc, my $log_fh) = open_read_text_file($log_file);
-# file contents to parse:
-#{"epoch": 0, "step": 1, "rank": 0, "loss": 0.18146394193172455, "overall_throughput": 3.5244029279710176, "lr": 0.0, "cuda_mem_allocated": 14.08400821685791, "cuda_malloc_retries": 0, "num_loss_counted_tokens": 4940, "batch_size": 14, "total_loss": 0.4069821238517761, "gradnorm": null, "weight_norm": 557.9681396484375, "timestamp": "2024-07-18T22:46:41.628932"}
-while (<$log_fh>) {
-    my $json_ref;
-    $json_ref = $coder->decode($_);
-    # Strptime does not recognize microseconds, so we split the timestamp in two sections
-    if ( exists $$json_ref{"epoch"} and $$json_ref{"timestamp"} =~ /([^\.]*)\.(\d+)/ ) {
-	my $timestamp = $1;
-	my $msec = $2 /1000;
-	my $epoch = Time::Piece->strptime($timestamp, '%Y-%m-%dT%T')->epoch;
-        my $epoch_ms = $epoch * 1000 + $msec;
-        my %s = ('end' => int $epoch_ms, 'value' => $$json_ref{"overall_throughput"});
-        log_sample("0", \%desc, \%names, \%s);
-    }
-}
-close($log_fh);
-my $metric_data_name = finish_samples();
+#!/usr/bin/env python3
+# -*- mode: python; indent-tabs-mode: nil; python-indent-level: 4 -*-
+# vim: autoindent tabstop=4 shiftwidth=4 expandtab softtabstop=4 filetype=python
+
+import sys
+import os
+import lzma
+import re
+import copy
+import math
+import json
+from datetime import datetime
+from pathlib import Path
+
+TOOLBOX_HOME = os.environ.get('TOOLBOX_HOME')
+if TOOLBOX_HOME is None:
+    print("This script requires libraries that are provided by the toolbox project.")
+    print("Toolbox can be acquired from https://github.com/perftool-incubator/toolbox and")
+    print("then use 'export TOOLBOX_HOME=/path/to/toolbox' so that it can be located.")
+    exit(1)
+else:
+    p = Path(TOOLBOX_HOME) / 'python'
+    if not p.exists() or not p.is_dir():
+        print("ERROR: <TOOLBOX_HOME>/python ('%s') does not exist!" % (p))
+        exit(2)
+    sys.path.append(str(p))
+from toolbox.metrics import log_sample
+from toolbox.metrics import finish_samples
 
 # In any benchmark post-process script, the metrics generated need to be attributed to a
 # time-period (AKA benchmark-phase).  The period which is used to report and offical
-# result for the benchmark is the 'measurement' period.  Ohter periods thay may exist
+# result for the benchmark is the 'measurement' period.  Other periods thay may exist
 # could be "warm-up", "prep", etc.
-my %sample;
-my @periods;
-my %period = ('name' => 'measurement');
-my @metric_files = ( $metric_data_name );
-$period{'metric-files'} = \@metric_files;
-push(@periods, \%period);
-$sample{'primary-metric'} = "train-samples-sec";
-$sample{'primary-period'} = "measurement";
-$sample{'benchmark'} = "ilab";
-$sample{'periods'} = \@periods;
-$sample{'rickshaw-bench-metric'}{'schema'}{'version'} = "2021.04.12";
-open(JSON_FH, ">post-process-data.json") || die("Could not open file post-process-data.json for writing\n");
-print JSON_FH $coder->encode(\%sample);
+
+iter_sample = { 'primary-metric': "train-samples-sec",  # will [eventually] vary depending on what was done
+    'primary-period': "measurement",
+    'benchmark': "ilab",
+    'periods': [],
+    'rickshaw-bench-metric': { 'schema': { 'version': '2021.04.12' } }
+    }
+
+metric_files = []
+
+period = { 'name': 'measurement', 'metric-files': [] }
+file_id = 'global0'
+
+jsonl_desc = {'source' : 'ilab', 'type': 'train-samples-sec', 'class': 'throughput'}
+names = {}
+filename = 'training_params_and_metrics_global0.jsonl.xz'
+print('Opening file {0:s}'.format(filename))
+with lzma.open(filename, 'rt') as file:
+    for line in file:
+        d = json.loads(line)
+        # file contents to parse (per line):
+        #{"epoch": 0, "step": 1, "rank": 0,
+        # "loss": 0.18146394193172455,
+        # "overall_throughput": 3.5244029279710176,
+        # "lr": 0.0, "cuda_mem_allocated": 14.08400821685791,
+        # "cuda_malloc_retries": 0,
+        # "num_loss_counted_tokens": 4940, "batch_size": 14,
+        # "total_loss": 0.4069821238517761, "gradnorm": null,
+        # "weight_norm": 557.9681396484375,
+        # "timestamp": "2024-07-18T22:46:41.628932"}
+        if 'epoch' in d.keys():
+            dt = datetime.strptime(d['timestamp'], '%Y-%m-%dT%X.%f')
+            ts = math.floor(dt.timestamp() * 1000)
+            sample = {'end': ts, 'value': d['overall_throughput']}
+            log_sample(file_id, jsonl_desc, names, sample)
+
+metric_file_name = finish_samples()
+
+period['metric-files'].append(metric_file_name)
+iter_sample['periods'].append(period)
+
+f = open('post-process-data.json', 'w')
+f.write(json.dumps(iter_sample))
+f.close