From 2bea211445af7e704ab3ffb9029b829c2c5e10a2 Mon Sep 17 00:00:00 2001 From: Andrew Theurer Date: Wed, 7 Aug 2024 07:53:14 -0400 Subject: [PATCH] Use python for post-processing --- ilab-post-process | 131 ++++++++++++++++++++++++++-------------------- 1 file changed, 73 insertions(+), 58 deletions(-) diff --git a/ilab-post-process b/ilab-post-process index 9f3f064..6c01587 100755 --- a/ilab-post-process +++ b/ilab-post-process @@ -1,62 +1,77 @@ -#!/usr/bin/perl -## -*- mode: perl; indent-tabs-mode: nil; perl-indent-level: 4 -*- -## vim: autoindent tabstop=4 shiftwidth=4 expandtab softtabstop=4 filetype=perl - -use strict; -use warnings; -use JSON::XS; -use Data::Dumper; -use Time::Piece; -BEGIN { - if (!(exists $ENV{'TOOLBOX_HOME'} && -d "$ENV{'TOOLBOX_HOME'}/perl")) { - print "This script requires libraries that are provided by the toolbox project.\n"; - print "Toolbox can be acquired from https://github.com/perftool-incubator/toolbox and\n"; - print "then use 'export TOOLBOX_HOME=/path/to/toolbox' so that it can be located.\n"; - exit 1; - } -} -use lib "$ENV{'TOOLBOX_HOME'}/perl"; -use toolbox::json; -use toolbox::metrics; - -my $coder = JSON::XS->new; -my $log_file = "training_params_and_metrics_global0.jsonl"; -my @logfile_metrics; -my %names = (); -my %desc = ('source' => 'ilab', 'class' => 'throughput', 'type' => 'train-samples-sec'); -(my $rc, my $log_fh) = open_read_text_file($log_file); -# file contents to parse: -#{"epoch": 0, "step": 1, "rank": 0, "loss": 0.18146394193172455, "overall_throughput": 3.5244029279710176, "lr": 0.0, "cuda_mem_allocated": 14.08400821685791, "cuda_malloc_retries": 0, "num_loss_counted_tokens": 4940, "batch_size": 14, "total_loss": 0.4069821238517761, "gradnorm": null, "weight_norm": 557.9681396484375, "timestamp": "2024-07-18T22:46:41.628932"} -while (<$log_fh>) { - my $json_ref; - $json_ref = $coder->decode($_); - # Strptime does not recognize microseconds, so we split the timestamp in two sections - if ( exists $$json_ref{"epoch"} and $$json_ref{"timestamp"} =~ /([^\.]*)\.(\d+)/ ) { - my $timestamp = $1; - my $msec = $2 /1000; - my $epoch = Time::Piece->strptime($timestamp, '%Y-%m-%dT%T')->epoch; - my $epoch_ms = $epoch * 1000 + $msec; - my %s = ('end' => int $epoch_ms, 'value' => $$json_ref{"overall_throughput"}); - log_sample("0", \%desc, \%names, \%s); - } -} -close($log_fh); -my $metric_data_name = finish_samples(); +#!/usr/bin/env python3 +# -*- mode: python; indent-tabs-mode: nil; python-indent-level: 4 -*- +# vim: autoindent tabstop=4 shiftwidth=4 expandtab softtabstop=4 filetype=python + +import sys +import os +import lzma +import re +import copy +import math +import json +from datetime import datetime +from pathlib import Path + +TOOLBOX_HOME = os.environ.get('TOOLBOX_HOME') +if TOOLBOX_HOME is None: + print("This script requires libraries that are provided by the toolbox project.") + print("Toolbox can be acquired from https://github.com/perftool-incubator/toolbox and") + print("then use 'export TOOLBOX_HOME=/path/to/toolbox' so that it can be located.") + exit(1) +else: + p = Path(TOOLBOX_HOME) / 'python' + if not p.exists() or not p.is_dir(): + print("ERROR: /python ('%s') does not exist!" % (p)) + exit(2) + sys.path.append(str(p)) +from toolbox.metrics import log_sample +from toolbox.metrics import finish_samples # In any benchmark post-process script, the metrics generated need to be attributed to a # time-period (AKA benchmark-phase). The period which is used to report and offical -# result for the benchmark is the 'measurement' period. Ohter periods thay may exist +# result for the benchmark is the 'measurement' period. Other periods thay may exist # could be "warm-up", "prep", etc. -my %sample; -my @periods; -my %period = ('name' => 'measurement'); -my @metric_files = ( $metric_data_name ); -$period{'metric-files'} = \@metric_files; -push(@periods, \%period); -$sample{'primary-metric'} = "train-samples-sec"; -$sample{'primary-period'} = "measurement"; -$sample{'benchmark'} = "ilab"; -$sample{'periods'} = \@periods; -$sample{'rickshaw-bench-metric'}{'schema'}{'version'} = "2021.04.12"; -open(JSON_FH, ">post-process-data.json") || die("Could not open file post-process-data.json for writing\n"); -print JSON_FH $coder->encode(\%sample); + +iter_sample = { 'primary-metric': "train-samples-sec", # will [eventually] vary depending on what was done + 'primary-period': "measurement", + 'benchmark': "ilab", + 'periods': [], + 'rickshaw-bench-metric': { 'schema': { 'version': '2021.04.12' } } + } + +metric_files = [] + +period = { 'name': 'measurement', 'metric-files': [] } +file_id = 'global0' + +jsonl_desc = {'source' : 'ilab', 'type': 'train-samples-sec', 'class': 'throughput'} +names = {} +filename = 'training_params_and_metrics_global0.jsonl.xz' +print('Opening file {0:s}'.format(filename)) +with lzma.open(filename, 'rt') as file: + for line in file: + d = json.loads(line) + # file contents to parse (per line): + #{"epoch": 0, "step": 1, "rank": 0, + # "loss": 0.18146394193172455, + # "overall_throughput": 3.5244029279710176, + # "lr": 0.0, "cuda_mem_allocated": 14.08400821685791, + # "cuda_malloc_retries": 0, + # "num_loss_counted_tokens": 4940, "batch_size": 14, + # "total_loss": 0.4069821238517761, "gradnorm": null, + # "weight_norm": 557.9681396484375, + # "timestamp": "2024-07-18T22:46:41.628932"} + if 'epoch' in d.keys(): + dt = datetime.strptime(d['timestamp'], '%Y-%m-%dT%X.%f') + ts = math.floor(dt.timestamp() * 1000) + sample = {'end': ts, 'value': d['overall_throughput']} + log_sample(file_id, jsonl_desc, names, sample) + +metric_file_name = finish_samples() + +period['metric-files'].append(metric_file_name) +iter_sample['periods'].append(period) + +f = open('post-process-data.json', 'w') +f.write(json.dumps(iter_sample)) +f.close