Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use python for post-processing #9

Merged
merged 1 commit into from
Aug 7, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
131 changes: 73 additions & 58 deletions ilab-post-process
Original file line number Diff line number Diff line change
@@ -1,62 +1,77 @@
#!/usr/bin/perl
## -*- mode: perl; indent-tabs-mode: nil; perl-indent-level: 4 -*-
## vim: autoindent tabstop=4 shiftwidth=4 expandtab softtabstop=4 filetype=perl

use strict;
use warnings;
use JSON::XS;
use Data::Dumper;
use Time::Piece;
BEGIN {
if (!(exists $ENV{'TOOLBOX_HOME'} && -d "$ENV{'TOOLBOX_HOME'}/perl")) {
print "This script requires libraries that are provided by the toolbox project.\n";
print "Toolbox can be acquired from https://github.com/perftool-incubator/toolbox and\n";
print "then use 'export TOOLBOX_HOME=/path/to/toolbox' so that it can be located.\n";
exit 1;
}
}
use lib "$ENV{'TOOLBOX_HOME'}/perl";
use toolbox::json;
use toolbox::metrics;

my $coder = JSON::XS->new;
my $log_file = "training_params_and_metrics_global0.jsonl";
my @logfile_metrics;
my %names = ();
my %desc = ('source' => 'ilab', 'class' => 'throughput', 'type' => 'train-samples-sec');
(my $rc, my $log_fh) = open_read_text_file($log_file);
# file contents to parse:
#{"epoch": 0, "step": 1, "rank": 0, "loss": 0.18146394193172455, "overall_throughput": 3.5244029279710176, "lr": 0.0, "cuda_mem_allocated": 14.08400821685791, "cuda_malloc_retries": 0, "num_loss_counted_tokens": 4940, "batch_size": 14, "total_loss": 0.4069821238517761, "gradnorm": null, "weight_norm": 557.9681396484375, "timestamp": "2024-07-18T22:46:41.628932"}
while (<$log_fh>) {
my $json_ref;
$json_ref = $coder->decode($_);
# Strptime does not recognize microseconds, so we split the timestamp in two sections
if ( exists $$json_ref{"epoch"} and $$json_ref{"timestamp"} =~ /([^\.]*)\.(\d+)/ ) {
my $timestamp = $1;
my $msec = $2 /1000;
my $epoch = Time::Piece->strptime($timestamp, '%Y-%m-%dT%T')->epoch;
my $epoch_ms = $epoch * 1000 + $msec;
my %s = ('end' => int $epoch_ms, 'value' => $$json_ref{"overall_throughput"});
log_sample("0", \%desc, \%names, \%s);
}
}
close($log_fh);
my $metric_data_name = finish_samples();
#!/usr/bin/env python3
# -*- mode: python; indent-tabs-mode: nil; python-indent-level: 4 -*-
# vim: autoindent tabstop=4 shiftwidth=4 expandtab softtabstop=4 filetype=python

import sys
import os
import lzma
import re
import copy
import math
import json
from datetime import datetime
from pathlib import Path

TOOLBOX_HOME = os.environ.get('TOOLBOX_HOME')
if TOOLBOX_HOME is None:
print("This script requires libraries that are provided by the toolbox project.")
print("Toolbox can be acquired from https://github.com/perftool-incubator/toolbox and")
print("then use 'export TOOLBOX_HOME=/path/to/toolbox' so that it can be located.")
exit(1)
else:
p = Path(TOOLBOX_HOME) / 'python'
if not p.exists() or not p.is_dir():
print("ERROR: <TOOLBOX_HOME>/python ('%s') does not exist!" % (p))
exit(2)
sys.path.append(str(p))
from toolbox.metrics import log_sample
from toolbox.metrics import finish_samples

# In any benchmark post-process script, the metrics generated need to be attributed to a
# time-period (AKA benchmark-phase). The period which is used to report and offical
# result for the benchmark is the 'measurement' period. Ohter periods thay may exist
# result for the benchmark is the 'measurement' period. Other periods thay may exist
# could be "warm-up", "prep", etc.
my %sample;
my @periods;
my %period = ('name' => 'measurement');
my @metric_files = ( $metric_data_name );
$period{'metric-files'} = \@metric_files;
push(@periods, \%period);
$sample{'primary-metric'} = "train-samples-sec";
$sample{'primary-period'} = "measurement";
$sample{'benchmark'} = "ilab";
$sample{'periods'} = \@periods;
$sample{'rickshaw-bench-metric'}{'schema'}{'version'} = "2021.04.12";
open(JSON_FH, ">post-process-data.json") || die("Could not open file post-process-data.json for writing\n");
print JSON_FH $coder->encode(\%sample);

iter_sample = { 'primary-metric': "train-samples-sec", # will [eventually] vary depending on what was done
'primary-period': "measurement",
'benchmark': "ilab",
'periods': [],
'rickshaw-bench-metric': { 'schema': { 'version': '2021.04.12' } }
}

metric_files = []

period = { 'name': 'measurement', 'metric-files': [] }
file_id = 'global0'

jsonl_desc = {'source' : 'ilab', 'type': 'train-samples-sec', 'class': 'throughput'}
names = {}
filename = 'training_params_and_metrics_global0.jsonl.xz'
print('Opening file {0:s}'.format(filename))
with lzma.open(filename, 'rt') as file:
for line in file:
d = json.loads(line)
# file contents to parse (per line):
#{"epoch": 0, "step": 1, "rank": 0,
# "loss": 0.18146394193172455,
# "overall_throughput": 3.5244029279710176,
# "lr": 0.0, "cuda_mem_allocated": 14.08400821685791,
# "cuda_malloc_retries": 0,
# "num_loss_counted_tokens": 4940, "batch_size": 14,
# "total_loss": 0.4069821238517761, "gradnorm": null,
# "weight_norm": 557.9681396484375,
# "timestamp": "2024-07-18T22:46:41.628932"}
if 'epoch' in d.keys():
dt = datetime.strptime(d['timestamp'], '%Y-%m-%dT%X.%f')
ts = math.floor(dt.timestamp() * 1000)
sample = {'end': ts, 'value': d['overall_throughput']}
log_sample(file_id, jsonl_desc, names, sample)

metric_file_name = finish_samples()

period['metric-files'].append(metric_file_name)
iter_sample['periods'].append(period)

f = open('post-process-data.json', 'w')
f.write(json.dumps(iter_sample))
f.close