From a305ce23de7d1d97da3e970b29e5d600dac1bf43 Mon Sep 17 00:00:00 2001 From: Andrew Theurer Date: Tue, 3 Dec 2024 13:41:32 -0500 Subject: [PATCH] install specific pynvml, and pp fix -valid latency is in fact an actual number --- pytorch-client | 15 ++++++++------- pytorch-post-process | 4 +++- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/pytorch-client b/pytorch-client index e83e65c..02bc449 100755 --- a/pytorch-client +++ b/pytorch-client @@ -8,11 +8,6 @@ exec 2>&1 model="llama" -pwd=`/bin/pwd` -pushd /opt/app-root/lib/python3.11/site-packages -find . -name METADATA | cpio -pdumv $pwd/instructlab-site-packages-metadata -popd - longopts="" longopts+=" model:" @@ -43,13 +38,19 @@ echo "model: $workflow" git clone https://github.com/pytorch/benchmark.git pushd benchmark pip uninstall -y nvidia-ml-py -pip install pynvml +pip install --force-reinstall -v pynvml==11.5.3 + +pwd=`/bin/pwd` +pushd /opt/app-root/lib/python3.11/site-packages +find . -name METADATA | cpio -pdumv $pwd/instructlab-site-packages-metadata +popd + python run_benchmark.py test_bench --help bench_cmd="python run_benchmark.py test_bench -m $model" echo "About to run: $bench_cmd" date +%s.%N >begin.txt $bench_cmd >run_benchmark_output.txt -date +%s.%N >end.txt rc=$? +date +%s.%N >end.txt popd exit $rc diff --git a/pytorch-post-process b/pytorch-post-process index 02d29a8..f7e5f2a 100755 --- a/pytorch-post-process +++ b/pytorch-post-process @@ -103,10 +103,12 @@ def main(): log_sample(file_id, desc, names, sample) for key in d['metrics'].keys(): - if (re.search("metric=latencies$", key)): + if (re.search("metric=latencies$", key) and re.findall("[0-9]+(?:[.][0-9]+)?", str(d['metrics'][key])) ): desc = {'source' : 'pytorch', 'class': 'count', 'type': 'latency-milliseconds'} sample = {'begin': begin, 'end': end, 'value': d['metrics'][key]} log_sample(file_id, desc, names, sample) + else: + print("skipping: " + key) metric_file_name = finish_samples() period['metric-files'].append(metric_file_name)