perftool-incubator · atheurer · Dec 3, 2024 · Dec 3, 2024
diff --git a/pytorch-client b/pytorch-client
@@ -8,11 +8,6 @@ exec 2>&1
 
 model="llama"
 
-pwd=`/bin/pwd`
-pushd /opt/app-root/lib/python3.11/site-packages
-find . -name METADATA | cpio -pdumv $pwd/instructlab-site-packages-metadata
-popd
-
 longopts=""
 longopts+=" model:"
 
@@ -43,13 +38,19 @@ echo "model: $workflow"
 git clone https://github.com/pytorch/benchmark.git
 pushd benchmark
 pip uninstall -y nvidia-ml-py
-pip install pynvml
+pip install --force-reinstall -v pynvml==11.5.3
+
+pwd=`/bin/pwd`
+pushd /opt/app-root/lib/python3.11/site-packages
+find . -name METADATA | cpio -pdumv $pwd/instructlab-site-packages-metadata
+popd
+
 python run_benchmark.py test_bench --help
 bench_cmd="python run_benchmark.py test_bench -m $model"
 echo "About to run: $bench_cmd"
 date +%s.%N >begin.txt
 $bench_cmd >run_benchmark_output.txt
-date +%s.%N >end.txt
 rc=$?
+date +%s.%N >end.txt
 popd
 exit $rc
diff --git a/pytorch-post-process b/pytorch-post-process
@@ -103,10 +103,12 @@ def main():
         log_sample(file_id, desc, names, sample)
 
         for key in d['metrics'].keys():
-            if (re.search("metric=latencies$", key)):
+            if (re.search("metric=latencies$", key) and re.findall("[0-9]+(?:[.][0-9]+)?", str(d['metrics'][key])) ):
                 desc = {'source' : 'pytorch', 'class': 'count', 'type': 'latency-milliseconds'}
                 sample = {'begin': begin, 'end': end, 'value': d['metrics'][key]}
                 log_sample(file_id, desc, names, sample)
+            else:
+                print("skipping: " + key)
 
         metric_file_name = finish_samples()
         period['metric-files'].append(metric_file_name)