[Torch] Fix classification sample metric dumps not in main process (#…

…2146) ### Changes In torch classification sample: `write_metrics` and `mlflow` accuracy logging is called only in main process ### Reason for changes Torch test_compression_training (build 106) failed due to metric collected by `write_metrics` and metric saved in checkpoint with key "best_acc1" were different ### Tests Torch test_compression_training build ~~109~~ 111
openvinotoolkit · Sep 21, 2023 · 0af613d · 0af613d
1 parent 28b524c
commit 0af613d
Show file tree

Hide file tree

Showing 2 changed files with 9 additions and 9 deletions.
diff --git a/examples/torch/classification/main.py b/examples/torch/classification/main.py
@@ -380,14 +380,15 @@ def train(
         is_best = is_best_by_accuracy or compression_stage > best_compression_stage
         if is_best:
             best_acc1 = acc1
-        config.mlflow.safe_call("log_metric", "best_acc1", best_acc1)
         best_compression_stage = max(compression_stage, best_compression_stage)
-        acc = best_acc1 / 100
-        if config.metrics_dump is not None:
-            write_metrics(acc, config.metrics_dump)
         if is_main_process():
             logger.info(statistics.to_str())
 
+            if config.metrics_dump is not None:
+                acc = best_acc1 / 100
+                write_metrics(acc, config.metrics_dump)
+            config.mlflow.safe_call("log_metric", "best_acc1", best_acc1)
+
             checkpoint_path = osp.join(config.checkpoint_save_dir, get_run_name(config) + "_last.pth")
             checkpoint = {
                 "epoch": epoch + 1,
@@ -735,12 +736,11 @@ def validate(val_loader, model, criterion, config, epoch=0, log_validation_info=
             config.mlflow.safe_call("log_metric", "val/top1", float(top1.avg), epoch)
             config.mlflow.safe_call("log_metric", "val/top5", float(top5.avg), epoch)
 
-        if log_validation_info:
             logger.info(" * Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}\n".format(top1=top1, top5=top5))
 
+        if is_main_process() and config.metrics_dump is not None:
             acc = top1.avg / 100
-            if config.metrics_dump is not None:
-                write_metrics(acc, config.metrics_dump)
+            write_metrics(acc, config.metrics_dump)
 
     return top1.avg, top5.avg, losses.avg
 

diff --git a/tests/torch/test_compression_training.py b/tests/torch/test_compression_training.py
@@ -442,7 +442,7 @@ def test_compression_train(self, desc: CompressionTrainingTestDescriptor, tmp_pa
         self._validate_train_metric(desc)
 
     @pytest.mark.dependency(depends=["train"])
-    def test_compression_eval(self, desc: LEGRTrainingTestDescriptor, tmp_path, mocker):
+    def test_compression_eval(self, desc: CompressionTrainingTestDescriptor, tmp_path, mocker):
         validator = desc.get_validator()
         args = validator.get_default_args(tmp_path)
         metric_file_path = self._add_args_for_eval(args, desc, tmp_path)
@@ -497,7 +497,7 @@ def test_compression_nas_eval(self, nas_desc: NASTrainingTestDescriptor, tmp_pat
         self._validate_eval_metric(nas_desc, metric_file_path)
 
     @staticmethod
-    def _validate_eval_metric(desc, metric_file_path):
+    def _validate_eval_metric(desc: CompressionTrainingTestDescriptor, metric_file_path):
         with open(str(metric_file_path), encoding="utf8") as metric_file:
             metrics = json.load(metric_file)
             ref_metric = metrics["Accuracy"]