From 16e7c269d062c8d16c4d4ff70cc80fd87935dc95 Mon Sep 17 00:00:00 2001 From: "yizhi.chen" Date: Mon, 6 Jul 2020 11:34:03 +0800 Subject: [PATCH] Add loss multiplication to preserver the single-process performance --- train.py | 15 +++++++++------ utils/torch_utils.py | 4 ++-- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/train.py b/train.py index b7d202d2fbe4..07a31abd2405 100644 --- a/train.py +++ b/train.py @@ -19,6 +19,7 @@ try: # Mixed precision training https://github.com/NVIDIA/apex from apex import amp except: + #from apex.parallel import DistributedDataParallel as DDP print('Apex recommended for faster mixed precision training: https://github.com/NVIDIA/apex') mixed_precision = False # not installed @@ -120,7 +121,8 @@ def train(hyp, tb_writer, opt, device): del pg0, pg1, pg2 # Load Model - if opt.local_rank in [-1, 0]: + # Avoid multiple downloads. + with torch_distributed_zero_first(opt.local_rank): google_utils.attempt_download(weights) start_epoch, best_fitness = 0, 0.0 if weights.endswith('.pt'): # pytorch format @@ -274,6 +276,9 @@ def train(hyp, tb_writer, opt, device): # Loss loss, loss_items = compute_loss(pred, targets.to(device), model) + # loss is scaled with batch size in func compute_loss. But in DDP mode, gradient is averaged between devices. + if opt.local_rank != -1: + loss *= torch.distributed.get_world_size() if not torch.isfinite(loss): print('WARNING: non-finite loss, ending training ', loss_items) return results @@ -293,9 +298,9 @@ def train(hyp, tb_writer, opt, device): ema.update(model) # Print - mloss = (mloss * i + loss_items) / (i + 1) # update mean losses - if opt.local_rank in [-1, 0]: + # TODO: all_reduct mloss if in DDP mode. + mloss = (mloss * i + loss_items) / (i + 1) # update mean losses mem = '%.3gG' % (torch.cuda.memory_cached() / 1E9 if torch.cuda.is_available() else 0) # (GB) s = ('%10s' * 2 + '%10.4g' * 6) % ( '%g/%g' % (epoch, epochs - 1), mem, *mloss, targets.shape[0], imgs.shape[-1]) @@ -319,7 +324,7 @@ def train(hyp, tb_writer, opt, device): final_epoch = epoch + 1 == epochs if not opt.notest or final_epoch: # Calculate mAP results, maps, times = test.test(opt.data, - batch_size=total_batch_size, + batch_size=batch_size, imgsz=imgsz_test, save_json=final_epoch and opt.data.endswith(os.sep + 'coco.yaml'), model=ema.ema.module if hasattr(ema.ema, 'module') else ema.ema, @@ -377,8 +382,6 @@ def train(hyp, tb_writer, opt, device): if not opt.evolve: plot_results() # save as results.png print('%g epochs completed in %.3f hours.\n' % (epoch - start_epoch + 1, (time.time() - t0) / 3600)) - if opt.local_rank == 0: - dist.destroy_process_group() torch.cuda.empty_cache() return results diff --git a/utils/torch_utils.py b/utils/torch_utils.py index b9c1ad6155c5..6569ff62af33 100644 --- a/utils/torch_utils.py +++ b/utils/torch_utils.py @@ -200,6 +200,6 @@ def update(self, model): def update_attr(self, model): # Assign attributes (which may change during training) for k in model.__dict__.keys(): - if not k.startswith('_') and (k != 'module' or not isinstance(getattr(model, k), - (torch.distributed.ProcessGroupNCCL, torch.distributed.Reducer))): + if not (k.startswith('_') or k == 'module' or + isinstance(getattr(model, k), (torch.distributed.ProcessGroupNCCL, torch.distributed.Reducer))): setattr(self.ema, k, getattr(model, k))