add flag to use old adam, needed for backwards compatability

Summary: Old models that use FairseqAdam and eventually use Adam as the internal optimizer can no longer be loaded as checkpoints, because now FairseqAdam seems to always load FusedAdamV1. FusedAdamV1 and Adam are not compatible, meaning all old checkpoints are not loadable with the current code. This fix is to just add a flag to specify if we want to use Adam rather than FusedAdamV1. Later on we'll delete this flag once all models are using FusedAdamV1. Reviewed By: myleott Differential Revision: D19358962 fbshipit-source-id: a0af5d50588dc108339a77736dcc8ff5db314dd0
facebookresearch · Jan 11, 2020 · 0ce722d · 0ce722d
1 parent fe6c2ed
commit 0ce722d
Showing 1 changed file with 9 additions and 1 deletion.
diff --git a/fairseq/optim/adam.py b/fairseq/optim/adam.py
@@ -26,7 +26,7 @@ class FairseqAdam(FairseqOptimizer):
     def __init__(self, args, params):
         super().__init__(args)
         fused_adam_cls = get_fused_adam_class()
-        if fused_adam_cls is not None and torch.cuda.is_available():
+        if not args.use_old_adam and fused_adam_cls is not None and torch.cuda.is_available():
             print('| using FusedAdam')
             self._optimizer = fused_adam_cls(params, **self.optimizer_config)
         else:
@@ -42,6 +42,14 @@ def add_args(parser):
                             help='epsilon for Adam optimizer')
         parser.add_argument('--weight-decay', '--wd', default=0.0, type=float, metavar='WD',
                             help='weight decay')
+        # Maintain backward compatibility with old checkpoints that have stored
+        # optimizer state as fairseq.optim.adam.Adam.
+        parser.add_argument(
+            "--use-old-adam",
+            action='store_true',
+            default=False,
+            help="Use fairseq.optim.adam.Adam",
+        )
         # fmt: on
 
     @property