Lightning-AI · Borda · Jan 3, 2023 · Dec 12, 2022 · Dec 12, 2022 · Dec 12, 2022
@@ -91,6 +91,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Enhanced `reduce_boolean_decision` to accommodate `any`-analogous semantics expected by the `EarlyStopping` callback ([#15253](https://github.com/Lightning-AI/lightning/pull/15253))
 
 
+- Fixed the incorrect optimizer step synchronization when running across multiple TPU devices ([#16020](https://github.com/Lightning-AI/lightning/pull/16020))
+
+
 ## [1.8.4] - 2022-12-08
 
 ### Changed

@@ -29,6 +29,13 @@ def __init__(self, *args: Any, **kwargs: Any) -> None:
             raise ModuleNotFoundError(str(_XLA_AVAILABLE))
         super().__init__(*args, **kwargs)
 
+    def _tpu_wrap_closure(self, optimizer: Optimizable, closure: Callable[[], Any]) -> Any:
+        import torch_xla.core.xla_model as xm
+
+        closure_result = closure()
+        xm.reduce_gradients(optimizer)
+        return closure_result
+
     def optimizer_step(  # type: ignore[override]
         self,
         optimizer: Optimizable,
@@ -39,8 +46,10 @@ def optimizer_step(  # type: ignore[override]
     ) -> Any:
         import torch_xla.core.xla_model as xm
 
+        closure = partial(self._tpu_wrap_closure, optimizer, closure)
         closure = partial(self._wrap_closure, model, optimizer, optimizer_idx, closure)
-        closure_result = xm.optimizer_step(optimizer, optimizer_args={"closure": closure, **kwargs})
+        closure_result = optimizer.step(closure=closure, **kwargs)
+        xm.mark_step()
         skipped_backward = closure_result is None
         # in manual optimization, the closure does not return a value
         if model.automatic_optimization and skipped_backward: