Skip to content

Commit

Permalink
Revert grad scale optimization pr (#50839)
Browse files Browse the repository at this point in the history
* Revert "fixoptminizer _set_auxiliary_var bug (#50335)"

This reverts commit c44005f.

* Revert "refine optimizer create accumulators (#50188)"

This reverts commit 244e754.

* Revert "fix found_inf bug for custom optimizer (#50158)"

This reverts commit 64573f9.

* Revert "refine amp scaler found_inf (#49864)"

This reverts commit 382e9a0.

* fix code format

* fix conflict
  • Loading branch information
veyron95 authored Feb 24, 2023
1 parent 09694f8 commit 8a50352
Show file tree
Hide file tree
Showing 19 changed files with 101 additions and 212 deletions.
52 changes: 15 additions & 37 deletions python/paddle/amp/grad_scaler.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@

import numpy as np

from paddle import _C_ops, _legacy_C_ops
from paddle import _legacy_C_ops
from paddle.fluid import core, in_dygraph_mode
from paddle.fluid.data_feeder import check_type
from paddle.fluid.dygraph import to_variable
Expand Down Expand Up @@ -228,16 +228,11 @@ def minimize(self, optimizer, *args, **kwargs):

optimize_ops, params_grads = (None, None)

if hasattr(optimizer, "_set_auxiliary_var"):
optimizer._set_auxiliary_var('found_inf', self._found_inf)
optimize_ops, params_grads = optimizer.minimize(*args, **kwargs)
self._cache_founf_inf = optimizer._get_auxiliary_var('found_inf')
if self._found_inf:
self._cache_founf_inf = True
else:
if self._found_inf:
self._cache_founf_inf = True
else:
optimize_ops, params_grads = optimizer.minimize(*args, **kwargs)
self._cache_founf_inf = False
optimize_ops, params_grads = optimizer.minimize(*args, **kwargs)
self._cache_founf_inf = False

if self._use_dynamic_loss_scaling:
# uopdate the scale
Expand Down Expand Up @@ -335,9 +330,6 @@ def _unscale(self, optimizer):
param_grads_fp16,
self._temp_found_inf_fp16,
)
self._found_inf = _C_ops.bitwise_or(
self._found_inf, self._temp_found_inf_fp16
)
if len(param_grads_bf16):
_legacy_C_ops.check_finite_and_unscale(
param_grads_bf16,
Expand All @@ -346,9 +338,6 @@ def _unscale(self, optimizer):
param_grads_bf16,
self._temp_found_inf_bf16,
)
self._found_inf = _C_ops.bitwise_or(
self._found_inf, self._temp_found_inf_bf16
)
if len(param_grads_fp32):
_legacy_C_ops.check_finite_and_unscale(
param_grads_fp32,
Expand All @@ -357,9 +346,6 @@ def _unscale(self, optimizer):
param_grads_fp32,
self._temp_found_inf_fp32,
)
self._found_inf = _C_ops.bitwise_or(
self._found_inf, self._temp_found_inf_fp32
)
else:
if len(param_grads_fp16):
_legacy_C_ops.check_finite_and_unscale(
Expand All @@ -368,29 +354,26 @@ def _unscale(self, optimizer):
param_grads_fp16,
self._temp_found_inf_fp16,
)
self._found_inf = _C_ops.bitwise_or(
self._found_inf, self._temp_found_inf_fp16
)
if len(param_grads_bf16):
_legacy_C_ops.check_finite_and_unscale(
param_grads_bf16,
self._scale,
param_grads_bf16,
self._temp_found_inf_bf16,
)
self._found_inf = _C_ops.bitwise_or(
self._found_inf, self._temp_found_inf_bf16
)
if len(param_grads_fp32):
_legacy_C_ops.check_finite_and_unscale(
param_grads_fp32,
self._scale,
param_grads_fp32,
self._temp_found_inf_fp32,
)
self._found_inf = _C_ops.bitwise_or(
self._found_inf, self._temp_found_inf_fp32
)

self._found_inf = (
self._temp_found_inf_fp16
or self._temp_found_inf_bf16
or self._temp_found_inf_fp32
)

optimizer_state["state"] = OptimizerState.UNSCALED

Expand Down Expand Up @@ -778,16 +761,11 @@ def step(self, optimizer):
if optimizer_state["state"] is OptimizerState.INIT:
self._unscale(optimizer)

if hasattr(optimizer, "_set_auxiliary_var"):
optimizer._set_auxiliary_var('found_inf', self._found_inf)
optimizer.step()
self._cache_founf_inf = optimizer._get_auxiliary_var('found_inf')
if self._found_inf:
self._cache_founf_inf = True
else:
if self._found_inf:
self._cache_founf_inf = True
else:
optimizer.step()
self._cache_founf_inf = False
optimizer.step()
self._cache_founf_inf = False

optimizer_state["state"] = OptimizerState.STEPPED

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -236,10 +236,6 @@ def _get_input_varlist(self, program):
ret_list.append(var)
return ret_list

def _set_auxiliary_var(self, key, val):
super()._set_auxiliary_var(key, val)
self.inner_opt._set_auxiliary_var(key, val)

def minimize(
self,
loss,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,16 +41,11 @@ def minimize(self, optimizer, *args, **kwargs):

optimize_ops, params_grads = (None, None)

if hasattr(optimizer, "_set_auxiliary_var"):
optimizer._set_auxiliary_var('found_inf', self._found_inf)
optimize_ops, params_grads = optimizer.minimize(*args, **kwargs)
self._cache_founf_inf = optimizer._get_auxiliary_var('found_inf')
if self._found_inf:
self._cache_founf_inf = True
else:
if self._found_inf:
self._cache_founf_inf = True
else:
optimize_ops, params_grads = optimizer.minimize(*args, **kwargs)
self._cache_founf_inf = False
optimize_ops, params_grads = optimizer.minimize(*args, **kwargs)
self._cache_founf_inf = False

if self._use_dynamic_loss_scaling:
self._update()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,6 @@ def __init__(self, optimizer):
self.meta_optimizers_white_list = []
self.meta_optimizers_black_list = []

def _set_auxiliary_var(self, key, val):
super()._set_auxiliary_var(key, val)
self.inner_opt._set_auxiliary_var(key, val)

def _set_basic_info(
self, loss, role_maker, user_defined_optimizer, user_defined_strategy
):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -203,10 +203,6 @@ def __init__(
# Update optimizer parameters and adjust parameter storage and use according to rank.
self._update_opt_status()

def _set_auxiliary_var(self, key, val):
super()._set_auxiliary_var(key, val)
self._optim._set_auxiliary_var(key, val)

@paddle.autograd.no_grad()
def _sync_params_and_buffers(self):
"""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,10 @@
import numpy as np

import paddle
from paddle import _C_ops, _legacy_C_ops
from paddle import _legacy_C_ops
from paddle.common_ops_import import dygraph_only
from paddle.fluid import core
from paddle.fluid.dygraph import to_variable
from paddle.framework import core
from paddle.nn import clip


Expand Down Expand Up @@ -270,37 +270,35 @@ def unscale_method(self, optimizer):
param_grads_bfp16,
temp_found_inf_bfp16,
)
self._found_inf = _C_ops.bitwise_or(
self._found_inf, temp_found_inf_bfp16
)
if len(param_grads_fp16):
_legacy_C_ops.check_finite_and_unscale(
param_grads_fp16,
self._scale,
param_grads_fp16,
temp_found_inf_fp16,
)
self._found_inf = _C_ops.bitwise_or(
self._found_inf, temp_found_inf_fp16
)
if len(param_grads_fp32):
_legacy_C_ops.check_finite_and_unscale(
param_grads_fp32,
self._scale,
param_grads_fp32,
temp_found_inf_fp32,
)
self._found_inf = _C_ops.bitwise_or(
self._found_inf, temp_found_inf_fp32
)

self._found_inf = self._found_inf.cast("int32")
self._found_inf = (
1
if temp_found_inf_bfp16
or temp_found_inf_fp16
or temp_found_inf_fp32
else 0
)
is_found_inf = paddle.to_tensor([self._found_inf], dtype="int32")

paddle.distributed.all_reduce(
self._found_inf, op=paddle.distributed.ReduceOp.MAX, group=None
is_found_inf, op=paddle.distributed.ReduceOp.SUM, group=None
)

self._found_inf = self._found_inf.cast("bool")
self._found_inf = is_found_inf.numpy()[0]

scaler._unscale = MethodType(unscale_method, scaler)
return scaler
Expand Down
15 changes: 5 additions & 10 deletions python/paddle/distributed/fleet/scaler.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
import numpy as np

import paddle
from paddle import _C_ops, _legacy_C_ops
from paddle import _legacy_C_ops
from paddle.distributed import fleet
from paddle.fluid.dygraph import to_variable
from paddle.framework import core
Expand Down Expand Up @@ -73,29 +73,24 @@ def unscale_method(self, optimizer):
param_grads_fp16,
temp_found_inf_fp16,
)
self._found_inf = _C_ops.bitwise_or(
self._found_inf, temp_found_inf_fp16
)
if len(param_grads_fp32):
_legacy_C_ops.check_finite_and_unscale(
param_grads_fp32,
self._scale,
param_grads_fp32,
temp_found_inf_fp32,
)
self._found_inf = _C_ops.bitwise_or(
self._found_inf, temp_found_inf_fp32
)

self._found_inf = self._found_inf.cast("int32")
self._found_inf = 1 if temp_found_inf_fp16 or temp_found_inf_fp32 else 0
is_found_inf = paddle.to_tensor([self._found_inf], dtype="int32")

# TODO(shenliang03) Since dp allreduce in the optimizer is
# after the gradscaler, check_finite needs to synchronize global
# information. In the future, we should use check_group to speed.
paddle.distributed.all_reduce(
self._found_inf, op=paddle.distributed.ReduceOp.MAX, group=None
is_found_inf, op=paddle.distributed.ReduceOp.MAX, group=None
)
self._found_inf = self._found_inf.cast("bool")
self._found_inf = is_found_inf.numpy()[0]

# Only data_parallel doesn't need to modify scaler
fleet_env = fleet.fleet
Expand Down
18 changes: 6 additions & 12 deletions python/paddle/fluid/optimizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

import paddle


from paddle.fluid.framework import (
Program,
Variable,
Expand Down Expand Up @@ -899,18 +900,11 @@ def _create_optimization_pass(self, parameters_and_grads):
self._create_global_learning_rate()

if in_dygraph_mode():
found_inf = self._get_auxiliary_var('found_inf')
if found_inf:
if isinstance(found_inf, core.eager.Tensor):
self._set_auxiliary_var('found_inf', True)
else:
if isinstance(found_inf, core.eager.Tensor):
self._set_auxiliary_var('found_inf', False)
for param_and_grad in parameters_and_grads:
if param_and_grad[1] is None:
continue
if param_and_grad[0].trainable is True:
self._append_optimize_op(target_block, param_and_grad)
for param_and_grad in parameters_and_grads:
if param_and_grad[1] is None:
continue
if param_and_grad[0].trainable is True:
self._append_optimize_op(target_block, param_and_grad)
else:
for param_and_grad in parameters_and_grads:
if param_and_grad[1] is None:
Expand Down
4 changes: 0 additions & 4 deletions python/paddle/incubate/optimizer/lookahead.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,10 +144,6 @@ def __init__(self, inner_optimizer, alpha=0.5, k=5, name=None):
self._global_step_var = None
self._k_var = None

def _set_auxiliary_var(self, key, val):
super()._set_auxiliary_var(key, val)
self.inner_optimizer._set_auxiliary_var(key, val)

@framework.dygraph_only
@imperative_base.no_grad
def step(self):
Expand Down
3 changes: 0 additions & 3 deletions python/paddle/optimizer/adadelta.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,11 +145,8 @@ def _create_accumulators(self, block, parameters):
parameters = parameters.get('params')

for p in parameters:
if p.name in self._already_create_accumulater:
continue
self._add_accumulator(self._avg_squared_grad_acc_str, p)
self._add_accumulator(self._avg_squared_update_acc_str, p)
self._already_create_accumulater.add(p.name)

def _append_optimize_op(self, block, param_and_grad):
if isinstance(param_and_grad, dict):
Expand Down
3 changes: 0 additions & 3 deletions python/paddle/optimizer/adagrad.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,14 +139,11 @@ def _create_accumulators(self, block, parameters):
parameters = self._update_param_group(parameters)

for p in parameters:
if p.name in self._already_create_accumulater:
continue
self._add_accumulator(
self._moment_acc_str,
p,
fill_value=self.initial_accumulator_value,
)
self._already_create_accumulater.add(p.name)

def _append_optimize_op(self, block, param_and_grad):
assert isinstance(block, framework.Block)
Expand Down
Loading

0 comments on commit 8a50352

Please sign in to comment.