Skip to content

Commit

Permalink
[DoubleGrad] Enabled double grad test cases in eager_mode for test_im…
Browse files Browse the repository at this point in the history
…perative_double_grad (#41451)

* [DoubleGrad] Enabled double grad test cases in eager_mode for test_imperative_double_grad

* Fixed elementwise issue

* Addressed CI failures
  • Loading branch information
jim19930609 authored Apr 12, 2022
1 parent c448032 commit 0b4c3c2
Show file tree
Hide file tree
Showing 11 changed files with 152 additions and 80 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
########################
ops_to_fill_zero_for_empty_grads = set([
"split_grad", "rnn_grad", "matmul_double_grad", "matmul_triple_grad",
"sigmoid_triple_grad"
"sigmoid_triple_grad, add_double_grad"
])

# For API dispatch used at python-level
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,7 @@ class {} : public egr::GradNodeBase {{
#endif
}}
// Forward API Call
VLOG(3) << \"Final State Running: \" << \"{}\";
{}
// Get Outputs
{}
Expand Down Expand Up @@ -505,15 +506,11 @@ def ForwardsValidationCheck(self):

for i in range(len(forward_attrs_list)):
orig_attr_type = orig_forward_attrs_list[i][1]
orig_attr_default = orig_forward_attrs_list[i][2]
orig_attr_pos = orig_forward_attrs_list[i][3]
forward_attr_type = forward_attrs_list[i][1]
forward_attr_default = forward_attrs_list[i][2]
forward_attr_pos = forward_attrs_list[i][3]
assert orig_attr_type == forward_attr_type, AssertMessage(
orig_attr_type, forward_attr_type)
assert orig_attr_default == forward_attr_default, AssertMessage(
orig_attr_default, forward_attr_default)
assert orig_attr_pos == forward_attr_pos, AssertMessage(
orig_attr_pos, forward_attr_pos)

Expand Down Expand Up @@ -753,6 +750,15 @@ def GenerateNodeCreationCodes(self):
set_grad_out_meta_list = []
set_edges_list = []
for name, (_, pos) in forward_inputs_position_map.items():
# Has corresponding grad output
has_corresponding_grad_output = False
for _, (_, corresponding_pos,
_) in backward_grad_outputs_map.items():
if pos == corresponding_pos:
has_corresponding_grad_output = True
if not has_corresponding_grad_output:
continue

input_autograd_meta_name = GetAutoGradMetaName(name)
is_optional = (name in self.optional_inputs)
if is_optional:
Expand Down Expand Up @@ -1063,9 +1069,10 @@ def GenerateForwardDefinition(self, is_inplaced):
self.forward_definition_str += FORWARD_FUNCTION_TEMPLATE.format(
returns_type_str, forward_function_name, inputs_args_definition_str,
dygraph_event_str, amp_logic_str, inputs_autograd_meta_str,
forward_call_str, get_outputs_str, outputs_autograd_meta_str,
compute_require_grad_args_str, check_inplace_str,
bump_inplace_version_str, node_creation_str, returns_str)
forward_function_name, forward_call_str, get_outputs_str,
outputs_autograd_meta_str, compute_require_grad_args_str,
check_inplace_str, bump_inplace_version_str, node_creation_str,
returns_str)
self.forward_declaration_str += f"{returns_type_str} {forward_function_name}({inputs_args_declaration_str});\n"

logging.info(
Expand Down Expand Up @@ -1439,28 +1446,18 @@ def GenerateNodeDefinition(self, grad_node_creation_str):
compute_require_grad_str += f"{indent}bool require_any_grad = egr::EagerUtils::ComputeRequireGrad({compute_require_grad_args_str});"

# Construct grad_api returns
num_bwd_outputs = len(backward_grad_outputs_map.keys())
slot_num_bwd_outputs = len(self.forward_inputs_position_map.keys())
returns_str = f"{indent}std::vector<std::vector<paddle::experimental::Tensor>> returns({slot_num_bwd_outputs});\n"
for name, (ttype, fwd_position,
grad_api_position) in backward_grad_outputs_map.items():
transformed_tensor_name = self.TransformToNextGradName(name)

# Infer Grad API Return Type
if num_bwd_outputs == 1:
# Single tensor output, return as is
if IsPlainTensorType(ttype):
returns_str += f"{indent}returns[0] = {{ {transformed_tensor_name} }};\n"
else:
assert IsVectorTensorType(ttype)
returns_str += f"{indent}returns[0] = {transformed_tensor_name};\n"
# Rearrange output order accordingly
if IsPlainTensorType(ttype):
returns_str += f"{indent}returns[{fwd_position}] = {{ {transformed_tensor_name} }};\n"
else:
# Rearrange output order accordingly
if IsPlainTensorType(ttype):
returns_str += f"{indent}returns[{fwd_position}] = {{ {transformed_tensor_name} }};\n"
else:
assert IsVectorTensorType(ttype)
returns_str += f"{indent}returns[{fwd_position}] = {transformed_tensor_name};\n"
assert IsVectorTensorType(ttype)
returns_str += f"{indent}returns[{fwd_position}] = {transformed_tensor_name};\n"

returns_str += f"{indent}if(NeedComplexToRealConversion()) HandleComplexGradToRealGrad(&returns);\n"
returns_str += f"{indent}return returns;\n"
Expand Down
21 changes: 12 additions & 9 deletions paddle/fluid/eager/backward.cc
Original file line number Diff line number Diff line change
Expand Up @@ -485,6 +485,7 @@ std::unordered_map<GradNodeBase*, int> getInDegreeMap(
}
}
}

return node_in_degree_map;
}

Expand Down Expand Up @@ -526,6 +527,7 @@ std::vector<paddle::experimental::Tensor> RunBackward(
bool allow_unused = false,
const std::vector<paddle::experimental::Tensor>& no_grad_vars = {}) {
VLOG(6) << "Start Backward";

// *Gradient Hook should happen at node-level
// *Inplace version check should perform at node-level
// *Cross-batch accumulation happens at forward pass
Expand Down Expand Up @@ -729,6 +731,16 @@ std::vector<paddle::experimental::Tensor> RunBackward(
continue;
}

auto* next_node = next_node_shared.get();
if (!node_input_buffers_dict.count(next_node)) {
const auto& input_meta = next_node->InputMeta();
auto grad_tensor_holder =
std::make_unique<GradTensorHolder>(input_meta);
VLOG(6) << "Construct GradTensorHolder for grad node: "
<< next_node->name();
node_input_buffers_dict[next_node] = std::move(grad_tensor_holder);
}

PADDLE_ENFORCE_LT(
j, grad_output_tensors[i].size(),
paddle::platform::errors::Fatal(
Expand All @@ -748,15 +760,6 @@ std::vector<paddle::experimental::Tensor> RunBackward(
<< ", rank: " << j
<< " 's name is: " << grad_output_tensor.name();

auto* next_node = next_node_shared.get();
if (!node_input_buffers_dict.count(next_node)) {
const auto& input_meta = next_node->InputMeta();
auto grad_tensor_holder =
std::make_unique<GradTensorHolder>(input_meta);
VLOG(6) << "Construct GradTensorHolder for grad node: "
<< next_node->name();
node_input_buffers_dict[next_node] = std::move(grad_tensor_holder);
}
VLOG(6) << "Sum grad inputs for edge slot: " << edge_rank.first
<< ", rank: " << edge_rank.second;
node_input_buffers_dict[next_node]->add(
Expand Down
2 changes: 1 addition & 1 deletion paddle/phi/kernels/cpu/elementwise_grad_kernel.cc
Original file line number Diff line number Diff line change
Expand Up @@ -63,9 +63,9 @@ void AddGradKernel(const Context& dev_ctx,
template <typename T, typename Context>
void AddDoubleGradKernel(const Context& dev_ctx,
const DenseTensor& y,
const DenseTensor& dout,
paddle::optional<const DenseTensor&> ddx,
paddle::optional<const DenseTensor&> ddy,
const DenseTensor& dout,
int axis,
DenseTensor* ddout) {
phi::AddDoubleGradImpl<T>(dev_ctx, y, ddx, ddy, dout, axis, ddout);
Expand Down
2 changes: 1 addition & 1 deletion paddle/phi/kernels/elementwise_grad_kernel.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,9 @@ void AddGradKernel(const Context& dev_ctx,
template <typename T, typename Context>
void AddDoubleGradKernel(const Context& dev_ctx,
const DenseTensor& y,
const DenseTensor& dout,
paddle::optional<const DenseTensor&> ddx,
paddle::optional<const DenseTensor&> ddy,
const DenseTensor& dout,
int axis,
DenseTensor* ddout);

Expand Down
2 changes: 1 addition & 1 deletion paddle/phi/kernels/gpu/elementwise_grad_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -56,9 +56,9 @@ void AddGradKernel(const Context& dev_ctx,
template <typename T, typename Context>
void AddDoubleGradKernel(const Context& dev_ctx,
const DenseTensor& y,
const DenseTensor& dout,
paddle::optional<const DenseTensor&> ddx,
paddle::optional<const DenseTensor&> ddy,
const DenseTensor& dout,
int axis,
DenseTensor* ddout) {
phi::AddDoubleGradImpl<T>(dev_ctx, y, ddx, ddy, dout, axis, ddout);
Expand Down
2 changes: 1 addition & 1 deletion paddle/phi/ops/compat/elementwise_sig.cc
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ KernelSignature ElementwiseAddGradOpArgumentMapping(
KernelSignature ElementwiseAddDoubleGradOpArgumentMapping(
const ArgumentMappingContext& ctx) {
return KernelSignature(
"add_double_grad", {"Y", "DDX", "DDY", "DOut"}, {"axis"}, {"DDOut"});
"add_double_grad", {"Y", "DOut", "DDX", "DDY"}, {"axis"}, {"DDOut"});
}

KernelSignature ElementwiseAddTripleGradOpArgumentMapping(
Expand Down
33 changes: 26 additions & 7 deletions python/paddle/fluid/dygraph/math_op_patch.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from __future__ import print_function

from .. import core
from ..framework import Variable, convert_np_dtype_to_dtype_, _varbase_creator
from ..framework import Variable, convert_np_dtype_to_dtype_, _varbase_creator, _in_legacy_dygraph, in_dygraph_mode
from ..layers.layer_function_generator import OpProtoHolder
from . import no_grad
from .. import framework
Expand Down Expand Up @@ -62,6 +62,15 @@
_already_patch_varbase = False
_already_patch_eager_tensor = False

# Dispatch to final state Python-C functions
_final_state_op_type_mapping = {
"elementwise_add": "final_state_add",
"elementwise_sub": "final_state_subtract",
"elementwise_div": "final_state_divide",
"elementwise_mul": "final_state_multiply",
"matmul_v2": "final_state_matmul",
}


def monkey_patch_math_varbase():
"""
Expand Down Expand Up @@ -105,10 +114,15 @@ def astype(self, dtype):
"""
if not isinstance(dtype, core.VarDesc.VarType):
dtype = convert_np_dtype_to_dtype_(dtype)
return _C_ops.cast(self, 'in_dtype', self.dtype, 'out_dtype', dtype)

if _in_legacy_dygraph():
return _C_ops.cast(self, 'in_dtype', self.dtype, 'out_dtype', dtype)
return _C_ops.final_state_cast(self, dtype)

def _scalar_elementwise_op_(var, scale, bias):
return _C_ops.scale(var, 'scale', scale, 'bias', bias)
if _in_legacy_dygraph():
return _C_ops.scale(var, 'scale', scale, 'bias', bias)
return _C_ops.final_state_scale(var, float(scale), bias, True)

def _neg_(var):
return _scalar_elementwise_op_(var, -1.0, 0.0)
Expand Down Expand Up @@ -164,7 +178,10 @@ def _T_(var):
perm = []
for i in range(len(var.shape)):
perm.insert(0, i)
out, _ = _C_ops.transpose2(var, 'axis', perm)
if _in_legacy_dygraph():
out, _ = _C_ops.transpose2(var, 'axis', perm)
else:
out = _C_ops.final_state_transpose(var, perm)
return out

def _scalar_add_(var, value):
Expand Down Expand Up @@ -270,11 +287,13 @@ def __impl__(self, other_var):

# 4. calculation
axis = -1
if framework._in_eager_mode_ and op_type == 'elementwise_add':
math_op = getattr(_C_ops, 'final_state_add')
if in_dygraph_mode(
) and op_type in _final_state_op_type_mapping.keys():
math_op = getattr(_C_ops, _final_state_op_type_mapping[op_type])
return math_op(self, other_var)
else:
math_op = getattr(_C_ops, op_type)
return math_op(self, other_var, 'axis', axis)
return math_op(self, other_var, 'axis', axis)

comment = OpProtoHolder.instance().get_op_proto(op_type).comment

Expand Down
5 changes: 4 additions & 1 deletion python/paddle/fluid/layers/nn.py
Original file line number Diff line number Diff line change
Expand Up @@ -9036,7 +9036,10 @@ def relu(x, name=None):
# [[0. 0. ]
# [1. 2.6]]
"""
if _non_static_mode():

if in_dygraph_mode():
return _C_ops.final_state_relu(x)
if _in_legacy_dygraph():
return _C_ops.relu(x)

check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'relu')
Expand Down
60 changes: 27 additions & 33 deletions python/paddle/fluid/tests/unittests/test_imperative_double_grad.py
Original file line number Diff line number Diff line change
Expand Up @@ -385,26 +385,23 @@ def func_example_with_gradient_accumulation_and_create_graph(self):
(x_np > 0) * 2).astype('float32')
self.assertTrue(np.allclose(dx_actual.numpy(), dx_expected))

if not _in_legacy_dygraph():
pass
else:
loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x)
loss.backward(retain_graph=True)
loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x)
loss.backward(retain_graph=True)

x_grad_actual = x.gradient()
x_grad_expected = (2.0 / float(numel) *
(x_np + dx_expected *
(x_np > 0) * 2 / float(numel))).astype('float32')
self.assertTrue(np.allclose(x_grad_actual, x_grad_expected))

for i in range(5):
loss.backward(retain_graph=True)
x_grad_actual = x.gradient()
x_grad_expected = (2.0 / float(numel) * (
x_grad_expected = (i + 2) * (2.0 / float(numel) * (
x_np + dx_expected *
(x_np > 0) * 2 / float(numel))).astype('float32')
self.assertTrue(np.allclose(x_grad_actual, x_grad_expected))

for i in range(5):
loss.backward(retain_graph=True)
x_grad_actual = x.gradient()
x_grad_expected = (i + 2) * (2.0 / float(numel) * (
x_np + dx_expected *
(x_np > 0) * 2 / float(numel))).astype('float32')
self.assertTrue(np.allclose(x_grad_actual, x_grad_expected))

def test_example_with_gradient_accumulation_and_create_graph(self):
with _test_eager_guard():
self.func_example_with_gradient_accumulation_and_create_graph()
Expand All @@ -426,7 +423,10 @@ def func_example_with_gradient_accumulation_and_no_grad_vars(self):
del y1, z, w

dx_actual, = self.grad(
[w_mean], [x], create_graph=True, no_grad_vars=[y2])
[w_mean], [x],
retain_graph=True,
create_graph=True,
no_grad_vars=[y2])

self.assertFalse(y2.stop_gradient)
self.assertFalse(dx_actual.stop_gradient)
Expand All @@ -435,17 +435,14 @@ def func_example_with_gradient_accumulation_and_no_grad_vars(self):
(x_np > 0) * 2).astype('float32')
self.assertTrue(np.allclose(dx_actual.numpy(), dx_expected))

if not _in_legacy_dygraph():
pass
else:
loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x)
loss.backward()
loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x)
loss.backward()

x_grad_actual = x.gradient()
x_grad_expected = (2.0 / float(numel) * (
x_np + dx_expected *
(x_np > 0) * 4 / float(numel))).astype('float32')
self.assertTrue(np.allclose(x_grad_actual, x_grad_expected))
x_grad_actual = x.gradient()
x_grad_expected = (2.0 / float(numel) *
(x_np + dx_expected *
(x_np > 0) * 4 / float(numel))).astype('float32')
self.assertTrue(np.allclose(x_grad_actual, x_grad_expected))

def test_example_with_gradient_accumulation_and_no_grad_vars(self):
with _test_eager_guard():
Expand Down Expand Up @@ -476,15 +473,12 @@ def func_example_with_gradient_accumulation_and_not_create_graph(self):

self.assertTrue(np.allclose(dx_actual.numpy(), dx_expected))

if not _in_legacy_dygraph():
pass
else:
loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x)
loss.backward()
loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x)
loss.backward()

x_grad_actual = x.gradient()
x_grad_expected = (2.0 * x_np / float(numel)).astype('float32')
self.assertTrue(np.allclose(x_grad_actual, x_grad_expected))
x_grad_actual = x.gradient()
x_grad_expected = (2.0 * x_np / float(numel)).astype('float32')
self.assertTrue(np.allclose(x_grad_actual, x_grad_expected))

def test_example_with_gradient_accumulation_and_not_create_graph(self):
with _test_eager_guard():
Expand Down
Loading

0 comments on commit 0b4c3c2

Please sign in to comment.