Skip to content

Commit

Permalink
fused_linear_param_grad infer spmd (PaddlePaddle#60144)
Browse files Browse the repository at this point in the history
* polish

* fused_linear_param_grad_add

* fused_linear_param_grad_add

* polish

* add integration test

* polish

* format

* polish

* polish
  • Loading branch information
liuzhenhai93 authored and HermitSun committed Dec 21, 2023
1 parent 69f11cb commit 03e64a4
Show file tree
Hide file tree
Showing 12 changed files with 472 additions and 37 deletions.
77 changes: 77 additions & 0 deletions paddle/phi/infermeta/spmd_rules/fused_linear_param_grad_add.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#include "paddle/phi/infermeta/spmd_rules/fused_linear_param_grad_add.h"
#include "paddle/phi/core/enforce.h"
#include "paddle/phi/infermeta/spmd_rules/matmul.h"
#include "paddle/phi/infermeta/spmd_rules/utils.h"

namespace phi {
namespace distributed {

SpmdInfo FusedLinearParamGradAddInferSpmd(const DistMetaTensor& x,
const DistMetaTensor& dout,
const DistMetaTensor& dweight,
const DistMetaTensor& dbias,
bool multi_precision,
bool has_bias) {
auto dy_spmd_info =
MatmulInferSpmd(x, dout, /*trans_x=*/true, /*trans_y=*/false);
auto& x_dist_attr = PADDLE_GET_CONST(TensorDistAttr, dy_spmd_info.first[0]);
auto& dout_dist_attr =
PADDLE_GET_CONST(TensorDistAttr, dy_spmd_info.first[1]);
auto weight_grad_dist_attr =
PADDLE_GET_CONST(TensorDistAttr, dy_spmd_info.second[0]);

weight_grad_dist_attr = ReduceGradBroadCastDims(2, weight_grad_dist_attr);

TensorDistAttr dweight_dist_attr = dweight.dist_attr();
auto dweight_shape = common::vectorize(dweight.dims());
TensorDistAttr dbias_dist_attr = dbias.dist_attr();
auto dbias_shape = common::vectorize(dbias.dims());

TensorDistAttr bias_grad_dist_attr;
if (has_bias) {
bias_grad_dist_attr = ReduceGradBroadCastDims(1, dout.dist_attr());
}

// check dweight and dweight_grad
if (!IsEmpty(dweight_shape)) {
PADDLE_ENFORCE_EQ(dweight_dist_attr,
weight_grad_dist_attr,
phi::errors::InvalidArgument(
"dweight_dist_attr [%s] and weight_grad_dist_attr "
"[%s] should be equal",
dweight_dist_attr.to_string(),
weight_grad_dist_attr.to_string()));
}
// check dbias and bias_grad
if (!IsEmpty(dbias_shape)) {
PADDLE_ENFORCE_EQ(
dbias_dist_attr,
bias_grad_dist_attr,
phi::errors::InvalidArgument(
"dbias_dist_attr [%s] and bias_grad_dist_attr [%s] should be equal",
dbias_dist_attr.to_string(),
bias_grad_dist_attr.to_string()));
}

return {{x_dist_attr, dout_dist_attr, dweight_dist_attr, dbias_dist_attr},
{weight_grad_dist_attr, bias_grad_dist_attr}};
}

SpmdInfo FusedLinearParamGradAddInferSpmdFakeReverse() { return SpmdInfo(); }

} // namespace distributed
} // namespace phi
32 changes: 32 additions & 0 deletions paddle/phi/infermeta/spmd_rules/fused_linear_param_grad_add.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#pragma once

#include <vector>

#include "paddle/phi/common/int_array.h"
#include "paddle/phi/core/distributed/auto_parallel/dist_meta_tensor.h"
#include "paddle/phi/core/distributed/type_defs.h"

namespace phi {
namespace distributed {
SpmdInfo FusedLinearParamGradAddInferSpmd(const DistMetaTensor& x,
const DistMetaTensor& dout,
const DistMetaTensor& dweight,
const DistMetaTensor& dbias,
bool multi_precision,
bool has_bias);

SpmdInfo FusedLinearParamGradAddInferSpmdFakeReverse();
} // namespace distributed
} // namespace phi
37 changes: 0 additions & 37 deletions paddle/phi/infermeta/spmd_rules/matmul.cc
Original file line number Diff line number Diff line change
Expand Up @@ -286,43 +286,6 @@ static bool DistAttrsAreBasicallyEqual(
in_dist_attr.partial_status() == out_dist_attr.partial_status());
}

TensorDistAttr ReduceGradBroadCastDims(const TensorDistAttr& input,
const ArgDistAttr& grad) {
auto& grad_in = PADDLE_GET_CONST(TensorDistAttr, grad);
auto grad_dim = grad_in.dims_mapping().size();
auto input_dim = input.dims_mapping().size();
PADDLE_ENFORCE_GE(
grad_dim,
input_dim,
phi::errors::InvalidArgument("grad dim must ge than input dim, but we "
"got grad_dim [%d], input_dim[%d]",
grad_dim,
input_dim));
if (grad_dim == input_dim) {
return grad_in;
}
size_t broadcast_dim = grad_dim - input_dim;
// gather partial status
auto partial_dims = grad_in.partial_dims();
auto& grad_dims_mapping = grad_in.dims_mapping();
auto dims_mapping = input.dims_mapping();
for (size_t i = 0; i < grad_dim; ++i) {
auto mapping = grad_dims_mapping[i];
if (i < broadcast_dim) {
if (mapping >= 0) {
partial_dims.insert(mapping);
}
} else {
dims_mapping[i - broadcast_dim] = mapping;
}
}
auto grad_out = CopyTensorDistAttrForOutput(input);
grad_out.set_dims_mapping(dims_mapping);
std::vector<int64_t> partial_status(partial_dims.begin(), partial_dims.end());
grad_out.set_partial_status(partial_status);
return grad_out;
}

SpmdInfo MatmulGradInferSpmd(const DistMetaTensor& x,
const DistMetaTensor& y,
const DistMetaTensor& out_grad,
Expand Down
9 changes: 9 additions & 0 deletions paddle/phi/infermeta/spmd_rules/rules.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ limitations under the License. */
#include "paddle/phi/infermeta/spmd_rules/flash_attention.h"
#include "paddle/phi/infermeta/spmd_rules/flatten.h"
#include "paddle/phi/infermeta/spmd_rules/full_like.h"
#include "paddle/phi/infermeta/spmd_rules/fused_linear_param_grad_add.h"
#include "paddle/phi/infermeta/spmd_rules/fused_rope.h"
#include "paddle/phi/infermeta/spmd_rules/layer_norm.h"
#include "paddle/phi/infermeta/spmd_rules/matmul.h"
Expand Down Expand Up @@ -618,10 +619,18 @@ PD_REGISTER_SPMD_RULE(
cross_entropy_with_softmax,
PD_INFER_SPMD(phi::distributed::CrossEntropyWithSoftmaxInferSpmd),
PD_INFER_SPMD(phi::distributed::CrossEntropyWithSoftmaxInferSpmdReverse));

PD_REGISTER_SPMD_RULE(
softmax_with_cross_entropy,
PD_INFER_SPMD(phi::distributed::CrossEntropyWithSoftmaxInferSpmd),
PD_INFER_SPMD(phi::distributed::CrossEntropyWithSoftmaxInferSpmdReverse));

// fused_linear_param_grad_add got no reverse infer spmd rule
PD_REGISTER_SPMD_RULE(
fused_linear_param_grad_add,
PD_INFER_SPMD(phi::distributed::FusedLinearParamGradAddInferSpmd),
PD_INFER_SPMD(
phi::distributed::FusedLinearParamGradAddInferSpmdFakeReverse));

} // namespace distributed
} // namespace phi
50 changes: 50 additions & 0 deletions paddle/phi/infermeta/spmd_rules/utils.cc
Original file line number Diff line number Diff line change
Expand Up @@ -554,5 +554,55 @@ void DebugInfoForInferSpmd(const std::string& rule_name,
}
}

TensorDistAttr ReduceGradBroadCastDims(const TensorDistAttr& input,
const ArgDistAttr& grad) {
const auto& grad_in = PADDLE_GET_CONST(TensorDistAttr, grad);
return ReduceGradBroadCastDims(input, grad_in);
}

TensorDistAttr ReduceGradBroadCastDims(int64_t input_dims,
const TensorDistAttr& grad) {
TensorDistAttr input;
std::vector<int64_t> dim_mapping(input_dims, -1);
input.set_dims_mapping(dim_mapping);
return ReduceGradBroadCastDims(input, grad);
}

TensorDistAttr ReduceGradBroadCastDims(const TensorDistAttr& input,
const TensorDistAttr& grad) {
auto grad_dim = grad.dims_mapping().size();
auto input_dim = input.dims_mapping().size();
PADDLE_ENFORCE_GE(
grad_dim,
input_dim,
phi::errors::InvalidArgument("grad dim must ge than input dim, but we "
"got grad_dim [%d], input_dim[%d]",
grad_dim,
input_dim));
if (grad_dim == input_dim) {
return grad;
}
size_t broadcast_dim = grad_dim - input_dim;
// gather partial status
auto partial_dims = grad.partial_dims();
auto& grad_dims_mapping = grad.dims_mapping();
auto dims_mapping = input.dims_mapping();
for (size_t i = 0; i < grad_dim; ++i) {
auto mapping = grad_dims_mapping[i];
if (i < broadcast_dim) {
if (mapping >= 0) {
partial_dims.insert(mapping);
}
} else {
dims_mapping[i - broadcast_dim] = mapping;
}
}
auto grad_out = CopyTensorDistAttrForOutput(input);
grad_out.set_dims_mapping(dims_mapping);
std::vector<int64_t> partial_status(partial_dims.begin(), partial_dims.end());
grad_out.set_partial_status(partial_status);
return grad_out;
}

} // namespace distributed
} // namespace phi
9 changes: 9 additions & 0 deletions paddle/phi/infermeta/spmd_rules/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -205,5 +205,14 @@ std::vector<int64_t> GetDimsMappingForAxes(
void DebugInfoForInferSpmd(const std::string& rule_name,
const SpmdInfo& infer_result);

TensorDistAttr ReduceGradBroadCastDims(const TensorDistAttr& input,
const ArgDistAttr& grad);

TensorDistAttr ReduceGradBroadCastDims(const TensorDistAttr& input,
const TensorDistAttr& grad);

TensorDistAttr ReduceGradBroadCastDims(int64_t input_dims,
const TensorDistAttr& grad);

} // namespace distributed
} // namespace phi
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os

import numpy as np

import paddle
import paddle.distributed as dist
from paddle import _C_ops


class TestFusedParamGradAddForSemiAutoParallel:
def __init__(self):
self._dtype = os.getenv("dtype")
self._backend = os.getenv("backend")
self._seed = eval(os.getenv("seed"))
self._mesh = dist.ProcessMesh([[0, 1], [2, 3]], dim_names=["x", "y"])

def check_tensor_eq(self, a, b):
np1 = a.numpy()
np2 = b.numpy()
np.testing.assert_allclose(np1, np2, rtol=1e-05, verbose=True)

def test_body(self):
x_shape = [4, 16, 32]
y_shape = [4, 16, 64]

paddle.seed(self._seed)
np.random.seed(self._seed)

x_np = np.random.random(size=x_shape).astype(self._dtype)
y_np = np.random.random(size=y_shape).astype(self._dtype)

def run_acc_step(x, y):
weight_grad = None
bias_grad = None
for _ in range(2):
weight_grad, bias_grad = _C_ops.fused_linear_param_grad_add(
x,
y,
weight_grad,
bias_grad,
False,
True,
)
return weight_grad, bias_grad

x = paddle.to_tensor(x_np)
y = paddle.to_tensor(y_np)
x.stop_gradient = True
y.stop_gradient = True

weight_grad, bias_grad = run_acc_step(x, y)

# test mp col split
x_placements = [dist.Shard(0), dist.Replicate()]
y_placements = [dist.Shard(0), dist.Shard(2)]

dist_x = dist.shard_tensor(x_np, self._mesh, x_placements)
dist_y = dist.shard_tensor(y_np, self._mesh, y_placements)
dist_x.stop_gradient = True
dist_y.stop_gradient = True

weight_grad_dist, bias_grad_dist = run_acc_step(dist_x, dist_y)
self.check_tensor_eq(weight_grad, weight_grad_dist)
self.check_tensor_eq(bias_grad, bias_grad_dist)

# test mp row split
x_placements = [dist.Shard(0), dist.Shard(2)]
y_placements = [dist.Shard(0), dist.Replicate()]
dist_x = dist.shard_tensor(x_np, self._mesh, x_placements)
dist_y = dist.shard_tensor(y_np, self._mesh, y_placements)
dist_x.stop_gradient = True
dist_y.stop_gradient = True
weight_grad_dist, bias_grad_dist = run_acc_step(dist_x, dist_y)
self.check_tensor_eq(weight_grad, weight_grad_dist)
self.check_tensor_eq(bias_grad, bias_grad_dist)

def test_fused_linear_param_grad_add(self):
self.test_body()

def run_test_case(self):
if self._backend == "cpu":
paddle.set_device("cpu")
elif self._backend == "gpu":
paddle.set_device("gpu:" + str(dist.get_rank()))
else:
raise ValueError("Only support cpu or gpu backend.")

self.test_fused_linear_param_grad_add()


if __name__ == '__main__':
TestFusedParamGradAddForSemiAutoParallel().run_test_case()
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,16 @@ def test_simple_net_hybrid_strategy(self):
)
ckpt_path.cleanup()

def test_fused_linear_param_grad_add(self):
envs_list = test_base.gen_product_envs_list(
self._default_envs, self._changeable_envs
)
for envs in envs_list:
self.run_test_case(
"semi_auto_parallel_for_fused_linear_param_grad_add.py",
user_defined_envs=envs,
)


class TestSemiAutoParallelHybridStrategy(test_base.CommunicationTestDistBase):
def setUp(self):
Expand Down
2 changes: 2 additions & 0 deletions test/auto_parallel/spmd_rules/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ if(WITH_DISTRIBUTE)
py_test_modules(test_triu_rule MODULES test_triu_rule)
py_test_modules(test_flash_attention_rule MODULES test_flash_attention_rule)
py_test_modules(test_tile_rule MODULES test_tile_rule)
py_test_modules(test_fused_linear_param_grad_add_rule MODULES
test_fused_linear_param_grad_add_rule)
# End of unittests WITH single card WITHOUT timeout

endif()
Loading

0 comments on commit 03e64a4

Please sign in to comment.