Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[PHI decoupling] move softmax from fluid to phi and remove cpu_vec.h in fluid #48970

Merged
merged 16 commits into from
Dec 15, 2022
Merged
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,15 @@ See the License for the specific language governing permissions and
limitations under the License. */

#include "paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.h"
#include "paddle/fluid/operators/math/softmax_impl.h"

#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
#include "paddle/fluid/string/string_helper.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/kernels/funcs/axis_utils.h"
#include "paddle/phi/kernels/funcs/cross_entropy.h"
#include "paddle/phi/kernels/funcs/softmax_impl.h"

namespace paddle {
namespace operators {
Expand Down Expand Up @@ -129,15 +131,15 @@ struct CSoftmaxWithCrossEntropyFunctor<phi::GPUContext, T> {
softmax_2d.ShareDataWith(*softmax).Resize({N, D});
loss_2d.ShareDataWith(*loss).Resize({N, 1});

auto eigen_logits = math::EigenMatrix<T>::From(logits_2d);
auto eigen_softmax = math::EigenMatrix<T>::From(softmax_2d);
auto eigen_logits = phi::funcs::EigenMatrix<T>::From(logits_2d);
auto eigen_softmax = phi::funcs::EigenMatrix<T>::From(softmax_2d);

// step 1, obtain logit_max
phi::DenseTensor logits_max;
logits_max = ctx.AllocateTmpTensor<T, phi::GPUContext>({N, 1}, dev_ctx);
void* logits_max_buff = logits_max.mutable_data<T>(place);

auto eigen_logits_max = math::EigenMatrix<T>::From(logits_max);
auto eigen_logits_max = phi::funcs::EigenMatrix<T>::From(logits_max);
Eigen::DSizes<int, 1> along_axis(1);
eigen_logits_max.device(*dev_ctx.eigen_device()) =
eigen_logits.maximum(along_axis);
Expand All @@ -158,7 +160,7 @@ struct CSoftmaxWithCrossEntropyFunctor<phi::GPUContext, T> {
eigen_softmax.device(*dev_ctx.eigen_device()) =
(eigen_logits -
eigen_logits_max.reshape(batch_by_one).broadcast(one_by_class))
.unaryExpr(math::ValueClip<T>());
.unaryExpr(phi::funcs::ValueClip<T>());

// step 3, obtain predict target
phi::DenseTensor predicted_logits;
Expand Down Expand Up @@ -217,7 +219,8 @@ struct CSoftmaxWithCrossEntropyFunctor<phi::GPUContext, T> {
sum_exp_logits = ctx.AllocateTmpTensor<T, phi::GPUContext>({N, 1}, dev_ctx);
void* sum_exp_logits_buff = sum_exp_logits.mutable_data<T>(place);

auto eigen_sum_exp_logits = math::EigenMatrix<T>::From(sum_exp_logits);
auto eigen_sum_exp_logits =
phi::funcs::EigenMatrix<T>::From(sum_exp_logits);
eigen_sum_exp_logits.device(*dev_ctx.eigen_device()) =
eigen_softmax.sum(along_axis);

Expand All @@ -231,8 +234,9 @@ struct CSoftmaxWithCrossEntropyFunctor<phi::GPUContext, T> {
comm->comm(),
stream));

auto eigen_loss = math::EigenMatrix<T>::From(loss_2d);
auto eigen_predicted_logits = math::EigenMatrix<T>::From(predicted_logits);
auto eigen_loss = phi::funcs::EigenMatrix<T>::From(loss_2d);
auto eigen_predicted_logits =
phi::funcs::EigenMatrix<T>::From(predicted_logits);

eigen_loss.device(*dev_ctx.eigen_device()) =
(eigen_sum_exp_logits.log().unaryExpr(phi::funcs::TolerableValue<T>()) -
Expand Down Expand Up @@ -281,14 +285,14 @@ struct CSoftmaxWithCrossEntropyProcessGroupFunctor<phi::GPUContext, T> {
softmax_2d.ShareDataWith(*softmax).Resize({N, D});
loss_2d.ShareDataWith(*loss).Resize({N, 1});

auto eigen_logits = math::EigenMatrix<T>::From(logits_2d);
auto eigen_softmax = math::EigenMatrix<T>::From(softmax_2d);
auto eigen_logits = phi::funcs::EigenMatrix<T>::From(logits_2d);
auto eigen_softmax = phi::funcs::EigenMatrix<T>::From(softmax_2d);

// step 1, obtain logit_max
phi::DenseTensor logits_max;
logits_max = ctx.AllocateTmpTensor<T, phi::GPUContext>({N, 1}, dev_ctx);

auto eigen_logits_max = math::EigenMatrix<T>::From(logits_max);
auto eigen_logits_max = phi::funcs::EigenMatrix<T>::From(logits_max);
Eigen::DSizes<int, 1> along_axis(1);
eigen_logits_max.device(*dev_ctx.eigen_device()) =
eigen_logits.maximum(along_axis);
Expand All @@ -304,7 +308,7 @@ struct CSoftmaxWithCrossEntropyProcessGroupFunctor<phi::GPUContext, T> {
eigen_softmax.device(*dev_ctx.eigen_device()) =
(eigen_logits -
eigen_logits_max.reshape(batch_by_one).broadcast(one_by_class))
.unaryExpr(math::ValueClip<T>());
.unaryExpr(phi::funcs::ValueClip<T>());

// step 3, obtain predict target
phi::DenseTensor predicted_logits;
Expand Down Expand Up @@ -357,7 +361,8 @@ struct CSoftmaxWithCrossEntropyProcessGroupFunctor<phi::GPUContext, T> {
sum_exp_logits = ctx.AllocateTmpTensor<T, phi::GPUContext>({N, 1}, dev_ctx);
void* sum_exp_logits_buff = sum_exp_logits.mutable_data<T>(place);

auto eigen_sum_exp_logits = math::EigenMatrix<T>::From(sum_exp_logits);
auto eigen_sum_exp_logits =
phi::funcs::EigenMatrix<T>::From(sum_exp_logits);
eigen_sum_exp_logits.device(*dev_ctx.eigen_device()) =
eigen_softmax.sum(along_axis);

Expand All @@ -366,8 +371,9 @@ struct CSoftmaxWithCrossEntropyProcessGroupFunctor<phi::GPUContext, T> {
opts.reduce_op = distributed::ReduceOp::SUM;
pg->AllReduce(in_out, in_out, opts)->Synchronize();

auto eigen_loss = math::EigenMatrix<T>::From(loss_2d);
auto eigen_predicted_logits = math::EigenMatrix<T>::From(predicted_logits);
auto eigen_loss = phi::funcs::EigenMatrix<T>::From(loss_2d);
auto eigen_predicted_logits =
phi::funcs::EigenMatrix<T>::From(predicted_logits);

eigen_loss.device(*dev_ctx.eigen_device()) =
(eigen_sum_exp_logits.log().unaryExpr(phi::funcs::TolerableValue<T>()) -
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,9 @@ limitations under the License. */
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/softmax.h"
#include "paddle/phi/api/include/tensor.h"
#include "paddle/phi/kernels/funcs/cross_entropy.h"
#include "paddle/phi/kernels/funcs/softmax.h"

namespace paddle {
namespace operators {
Expand Down
2 changes: 1 addition & 1 deletion paddle/fluid/operators/jit/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ file(APPEND ${jit_file} "\#include \"paddle/fluid/operators/jit/helper.h\"\n")
file(APPEND ${jit_file}
"\#include \"paddle/fluid/operators/jit/registry.h\"\n\n")

set(JIT_KERNEL_DEPS cpu_info cblas gflags enforce place xxhash)
set(JIT_KERNEL_DEPS device_context cblas gflags enforce place xxhash)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这里device_context能否改成phi_backends?

Copy link
Member Author

@huangjiyi huangjiyi Dec 14, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@YuanRisheng 这里我最开始用的是 phi_backends,然后编译报了找不到 device_context 中一些定义的错误,然后我就改成了 device_context(device_context 也依赖了 phi_backends)


file(
GLOB jit_kernel_cc_srcs
Expand Down
1 change: 0 additions & 1 deletion paddle/fluid/operators/math/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@ math_library(maxouting)
math_library(sequence_padding)
math_library(sequence_pooling DEPS math_function jit_kernel_helper)
math_library(sequence_scale)
math_library(softmax DEPS math_function jit_kernel_helper)
if(WITH_ASCEND_CL)
math_library(beam_search DEPS math_function beam_search_npu)
elseif(WITH_XPU)
Expand Down
Loading