Skip to content

Commit

Permalink
merge develop into branch
Browse files Browse the repository at this point in the history
  • Loading branch information
csy0225 committed Feb 22, 2023
2 parents 2c0f308 + 433c2ff commit 76c8ae1
Show file tree
Hide file tree
Showing 249 changed files with 6,063 additions and 9,537 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@ paddle/phi/api/lib/tensor_operants.cc
paddle/phi/extension.h
paddle/phi/include/*
paddle/phi/infermeta/generated.*

paddle/fluid/prim/api/generated_prim/*.cc
paddle/fluid/prim/api/generated_prim/*.h
*.DS_Store
*.vs
build/
Expand Down
2 changes: 1 addition & 1 deletion cmake/external/xpu.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ set(XPU_API_LIB_NAME "libxpuapi.so")
set(XPU_RT_LIB_NAME "libxpurt.so")

set(XPU_BASE_DATE "20230220")
set(XPU_XCCL_BASE_VERSION "1.0.8")
set(XPU_XCCL_BASE_VERSION "1.0.9")

if(NOT DEFINED XPU_BASE_URL)
set(XPU_BASE_URL_WITHOUT_DATE
Expand Down
2 changes: 1 addition & 1 deletion paddle/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ add_subdirectory(ir)

# Next, (to be discusssed)
# (1) move all source files to same folder,
# (2) naturally, and and configure tests in only one `CMakeLists.txt`,
# (2) naturally, and configure tests in only one `CMakeLists.txt`,
# (3) cc tests support linking pre-built dynamic libraries. For example, use the dynamic
# library in the installed paddle by `pip`.

Expand Down
2 changes: 1 addition & 1 deletion paddle/fluid/eager/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,6 @@ cc_library(
op_registry
variable_helper
memcpy
scale_op
generated_op
autograd_meta
hook_utils)
2 changes: 1 addition & 1 deletion paddle/fluid/eager/tests/performance_tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ if(NOT (NOT WITH_PYTHON AND ON_INFER))
${generated_deps}
eager_scale
scale_node
scale_op
generated_op
matmul_v2_op
dygraph_function
eager_prim_api)
Expand Down
4 changes: 2 additions & 2 deletions paddle/fluid/framework/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1051,7 +1051,7 @@ if(WITH_PSCORE)
heter_pipeline_trainer_test
SRCS heter_pipeline_trainer_test.cc
DEPS conditional_block_op
scale_op
generated_op
heter_listen_and_serv_op
executor
heter_server
Expand All @@ -1068,7 +1068,7 @@ if(WITH_PSCORE)
heter_pipeline_trainer_test
SRCS heter_pipeline_trainer_test.cc
DEPS conditional_block_op
scale_op
generated_op
heter_listen_and_serv_op
executor
heter_server
Expand Down
2 changes: 1 addition & 1 deletion paddle/fluid/framework/data_set.h
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,7 @@ class Dataset {
virtual void DestroyPreLoadReaders() = 0;
// set preload thread num
virtual void SetPreLoadThreadNum(int thread_num) = 0;
// seperate train thread and dataset thread
// separate train thread and dataset thread
virtual void DynamicAdjustChannelNum(int channel_num,
bool discard_remaining_ins = false) = 0;
virtual void DynamicAdjustReadersNum(int thread_num) = 0;
Expand Down
1 change: 1 addition & 0 deletions paddle/fluid/framework/ir/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,7 @@ if(WITH_XPU)
${XPU_PASS_DEPS})
pass_library(multi_encoder_xpu_slice_fuse_pass inference DIR xpu)
pass_library(generate_sequence_xpu_fuse_pass inference DIR xpu)
pass_library(link_xpu_op_max_pass inference DIR xpu)
endif()

cc_library(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -76,5 +76,5 @@ cc_library(
cc_test(
test_reference_count_pass_last_lived_ops
SRCS test_reference_count_pass_last_lived_ops.cc
DEPS parallel_executor elementwise_mul_op elementwise_add_op scale_op
DEPS parallel_executor elementwise_mul_op elementwise_add_op generated_op
eigen_function)
41 changes: 25 additions & 16 deletions paddle/fluid/framework/ir/xpu/fc_xpu_fuse_pass.cc
Original file line number Diff line number Diff line change
Expand Up @@ -159,9 +159,9 @@ Fused subgraph:
\ | / |
\ | / |
fc_xpu-----------
|
|
act_out
| \
| \
act_out out_max
*/
class FcXPUFusePass : public FusePassBase {
protected:
Expand All @@ -185,6 +185,7 @@ void FcXPUFusePass::ApplyImpl(ir::Graph* graph) const {
for (auto act_type : {
"relu",
"gelu",
"tanh",
"",
}) {
ApplyImpl(graph, mul_type, with_bias, act_type);
Expand Down Expand Up @@ -244,6 +245,18 @@ void FcXPUFusePass::ApplyImpl(ir::Graph* graph,
QuantWeight<int16_t>(mul_w_tensor, mul_w_max_tensor, !transpose_w);
}

std::string fc_out_name;
if (act_out) {
fc_out_name = act_out->Name();
} else if (add_out) {
fc_out_name = add_out->Name();
} else {
fc_out_name = mul_out->Name();
}
std::string fc_out_max_name = fc_out_name + "_max";
VarDesc fc_out_max_desc(fc_out_max_name);
Node* fc_out_max = graph->CreateVarNode(&fc_out_max_desc);

// Generate fc_xpu op
framework::OpDesc fc_xpu_op_desc(block);
fc_xpu_op_desc.SetType("fc_xpu");
Expand Down Expand Up @@ -282,25 +295,21 @@ void FcXPUFusePass::ApplyImpl(ir::Graph* graph,
"act_alpha", PADDLE_GET_CONST(float, act->Op()->GetAttr("slope")));
}
}
if (act_out) {
fc_xpu_op_desc.SetOutput("out", {act_out->Name()});
} else if (add_out) {
fc_xpu_op_desc.SetOutput("out", {add_out->Name()});
} else {
fc_xpu_op_desc.SetOutput("out", {mul_out->Name()});
}
fc_xpu_op_desc.SetOutput("out", {fc_out_name});
fc_xpu_op_desc.SetOutput("out_max", {fc_out_max_name});
auto* fc_xpu = graph->CreateOpNode(&fc_xpu_op_desc);
SAFE_IR_NODE_LINK_TO(mul_x, fc_xpu);
SAFE_IR_NODE_LINK_TO(mul_w, fc_xpu);
SAFE_IR_NODE_LINK_TO(mul_w_max, fc_xpu);
IR_NODE_LINK_TO(mul_x, fc_xpu);
IR_NODE_LINK_TO(mul_w, fc_xpu);
IR_NODE_LINK_TO(mul_w_max, fc_xpu);
SAFE_IR_NODE_LINK_TO(bias, fc_xpu);
if (act_out) {
SAFE_IR_NODE_LINK_TO(fc_xpu, act_out);
IR_NODE_LINK_TO(fc_xpu, act_out);
} else if (add_out) {
SAFE_IR_NODE_LINK_TO(fc_xpu, add_out);
IR_NODE_LINK_TO(fc_xpu, add_out);
} else {
SAFE_IR_NODE_LINK_TO(fc_xpu, mul_out);
IR_NODE_LINK_TO(fc_xpu, mul_out);
}
IR_NODE_LINK_TO(fc_xpu, fc_out_max);

// delete useless node
std::unordered_set<const Node*> delete_nodes;
Expand Down
145 changes: 145 additions & 0 deletions paddle/fluid/framework/ir/xpu/link_xpu_op_max_pass.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include <string>
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
#include "paddle/fluid/framework/ir/pass.h"
#include "paddle/fluid/framework/ir/xpu/pass_utils.h"
#include "paddle/fluid/framework/ir/xpu/quant_utils.h"
#include "paddle/fluid/framework/op_version_registry.h"
#include "paddle/fluid/platform/enforce.h"

namespace phi {
class DenseTensor;
} // namespace phi

namespace paddle {
namespace framework {
class Scope;
} // namespace framework
} // namespace paddle

namespace paddle {
namespace framework {
namespace ir {
namespace patterns {

struct FusionXPUOpPattern : public PatternBase {
FusionXPUOpPattern(PDPattern* pattern,
const std::string& name_scope,
const std::string& op_type);

// declare operator node's name
PATTERN_DECL_NODE(fusion_op);
// declare variable node's name
PATTERN_DECL_NODE(out);
PATTERN_DECL_NODE(out_max);

private:
std::string op_type_;
};

FusionXPUOpPattern::FusionXPUOpPattern(PDPattern* pattern,
const std::string& name_scope,
const std::string& op_type)
: PatternBase(pattern, name_scope, name_scope), op_type_(op_type) {
auto* fusion_op = pattern->NewNode(fusion_op_repr())->assert_is_op(op_type_);
auto* out = pattern->NewNode(out_repr())
->assert_is_op_output(op_type_, "out")
->assert_var_not_persistable();
auto* out_max = pattern->NewNode(out_max_repr())
->assert_is_op_output(op_type_, "out_max")
->assert_var_not_persistable();
fusion_op->LinksTo({out, out_max});
}

} // namespace patterns

class LinkXPUOpMaxPass : public FusePassBase {
protected:
void ApplyImpl(ir::Graph* graph) const override;

private:
void ApplyImpl(ir::Graph* graph, const std::string& op_type) const;

const std::string name_scope_{"multi_encoder_xpu_slice_fuse_pass"};
// ops with x_max/out_max
std::set<std::string> op_types_{"fc_xpu", "conv2d_xpu"};
};

/*
Origin subgraph:
fusion_xpu_op0
/ \
| |
out0 out0_max
|
\
fusion_xpu_op1
Fused subgraph:
fusion_xpu_op0
/ \
| |
out0 out0_max
| |
\ /
fusion_xpu_op1
*/
void LinkXPUOpMaxPass::ApplyImpl(ir::Graph* graph) const {
Init(name_scope_, graph);
for (auto op_type : op_types_) {
ApplyImpl(graph, op_type);
}
}

void LinkXPUOpMaxPass::ApplyImpl(ir::Graph* graph,
const std::string& op_type) const {
PADDLE_ENFORCE_NOT_NULL(
graph, platform::errors::PreconditionNotMet("graph should not be null."));
GraphPatternDetector gpd;
patterns::FusionXPUOpPattern pattern(
gpd.mutable_pattern(), name_scope_, op_type);

int found_subgraph_count = 0;
auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
Graph* graph) {
VLOG(4) << "handle LinkXPUOpMaxPass fuse";
GET_IR_NODE(fusion_op);
GET_IR_NODE(out);
GET_IR_NODE(out_max);
for (auto next_op : out->outputs) {
auto* next_op_desc = next_op->Op();
if (op_types_.count(next_op_desc->Type()) == 0) continue;
next_op_desc->SetInput("x_max", {out_max->Name()});
IR_NODE_LINK_TO(out_max, next_op);
found_subgraph_count++;
}
};

gpd(graph, handler);
AddStatis(found_subgraph_count);
}

} // namespace ir
} // namespace framework
} // namespace paddle

REGISTER_PASS(link_xpu_op_max_pass, paddle::framework::ir::LinkXPUOpMaxPass);

REGISTER_PASS_CAPABILITY(link_xpu_op_max_pass)
.AddCombination(
paddle::framework::compatible::OpVersionComparatorCombination().EQ(
"fc_xpu", 0));
Original file line number Diff line number Diff line change
Expand Up @@ -759,12 +759,22 @@ bool BuildOpFuncList(const platform::Place& place,
op_with_kernel->Type())) {
auto phi_kernel_key = op_with_kernel->ChoosePhiKernel(exec_ctx);
auto phi_kernel_name = op_with_kernel->PhiKernelSignature()->name;

if (op_with_kernel->PhiKernel()->IsValid()) {
bool in_custom_back_list = false;
#ifdef PADDLE_WITH_CUSTOM_DEVICE
in_custom_back_list =
phi::backends::custom_device::is_in_custom_black_list(
phi_kernel_name);
#endif
if (op_with_kernel->PhiKernel()->IsValid() && !in_custom_back_list) {
run_phi_kernel = true;
} else {
if (!op_with_kernel->SupportsKernelType(expected_kernel_key,
exec_ctx)) {
if ((!op_with_kernel->SupportsKernelType(expected_kernel_key,
exec_ctx)) ||
in_custom_back_list) {
std::string info = in_custom_back_list ? "fluid in black list "
: "fluid missing ";
VLOG(3) << info << phi_kernel_key
<< " kernel: " << phi_kernel_name;
auto phi_cpu_kernel_key =
FallBackToCpu(phi_kernel_key, *op_with_kernel);
op_with_kernel->ResetPhiKernel(
Expand Down
2 changes: 1 addition & 1 deletion paddle/fluid/framework/op_desc.h
Original file line number Diff line number Diff line change
Expand Up @@ -219,7 +219,7 @@ class OpDesc {
return ret_val;
}

// it it really needed? or just maintain a ptr from block?
// Is it really needed? Or just maintain a ptr from the block?
proto::OpDesc desc_;
BlockDesc *block_{nullptr}; // not_own
// input arg name => input variable names
Expand Down
16 changes: 13 additions & 3 deletions paddle/fluid/framework/operator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1888,7 +1888,12 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
bool is_xpu_kp_support = (use_xpu_kp_kernel_rt || use_xpu_kp_kernel_debug);
#endif

if (phi_kernel_->IsValid()
bool in_custom_back_list = false;
#if defined(PADDLE_WITH_CUSTOM_DEVICE)
in_custom_back_list =
phi::backends::custom_device::is_in_custom_black_list(phi_kernel_name);
#endif
if (phi_kernel_->IsValid() && !in_custom_back_list
#if defined(PADDLE_WITH_XPU) && !defined(PADDLE_WITH_XPU_KP)
&& !is_xpu_unsupport
#endif
Expand All @@ -1909,7 +1914,6 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
kernel_type_->library_type_ = LibraryType::kKP;
}
#endif

if (kernels_iter == all_op_kernels.end() ||
kernels_iter->second.find(*kernel_type_.get()) ==
kernels_iter->second.end()
Expand All @@ -1918,9 +1922,15 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
#endif
#if defined(PADDLE_WITH_XPU_KP)
|| (is_xpu_unsupport && !is_xpu_kp_support)
#endif
#if defined(PADDLE_WITH_CUSTOM_DEVICE)
|| in_custom_back_list
#endif
) {
fallback_to_cpu = true;
if (in_custom_back_list) {
VLOG(3) << "fluid in black list: " << phi_kernel_name;
}
auto phi_cpu_kernel_key = FallBackToCpu(phi_kernel_key, *this);
phi_kernel_.reset(
new phi::Kernel(phi::KernelFactory::Instance().SelectKernel(
Expand Down Expand Up @@ -3492,7 +3502,7 @@ void OperatorWithKernel::BuildPhiKernelContext(
// we try to add these Attrs to the RuntimeAttrs, but these OpDesc will lose
// the RuntimeAttrs information in the process of converting the Graph to
// the Program, so additional record configuration will be introduced,
// which increases the The cost of development and understanding, so we
// which increases the cost of development and understanding, so we
// still use Attrs to get and the attributes set by these passes from Attrs
// for the time being. In the future, it is necessary to clarify the
// positioning of RuntimeAttrs and expand related functions.
Expand Down
2 changes: 1 addition & 1 deletion paddle/fluid/framework/var_desc.h
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,7 @@ class VarDesc {
proto::VarType::TensorDesc *mutable_tensor_desc();
std::vector<proto::VarType::TensorDesc *> mutable_tensor_descs();

// it it really needed? or just mantain a ptr from block?
// Is it really needed? Or just mantain a ptr from the block?
proto::VarDesc desc_;
AttributeMap attrs_;

Expand Down
Loading

0 comments on commit 76c8ae1

Please sign in to comment.