diff --git a/.clang-tidy b/.clang-tidy index 9462687d6e7dd..1045c62b1b9a3 100644 --- a/.clang-tidy +++ b/.clang-tidy @@ -70,7 +70,7 @@ clang-analyzer-core.uninitialized.Assign, clang-analyzer-cplusplus.InnerPointer, -clang-analyzer-cplusplus.Move, -clang-analyzer-cplusplus.NewDelete, --clang-analyzer-cplusplus.NewDeleteLeaks, +clang-analyzer-cplusplus.NewDeleteLeaks, -clang-analyzer-cplusplus.PureVirtualCall, -clang-analyzer-cplusplus.SelfAssignment, -clang-analyzer-cplusplus.SmartPtr, diff --git a/paddle/cinn/hlir/framework/pir/op_lowering_impl.cc b/paddle/cinn/hlir/framework/pir/op_lowering_impl.cc index a7db7bcd03104..6e03a261003e6 100644 --- a/paddle/cinn/hlir/framework/pir/op_lowering_impl.cc +++ b/paddle/cinn/hlir/framework/pir/op_lowering_impl.cc @@ -101,6 +101,14 @@ std::shared_ptr OpLowererImpl::GetGroupInfo( for (auto& val : group->output_values()) { group_info->direct_output_var_names.insert(ValueName(val)); } + + group->WalkOps([&group_info](::pir::Operation* op) { + if (CompatibleInfo::OpKind(*op) == OpPatternKind::kReduction) { + group_info->raw_reduce_axis = cinn::fusion::GetReduceAxisIdx(op); + group_info->raw_data_rank = + cinn::fusion::GetCompitableRank(op->operand_source(0)); + } + }); return group_info; } diff --git a/paddle/cinn/hlir/framework/pir/op_lowering_impl.h b/paddle/cinn/hlir/framework/pir/op_lowering_impl.h index e8c2d468347af..838b70da20fa5 100644 --- a/paddle/cinn/hlir/framework/pir/op_lowering_impl.h +++ b/paddle/cinn/hlir/framework/pir/op_lowering_impl.h @@ -51,6 +51,8 @@ typedef bool (OpLowererImpl::*ScheduleDetermineFunction)(::pir::Operation*); struct GroupInfo { std::vector data_space; std::vector reduce_axis; + int64_t raw_data_rank; + std::vector raw_reduce_axis; std::set reduce_var_names; std::set shared_var_names; std::set direct_output_var_names; diff --git a/paddle/cinn/ir/group_schedule/config/filedatabase.cc b/paddle/cinn/ir/group_schedule/config/filedatabase.cc index 1c963c754217f..64741521802e9 100644 --- a/paddle/cinn/ir/group_schedule/config/filedatabase.cc +++ b/paddle/cinn/ir/group_schedule/config/filedatabase.cc @@ -14,13 +14,25 @@ #include "paddle/cinn/ir/group_schedule/config/filedatabase.h" +#include + #include #include #include #include "paddle/cinn/utils/multi_threading.h" +#define MKDIR(path) mkdir(path, S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH) PD_DECLARE_string(cinn_tile_config_filename_label); +static bool PathExists(const std::string& path) { + struct stat statbuf; + if (stat(path.c_str(), &statbuf) != -1) { + if (S_ISDIR(statbuf.st_mode)) { + return true; + } + } + return false; +} namespace cinn { namespace ir { diff --git a/paddle/cinn/ir/group_schedule/config/filedatabase.h b/paddle/cinn/ir/group_schedule/config/filedatabase.h index 3eb12c3aa2b25..19758dc828c18 100644 --- a/paddle/cinn/ir/group_schedule/config/filedatabase.h +++ b/paddle/cinn/ir/group_schedule/config/filedatabase.h @@ -16,7 +16,6 @@ #include "paddle/cinn/ir/group_schedule/config/database.h" #include "paddle/cinn/ir/group_schedule/config/tileconfig_desc.pb.h" -#include "paddle/fluid/inference/analysis/helper.h" namespace cinn { namespace ir { diff --git a/paddle/cinn/ir/group_schedule/config/group_tile_config.cc b/paddle/cinn/ir/group_schedule/config/group_tile_config.cc index 3beaba6df036c..40c1d134ac642 100644 --- a/paddle/cinn/ir/group_schedule/config/group_tile_config.cc +++ b/paddle/cinn/ir/group_schedule/config/group_tile_config.cc @@ -37,9 +37,10 @@ std::shared_ptr InitBasicInfo( base_info->broadcast_info = group_info->broadcast_info; base_info->broadcast_to_elementwise = group_info->broadcast_to_elementwise; base_info->data_rank = group_info->data_space.size(); + base_info->raw_data_rank = group_info->raw_data_rank; std::set reduce_dim_loc; - for (auto dim : group_info->reduce_axis) { + for (int64_t dim : group_info->reduce_axis) { if (dim < 0) { dim += base_info->data_rank; } @@ -47,6 +48,13 @@ std::shared_ptr InitBasicInfo( reduce_dim_loc.insert(dim); } + for (int64_t dim : group_info->raw_reduce_axis) { + if (dim < 0) { + dim += base_info->data_rank; + } + base_info->raw_reduce_axis.push_back(dim); + } + base_info->spatial_numel = 1; base_info->reduce_numel = 1; for (int64_t i = 0; i < base_info->data_rank; ++i) { diff --git a/paddle/cinn/ir/group_schedule/config/group_tile_config.h b/paddle/cinn/ir/group_schedule/config/group_tile_config.h index c48ea32f17c95..a62d9dd84fb59 100644 --- a/paddle/cinn/ir/group_schedule/config/group_tile_config.h +++ b/paddle/cinn/ir/group_schedule/config/group_tile_config.h @@ -29,7 +29,9 @@ namespace ir { struct ScheduleConfig { struct BaseInfo { std::vector reduce_axis; + std::vector raw_reduce_axis; int64_t data_rank; + int64_t raw_data_rank; int64_t reduce_numel; int64_t spatial_numel; bool has_dynamic_spatial{false}; diff --git a/paddle/common/flags.cc b/paddle/common/flags.cc index ae302963ca1ac..ed5cd0500f5d5 100644 --- a/paddle/common/flags.cc +++ b/paddle/common/flags.cc @@ -1724,3 +1724,7 @@ PHI_DEFINE_EXPORTED_string(cusolver_dir, // NOLINT PHI_DEFINE_EXPORTED_string(cusparse_dir, // NOLINT "", "Specify path for loading libcusparse.so.*."); +PHI_DEFINE_EXPORTED_string( + win_cuda_bin_dir, // NOLINT + "", + "Specify path for loading *.dll about cuda on windows"); diff --git a/paddle/fluid/distributed/ps/table/common_graph_table.cc b/paddle/fluid/distributed/ps/table/common_graph_table.cc index 860f2fa54887e..c7ff94500ac62 100644 --- a/paddle/fluid/distributed/ps/table/common_graph_table.cc +++ b/paddle/fluid/distributed/ps/table/common_graph_table.cc @@ -14,7 +14,7 @@ #include "paddle/fluid/distributed/ps/table/common_graph_table.h" -#include +#include #include #include diff --git a/paddle/fluid/distributed/ps/table/graph/graph_node.cc b/paddle/fluid/distributed/ps/table/graph/graph_node.cc index 8be6734992ee9..31c098c49fba2 100644 --- a/paddle/fluid/distributed/ps/table/graph/graph_node.cc +++ b/paddle/fluid/distributed/ps/table/graph/graph_node.cc @@ -74,7 +74,12 @@ void GraphNode::build_sampler(std::string sample_type) { } else if (sample_type == "weighted") { sampler = new WeightedSampler(); } - sampler->build(edges); + if (sampler != nullptr) { + sampler->build(edges); + } else { + throw std::runtime_error("Failed to create a sampler of type: " + + sample_type); + } } void FeatureNode::to_buffer(char* buffer, bool need_feature) { memcpy(buffer, &id, id_size); diff --git a/paddle/fluid/framework/new_executor/interpreter/dependency_builder.cc b/paddle/fluid/framework/new_executor/interpreter/dependency_builder.cc index 1485df091f948..11f29ba5c5a45 100644 --- a/paddle/fluid/framework/new_executor/interpreter/dependency_builder.cc +++ b/paddle/fluid/framework/new_executor/interpreter/dependency_builder.cc @@ -144,6 +144,10 @@ void DependencyBuilder::ShareDependencyFrom(const DependencyBuilder& src) { is_build_ = true; } +const std::string& DependencyBuilder::GetInstructionName(size_t op_idx) const { + return (*instructions_)[op_idx].OpBase()->Type(); +} + const std::map>& DependencyBuilder::OpDownstreamMap() const { PADDLE_ENFORCE_EQ( @@ -340,6 +344,13 @@ void DependencyBuilder::AddDependencyForReadOp() { void DependencyBuilder::AddDependencyForSequentialRun() { size_t dependence_op_idx = ULLONG_MAX; for (size_t op_idx = 0; op_idx < op_num_; ++op_idx) { + if (this->GetInstructionName(op_idx) == "pd_op.full_int_array") { + VLOG(8) << "Skip adding dependency for sequential run: " + << dependence_op_idx << "->" << op_idx << " " + << this->GetInstructionName(dependence_op_idx) << "->" + << this->GetInstructionName(op_idx); + continue; + } if (dependence_op_idx != ULLONG_MAX) { AddDownstreamOp(dependence_op_idx, op_idx); } @@ -571,6 +582,11 @@ PirDependencyBuilder::PirDependencyBuilder() : instructions_() { op_happens_before_ = std::make_shared>>(); } +const std::string& PirDependencyBuilder::GetInstructionName( + size_t op_idx) const { + return (instructions_)[op_idx]->Name(); +} + void PirDependencyBuilder::AddDependencyForCommunicationOp() { size_t dependence_op_idx = ULLONG_MAX; for (size_t op_idx = 0; op_idx < op_num_; ++op_idx) { diff --git a/paddle/fluid/framework/new_executor/interpreter/dependency_builder.h b/paddle/fluid/framework/new_executor/interpreter/dependency_builder.h index bef81c3c627d4..f16190a9ae9cd 100644 --- a/paddle/fluid/framework/new_executor/interpreter/dependency_builder.h +++ b/paddle/fluid/framework/new_executor/interpreter/dependency_builder.h @@ -63,6 +63,8 @@ class DependencyBuilder { &((*instructions_)[op2].DeviceContext()); } + virtual const std::string& GetInstructionName(size_t op_idx) const; + protected: void AddDependencyForCoalesceTensorOp(); virtual void AddDependencyForCommunicationOp(); @@ -127,6 +129,8 @@ class PirDependencyBuilder : public DependencyBuilder { &((instructions_)[op2]->DeviceContext()); } + const std::string& GetInstructionName(size_t op_idx) const override; + private: void AddDependencyForCommunicationOp() override; diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index cdf6c51c7d078..adb7021633b8e 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -114,6 +114,7 @@ #include "paddle/common/flags.h" #include "paddle/fluid/ir_adaptor/translator/translate.h" +#include "paddle/fluid/pir/transforms/general/common_subexpression_elimination_pass.h" #include "paddle/fluid/pir/transforms/general/constant_folding_pass.h" #include "paddle/fluid/pir/transforms/general/dead_code_elimination_pass.h" #include "paddle/fluid/pir/transforms/general/inplace_pass.h" @@ -906,7 +907,7 @@ bool AnalysisPredictor::PrepareExecutor() { ctx->GetOrRegisterDialect(); ctx->GetOrRegisterDialect(); auto pass_manager = std::make_shared<::pir::PassManager>( - ::pir::IrContext::Instance(), 2); + ::pir::IrContext::Instance(), config_.pm_opt_level_); if (!config_.glog_info_disabled()) { pass_manager->EnablePrintStatistics(); } @@ -999,7 +1000,7 @@ bool AnalysisPredictor::PrepareExecutor() { // Apply some basic passes required by the framework ::pir::PassManager basic_pass_pm(::pir::IrContext::Instance(), config_.pm_opt_level_); - + basic_pass_pm.AddPass(::pir::CreateCommonSubexpressionEliminationPass()); auto params_sync_among_devices_pass = ::pir::CreateParamsSyncAmongDevicesPass(); params_sync_among_devices_pass->SetNotOwned(pir::Pass::kPlaceAttr, diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc index c6b83175d21d2..a296074f9d6cf 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.cc +++ b/paddle/fluid/inference/api/paddle_pass_builder.cc @@ -610,6 +610,7 @@ const std::vector kPirGpuPasses{ "embedding_eltwise_layernorm_fuse_pass", "fused_flash_attn_pass", "multihead_matmul_fuse_pass", + "fused_weight_only_linear_pass", "matmul_add_act_fuse_pass", "fc_elementwise_layernorm_fuse_pass", "matmul_scale_fuse_pass", diff --git a/paddle/fluid/inference/tensorrt/convert/bilinear_interp_v2_op.cc b/paddle/fluid/inference/tensorrt/convert/bilinear_interp_v2_op.cc index 854eaa4cc0b7e..a81dc2c2d6b73 100644 --- a/paddle/fluid/inference/tensorrt/convert/bilinear_interp_v2_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/bilinear_interp_v2_op.cc @@ -49,7 +49,11 @@ class BilinearInterpolateV2OpConverter : public OpConverter { auto layer = TRT_ENGINE_ADD_LAYER(engine_, Resize, *input); if (align_mode == 0) { +#if IS_TRT_VERSION_GE(8600) + layer->setResizeMode(nvinfer1::InterpolationMode::kLINEAR); +#else layer->setResizeMode(nvinfer1::ResizeMode::kLINEAR); +#endif } #if IS_TRT_VERSION_GE(8000) if (align_corners == true) { diff --git a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc index c3ff1e703c1bb..05f80e168a6f8 100644 --- a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc @@ -221,7 +221,7 @@ class Conv2dOpConverter : public OpConverter { return layer; }, [](nvinfer1::IConvolutionLayer* layer, nvinfer1::DimsHW& dilations) { - layer->setDilation(dilations); + layer->setDilationNd(dilations); }, "conv2d"); } @@ -245,7 +245,7 @@ class Deconv2dOpConverter : public OpConverter { TensorRTEngine::Weight& weight, TensorRTEngine::Weight& bias) -> nvinfer1::IDeconvolutionLayer* { auto* layer = TRT_ENGINE_ADD_LAYER(engine_, - Deconvolution, + DeconvolutionNd, *inputs, n_output, ksize, diff --git a/paddle/fluid/inference/tensorrt/convert/cross_multihead_matmul_op.cc b/paddle/fluid/inference/tensorrt/convert/cross_multihead_matmul_op.cc index 33c151a24f7b2..2fbeacbc0f2af 100644 --- a/paddle/fluid/inference/tensorrt/convert/cross_multihead_matmul_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/cross_multihead_matmul_op.cc @@ -83,10 +83,21 @@ class CrossMultiheadMatMulOpConverter : public OpConverter { nvinfer1::Weights weight_q{nvinfer1::DataType::kFLOAT, static_cast(weight_q_data), static_cast(weight_q_t->numel())}; + nvinfer1::ITensor* input_q_shape_tensor = Shape(input_q); +#if IS_TRT_VERSION_GE(8600) + auto* fc_q_weight_layer = TRT_ENGINE_ADD_LAYER( + engine_, Constant, nvinfer1::Dims3(1, n_q, hidden_in_q), weight_q); + auto* fc_q_layer = + TRT_ENGINE_ADD_LAYER(engine_, + MatrixMultiply, + *input_q, + nvinfer1::MatrixOperation::kNONE, + *fc_q_weight_layer->getOutput(0), + nvinfer1::MatrixOperation::kTRANSPOSE); +#else nvinfer1::Weights bias_q{}; // add shuffle for FullyConnected layer std::vector reshape_before_fc_q_shape_tensor; - nvinfer1::ITensor* input_q_shape_tensor = Shape(input_q); for (int i = 0; i < 5; i++) { reshape_before_fc_q_shape_tensor.push_back(Add1DConstantLayer(1)); } @@ -109,6 +120,7 @@ class CrossMultiheadMatMulOpConverter : public OpConverter { n_q, weight_q, bias_q); +#endif fc_q_layer->setName( ("multihead_matmul_fc_q(Output: " + output_name + ")").c_str()); @@ -184,11 +196,22 @@ class CrossMultiheadMatMulOpConverter : public OpConverter { nvinfer1::Weights weight_kv{nvinfer1::DataType::kFLOAT, static_cast(weight_kv_data), static_cast(weight_kv_t->numel())}; - nvinfer1::Weights bias_kv{}; + nvinfer1::ITensor* input_shape_tensor = Shape(input_kv); +#if IS_TRT_VERSION_GE(8600) + auto* fc_weight_layer = TRT_ENGINE_ADD_LAYER( + engine_, Constant, nvinfer1::Dims3(1, n, hidden_in), weight_kv); + auto* fc_layer = + TRT_ENGINE_ADD_LAYER(engine_, + MatrixMultiply, + *input_q, + nvinfer1::MatrixOperation::kNONE, + *fc_weight_layer->getOutput(0), + nvinfer1::MatrixOperation::kTRANSPOSE); +#else + nvinfer1::Weights bias_kv{}; // add shuffle for FullyConnected layer std::vector reshape_before_fc_shape_tensor; - nvinfer1::ITensor* input_shape_tensor = Shape(input_kv); for (int i = 0; i < 5; i++) { reshape_before_fc_shape_tensor.push_back(Add1DConstantLayer(1)); } @@ -211,6 +234,7 @@ class CrossMultiheadMatMulOpConverter : public OpConverter { n, weight_kv, bias_kv); +#endif fc_layer->setName( ("multihead_matmul_fc(Output: " + output_name + ")").c_str()); diff --git a/paddle/fluid/inference/tensorrt/convert/flash_multihead_matmul_op.cc b/paddle/fluid/inference/tensorrt/convert/flash_multihead_matmul_op.cc index 757f90181cd0d..70109a6a7f4f6 100644 --- a/paddle/fluid/inference/tensorrt/convert/flash_multihead_matmul_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/flash_multihead_matmul_op.cc @@ -110,6 +110,17 @@ class FlashMultiheadMatMulOpConverter : public OpConverter { nvinfer1::Weights weight{nvinfer1::DataType::kFLOAT, static_cast(weight_data), static_cast(weight_t->numel())}; +#if IS_TRT_VERSION_GE(8600) + auto* fc_weight_layer = TRT_ENGINE_ADD_LAYER( + engine_, Constant, nvinfer1::Dims3(1, n, hidden_in), weight); + auto* fc_layer = + TRT_ENGINE_ADD_LAYER(engine_, + MatrixMultiply, + *input, + nvinfer1::MatrixOperation::kNONE, + *fc_weight_layer->getOutput(0), + nvinfer1::MatrixOperation::kTRANSPOSE); +#else nvinfer1::Weights bias{}; // add shuffle for FullyConnected layer std::vector reshape_before_fc_shape_tensor; @@ -138,6 +149,7 @@ class FlashMultiheadMatMulOpConverter : public OpConverter { n, weight, bias); +#endif fc_layer->setName( ("multihead_matmul_fc(Output: " + output_name + ")").c_str()); // add shuffle for fc layer @@ -299,6 +311,20 @@ class FlashMultiheadMatMulOpConverter : public OpConverter { nvinfer1::Weights weight{nvinfer1::DataType::kFLOAT, static_cast(weight_data), static_cast(weight_tensor->numel())}; +#if IS_TRT_VERSION_GE(8600) + auto* qkv_fc_weight_layer = + TRT_ENGINE_ADD_LAYER(engine_, + Constant, + nvinfer1::Dims3(1, hidden_out, hidden_out), + weight); + qkv_fc_layers[i] = + TRT_ENGINE_ADD_LAYER(engine_, + MatrixMultiply, + *input, + nvinfer1::MatrixOperation::kNONE, + *qkv_fc_weight_layer->getOutput(0), + nvinfer1::MatrixOperation::kTRANSPOSE); +#else nvinfer1::Weights bias{}; qkv_fc_layers[i] = TRT_ENGINE_ADD_LAYER(engine_, @@ -307,6 +333,7 @@ class FlashMultiheadMatMulOpConverter : public OpConverter { hidden_out, weight, bias); +#endif qkv_fc_layers[i]->setName(("multihead_matmul_fc_" + std::to_string(i) + "_(Output: " + output_name + ")") .c_str()); diff --git a/paddle/fluid/inference/tensorrt/convert/grid_sampler_op.cc b/paddle/fluid/inference/tensorrt/convert/grid_sampler_op.cc index 48bcdddcba875..a2fe27590df02 100644 --- a/paddle/fluid/inference/tensorrt/convert/grid_sampler_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/grid_sampler_op.cc @@ -48,9 +48,17 @@ class GridSamplerOpConverter : public OpConverter { nvinfer1::InterpolationMode interpolationMode{ nvinfer1::InterpolationMode::kNEAREST}; if (mode == "nearest") { +#if IS_TRT_VERSION_GE(8600) + interpolationMode = nvinfer1::InterpolationMode::kNEAREST; +#else interpolationMode = nvinfer1::ResizeMode::kNEAREST; +#endif } else if (mode == "bilinear") { +#if IS_TRT_VERSION_GE(8600) + interpolationMode = nvinfer1::InterpolationMode::kLINEAR; +#else interpolationMode = nvinfer1::ResizeMode::kLINEAR; +#endif } nvinfer1::SampleMode sampleMode{nvinfer1::SampleMode::kFILL}; diff --git a/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc b/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc index a092b3215502e..68f18bd6e7472 100644 --- a/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc @@ -199,7 +199,7 @@ class MultiheadMatMulOpConverter : public OpConverter { float dp_probs = 1.0 / 127.0; nvinfer1::DimsHW nv_ksize(1, 1); fc_layer = TRT_ENGINE_ADD_LAYER( - engine_, Convolution, *input, n, nv_ksize, weight, bias); + engine_, ConvolutionNd, *input, n, nv_ksize, weight, bias); fc_layer->setName( ("Multihead: Convolution/FullyConnected: (Output: " + output_name + ")") @@ -567,10 +567,46 @@ class MultiheadMatMulOpConverter : public OpConverter { bias_data_tmp.data(), bias_data, bias_t->numel() * sizeof(float)); transpose_bias_v2( bias_data_tmp.data(), bias_data, head_number, head_size); - + nvinfer1::ITensor* input_shape_tensor = Shape(input); +#if IS_TRT_VERSION_GE(8600) + // add matmul and elementwise layer + auto* fc_weight_layer = TRT_ENGINE_ADD_LAYER( + engine_, Constant, nvinfer1::Dims3(1, n, hidden_in), weight); + auto* fc_matmul_layer = + TRT_ENGINE_ADD_LAYER(engine_, + MatrixMultiply, + *input, + nvinfer1::MatrixOperation::kNONE, + *fc_weight_layer->getOutput(0), + nvinfer1::MatrixOperation::kTRANSPOSE); + fc_matmul_layer->setName( + ("multihead_matmul_fc_matmul(Output: " + output_name + ")") + .c_str()); + auto* fc_bias_layer = TRT_ENGINE_ADD_LAYER( + engine_, Constant, nvinfer1::Dims3(1, 1, n), bias); + auto* fc_add_layer = + TRT_ENGINE_ADD_LAYER(engine_, + ElementWise, + *fc_matmul_layer->getOutput(0), + *fc_bias_layer->getOutput(0), + nvinfer1::ElementWiseOperation::kSUM); + fc_add_layer->setName( + ("multihead_matmul_fc_add(Output: " + output_name + ")").c_str()); + if (op_desc.HasAttr("Input_scale")) { + PADDLE_ENFORCE_EQ(op_desc.HasAttr("fc_out_threshold"), + true, + platform::errors::InvalidArgument( + "must have out threshold in multihead layers " + "in int8 mode")); + float out_scale = + PADDLE_GET_CONST(float, op_desc.GetAttr("fc_out_threshold")); + engine_->SetTensorDynamicRange(fc_add_layer->getOutput(0), + out_scale); + } + auto* fc_layer = fc_add_layer; +#else // add shuffle for FullyConnected layer std::vector reshape_before_fc_shape_tensor; - nvinfer1::ITensor* input_shape_tensor = Shape(input); for (int i = 0; i < 5; i++) { reshape_before_fc_shape_tensor.push_back(Add1DConstantLayer(1)); } @@ -595,7 +631,7 @@ class MultiheadMatMulOpConverter : public OpConverter { nvinfer1::DimsHW nv_ksize(1, 1); fc_layer = TRT_ENGINE_ADD_LAYER(engine_, - Convolution, + ConvolutionNd, *reshape_before_fc_layer->getOutput(0), n, nv_ksize, @@ -620,6 +656,7 @@ class MultiheadMatMulOpConverter : public OpConverter { } fc_layer->setName( ("multihead_matmul_fc(Output: " + output_name + ")").c_str()); +#endif // add shuffle for CustomQKVToContextPluginDynamic layer auto* reshape_after_fc_layer = @@ -769,6 +806,56 @@ class MultiheadMatMulOpConverter : public OpConverter { static_cast(bias_data), static_cast(bias_t->numel())}; +#if IS_TRT_VERSION_GE(10000) + auto* fc_weight_layer = + TRT_ENGINE_ADD_LAYER(engine_, + Constant, + nvinfer1::Dims3(1, n, hidden_in), + weight.get()); + auto* fc_matmul_layer = + TRT_ENGINE_ADD_LAYER(engine_, + MatrixMultiply, + *input, + nvinfer1::MatrixOperation::kNONE, + *fc_weight_layer->getOutput(0), + nvinfer1::MatrixOperation::kTRANSPOSE); + fc_matmul_layer->setName( + ("multihead_matmul_fc_matmul(Output: " + output_name + ")") + .c_str()); + auto* fc_bias_layer = TRT_ENGINE_ADD_LAYER( + engine_, Constant, nvinfer1::Dims3(1, 1, n), bias.get()); + auto* fc_add_layer = + TRT_ENGINE_ADD_LAYER(engine_, + ElementWise, + *fc_matmul_layer->getOutput(0), + *fc_bias_layer->getOutput(0), + nvinfer1::ElementWiseOperation::kSUM); + fc_add_layer->setName( + ("multihead_matmul_fc_add(Output: " + output_name + ")").c_str()); + if (op_desc.HasAttr("Input_scale")) { + PADDLE_ENFORCE_EQ(op_desc.HasAttr("fc_out_threshold"), + true, + platform::errors::InvalidArgument( + "must have out threshold in multihead layers " + "in int8 mode")); + float out_scale = + PADDLE_GET_CONST(float, op_desc.GetAttr("fc_out_threshold")); + engine_->SetTensorDynamicRange(fc_add_layer->getOutput(0), + out_scale); + } + auto* reshape_after_fc_layer = TRT_ENGINE_ADD_LAYER( + engine_, Shuffle, *fc_add_layer->getOutput(0)); + nvinfer1::Dims reshape_after_fc_layer_dim{}; + reshape_after_fc_layer_dim.nbDims = 5; + reshape_after_fc_layer_dim.d[3] = 1; + reshape_after_fc_layer_dim.d[4] = 1; + reshape_after_fc_layer->setReshapeDimensions( + reshape_after_fc_layer_dim); + reshape_after_fc_layer->setName( + ("shuffle_after_multihead_matmul(Output: " + output_name + ")") + .c_str()); + auto* fc_layer = reshape_after_fc_layer; +#else // add shuffle before fc std::vector reshape_before_fc_shape_tensor; nvinfer1::ITensor* input_shape_tensor = Shape(input); @@ -798,7 +885,7 @@ class MultiheadMatMulOpConverter : public OpConverter { nvinfer1::DimsHW nv_ksize(1, 1); fc_layer = TRT_ENGINE_ADD_LAYER(engine_, - Convolution, + ConvolutionNd, *reshape_before_fc_layer->getOutput(0), n, nv_ksize, @@ -828,6 +915,7 @@ class MultiheadMatMulOpConverter : public OpConverter { ("multihead_matmul_fc(Output: " + output_name + ")").c_str()); // no need to add shuffle after fc, just change it in +#endif // QkvToContextPluginDynamic // add qkv to context diff --git a/paddle/fluid/inference/tensorrt/convert/multihead_matmul_roformer_op.cc b/paddle/fluid/inference/tensorrt/convert/multihead_matmul_roformer_op.cc index f849fff7ab1f2..e65dbd4832ff5 100644 --- a/paddle/fluid/inference/tensorrt/convert/multihead_matmul_roformer_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/multihead_matmul_roformer_op.cc @@ -108,7 +108,53 @@ class MultiheadMatMulRoformerOpConverter : public OpConverter { TensorRTEngine::Weight bias{nvinfer1::DataType::kFLOAT, static_cast(bias_data), static_cast(bias_t->numel())}; - +#if IS_TRT_VERSION_GE(8600) + auto* weight_layer = TRT_ENGINE_ADD_LAYER( + engine_, Constant, nvinfer1::Dims3(1, n, m), weight.get()); + auto* bias_layer = TRT_ENGINE_ADD_LAYER( + engine_, Constant, nvinfer1::Dims3(1, 1, n), bias.get()); + auto* matmul_layer = + TRT_ENGINE_ADD_LAYER(engine_, + MatrixMultiply, + *input, + nvinfer1::MatrixOperation::kNONE, + *weight_layer->getOutput(0), + nvinfer1::MatrixOperation::kTRANSPOSE); + matmul_layer->setName( + ("multihead_matmul_matmul(Output: " + output_name + ")").c_str()); + auto* add_layer = + TRT_ENGINE_ADD_LAYER(engine_, + ElementWise, + *matmul_layer->getOutput(0), + *bias_layer->getOutput(0), + nvinfer1::ElementWiseOperation::kSUM); + add_layer->setName( + ("multihead_matmul_add(Output: " + output_name + ")").c_str()); + if (op_desc.HasAttr("fc_out_threshold")) { + PADDLE_ENFORCE_EQ( + op_desc.HasAttr("fc_out_threshold"), + true, + platform::errors::InvalidArgument( + "must have out threshold in multihead layers in int8 mode")); + float out_scale = + PADDLE_GET_CONST(float, op_desc.GetAttr("fc_out_threshold")); + engine_->SetTensorDynamicRange(add_layer->getOutput(0), out_scale); + } + auto* reshape_after_fc_layer = + TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *add_layer->getOutput(0)); + nvinfer1::Dims reshape_after_fc_dim; + reshape_after_fc_dim.nbDims = 5; + reshape_after_fc_dim.d[0] = 0; + reshape_after_fc_dim.d[1] = 0; + reshape_after_fc_dim.d[2] = 0; + reshape_after_fc_dim.d[3] = 1; + reshape_after_fc_dim.d[4] = 1; + reshape_after_fc_layer->setReshapeDimensions(reshape_after_fc_dim); + reshape_after_fc_layer->setName( + ("shuffle_after_multihead_matmul(Output: " + output_name + ")") + .c_str()); + auto* fc_layer = reshape_after_fc_layer; +#else // add shuffle before fc nvinfer1::Dims reshape_before_fc_dim; reshape_before_fc_dim.nbDims = 5; @@ -134,7 +180,7 @@ class MultiheadMatMulRoformerOpConverter : public OpConverter { nvinfer1::DimsHW nv_ksize(1, 1); fc_layer = TRT_ENGINE_ADD_LAYER(engine_, - Convolution, + ConvolutionNd, *reshape_before_fc_layer->getOutput(0), n, nv_ksize, @@ -164,6 +210,7 @@ class MultiheadMatMulRoformerOpConverter : public OpConverter { ("multihead_matmul_fc(Output: " + output_name + ")").c_str()); // no need to add shuffle after fc, just change it in +#endif // QkvToContextPluginDynamic // add qkv to context diff --git a/paddle/fluid/inference/tensorrt/convert/nearest_interp_op.cc b/paddle/fluid/inference/tensorrt/convert/nearest_interp_op.cc index d03940dd40aa5..2f67bd6523cf0 100644 --- a/paddle/fluid/inference/tensorrt/convert/nearest_interp_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/nearest_interp_op.cc @@ -46,7 +46,14 @@ class NearestInterpolateOpConverter : public OpConverter { auto out_w = PADDLE_GET_CONST(int, op_desc.GetAttr("out_w")); auto layer = TRT_ENGINE_ADD_LAYER(engine_, Resize, *input); +#if IS_TRT_VERSION_GE(8600) + if (align_corners) { + layer->setCoordinateTransformation( + nvinfer1::ResizeCoordinateTransformation::kALIGN_CORNERS); + } +#else layer->setAlignCorners(align_corners); +#endif auto in_dim = input->getDimensions(); diff --git a/paddle/fluid/inference/tensorrt/convert/nearest_interp_v2_op.cc b/paddle/fluid/inference/tensorrt/convert/nearest_interp_v2_op.cc index feac662ff0441..56d08821487e7 100644 --- a/paddle/fluid/inference/tensorrt/convert/nearest_interp_v2_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/nearest_interp_v2_op.cc @@ -44,7 +44,14 @@ class NearestInterpolateV2OpConverter : public OpConverter { auto out_w = PADDLE_GET_CONST(int, op_desc.GetAttr("out_w")); auto layer = TRT_ENGINE_ADD_LAYER(engine_, Resize, *input); +#if IS_TRT_VERSION_GE(8600) + if (align_corners) { + layer->setCoordinateTransformation( + nvinfer1::ResizeCoordinateTransformation::kALIGN_CORNERS); + } +#else layer->setAlignCorners(align_corners); +#endif auto in_dim = input->getDimensions(); diff --git a/paddle/fluid/inference/tensorrt/convert/pad_op.cc b/paddle/fluid/inference/tensorrt/convert/pad_op.cc index 56a662a7254ee..12b86bf96b06d 100644 --- a/paddle/fluid/inference/tensorrt/convert/pad_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/pad_op.cc @@ -41,7 +41,7 @@ class PadOpConverter : public OpConverter { nvinfer1::DimsHW post_pad(paddings[pad_size - 3], paddings[pad_size - 1]); auto* layer = TRT_ENGINE_ADD_LAYER(engine_, - Padding, + PaddingNd, *const_cast(input), pre_pad, post_pad); diff --git a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc index 317a519b0cc44..d38945a733b8f 100644 --- a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc @@ -146,8 +146,9 @@ class Pool2dOpConverter : public OpConverter { // compute for (int i = 0; i < 2; ++i) { int out_size = (input_shape.d[2 + i] + strides[i] - 1) / strides[i]; - int pad_sum = std::max( - (out_size - 1) * strides[i] + ksize[i] - input_shape.d[2 + i], 0); + int pad_sum = std::max((out_size - 1) * strides[i] + ksize[i] - + static_cast(input_shape.d[2 + i]), + 0); int pad_0 = pad_sum / 2; int pad_1 = pad_sum - pad_0; paddings[i * 2] = pad_0; @@ -179,7 +180,7 @@ class Pool2dOpConverter : public OpConverter { ((g_post_pad.w() > 0 && input_shape.d[input_dims - 2] > 0) || (g_post_pad.h() > 0 && input_shape.d[input_dims - 1] > 0))) { auto *pad_layer = TRT_ENGINE_ADD_LAYER( - engine_, Padding, *input1, g_pre_pad, g_post_pad); + engine_, PaddingNd, *input1, g_pre_pad, g_post_pad); PADDLE_ENFORCE_NOT_NULL( pad_layer, platform::errors::Fatal( @@ -189,9 +190,9 @@ class Pool2dOpConverter : public OpConverter { } auto *pool_layer = TRT_ENGINE_ADD_LAYER( - engine_, Pooling, *input1, nv_pool_type, nv_ksize); - pool_layer->setStride(nv_strides); - pool_layer->setPadding(nv_paddings); + engine_, PoolingNd, *input1, nv_pool_type, nv_ksize); + pool_layer->setStrideNd(nv_strides); + pool_layer->setPaddingNd(nv_paddings); pool_layer->setAverageCountExcludesPadding(exclusive); if (padding_algorithm == "SAME") { pool_layer->setPaddingMode(nvinfer1::PaddingMode::kSAME_UPPER); @@ -199,9 +200,9 @@ class Pool2dOpConverter : public OpConverter { layer = pool_layer; } else if (!adaptive && !global_pooling && ceil_mode) { auto *pool_layer = TRT_ENGINE_ADD_LAYER( - engine_, Pooling, *input1, nv_pool_type, nv_ksize); - pool_layer->setStride(nv_strides); - pool_layer->setPadding(nv_paddings); + engine_, PoolingNd, *input1, nv_pool_type, nv_ksize); + pool_layer->setStrideNd(nv_strides); + pool_layer->setPaddingNd(nv_paddings); pool_layer->setAverageCountExcludesPadding(exclusive); if (padding_algorithm == "SAME") { pool_layer->setPaddingMode(nvinfer1::PaddingMode::kSAME_UPPER); @@ -239,7 +240,7 @@ class Pool2dOpConverter : public OpConverter { if (global_pooling == true && adaptive == false) { auto *pool_layer = TRT_ENGINE_ADD_LAYER( - engine_, Pooling, *input1, nv_pool_type, nv_ksize); + engine_, PoolingNd, *input1, nv_pool_type, nv_ksize); PADDLE_ENFORCE_NOT_NULL( pool_layer, platform::errors::Fatal( @@ -271,7 +272,7 @@ class Pool2dOpConverter : public OpConverter { &post_pad, input_dims); auto *pad_layer = TRT_ENGINE_ADD_LAYER( - engine_, Padding, *input1, pre_pad, post_pad); + engine_, PaddingNd, *input1, pre_pad, post_pad); PADDLE_ENFORCE_NOT_NULL( pad_layer, @@ -281,13 +282,13 @@ class Pool2dOpConverter : public OpConverter { input1 = pad_layer->getOutput(0); auto *pool_layer = TRT_ENGINE_ADD_LAYER( - engine_, Pooling, *input1, nv_pool_type, nv_ksize); + engine_, PoolingNd, *input1, nv_pool_type, nv_ksize); PADDLE_ENFORCE_NOT_NULL( pool_layer, platform::errors::Fatal( "trt pool layer in converter could not be created.")); - pool_layer->setStride(nv_strides); - pool_layer->setPadding(nv_paddings); + pool_layer->setStrideNd(nv_strides); + pool_layer->setPaddingNd(nv_paddings); if (padding_algorithm == "SAME") { pool_layer->setPaddingMode(nvinfer1::PaddingMode::kSAME_UPPER); } @@ -322,7 +323,7 @@ class Pool2dOpConverter : public OpConverter { if ((g_post_pad.w() > 0 || g_post_pad.h() > 0) && (padding_algorithm != "SAME") && !ceil_mode) { auto *pad_layer = TRT_ENGINE_ADD_LAYER( - engine_, Padding, *input1, g_pre_pad, g_post_pad); + engine_, PaddingNd, *input1, g_pre_pad, g_post_pad); PADDLE_ENFORCE_NOT_NULL( pad_layer, platform::errors::Fatal( @@ -332,13 +333,13 @@ class Pool2dOpConverter : public OpConverter { } #endif auto *pool_layer = TRT_ENGINE_ADD_LAYER( - engine_, Pooling, *input1, nv_pool_type, nv_ksize); + engine_, PoolingNd, *input1, nv_pool_type, nv_ksize); PADDLE_ENFORCE_NOT_NULL( pool_layer, platform::errors::Fatal( "trt pool layer in converter could not be created.")); - pool_layer->setStride(nv_strides); - pool_layer->setPadding(nv_paddings); + pool_layer->setStrideNd(nv_strides); + pool_layer->setPaddingNd(nv_paddings); if (padding_algorithm == "SAME") { pool_layer->setPaddingMode(nvinfer1::PaddingMode::kSAME_UPPER); } diff --git a/paddle/fluid/inference/tensorrt/convert/slice_op.cc b/paddle/fluid/inference/tensorrt/convert/slice_op.cc index 0e2382a2d3fa6..4dbeff4761401 100644 --- a/paddle/fluid/inference/tensorrt/convert/slice_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/slice_op.cc @@ -161,12 +161,14 @@ class SliceOpConverter : public OpConverter { input_dims.d[0] = 1; // fake batchsize, not useful here for (size_t i = 0; i < axes.size(); i++) { if (starts[i] < 0) { - starts[i] = std::max(starts[i] + input_dims.d[axes[i]], 0); + starts[i] = + std::max(starts[i] + static_cast(input_dims.d[axes[i]]), 0); } if (ends[i] < 0) { - ends[i] = std::max(ends[i] + input_dims.d[axes[i]], 0); + ends[i] = + std::max(ends[i] + static_cast(input_dims.d[axes[i]]), 0); } - ends[i] = std::min(ends[i], input_dims.d[axes[i]]); + ends[i] = std::min(ends[i], static_cast(input_dims.d[axes[i]])); PADDLE_ENFORCE_GT( ends[i], starts[i], diff --git a/paddle/fluid/inference/tensorrt/convert/strided_slice_op.cc b/paddle/fluid/inference/tensorrt/convert/strided_slice_op.cc index 6b721d37d205f..e7842904a8b91 100644 --- a/paddle/fluid/inference/tensorrt/convert/strided_slice_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/strided_slice_op.cc @@ -52,12 +52,14 @@ class StridedSliceOpConverter : public OpConverter { input_dims.d[0] = 1; // fake batchsize, not useful here for (size_t i = 0; i < axes.size(); i++) { if (starts[i] < 0) { - starts[i] = std::max(starts[i] + input_dims.d[axes[i]], 0); + starts[i] = + std::max(starts[i] + static_cast(input_dims.d[axes[i]]), 0); } if (ends[i] < 0) { - ends[i] = std::max(ends[i] + input_dims.d[axes[i]], 0); + ends[i] = + std::max(ends[i] + static_cast(input_dims.d[axes[i]]), 0); } - ends[i] = std::min(ends[i], input_dims.d[axes[i]]); + ends[i] = std::min(ends[i], static_cast(input_dims.d[axes[i]])); PADDLE_ENFORCE_GT( ends[i], starts[i], diff --git a/paddle/fluid/inference/tensorrt/convert/tile_op.cc b/paddle/fluid/inference/tensorrt/convert/tile_op.cc index c02fe619aa30d..75c3e696d0989 100644 --- a/paddle/fluid/inference/tensorrt/convert/tile_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/tile_op.cc @@ -101,7 +101,11 @@ class TileOpConverter : public OpConverter { layer->setInput(1, *start_tensor); layer->setInput(2, *output_shape_tensor); layer->setInput(3, *stride_tensor); +#if IS_TRT_VERSION_GE(8600) + layer->setMode(nvinfer1::SampleMode::kWRAP); +#else layer->setMode(nvinfer1::SliceMode::kWRAP); +#endif ReplenishLayerAndOutput(layer, "tile", {output_name}, test_mode); } else { @@ -132,7 +136,11 @@ class TileOpConverter : public OpConverter { } auto layer = TRT_ENGINE_ADD_LAYER( engine_, Slice, *input, input_shape, output_dim, output_stride); +#if IS_TRT_VERSION_GE(8600) + layer->setMode(nvinfer1::SampleMode::kWRAP); +#else layer->setMode(nvinfer1::SliceMode::kWRAP); +#endif ReplenishLayerAndOutput(layer, "tile", {output_name}, test_mode); } diff --git a/paddle/fluid/inference/tensorrt/convert/ut_helper.h b/paddle/fluid/inference/tensorrt/convert/ut_helper.h index 347f6f500c7c8..d04ae0c778e78 100644 --- a/paddle/fluid/inference/tensorrt/convert/ut_helper.h +++ b/paddle/fluid/inference/tensorrt/convert/ut_helper.h @@ -201,11 +201,21 @@ class TRTConvertValidation { // Bind input and output for TRT. const int num_bindings = input_output_names.size(); std::vector buffers(num_bindings); - +#if IS_TRT_VERSION_GE(8600) + std::unordered_map tensor_index; + for (int i = 0; i < engine_->engine()->getNbIOTensors(); ++i) { + auto tensor_name = engine_->engine()->getIOTensorName(i); + tensor_index[std::string(tensor_name)] = i; + } +#endif for (const std::string& name : input_output_names) { auto* var = scope_.FindVar(name); auto* tensor = var->GetMutable(); +#if IS_TRT_VERSION_GE(10000) + const int bind_index = tensor_index[std::string(name.c_str())]; +#else const int bind_index = engine_->engine()->getBindingIndex(name.c_str()); +#endif buffers[bind_index] = static_cast(tensor->mutable_data(place_)); } diff --git a/paddle/fluid/inference/tensorrt/dynamic_shape_infermeta.cc b/paddle/fluid/inference/tensorrt/dynamic_shape_infermeta.cc index ea333754f894b..e6bc25af044dc 100644 --- a/paddle/fluid/inference/tensorrt/dynamic_shape_infermeta.cc +++ b/paddle/fluid/inference/tensorrt/dynamic_shape_infermeta.cc @@ -42,12 +42,15 @@ class ExprWrapper { const ExprWrapper& b, nvinfer1::DimensionOperation op) { ExprWrapper result = {}; + assert(a.expr); + assert(b.expr); if (a.expr_builder) { result.expr_builder = a.expr_builder; } if (b.expr_builder) { result.expr_builder = b.expr_builder; } + assert(result.expr_builder); assert(result.expr); result.expr = result.expr_builder->operation(op, *a.expr, *b.expr); return result; diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc index 2a14702b59d81..41db064cb0c9c 100644 --- a/paddle/fluid/inference/tensorrt/engine.cc +++ b/paddle/fluid/inference/tensorrt/engine.cc @@ -101,7 +101,11 @@ nvinfer1::IExecutionContext *TensorRTEngine::context() { if (with_dynamic_shape()) { // need new profile if it's not the first if (cur_profile_num_ > 0) { +#if IS_TRT_VERSION_GE(8600) + infer_context->setOptimizationProfileAsync(cur_profile_num_, nullptr); +#else infer_context->setOptimizationProfile(cur_profile_num_); +#endif } profile_index_[predictor_id_per_thread] = cur_profile_num_; ++cur_profile_num_; @@ -174,9 +178,10 @@ bool TensorRTEngine::Enqueue(nvinfer1::IExecutionContext *context, #if IS_TRT_VERSION_GE(8500) for (size_t j = 0; j < buffers->size(); ++j) { - auto name = context->getEngine().getBindingName(j); - if (context->getEngine().isShapeBinding(j) && - context->getEngine().bindingIsInput(j)) { + auto name = context->getEngine().getIOTensorName(j); + if (context->getEngine().isShapeInferenceIO(name) && + context->getEngine().getTensorIOMode(name) == + nvinfer1::TensorIOMode::kINPUT) { continue; } else { context->setTensorAddress(name, (*buffers)[j]); @@ -425,7 +430,11 @@ void TensorRTEngine::FreezeNetwork() { "Build TensorRT cuda engine failed! Please recheck " "you configurations related to paddle-TensorRT.")); +#if IS_TRT_VERSION_GE(10000) + binding_num_ = infer_engine_->getNbIOTensors(); +#else binding_num_ = infer_engine_->getNbBindings(); +#endif // reset status for dynamic shape clone if (max_profile_num_ > 1) { infer_context_.clear(); @@ -647,7 +656,11 @@ void TensorRTEngine::Deserialize(const std::string &engine_serialized_data) { "generating serialization file and doing inference are " "consistent.")); +#if IS_TRT_VERSION_GE(10000) + binding_num_ = infer_engine_->getNbIOTensors(); +#else binding_num_ = infer_engine_->getNbBindings(); +#endif // for engine context memory sharing if (params_.context_memory_sharing) { inference::Singleton::Global() diff --git a/paddle/fluid/inference/tensorrt/helper.h b/paddle/fluid/inference/tensorrt/helper.h index 3f0152b2909c0..e895d1e943c7d 100644 --- a/paddle/fluid/inference/tensorrt/helper.h +++ b/paddle/fluid/inference/tensorrt/helper.h @@ -104,7 +104,7 @@ template struct Destroyer { void operator()(T* x) { if (x) { - x->destroy(); + delete x; } } }; diff --git a/paddle/fluid/inference/tensorrt/test_dynamic_engine.cc b/paddle/fluid/inference/tensorrt/test_dynamic_engine.cc index 357d04c3c6cc6..d87c9af8cfa67 100644 --- a/paddle/fluid/inference/tensorrt/test_dynamic_engine.cc +++ b/paddle/fluid/inference/tensorrt/test_dynamic_engine.cc @@ -124,7 +124,11 @@ TEST_F(TensorRTDynamicShapeValueEngineTest, test_trt_dynamic_shape_value) { platform::errors::InvalidArgument("TRT shuffle layer building failed.")); engine_->DeclareOutput(layer, 0, "y"); engine_->FreezeNetwork(); +#if IS_TRT_VERSION_GE(8600) + ASSERT_EQ(engine_->engine()->getNbIOTensors(), 3); +#else ASSERT_EQ(engine_->engine()->getNbBindings(), 3); +#endif std::vector x_v(8 * 32); for (int i = 0; i < 8 * 32; i++) { @@ -150,9 +154,10 @@ TEST_F(TensorRTDynamicShapeValueEngineTest, test_trt_dynamic_shape_value) { buffers[2] = reinterpret_cast(y_gpu_data); #if IS_TRT_VERSION_GE(8500) for (size_t i = 0; i < buffers.size(); i++) { - auto name = engine_->engine()->getBindingName(i); - if (engine_->engine()->isShapeBinding(i) && - engine_->engine()->bindingIsInput(i)) { + auto name = engine_->engine()->getIOTensorName(i); + if (engine_->engine()->isShapeInferenceIO(name) && + engine_->engine()->getTensorIOMode(name) == + nvinfer1::TensorIOMode::kINPUT) { engine_->context()->setTensorAddress(name, shape_v.data()); } else { engine_->context()->setTensorAddress(name, buffers[i]); @@ -168,7 +173,7 @@ TEST_F(TensorRTDynamicShapeValueEngineTest, test_trt_dynamic_shape_value) { ASSERT_EQ(y_cpu[0], 0); ASSERT_EQ(y_cpu[1], 1); #if IS_TRT_VERSION_GE(8500) - const char *name1 = engine_->engine()->getBindingName(2); + const char *name1 = engine_->engine()->getIOTensorName(2); auto dims = engine_->context()->getTensorShape(name1); #else auto dims = engine_->context()->getBindingDimensions(2); @@ -447,7 +452,11 @@ TEST_F(TensorRTDynamicTestFusedTokenPrune, test_fused_token_prune) { } engine_->FreezeNetwork(); +#if IS_TRT_VERSION_GE(8600) + ASSERT_EQ(engine_->engine()->getNbIOTensors(), 6); +#else ASSERT_EQ(engine_->engine()->getNbBindings(), 6); +#endif LOG(INFO) << "create input"; std::vector attn_v(16); for (int j = 0; j < 4; ++j) { @@ -645,7 +654,11 @@ TEST_F(TensorRTDynamicTestFusedTokenPruneHalf, test_fused_token_prune) { } engine_->FreezeNetwork(); +#if IS_TRT_VERSION_GE(8600) + ASSERT_EQ(engine_->engine()->getNbIOTensors(), 6); +#else ASSERT_EQ(engine_->engine()->getNbBindings(), 6); +#endif LOG(INFO) << "create input"; std::vector attn_v(16); for (int j = 0; j < 4; ++j) { diff --git a/paddle/fluid/inference/tensorrt/test_engine.cc b/paddle/fluid/inference/tensorrt/test_engine.cc index cce1e2888a391..4c08da6d060eb 100644 --- a/paddle/fluid/inference/tensorrt/test_engine.cc +++ b/paddle/fluid/inference/tensorrt/test_engine.cc @@ -52,6 +52,7 @@ class TensorRTEngineTest : public ::testing::Test { TensorRTEngine::ConstructionParams params; params.max_batch_size = 10; params.max_workspace_size = 1 << 10; + params.with_dynamic_shape = true; engine_ = std::make_unique(params); engine_->InitNetwork(); } @@ -88,16 +89,41 @@ TEST_F(TensorRTEngineTest, add_layer) { nvinfer1::DataType::kFLOAT, raw_bias.data(), size); auto *x = engine_->DeclareInput( "x", nvinfer1::DataType::kFLOAT, nvinfer1::Dims3{1, 1, 1}); - auto *fc_layer = TRT_ENGINE_ADD_LAYER( - engine_, FullyConnected, *x, size, weight.get(), bias.get()); - PADDLE_ENFORCE_NOT_NULL(fc_layer, - platform::errors::InvalidArgument( - "TRT fully connected layer building failed.")); + auto *weight_layer = TRT_ENGINE_ADD_LAYER( + engine_, Constant, nvinfer1::Dims3{1, 1, 1}, weight.get()); + auto *bias_layer = TRT_ENGINE_ADD_LAYER( + engine_, Constant, nvinfer1::Dims3{1, 1, 1}, bias.get()); + auto *matmul_layer = + TRT_ENGINE_ADD_LAYER(engine_, + MatrixMultiply, + *x, + nvinfer1::MatrixOperation::kNONE, + *weight_layer->getOutput(0), + nvinfer1::MatrixOperation::kTRANSPOSE); + PADDLE_ENFORCE_NOT_NULL( + matmul_layer, + platform::errors::InvalidArgument( + "The TRT MatrixMultiply layer cannot be null. There is something " + "wrong with the TRT network building and layer creation.")); + auto *add_layer = TRT_ENGINE_ADD_LAYER(engine_, + ElementWise, + *matmul_layer->getOutput(0), + *bias_layer->getOutput(0), + nvinfer1::ElementWiseOperation::kSUM); + PADDLE_ENFORCE_NOT_NULL( + add_layer, + platform::errors::InvalidArgument( + "The TRT elementwise layer cannot be null. There is something wrong " + "with the TRT network building and layer creation.")); - engine_->DeclareOutput(fc_layer, 0, "y"); + engine_->DeclareOutput(add_layer, 0, "y"); LOG(INFO) << "freeze network"; engine_->FreezeNetwork(); +#if IS_TRT_VERSION_GE(8600) + ASSERT_EQ(engine_->engine()->getNbIOTensors(), 2); +#else ASSERT_EQ(engine_->engine()->getNbBindings(), 2); +#endif // fill in real data std::vector x_v = {1234}; @@ -132,16 +158,41 @@ TEST_F(TensorRTEngineTest, add_layer_multi_dim) { nvinfer1::DataType::kFLOAT, raw_weight.data(), 4); TensorRTEngine::Weight bias(nvinfer1::DataType::kFLOAT, raw_bias.data(), 2); auto *x = engine_->DeclareInput( - "x", nvinfer1::DataType::kFLOAT, nvinfer1::Dims3{1, 2, 1}); - auto *fc_layer = TRT_ENGINE_ADD_LAYER( - engine_, FullyConnected, *x, 2, weight.get(), bias.get()); - PADDLE_ENFORCE_NOT_NULL(fc_layer, - platform::errors::InvalidArgument( - "TRT fully connected layer building failed.")); + "x", nvinfer1::DataType::kFLOAT, nvinfer1::Dims3{1, 1, 2}); + auto *weight_layer = TRT_ENGINE_ADD_LAYER( + engine_, Constant, nvinfer1::Dims3{1, 2, 2}, weight.get()); + auto *bias_layer = TRT_ENGINE_ADD_LAYER( + engine_, Constant, nvinfer1::Dims3{1, 1, 2}, bias.get()); + auto *matmul_layer = + TRT_ENGINE_ADD_LAYER(engine_, + MatrixMultiply, + *x, + nvinfer1::MatrixOperation::kNONE, + *weight_layer->getOutput(0), + nvinfer1::MatrixOperation::kTRANSPOSE); + PADDLE_ENFORCE_NOT_NULL( + matmul_layer, + platform::errors::InvalidArgument( + "The TRT MatrixMultiply layer cannot be null. There is something " + "wrong with the TRT network building and layer creation.")); + auto *add_layer = TRT_ENGINE_ADD_LAYER(engine_, + ElementWise, + *matmul_layer->getOutput(0), + *bias_layer->getOutput(0), + nvinfer1::ElementWiseOperation::kSUM); + PADDLE_ENFORCE_NOT_NULL( + add_layer, + platform::errors::InvalidArgument( + "The TRT elementwise layer cannot be null. There is something wrong " + "with the TRT network building and layer creation.")); - engine_->DeclareOutput(fc_layer, 0, "y"); + engine_->DeclareOutput(add_layer, 0, "y"); engine_->FreezeNetwork(); +#if IS_TRT_VERSION_GE(8600) + ASSERT_EQ(engine_->engine()->getNbIOTensors(), 2); +#else ASSERT_EQ(engine_->engine()->getNbBindings(), 2); +#endif // fill in real data std::vector x_v = {1.0, 2.0}; @@ -161,8 +212,9 @@ TEST_F(TensorRTEngineTest, add_layer_multi_dim) { auto dims = engine_->GetITensor("y")->getDimensions(); ASSERT_EQ(dims.nbDims, 3); - ASSERT_EQ(dims.d[0], 2); + ASSERT_EQ(dims.d[0], 1); ASSERT_EQ(dims.d[1], 1); + ASSERT_EQ(dims.d[2], 2); ASSERT_EQ(y_cpu[0], 4.5); ASSERT_EQ(y_cpu[1], 14.5); @@ -178,9 +230,9 @@ TEST_F(TensorRTEngineTest, test_conv2d) { nvinfer1::DataType::kFLOAT, raw_weight.data(), 9); TensorRTEngine::Weight bias(nvinfer1::DataType::kFLOAT, raw_bias.data(), 1); auto *x = engine_->DeclareInput( - "x", nvinfer1::DataType::kFLOAT, nvinfer1::Dims3{1, 3, 3}); + "x", nvinfer1::DataType::kFLOAT, nvinfer1::Dims4{2, 1, 3, 3}); auto *conv_layer = TRT_ENGINE_ADD_LAYER(engine_, - Convolution, + ConvolutionNd, *x, 1, nvinfer1::DimsHW{3, 3}, @@ -189,12 +241,16 @@ TEST_F(TensorRTEngineTest, test_conv2d) { PADDLE_ENFORCE_NOT_NULL(conv_layer, platform::errors::InvalidArgument( "TRT convolution layer building failed.")); - conv_layer->setStride(nvinfer1::DimsHW{1, 1}); - conv_layer->setPadding(nvinfer1::DimsHW{1, 1}); + conv_layer->setStrideNd(nvinfer1::Dims2{1, 1}); + conv_layer->setPaddingNd(nvinfer1::Dims2{1, 1}); engine_->DeclareOutput(conv_layer, 0, "y"); engine_->FreezeNetwork(); +#if IS_TRT_VERSION_GE(8600) + ASSERT_EQ(engine_->engine()->getNbIOTensors(), 2); +#else ASSERT_EQ(engine_->engine()->getNbBindings(), 2); +#endif // fill in real data std::vector x_v = {1.0, @@ -236,22 +292,26 @@ TEST_F(TensorRTEngineTest, test_conv2d) { TEST_F(TensorRTEngineTest, test_pool2d) { // Weight in CPU memory. auto *x = engine_->DeclareInput( - "x", nvinfer1::DataType::kFLOAT, nvinfer1::Dims3{1, 2, 2}); + "x", nvinfer1::DataType::kFLOAT, nvinfer1::Dims4{2, 1, 2, 2}); std::vector buffers(2); // TRT binded inputs nvinfer1::PoolingType pool_t = nvinfer1::PoolingType::kAVERAGE; auto *pool_layer = TRT_ENGINE_ADD_LAYER( - engine_, Pooling, *x, pool_t, nvinfer1::DimsHW{2, 2}); + engine_, PoolingNd, *x, pool_t, nvinfer1::DimsHW{2, 2}); PADDLE_ENFORCE_NOT_NULL( pool_layer, platform::errors::InvalidArgument("TRT pooling layer building failed.")); - pool_layer->setStride(nvinfer1::DimsHW{1, 1}); - pool_layer->setPadding(nvinfer1::DimsHW{0, 0}); + pool_layer->setStrideNd(nvinfer1::Dims2{1, 1}); + pool_layer->setPaddingNd(nvinfer1::Dims2{0, 0}); engine_->DeclareOutput(pool_layer, 0, "y"); engine_->FreezeNetwork(); +#if IS_TRT_VERSION_GE(8600) + ASSERT_EQ(engine_->engine()->getNbIOTensors(), 2); +#else ASSERT_EQ(engine_->engine()->getNbBindings(), 2); +#endif // fill in real data std::vector x_v = {1.0, 2.0, 5.0, 0.0, 2.0, 3.0, 5.0, 10.0}; diff --git a/paddle/fluid/inference/tensorrt/test_tensorrt.cc b/paddle/fluid/inference/tensorrt/test_tensorrt.cc index 4949421db5f60..dc0b78b7495b2 100644 --- a/paddle/fluid/inference/tensorrt/test_tensorrt.cc +++ b/paddle/fluid/inference/tensorrt/test_tensorrt.cc @@ -78,32 +78,53 @@ nvinfer1::IHostMemory* CreateNetwork() { ScopedWeights weights(2.); ScopedWeights bias(3.); - nvinfer1::INetworkDefinition* network = builder->createNetworkV2(0U); + nvinfer1::INetworkDefinition* network = builder->createNetworkV2( + 1U << static_cast( + nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH)); // Add the input auto input = network->addInput( kInputTensor, nvinfer1::DataType::kFLOAT, nvinfer1::Dims3{1, 1, 1}); EXPECT_NE(input, nullptr); + // Add the constant layer for weight + auto weight_tensor = + network->addConstant(nvinfer1::Dims3{1, 1, 1}, weights.get()) + ->getOutput(0); + // Add the constant layer for bias + auto bias_tensor = + network->addConstant(nvinfer1::Dims3{1, 1, 1}, bias.get())->getOutput(0); // Add the hidden layer. - auto layer = network->addFullyConnected(*input, 1, weights.get(), bias.get()); - EXPECT_NE(layer, nullptr); + auto matmul_layer = + network->addMatrixMultiply(*input, + nvinfer1::MatrixOperation::kNONE, + *weight_tensor, + nvinfer1::MatrixOperation::kTRANSPOSE); + auto add_layer = + network->addElementWise(*matmul_layer->getOutput(0), + *bias_tensor, + nvinfer1::ElementWiseOperation::kSUM); + EXPECT_NE(add_layer, nullptr); // Mark the output. - auto output = layer->getOutput(0); + auto output = add_layer->getOutput(0); output->setName(kOutputTensor); network->markOutput(*output); - // Build the engine. - builder->setMaxBatchSize(1); #if IS_TRT_VERSION_GE(8300) config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, 1 << 10); #else config->setMaxWorkspaceSize(1 << 10); #endif - auto engine = builder->buildEngineWithConfig(*network, *config); +#if IS_TRT_VERSION_GE(8600) + nvinfer1::IHostMemory* model = + builder->buildSerializedNetwork(*network, *config); + EXPECT_NE(model, nullptr); +#else + auto* engine = builder->buildEngineWithConfig(*network, *config); EXPECT_NE(engine, nullptr); // Serialize the engine to create a model, then close. nvinfer1::IHostMemory* model = engine->serialize(); - network->destroy(); - engine->destroy(); - builder->destroy(); + delete engine; +#endif + delete network; + delete builder; return model; } @@ -112,6 +133,30 @@ void Execute(nvinfer1::IExecutionContext* context, float* output) { const nvinfer1::ICudaEngine& engine = context->getEngine(); // Two binds, input and output + cudaStream_t stream; + ASSERT_EQ(0, cudaStreamCreate(&stream)); +#if IS_TRT_VERSION_GE(8600) + ASSERT_EQ(engine.getNbIOTensors(), 2); + void* buffers[2]; + for (int i = 0; i < 2; ++i) { + ASSERT_EQ(0, cudaMalloc(&buffers[i], sizeof(float))); + auto tensor_name = engine.getIOTensorName(i); + context->setTensorAddress(tensor_name, buffers[i]); + } + ASSERT_EQ( + 0, + cudaMemcpyAsync( + buffers[0], input, sizeof(float), cudaMemcpyHostToDevice, stream)); + context->enqueueV3(stream); + ASSERT_EQ( + 0, + cudaMemcpyAsync( + output, buffers[1], sizeof(float), cudaMemcpyDeviceToHost, stream)); + cudaStreamSynchronize(stream); + cudaStreamDestroy(stream); + ASSERT_EQ(0, cudaFree(buffers[0])); + ASSERT_EQ(0, cudaFree(buffers[1])); +#else ASSERT_EQ(engine.getNbBindings(), 2); const int input_index = engine.getBindingIndex(kInputTensor); const int output_index = engine.getBindingIndex(kOutputTensor); @@ -119,7 +164,6 @@ void Execute(nvinfer1::IExecutionContext* context, std::vector buffers(2); ASSERT_EQ(0, cudaMalloc(&buffers[input_index], sizeof(float))); ASSERT_EQ(0, cudaMalloc(&buffers[output_index], sizeof(float))); - cudaStream_t stream; ASSERT_EQ(0, cudaStreamCreate(&stream)); // Copy the input to the GPU, execute the network, and copy the output back. ASSERT_EQ(0, @@ -141,6 +185,7 @@ void Execute(nvinfer1::IExecutionContext* context, cudaStreamDestroy(stream); ASSERT_EQ(0, cudaFree(buffers[input_index])); ASSERT_EQ(0, cudaFree(buffers[output_index])); +#endif } TEST(TensorrtTest, BasicFunction) { @@ -151,8 +196,8 @@ TEST(TensorrtTest, BasicFunction) { Logger logger; nvinfer1::IRuntime* runtime = createInferRuntime(&logger); nvinfer1::ICudaEngine* engine = - runtime->deserializeCudaEngine(model->data(), model->size(), nullptr); - model->destroy(); + runtime->deserializeCudaEngine(model->data(), model->size()); + delete model; nvinfer1::IExecutionContext* context = engine->createExecutionContext(); // Execute the network. @@ -162,7 +207,7 @@ TEST(TensorrtTest, BasicFunction) { EXPECT_EQ(output, input * 2 + 3); // Destroy the engine. - context->destroy(); - engine->destroy(); - runtime->destroy(); + delete context; + delete engine; + delete runtime; } diff --git a/paddle/fluid/ir_adaptor/translator/op_translator.cc b/paddle/fluid/ir_adaptor/translator/op_translator.cc index a06de446ec9e1..005f73e7b6427 100644 --- a/paddle/fluid/ir_adaptor/translator/op_translator.cc +++ b/paddle/fluid/ir_adaptor/translator/op_translator.cc @@ -2673,6 +2673,7 @@ struct ElementwiseTranscriber : public OpTranscriber { pir::Value y_new; if (std::find(y_shape.begin(), y_shape.end(), -1) == y_shape.end()) { std::vector y_new_shape(y_shape); + y_new_shape.insert(y_new_shape.begin(), axis, 1); for (int i = 0; i < append_size; i++) { y_new_shape.push_back(1); } diff --git a/paddle/fluid/operators/collective/c_broadcast_op.cu.cc b/paddle/fluid/operators/collective/c_broadcast_op.cu.cc index 788eb31977f9b..137c6b9d89a75 100644 --- a/paddle/fluid/operators/collective/c_broadcast_op.cu.cc +++ b/paddle/fluid/operators/collective/c_broadcast_op.cu.cc @@ -17,6 +17,7 @@ limitations under the License. */ #include "paddle/phi/core/distributed/nccl_comm_context.h" #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#include "paddle/fluid/distributed/collective/process_group.h" #include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/device/gpu/nccl_helper.h" #endif @@ -39,6 +40,17 @@ class CBroadcastOpCUDAKernel : public framework::OpKernel { int root = ctx.Attr("root"); + auto map = distributed::ProcessGroupMapFromGid::getInstance(); + if (map->has(rid)) { + distributed::ProcessGroup* pg = map->get(rid); + auto b_opts = distributed::BroadcastOptions(); + b_opts.source_rank = rid; + b_opts.source_root = root; + auto task = pg->Broadcast(out, *x, b_opts, false); + task->Wait(); + return; + } + gpuStream_t stream = ctx.cuda_device_context().stream(); const auto& comm_context_manager = phi::distributed::CommContextManager::GetInstance(); diff --git a/paddle/fluid/operators/collective/c_comm_init_multitrainer_op.cc b/paddle/fluid/operators/collective/c_comm_init_multitrainer_op.cc index 4b5afa4079c9c..5b4be73c80f3a 100644 --- a/paddle/fluid/operators/collective/c_comm_init_multitrainer_op.cc +++ b/paddle/fluid/operators/collective/c_comm_init_multitrainer_op.cc @@ -17,7 +17,7 @@ limitations under the License. */ #if defined(PADDLE_WITH_RCCL) #include #endif -#include +#include #include #include diff --git a/paddle/fluid/operators/controlflow/pylayer_op.cc b/paddle/fluid/operators/controlflow/pylayer_op.cc index 08ac56226b17f..0a6cb8aac83c0 100644 --- a/paddle/fluid/operators/controlflow/pylayer_op.cc +++ b/paddle/fluid/operators/controlflow/pylayer_op.cc @@ -91,7 +91,7 @@ class PyLayerForwardOp : public PyLayerOp { private: void RunImpl(const framework::Scope &scope, - const phi::Place &dev_place) const { + const phi::Place &dev_place) const override { auto *scope_var = scope.FindVar(Output(kScope)); PADDLE_ENFORCE_NOT_NULL( scope_var, diff --git a/paddle/fluid/operators/optimizers/sgd_op.cu b/paddle/fluid/operators/optimizers/sgd_op.cu deleted file mode 100644 index a489454ff12a9..0000000000000 --- a/paddle/fluid/operators/optimizers/sgd_op.cu +++ /dev/null @@ -1,191 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include - -#include "paddle/fluid/operators/optimizers/sgd_op.h" -#include "paddle/phi/backends/gpu/gpu_primitives.h" -#include "paddle/phi/common/amp_type_traits.h" - -namespace paddle { -namespace operators { - -namespace { - -template -__global__ void SGDKernelMT(const T* param, - const T* grad, - const T* learning_rate, - const int num, - T* param_out, - const MT* master_param, - MT* master_param_out) { - MT lr = static_cast(learning_rate[0]); - CUDA_KERNEL_LOOP(i, num) { - MT p_data = master_param ? master_param[i] : static_cast(param[i]); - MT g_data = static_cast(grad[i]); - p_data = p_data - lr * g_data; - param_out[i] = static_cast(p_data); - if (master_param_out) { - master_param_out[i] = p_data; - } - } -} - -template -__global__ void SparseSGDFunctorKernel(const T* selected_rows, - const int64_t* rows, - const T* learning_rate, - T* tensor_out, - int64_t row_numel, - int64_t limit) { - for (int64_t i = blockIdx.x; i < limit; i += gridDim.x) { - const T* selected_rows_ptr = selected_rows + i * row_numel; - T* tensor_out_ptr = tensor_out + rows[i] * row_numel; - for (int64_t index = threadIdx.x; index < row_numel; index += blockDim.x) { - // Since index in rows of SelectedRows can be duplicate, we have to use - // Atomic Operation to avoid concurrent write error. - phi::CudaAtomicAdd( - tensor_out_ptr + index, - -static_cast(1.0) * learning_rate[0] * selected_rows_ptr[index]); - } - } -} -} // namespace - -template -class SGDOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - const auto* param_var = ctx.InputVar("Param"); - PADDLE_ENFORCE_EQ(param_var->IsType(), - true, - phi::errors::InvalidArgument( - "The Var(%s)'s type should be phi::DenseTensor, " - "but the received is %s", - ctx.InputNames("Param").front(), - paddle::framework::ToTypeName(param_var->Type()))); - - using MPDType = typename phi::dtype::MPTypeTrait::Type; - - auto* param = ctx.Input("Param"); - auto* param_out = ctx.Output("ParamOut"); - auto* learning_rate = ctx.Input("LearningRate"); - - auto* grad_var = ctx.InputVar("Grad"); - - const bool multi_precision = ctx.Attr("multi_precision"); - const phi::DenseTensor* master_param = nullptr; - phi::DenseTensor* master_param_out = nullptr; - if (multi_precision) { - bool has_master = - ctx.HasInput("MasterParam") && ctx.HasOutput("MasterParamOut"); - PADDLE_ENFORCE_EQ(has_master, - true, - phi::errors::InvalidArgument( - "The Input(MasterParam) and Output(MasterParamOut) " - "should not be null when " - "the attr `multi_precision` is true")); - master_param = ctx.Input("MasterParam"); - master_param_out = ctx.Output("MasterParamOut"); - } - const MPDType* master_in_data = - multi_precision ? master_param->data() : nullptr; - MPDType* master_out_data = - multi_precision - ? master_param_out->mutable_data(ctx.GetPlace()) - : nullptr; - - // Actually, all tensors are phi::DenseTensor except SelectedRows. - if (grad_var->IsType()) { - auto* grad = ctx.Input("Grad"); - - int block = 512; - int grid = (param->numel() + block - 1) / block; - - SGDKernelMT - <<>>( - param->data(), - grad->data(), - learning_rate->data(), - param->numel(), - param_out->mutable_data(ctx.GetPlace()), - master_in_data, - master_out_data); - - } else if (grad_var->IsType()) { - // TODO(qijun): In Sparse SGD operator, in-place update is enforced. - // This manual optimization brings difficulty to track data dependency. - // It's better to find a more elegant solution. - PADDLE_ENFORCE_EQ( - param, - param_out, - phi::errors::InvalidArgument( - "The input tensor Param of SgdOp should be equal with ParamOut " - "if variable's type is SelectedRows.")); - auto* grad = ctx.Input("Grad"); - - auto in_height = grad->height(); - auto out_dims = param_out->dims(); - PADDLE_ENFORCE_EQ(in_height, - out_dims[0], - phi::errors::InvalidArgument( - "The input tensor Grad's height of SgdOp should be " - "equal with ParamOut's dims. But received Grad's " - "height [%s] and ParamOut's dims [%s]", - in_height, - out_dims[0])); - - auto& in_value = grad->value(); - auto& in_rows = grad->rows(); - - int64_t in_row_numel = in_value.numel() / in_rows.size(); - PADDLE_ENFORCE_EQ(in_row_numel, - param_out->numel() / in_height, - phi::errors::InvalidArgument( - "The in_row_numel of SgdOp should be equal with " - "param_out's numel / in_height.")); - - auto* in_data = in_value.data(); - auto* out_data = param_out->data(); - - const int kThreadsPerBlock = 256; - int thread_x = kThreadsPerBlock; - int max_threads = ctx.cuda_device_context().GetMaxPhysicalThreadCount(); - int max_blocks = std::max(max_threads / kThreadsPerBlock, 1); - phi::MixVector mixv_in_rows(&in_rows); - SparseSGDFunctorKernel<<>>( - in_data, - mixv_in_rows.CUDAData(ctx.GetPlace()), - learning_rate->data(), - out_data, - in_row_numel, - in_rows.size()); - - } else { - PADDLE_ENFORCE_EQ(false, - true, - phi::errors::PermissionDenied( - "Unsupported Variable Type of Grad " - "in SgdOp. Excepted LodTensor or " - "SelectedRows, But received [%s]", - paddle::framework::ToTypeName(grad_var->Type()))); - } - } -}; -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/optimizers/sgd_op.h b/paddle/fluid/operators/optimizers/sgd_op.h deleted file mode 100644 index 98d59b7ba1d38..0000000000000 --- a/paddle/fluid/operators/optimizers/sgd_op.h +++ /dev/null @@ -1,335 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/selected_rows_utils.h" -#include "paddle/fluid/framework/var_type_traits.h" -#include "paddle/phi/common/bfloat16.h" -#include "paddle/phi/kernels/funcs/eigen/common.h" -#include "paddle/phi/kernels/funcs/jit/kernels.h" - -namespace paddle { -namespace operators { - -namespace detail { - -template -struct sgd_dense_param_kernel { - void operator()() const {} -}; - -// LodTensor -template -struct sgd_dense_param_kernel::kId> { - void operator()(const framework::ExecutionContext &ctx) const { - VLOG(4) << "[CPU]: sgd_dense_param_kernel"; - const auto *learning_rate = ctx.Input("LearningRate"); - const auto *param = ctx.Input("Param"); - auto *param_out = ctx.Output("ParamOut"); - const auto *grad = ctx.Input("Grad"); - - const auto sz = param_out->numel(); - phi::jit::sgd_attr_t attr(1, sz, 1, sz, 1); - const T *lr = learning_rate->data(); - const T *param_data = param->data(); - const T *grad_data = grad->data(); - int64_t rows_idx = 0; - T *out_data = param_out->mutable_data(ctx.GetPlace()); - - auto sgd = - phi::jit::KernelFuncs, phi::CPUPlace>::Cache().At( - attr); - sgd(lr, param_data, grad_data, &rows_idx, out_data, &attr); - } -}; - -// SelectedRows -template -struct sgd_dense_param_kernel::kId> { - void operator()(const framework::ExecutionContext &ctx) const { - VLOG(4) << "[CPU]: sgd_dense_param_kernel"; - const auto *learning_rate = ctx.Input("LearningRate"); - const auto *param = ctx.Input("Param"); - auto *param_out = ctx.Output("ParamOut"); - const auto *grad = ctx.Input("Grad"); - - const auto &grad_value = grad->value(); - const auto &grad_rows = grad->rows(); - const T *param_data = param->data(); - const T *grad_data = grad_value.data(); - const T *lr = learning_rate->data(); - const int64_t *rows_data = grad_rows.data(); - T *out_data = param_out->mutable_data(ctx.GetPlace()); - - phi::jit::sgd_attr_t attr; - attr.param_height = param_out->dims()[0]; - attr.param_width = param_out->numel() / attr.param_height; - attr.grad_height = grad_rows.size(); // note: it is not grad->height() - attr.grad_width = grad_value.numel() / attr.grad_height; - attr.selected_rows_size = grad_rows.size(); - - auto sgd = - phi::jit::KernelFuncs, phi::CPUPlace>::Cache().At( - attr); - sgd(lr, param_data, grad_data, rows_data, out_data, &attr); - } -}; - -// LodTensor -template <> -struct sgd_dense_param_kernel::kId> { - void operator()(const framework::ExecutionContext &ctx) const { - VLOG(4) << "[CPU]: sgd_dense_param_kernel"; - const auto *learning_rate = ctx.Input("LearningRate"); - const auto *param = ctx.Input("Param"); - auto *param_out = ctx.Output("ParamOut"); - const auto *grad = ctx.Input("Grad"); - param_out->mutable_data(ctx.GetPlace()); - - auto p = phi::EigenVector::Flatten(*param); - auto g = phi::EigenVector::Flatten(*grad); - auto o = phi::EigenVector::Flatten(*param_out); - const auto *lr = learning_rate->data(); - - o = p - lr[0] * g; - } -}; - -// SelectedRows -template <> -struct sgd_dense_param_kernel::kId> { - void operator()(const framework::ExecutionContext &ctx) const { - VLOG(4) << "[CPU]: sgd_dense_param_kernel"; - const auto *learning_rate = ctx.Input("LearningRate"); - auto *param_out = ctx.Output("ParamOut"); - const auto *grad = ctx.Input("Grad"); - - const auto &grad_value = grad->value(); - const auto &grad_rows = grad->rows(); - const auto grad_height = grad->height(); - const int64_t grad_val_height = static_cast(grad_rows.size()); - const auto grad_width = grad_value.numel() / grad_val_height; - - const auto *grad_data = grad_value.data(); - auto *out_data = param_out->data(); - const auto *lr = learning_rate->data(); - - for (size_t i = 0; i < grad_rows.size(); ++i) { - PADDLE_ENFORCE_LT( - grad_rows[i], - grad_height, - phi::errors::OutOfRange( - "Grad rows index value should be less than grad height." - "Got [%s], but expected less than [%s]", - grad_rows[i], - grad_height)); - const int64_t row = grad_rows[i]; - for (int64_t j = 0; j < grad_width; ++j) { - out_data[row * grad_width + j] -= lr[0] * grad_data[i * grad_width + j]; - } - } - } -}; - -} // namespace detail - -template -class SGDOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override; -}; - -template -class SGDOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - const auto *param_var = ctx.InputVar("Param"); - - if (param_var->IsType()) { - invoke_dense_param_kernel(ctx); - } else if (param_var->IsType()) { - sparse_param_and_grad_kernel(ctx); - } else { - PADDLE_ENFORCE_EQ( - false, - true, - phi::errors::PermissionDenied( - "Unsupported Variable Type of Parameter in SgdOp. Excepted " - "LodTensor or SelectedRows, But received [%s]", - paddle::framework::ToTypeName(param_var->Type()))); - } - } - - protected: - void invoke_dense_param_kernel(const framework::ExecutionContext &ctx) const { - const auto *param = ctx.Input("Param"); - auto *param_out = ctx.Output("ParamOut"); - const auto *grad_var = ctx.InputVar("Grad"); - - if (grad_var->IsType()) { - const auto *grad = ctx.Input("Grad"); - const auto sz = param_out->numel(); - PADDLE_ENFORCE_EQ(param->numel(), - sz, - phi::errors::InvalidArgument( - "The input tensor Param's numel of SgdOp " - "should be equal with ParamOut's numel. " - "But received Param's " - "numel = [%s], ParamOut's numel = [%s]", - param->numel(), - sz)); - PADDLE_ENFORCE_EQ( - grad->numel(), - sz, - phi::errors::InvalidArgument("The input tensor Grad's numel of SgdOp " - "should be equal with ParamOut's numel. " - "But received Grad's " - "numel = [%s], ParamOut's numel = [%s]", - grad->numel(), - sz)); - - dense_param_and_grad_kernel(ctx); - } else if (grad_var->IsType()) { - // TODO(qijun): In Sparse SGD operator, in-place update is enforced. - // This manual optimization brings difficulty to track data dependency. - // It's better to find a more elegant solution. - PADDLE_ENFORCE_EQ(param, - param_out, - phi::errors::InvalidArgument( - "The input tensor Param of SgdOp " - "should be equal with ParamOut if variable's " - "type is SelectedRows. ")); - const auto *grad = ctx.Input("Grad"); - - // for distributed training, a sparse var may be empty, - // just skip updating. - if (grad->rows().size() == 0) { - return; - } - - auto out_dims = param_out->dims(); - PADDLE_ENFORCE_EQ( - grad->height(), - out_dims[0], - phi::errors::InvalidArgument( - "The input tensor Grad's height of SgdOp " - "should be equal with ParamOut's dims. But received Grad's " - "height [%s] and ParamOut's dims [%s]", - grad->height(), - out_dims[0])); - - auto &grad_value = grad->value(); - auto &grad_rows = grad->rows(); - const auto param_height = param_out->dims()[0]; - const auto param_width = param_out->numel() / param_height; - // note: it is not grad->height() - const auto grad_height = static_cast(grad_rows.size()); - const auto grad_width = grad_value.numel() / grad_height; - - PADDLE_ENFORCE_EQ( - grad_width, - param_width, - phi::errors::InvalidArgument( - "The grad_value's numel of SgdOp " - "should be equal with param_out's numel. But received " - "grad_value's numel [%s] and param_out's numel [%s]", - grad_width, - param_width)); - - dense_param_sparse_grad_kernel(ctx); - } else { - PADDLE_ENFORCE_EQ( - false, - true, - phi::errors::PermissionDenied( - "Unsupported Variable Type of Grad in SgdOp. Excepted " - "LodTensor or SelectedRows, But received [%s]", - paddle::framework::ToTypeName(grad_var->Type()))); - } - } - - void sparse_param_and_grad_kernel( - const framework::ExecutionContext &ctx) const { - const auto *learning_rate = ctx.Input("LearningRate"); - const auto *param_var = ctx.InputVar("Param"); - const auto *grad_var = ctx.InputVar("Grad"); - - PADDLE_ENFORCE_EQ(grad_var->IsType(), - true, - phi::errors::InvalidArgument( - "When param is SelectedRows, gradient should also " - "be SelectedRows")); - const auto ¶m = param_var->Get(); - auto *param_out = ctx.Output("ParamOut"); - const auto &grad = grad_var->Get(); - - // for distributed training, a sparse var may be empty, - // just skip updating. - if (grad.rows().size() == 0) { - return; - } - - auto param_row_width = param.value().dims()[1]; - auto grad_row_width = grad.value().dims()[1]; - PADDLE_ENFORCE_EQ( - param_row_width, - grad_row_width, - phi::errors::InvalidArgument( - "The param_row in SgdOP should have the same size with grad_row. " - "But received param_row's width is [%s], and grad_row's width is " - "[%s]", - param_row_width, - grad_row_width)); - - const auto *lr = learning_rate->data(); - const auto *grad_data = grad.value().data(); - auto *out_data = param_out->mutable_value()->data(); - for (size_t i = 0; i < grad.rows().size(); i++) { - int64_t id_index = param_out->AutoGrownIndex(grad.rows()[i], false); - PADDLE_ENFORCE_GE( - id_index, - static_cast(0), - phi::errors::InvalidArgument( - "The id in SgdOp should be >= 0. But received id_index is [%s]", - id_index)); - for (int64_t j = 0; j < grad_row_width; j++) { - out_data[id_index * grad_row_width + j] -= - lr[0] * grad_data[i * grad_row_width + j]; - } - } - } - - virtual void dense_param_and_grad_kernel( - const framework::ExecutionContext &ctx) const { - detail::sgd_dense_param_kernel< - T, - framework::VarTypeTrait::kId>()(ctx); - } - - virtual void dense_param_sparse_grad_kernel( - const framework::ExecutionContext &ctx) const { - detail::sgd_dense_param_kernel< - T, - framework::VarTypeTrait::kId>()(ctx); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h index 74701dc01f90f..3107d4c1cc720 100644 --- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h +++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h @@ -297,8 +297,12 @@ class TensorRTEngineOp : public framework::OperatorBase { auto is_shape_tensor = true; if (trt_engine->engine()) { auto *engine = trt_engine->engine(); +#if IS_TRT_VERSION_GE(8600) + is_shape_tensor = engine->isShapeInferenceIO(name.c_str()); +#else is_shape_tensor = engine->isShapeBinding(engine->getBindingIndex(name.c_str())); +#endif if (!is_shape_tensor) { runtime_shape_tensor.erase(name); VLOG(4) << "trt engine runtime delete shape name(" << name @@ -517,6 +521,13 @@ class TensorRTEngineOp : public framework::OperatorBase { binding_offset = engine->GetBindingsOffset(); } // Bind input tensor to TRT. +#if IS_TRT_VERSION_GE(8600) + std::unordered_map tensor_index; + for (int i = 0; i < engine->engine()->getNbIOTensors(); ++i) { + auto tensor_name = engine->engine()->getIOTensorName(i); + tensor_index[std::string(tensor_name)] = i; + } +#endif for (auto x : runtime_input_names_) { // NOTE(liuyuanle): It is a trick. If you need a [x], then you need // to use [x.substr(0, idx)]. @@ -566,8 +577,13 @@ class TensorRTEngineOp : public framework::OperatorBase { } // Get index of profile 0 first, then plus binding offset +#if IS_TRT_VERSION_GE(8600) + const int bind_index = + tensor_index[std::string(x.c_str())] + binding_offset; +#else const int bind_index = engine->engine()->getBindingIndex(x.c_str()) + binding_offset; +#endif PADDLE_ENFORCE_LT( bind_index, num_bindings, @@ -615,8 +631,9 @@ class TensorRTEngineOp : public framework::OperatorBase { } else { #if IS_TRT_VERSION_GE(6000) #if IS_TRT_VERSION_GE(8500) - if (engine->engine()->isShapeBinding(bind_index) && - engine->engine()->bindingIsInput(bind_index)) { + if (engine->engine()->isShapeInferenceIO(x.c_str()) && + engine->engine()->getTensorIOMode(x.c_str()) == + nvinfer1::TensorIOMode::kINPUT) { std::vector shape_v(t.numel()); if (t.dtype() == phi::DataType::INT32) { phi::memory_utils::Copy(phi::CPUPlace(), @@ -689,8 +706,12 @@ class TensorRTEngineOp : public framework::OperatorBase { VLOG(1) << "trt input [" << x << "] dtype is " << t.dtype(); auto indata_type = inference::tensorrt::PhiType2NvType(t.dtype()); +#if IS_TRT_VERSION_GE(8600) + auto intrt_type = engine->engine()->getTensorDataType(x.c_str()); +#else auto intrt_index = engine->engine()->getBindingIndex(x.c_str()); auto intrt_type = engine->engine()->getBindingDataType(intrt_index); +#endif PADDLE_ENFORCE_EQ(indata_type, intrt_type, phi::errors::InvalidArgument( @@ -745,20 +766,31 @@ class TensorRTEngineOp : public framework::OperatorBase { Attr>("origin_output_rank"); VLOG(4) << "TensorRT Engine Op Outputs:"; for (const auto &y : Outputs("Ys")) { +#if IS_TRT_VERSION_GE(8600) + const int bind_index = + tensor_index[std::string(output_maps[output_index].c_str())] + + binding_offset; +#else const int bind_index = engine->engine()->getBindingIndex(output_maps[output_index].c_str()) + binding_offset; +#endif std::vector ddim; if (!engine->with_dynamic_shape()) { +#if IS_TRT_VERSION_GE(8600) + auto dims = + engine->engine()->getTensorShape(output_maps[output_index].c_str()); +#else auto dims = engine->engine()->getBindingDimensions(bind_index); +#endif ddim.push_back(runtime_batch); for (int i = 0; i < dims.nbDims; i++) { ddim.push_back(dims.d[i]); } } else { #if IS_TRT_VERSION_GE(8500) - auto x_name = engine->engine()->getBindingName(bind_index); + auto x_name = engine->engine()->getIOTensorName(bind_index); auto dims = trt_context->getTensorShape(x_name); int nb_dims = dims.nbDims; for (; nb_dims > 0; nb_dims--) { @@ -800,7 +832,12 @@ class TensorRTEngineOp : public framework::OperatorBase { "index = %d, number of bindings = %d.", bind_index, num_bindings)); +#if IS_TRT_VERSION_GE(8600) + auto trt_tensor_name = engine->engine()->getIOTensorName(bind_index); + auto trt_type = engine->engine()->getTensorDataType(trt_tensor_name); +#else auto trt_type = engine->engine()->getBindingDataType(bind_index); +#endif // get adr and set type VLOG(1) << "trt output [" << y << "] dtype is " << TRT2FluidDataType(trt_type); diff --git a/paddle/fluid/pir/dialect/op_generator/decomp_interface_gen_op_list.py b/paddle/fluid/pir/dialect/op_generator/decomp_interface_gen_op_list.py index d91284f29c2dc..9ee350475428d 100644 --- a/paddle/fluid/pir/dialect/op_generator/decomp_interface_gen_op_list.py +++ b/paddle/fluid/pir/dialect/op_generator/decomp_interface_gen_op_list.py @@ -134,6 +134,17 @@ 'reduce_as_grad', ] +OTHER_PRIM_VJP_OPS = [ + 'sum_grad', + 'reshape_grad', + 'roll_grad', + 'transpose_grad', + 'max_grad', + 'squeeze_grad', + 'unsqueeze_grad', +] + + CUSTOM_VJP = [ 'gelu_grad', 'hardswish_grad', @@ -152,9 +163,11 @@ ] # custom vjp list of composite op # declare belongs to codegen, but implementation not -OTHER_VJP = ["stack_grad"] +OTHER_VJP = ["concat_grad", "stack_grad"] -vjp_list = UNARY_PRIM_VJP_OPS + BINARY_PRIM_VJP_OPS + CUSTOM_VJP +vjp_list = ( + UNARY_PRIM_VJP_OPS + BINARY_PRIM_VJP_OPS + CUSTOM_VJP + OTHER_PRIM_VJP_OPS +) decomp_vjp_interface_declare_gen_op_list = vjp_list + OTHER_VJP diff --git a/paddle/fluid/pir/dialect/op_generator/ops_api_gen.py b/paddle/fluid/pir/dialect/op_generator/ops_api_gen.py index 79e9651f1ff1b..090aab4e3c4ed 100644 --- a/paddle/fluid/pir/dialect/op_generator/ops_api_gen.py +++ b/paddle/fluid/pir/dialect/op_generator/ops_api_gen.py @@ -189,7 +189,6 @@ 'sparse_momentum', 'tdm_sampler', 'soft_relu', - 'uniform_random_batch_size_like', 'match_matrix_tensor', 'c_reduce_max', 'c_reduce_max_', diff --git a/paddle/fluid/pir/dialect/operator/ir/manual_op_decomp_vjp.cc b/paddle/fluid/pir/dialect/operator/ir/manual_op_decomp_vjp.cc index 8c1d0f743b7ce..c1aa3d776b67e 100644 --- a/paddle/fluid/pir/dialect/operator/ir/manual_op_decomp_vjp.cc +++ b/paddle/fluid/pir/dialect/operator/ir/manual_op_decomp_vjp.cc @@ -106,5 +106,95 @@ std::vector> StackGradOp::DecompVjp( return res; } +std::vector> ConcatGradOp::DecompVjp( + pir::Operation* op) { + VLOG(4) << "Decomp call concat_grad's decomp interface begin"; + + ConcatGradOp op_obj = op->dyn_cast(); + (void)op_obj; + + FLAGS_tensor_operants_mode = "static"; + + VLOG(6) << "Decomp Prepare inputs of concat_grad"; + + pir::CombineOp combine_op_obj_x = + op_obj.x().defining_op()->dyn_cast(); + std::vector x; + for (size_t idx = 0; idx < combine_op_obj_x.inputs().size(); idx++) { + x.emplace_back(std::make_shared( + combine_op_obj_x.inputs()[idx])); + } + Tensor out_grad(std::make_shared(op_obj.out_grad())); + + VLOG(6) << "Decomp prepare attributes of concat_grad"; + + Tensor axis_(std::make_shared(op_obj.axis())); + + auto* axis_define_op = + std::static_pointer_cast(axis_.impl()) + ->value() + .defining_op(); + if (axis_define_op->name() != "pd_op.full") { + PADDLE_THROW(platform::errors::Unimplemented( + "We don't support dynamic tensors " + "attribute axis for concat_grad decomposition " + "for now. ")); + } + Scalar axis = axis_define_op->attribute("value") + .dyn_cast() + .data(); + + VLOG(6) << "Decomp call concat_grad's backward composite rule prepare"; + + std::vector> stop_gradients(op->results().size()); + if (combine_op_obj_x->HasAttribute(kAttrStopGradients)) { + auto stop_gradients_attr = op->attribute(kAttrStopGradients) + .dyn_cast() + .AsVector(); + for (size_t i = 0; i < stop_gradients[0].size(); ++i) { + stop_gradients[0].push_back( + stop_gradients_attr[i].dyn_cast().data()); + } + + VLOG(4) << " stop_gradients is set "; + } else { + std::vector x_grad_stop_gradient(combine_op_obj_x.inputs().size(), + false); + stop_gradients[0] = x_grad_stop_gradient; + VLOG(4) << " stop_gradients is not set "; + } + + std::vector> tensor_res; + for (auto arg : stop_gradients) { + tensor_res.push_back(std::vector(arg.size())); + } + std::string op_name = "concat_grad"; + FLAGS_tensor_operants_mode = "static"; + VLOG(4) << "Call Pir Decomposed backward op concat_grad"; + + std::vector x_grad(stop_gradients[0].size(), nullptr); + for (size_t i = 0; i < stop_gradients[0].size(); i++) { + x_grad[i] = !stop_gradients[0][i] ? &tensor_res[0][i] : nullptr; + } + + paddle::primitive::details::concat_grad( + x, out_grad, axis, x_grad); + std::vector> res(tensor_res.size()); + + for (size_t i = 0; i < tensor_res.size(); ++i) { + res[i].resize(tensor_res[i].size()); + for (size_t j = 0; j < tensor_res[i].size(); ++j) { + if (tensor_res[i][j].defined()) { + res[i][j] = std::static_pointer_cast( + tensor_res[i][j].impl()) + ->value(); + } + } + } + + VLOG(4) << "Decomp call concat_grad's decomp interface end"; + return res; +} + } // namespace dialect } // namespace paddle diff --git a/paddle/fluid/pir/transforms/general/transfer_layout_pass.cc b/paddle/fluid/pir/transforms/general/transfer_layout_pass.cc index e87690b6021ba..fcbfcbb910e1e 100644 --- a/paddle/fluid/pir/transforms/general/transfer_layout_pass.cc +++ b/paddle/fluid/pir/transforms/general/transfer_layout_pass.cc @@ -322,17 +322,14 @@ struct FlowGraph { auto judge_dense_tensor_type = [](paddle::dialect::DenseTensorType t) { if (t.dims().size() == 4) { - return false; + return true; } - return true; + return false; }; bool should_interrupt = std::visit( overloaded{ [&](const pir::Operation* op) { - // TODO(lyk): These conditions may be too loose, - // we should make a white list here. - pir::Operation* fop = const_cast(op); auto layout_transform_iface = fop->dyn_cast< @@ -347,20 +344,34 @@ struct FlowGraph { auto vt = v.type(); if (!vt) return true; // maybe not DenseTensor, but we can handle other types later + bool can_be_transformed = false; if (auto vdt = vt.dyn_cast()) { VLOG(10) << "judging var: " << v.defining_op() << " " - << v.type() << " " << vdt.dims() << " " - << (vdt.dims().size() == 4); - return judge_dense_tensor_type(vdt); + << v.type() << " " << vdt.dims(); + can_be_transformed = judge_dense_tensor_type(vdt); } else if (auto vdt = vt.dyn_cast()) { if (vdt.size() == 0) return false; auto vt_elem = vdt[0]; if (auto vdt_elem = vt_elem.dyn_cast()) - return judge_dense_tensor_type(vdt_elem); + can_be_transformed = judge_dense_tensor_type(vdt_elem); } - return true; + if (!can_be_transformed) { + // when the rank of value is not 4, we can't allow it to be + // a point of cut edge. So we set its outputs and inputs to + // immutable. + Node in_node = Node(v.defining_op()); + nhwc_nodes.erase(in_node); + VLOG(10) << "erase node: " << in_node << " from nhwc set"; + + for (auto it = v.use_begin(); it != v.use_end(); ++it) { + Node out_node(it->owner()); + nhwc_nodes.erase(out_node); + VLOG(10) << "erase node: " << out_node << " from nhwc set"; + } + } + return !can_be_transformed; }, [](const auto&) { return true; }, }, @@ -531,7 +542,7 @@ using Edge = FlowGraph::Edge; class TransferLayoutPass : public pir::Pass { public: - TransferLayoutPass() : pir::Pass("transfer_layout_pass", 4) {} + TransferLayoutPass() : pir::Pass("transfer_layout_pass", 3) {} bool CanApplyOn(pir::Operation* op) const override { if (!op->isa()) { @@ -636,7 +647,8 @@ class TransferLayoutPass : public pir::Pass { VLOG(10) << "-----------------------[rewrite begin]------------------------"; - + int64_t num_of_layout_changed_ops{0}; + int64_t num_of_transpose_ops{0}; while (!q.empty()) { auto node = q.front(); q.pop_front(); @@ -653,6 +665,7 @@ class TransferLayoutPass : public pir::Pass { if (layout_transformation_iface) { layout_transformation_iface.RewriteByLayout( op, common::DataLayout::NHWC); + num_of_layout_changed_ops++; } else { PADDLE_THROW(common::errors::Unimplemented( "Op %s should have a specialized RewriteByLayout function", @@ -684,6 +697,7 @@ class TransferLayoutPass : public pir::Pass { ((src_set.count(node) > 0) ? common::DataLayout::NHWC : common::DataLayout::NCHW); builder.SetInsertionPointAfter(dst_value.defining_op()); + num_of_transpose_ops++; auto transpose_op = builder.Build(dst_value, perm); transpose_op->set_attribute( @@ -724,6 +738,7 @@ class TransferLayoutPass : public pir::Pass { ((src_set.count(node) > 0) ? common::DataLayout::NHWC : common::DataLayout::NCHW); builder.SetInsertionPointAfter(value.defining_op()); + num_of_transpose_ops++; auto transpose_op = builder.Build(value, perm); transpose_op->set_attribute( @@ -738,6 +753,7 @@ class TransferLayoutPass : public pir::Pass { value.ReplaceUsesWithIf(transpose_op.out(), replace_uses_in_cut_set); } } + AddStatistics(num_of_transpose_ops, num_of_layout_changed_ops); } }; diff --git a/paddle/fluid/pir/transforms/gpu/conv2d_add_act_fuse_pass.cc b/paddle/fluid/pir/transforms/gpu/conv2d_add_act_fuse_pass.cc index 8604f890a49a6..96851cfeac559 100644 --- a/paddle/fluid/pir/transforms/gpu/conv2d_add_act_fuse_pass.cc +++ b/paddle/fluid/pir/transforms/gpu/conv2d_add_act_fuse_pass.cc @@ -40,7 +40,8 @@ class Conv2dAddActFusePassDrrPattern : public paddle::drr::DrrPatternBase { public: static const int CUTLASS_NHWC_ALIGNMENT = 8; - Conv2dAddActFusePassDrrPattern(std::string act_name, bool cutlass_pattern) + Conv2dAddActFusePassDrrPattern(const std::string &act_name, + bool cutlass_pattern) : act_name_(act_name), cutlass_pattern_(cutlass_pattern) {} std::string name() const override { return "Conv2dAddActFusePassDrrPattern"; } uint32_t benefit() const override { return cutlass_pattern_ ? 3 : 2; } diff --git a/paddle/fluid/pir/transforms/gpu/fused_flash_attn_pass.cc b/paddle/fluid/pir/transforms/gpu/fused_flash_attn_pass.cc index 0cbc5e0bd93bc..be8202356036f 100644 --- a/paddle/fluid/pir/transforms/gpu/fused_flash_attn_pass.cc +++ b/paddle/fluid/pir/transforms/gpu/fused_flash_attn_pass.cc @@ -139,7 +139,7 @@ class FlashAttnPatternQscaleWithMask : public paddle::drr::DrrPatternBase { } // mask's shape [bs, 1, seq_len, seq_len] auto mask_add = pir::GetShapeFromValue(match_ctx.Tensor("mask")); - if (mask_add.size() != 4 || mask_add.at(1) != 1) { + if (mask_add.size() != 4 || mask_add.at(1) != 1 || mask_add.at(0) != -1) { return false; } @@ -285,7 +285,7 @@ class FlashAttnPatternOutscaleWithMask : public paddle::drr::DrrPatternBase { } // mask's shape [bs, 1, seq_len, seq_len] auto mask_add = pir::GetShapeFromValue(match_ctx.Tensor("mask")); - if (mask_add.size() != 4 || mask_add.at(1) != 1) { + if (mask_add.size() != 4 || mask_add.at(1) != 1 || mask_add.at(0) != -1) { return false; } @@ -556,7 +556,7 @@ class TransposeSliceFlashAttnPattern : public paddle::drr::DrrPatternBase { } // mask's shape [bs, 1, seq_len, seq_len] auto mask_add = pir::GetShapeFromValue(match_ctx.Tensor("mask")); - if (mask_add.size() != 4 || mask_add.at(1) != 1) { + if (mask_add.size() != 4 || mask_add.at(1) != 1 || mask_add.at(0) != -1) { return false; } diff --git a/paddle/fluid/pir/transforms/gpu/fused_weight_only_linear_pass.cc b/paddle/fluid/pir/transforms/gpu/fused_weight_only_linear_pass.cc index efe0053b5bee3..5babd4072a7b0 100644 --- a/paddle/fluid/pir/transforms/gpu/fused_weight_only_linear_pass.cc +++ b/paddle/fluid/pir/transforms/gpu/fused_weight_only_linear_pass.cc @@ -32,9 +32,6 @@ int getSMVersion() { #if defined(PADDLE_WITH_CUDA) && defined(PADDLE_WITH_CUTLASS) sm_version = paddle::platform::GetGPUComputeCapability( paddle::platform::GetCurrentDeviceId()); -#else - PADDLE_THROW(common::errors::Unavailable( - "fused_weight_only_linear_pass needs paddle compiled with CUDA.")); #endif return sm_version; } @@ -280,7 +277,7 @@ class FusedWeightOnlyLinearPass : public pir::PatternRewritePass { sm_version_(getSMVersion()) {} pir::RewritePatternSet InitializePatterns(pir::IrContext *context) override { - std::string algo = "weight_only_int4"; + std::string algo = "weight_only_int8"; if (Has("weight_only_algo")) { algo = Get("weight_only_algo"); } diff --git a/paddle/fluid/pir/transforms/gpu/matmul_add_act_fuse_pass.cc b/paddle/fluid/pir/transforms/gpu/matmul_add_act_fuse_pass.cc index 639448e5f5936..0da1499a730c5 100644 --- a/paddle/fluid/pir/transforms/gpu/matmul_add_act_fuse_pass.cc +++ b/paddle/fluid/pir/transforms/gpu/matmul_add_act_fuse_pass.cc @@ -42,6 +42,9 @@ class MatmulAddPattern : public paddle::drr::DrrPatternBase { const bool reverse_add) : fused_op_name_(fused_op_name), reverse_add_(reverse_add) {} + uint32_t benefit() const override { + return fused_op_name_ == paddle::dialect::GemmEpilogueOp::name() ? 2 : 1; + } std::string name() const override { return "MatmulAddPattern"; } void operator()(paddle::drr::DrrPatternContext *ctx) const override { @@ -137,6 +140,7 @@ class MatmulAddActPattern : public paddle::drr::DrrPatternBase { explicit MatmulAddActPattern(const std::string &act_type, const std::string &fused_op_name) : act_type_(act_type), fused_op_name_(fused_op_name) {} + uint32_t benefit() const override { return 3; } std::string name() const override { return "MatmulAddActPattern"; } diff --git a/paddle/fluid/pir/transforms/onednn/self_attention_fuse_pass.cc b/paddle/fluid/pir/transforms/onednn/self_attention_fuse_pass.cc index bbdab7ff7a4b6..276c8871d88ee 100644 --- a/paddle/fluid/pir/transforms/onednn/self_attention_fuse_pass.cc +++ b/paddle/fluid/pir/transforms/onednn/self_attention_fuse_pass.cc @@ -169,7 +169,7 @@ class SelfAttentionFusePass : public pir::PatternRewritePass { return ps; } - bool CanApplyOn(pir::Operation *op) const { + bool CanApplyOn(pir::Operation *op) const override { #if !defined(PADDLE_WITH_AVX512F) || !defined(PADDLE_WITH_MKLML) || \ !defined(PADDLE_WITH_DNNL) LOG(WARNING) << "No-avx512 or MKL or oneDNN supported!"; diff --git a/paddle/fluid/platform/enforce_test.cc b/paddle/fluid/platform/enforce_test.cc index e6838746fd6ac..44bd7de5ff967 100644 --- a/paddle/fluid/platform/enforce_test.cc +++ b/paddle/fluid/platform/enforce_test.cc @@ -596,6 +596,7 @@ TEST(enforce, cannot_to_string_type) { TEST(GET_DATA_SAFELY_MACRO, SUCCESS) { int* a = new int(10); // NOLINT GET_DATA_SAFELY(a, "Input", "X", "dummy"); + delete a; } TEST(GET_DATA_SAFELY_MACRO, FAIL) { diff --git a/paddle/phi/backends/dynload/dynamic_loader.cc b/paddle/phi/backends/dynload/dynamic_loader.cc index b7b94835929f9..612a959fc307b 100644 --- a/paddle/phi/backends/dynload/dynamic_loader.cc +++ b/paddle/phi/backends/dynload/dynamic_loader.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include "paddle/phi/backends/dynload/dynamic_loader.h" #include +#include #include #include #include @@ -45,6 +46,7 @@ COMMON_DECLARE_string(cusparselt_dir); COMMON_DECLARE_string(curand_dir); COMMON_DECLARE_string(cusolver_dir); COMMON_DECLARE_string(cusparse_dir); +COMMON_DECLARE_string(win_cuda_bin_dir); #ifdef PADDLE_WITH_HIP PHI_DEFINE_string(miopen_dir, @@ -132,8 +134,12 @@ static constexpr char* win_cufft_lib = static inline std::string join(const std::string& part1, const std::string& part2) { - // directory separator +// directory separator +#if defined(_WIN32) + const char sep = '\\'; +#else const char sep = '/'; +#endif if (!part2.empty() && part2.front() == sep) { return part2; } @@ -263,6 +269,26 @@ static inline void* GetDsoHandleFromSearchPath( #else int dynload_flags = 0; #endif // !_WIN32 +#if defined(_WIN32) + std::vector cuda_bin_search_path = { + L"cublas", + L"cuda_nvrtc", + L"cuda_runtime", + L"cudnn", + L"cufft", + L"curand", + L"cusolver", + L"cusparse", + L"nvjitlink", + }; + for (auto search_path : cuda_bin_search_path) { + std::wstring_convert> converter; + std::wstring win_path_wstring = + converter.from_bytes(FLAGS_win_cuda_bin_dir); + search_path = win_path_wstring + L"\\" + search_path + L"\\bin"; + AddDllDirectory(search_path.c_str()); + } +#endif std::vector dso_names = split(dso_name, ";"); void* dso_handle = nullptr; for (auto const& dso : dso_names) { @@ -324,8 +350,26 @@ void* GetCublasDsoHandle() { #if defined(__APPLE__) || defined(__OSX__) return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.dylib"); #elif defined(_WIN32) && defined(PADDLE_WITH_CUDA) - return GetDsoHandleFromSearchPath( - FLAGS_cuda_dir, win_cublas_lib, true, {cuda_lib_path}); + if (CUDA_VERSION >= 11000 && CUDA_VERSION < 12000) { +#ifdef WITH_PIP_CUDA_LIBRARIES + return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "cublas64_11.dll"); +#else + return GetDsoHandleFromSearchPath( + FLAGS_cuda_dir, win_cublas_lib, true, {cuda_lib_path}); +#endif + } else if (CUDA_VERSION >= 12000 && CUDA_VERSION <= 12030) { +#ifdef WITH_PIP_CUDA_LIBRARIES + return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "cublas64_12.dll"); +#else + return GetDsoHandleFromSearchPath( + FLAGS_cuda_dir, win_cublas_lib, true, {cuda_lib_path}); +#endif + } else { + std::string warning_msg( + "Your CUDA_VERSION is less than 11 or greater than 12, paddle " + "temporarily no longer supports"); + return nullptr; + } #elif defined(__linux__) && defined(PADDLE_WITH_CUDA) if (CUDA_VERSION >= 11000 && CUDA_VERSION < 12000) { #ifdef WITH_PIP_CUDA_LIBRARIES @@ -403,8 +447,23 @@ void* GetCUDNNDsoHandle() { "Toolkit\\CUDA\\v10.0\n" "You should do this according to your CUDA installation directory and " "CUDNN version."); - return GetDsoHandleFromSearchPath( - FLAGS_cudnn_dir, win_cudnn_lib, true, {cuda_lib_path}, win_warn_meg); + if (CUDA_VERSION >= 11000 && CUDA_VERSION < 12030) { +#ifdef WITH_PIP_CUDA_LIBRARIES + return GetDsoHandleFromSearchPath( + FLAGS_cuda_dir, "cudnn64_8.dll", true, {cuda_lib_path}, win_warn_meg); +#else + return GetDsoHandleFromSearchPath( + FLAGS_cuda_dir, win_cudnn_lib, true, {cuda_lib_path}, win_warn_meg); +#endif + } else if (CUDA_VERSION >= 12030) { +#ifdef WITH_PIP_CUDA_LIBRARIES + return GetDsoHandleFromSearchPath( + FLAGS_cuda_dir, "cudnn64_9.dll", true, {cuda_lib_path}, win_warn_meg); +#else + return GetDsoHandleFromSearchPath( + FLAGS_cuda_dir, win_cudnn_lib, true, {cuda_lib_path}, win_warn_meg); +#endif + } #elif defined(PADDLE_WITH_HIP) return GetDsoHandleFromSearchPath(FLAGS_miopen_dir, "libMIOpen.so", false); #else @@ -461,8 +520,13 @@ void* GetCurandDsoHandle() { #if defined(__APPLE__) || defined(__OSX__) return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.dylib"); #elif defined(_WIN32) && defined(PADDLE_WITH_CUDA) +#ifdef WITH_PIP_CUDA_LIBRARIES + return GetDsoHandleFromSearchPath( + FLAGS_cuda_dir, "curand64_10.dll", true, {cuda_lib_path}); +#else return GetDsoHandleFromSearchPath( FLAGS_cuda_dir, win_curand_lib, true, {cuda_lib_path}); +#endif #elif defined(PADDLE_WITH_HIP) return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "libhiprand.so"); #else @@ -500,8 +564,13 @@ void* GetCusolverDsoHandle() { #if defined(__APPLE__) || defined(__OSX__) return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcusolver.dylib"); #elif defined(_WIN32) && defined(PADDLE_WITH_CUDA) +#ifdef WITH_PIP_CUDA_LIBRARIES + return GetDsoHandleFromSearchPath( + FLAGS_cuda_dir, "cusolver64_11.dll", true, {cuda_lib_path}); +#else return GetDsoHandleFromSearchPath( FLAGS_cuda_dir, win_cusolver_lib, true, {cuda_lib_path}); +#endif #else #ifdef WITH_PIP_CUDA_LIBRARIES return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcusolver.so.11"); @@ -515,8 +584,26 @@ void* GetCusparseDsoHandle() { #if defined(__APPLE__) || defined(__OSX__) return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcusparse.dylib"); #elif defined(_WIN32) && defined(PADDLE_WITH_CUDA) - return GetDsoHandleFromSearchPath( - FLAGS_cuda_dir, win_cusparse_lib, true, {cuda_lib_path}); + if (CUDA_VERSION >= 11000 && CUDA_VERSION < 12000) { +#ifdef WITH_PIP_CUDA_LIBRARIES + return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "cusparse64_11.dll"); +#else + return GetDsoHandleFromSearchPath( + FLAGS_cuda_dir, win_cusparse_lib, true, {cuda_lib_path}); +#endif + } else if (CUDA_VERSION >= 12000 && CUDA_VERSION <= 12030) { +#ifdef WITH_PIP_CUDA_LIBRARIES + return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "cusparse64_12.dll"); +#else + return GetDsoHandleFromSearchPath( + FLAGS_cuda_dir, win_cusparse_lib, true, {cuda_lib_path}); +#endif + } else { + std::string warning_msg( + "Your CUDA_VERSION is less than 11 or greater than 12, paddle " + "temporarily no longer supports"); + return nullptr; + } #elif defined(__linux__) && defined(PADDLE_WITH_CUDA) if (CUDA_VERSION >= 11000 && CUDA_VERSION < 12000) { #ifdef WITH_PIP_CUDA_LIBRARIES @@ -709,8 +796,26 @@ void* GetCUFFTDsoHandle() { return nullptr; } #elif defined(_WIN32) && defined(PADDLE_WITH_CUDA) - return GetDsoHandleFromSearchPath( - FLAGS_cuda_dir, win_cufft_lib, true, {cuda_lib_path}); + if (CUDA_VERSION >= 11000 && CUDA_VERSION < 12000) { +#ifdef WITH_PIP_CUDA_LIBRARIES + return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "cufft64_10.dll"); +#else + return GetDsoHandleFromSearchPath( + FLAGS_cuda_dir, win_cufft_lib, true, {cuda_lib_path}); +#endif + } else if (CUDA_VERSION >= 12000 && CUDA_VERSION <= 12030) { +#ifdef WITH_PIP_CUDA_LIBRARIES + return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "cufft64_11.dll"); +#else + return GetDsoHandleFromSearchPath( + FLAGS_cuda_dir, win_cufft_lib, true, {cuda_lib_path}); +#endif + } else { + std::string warning_msg( + "Your CUDA_VERSION is less than 11 or greater than 12, paddle " + "temporarily no longer supports"); + return nullptr; + } #else return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcufft.so"); #endif diff --git a/paddle/phi/common/port.cc b/paddle/phi/common/port.cc index 513088decf497..dff40f91fa7bc 100644 --- a/paddle/phi/common/port.cc +++ b/paddle/phi/common/port.cc @@ -18,7 +18,6 @@ #include #include #include - #include "glog/logging.h" #if !defined(_WIN32) @@ -42,7 +41,14 @@ void *dlsym(void *handle, const char *symbol_name) { void *dlopen(const char *filename, int flag) { std::string file_name(filename); - HMODULE hModule = LoadLibrary(file_name.c_str()); + HMODULE hModule = nullptr; +#ifdef WITH_PIP_CUDA_LIBRARIES + hModule = + LoadLibraryEx(file_name.c_str(), NULL, LOAD_LIBRARY_SEARCH_DEFAULT_DIRS); +#endif + if (!hModule) { + hModule = LoadLibrary(file_name.c_str()); + } if (!hModule) { if (flag) { throw std::runtime_error(file_name + " not found."); @@ -72,7 +78,7 @@ int gettimeofday(struct timeval *tp, void *tzp) { return (0); } -#endif // !_WIN32 +#endif // !_WIN32 void ExecShellCommand(const std::string &cmd, std::string *message) { std::array buffer = {}; diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc index f12b14e842062..3c3ef874854ab 100644 --- a/paddle/phi/infermeta/binary.cc +++ b/paddle/phi/infermeta/binary.cc @@ -4085,6 +4085,16 @@ void WeightDequantizeInferMeta(const MetaTensor& x, phi::errors::InvalidArgument("group_size must be -1, 64 or 128.")); auto dim_scale = scale.dims(); + int64_t real_channel_shape = -1; + if (algo == "weight_only_int8") { + real_channel_shape = x.dims()[0]; + } else if (algo == "weight_only_int4") { + real_channel_shape = x.dims()[0] * 2; + } else { + PADDLE_THROW(phi::errors::InvalidArgument( + "Currently, we only support weight_only_int8" + " and weight_only_int4 algo.")); + } // per-channel dequantization if (group_size == -1) { @@ -4095,7 +4105,7 @@ void WeightDequantizeInferMeta(const MetaTensor& x, "be 1D in per-channel mode, but got[%d]", scale.dims().size())); PADDLE_ENFORCE_EQ(dim_scale[0], - x.dims()[0], + real_channel_shape, phi::errors::InvalidArgument( "The scale tensor's shape must be equal to the x " "tensor's shape, but got [%d] not equal to [%d]", @@ -4117,9 +4127,16 @@ void WeightDequantizeInferMeta(const MetaTensor& x, "But receive %d and %d", dim_scale[0], (x.dims()[1] + (group_size - 1)) / group_size)); + PADDLE_ENFORCE_EQ(dim_scale[1], + real_channel_shape, + phi::errors::InvalidArgument( + "The scale tensor's shape must be equal to the real " + "channel size, but got [%d] not equal to [%d]", + scale.dims()[0], + real_channel_shape)); } int n = static_cast(x.dims()[1]); - int k = static_cast(x.dims()[0]); + int k = static_cast(real_channel_shape); out->set_dims(common::make_ddim({n, k})); out->set_dtype(out_dtype); } diff --git a/paddle/phi/kernels/cpu/graph_khop_sampler_kernel.cc b/paddle/phi/kernels/cpu/graph_khop_sampler_kernel.cc index 5df537a5b6340..58af31976402a 100644 --- a/paddle/phi/kernels/cpu/graph_khop_sampler_kernel.cc +++ b/paddle/phi/kernels/cpu/graph_khop_sampler_kernel.cc @@ -14,7 +14,7 @@ #include "paddle/phi/kernels/graph_khop_sampler_kernel.h" -#include +#include #include #include diff --git a/paddle/phi/ops/yaml/backward.yaml b/paddle/phi/ops/yaml/backward.yaml index 3e96fa7bc4772..95e8e8917bae4 100644 --- a/paddle/phi/ops/yaml/backward.yaml +++ b/paddle/phi/ops/yaml/backward.yaml @@ -273,6 +273,14 @@ data_type : out_grad no_need_buffer : input +- backward_op : cast_grad + forward : cast (Tensor x, DataType dtype) -> Tensor(out) + args : (Tensor x, Tensor out_grad) + output : Tensor(x_grad) + invoke : cast (out_grad, x.dtype()) + composite: cast_grad(x, out_grad, x_grad) + no_need_buffer : x + - backward_op : ceil_grad forward : ceil(Tensor x) -> Tensor(out) args : (Tensor out_grad) @@ -1807,6 +1815,16 @@ func : logsigmoid_grad inplace : (out_grad -> x_grad) +- backward_op : logsumexp_grad + forward : logsumexp(Tensor x, int64_t[] axis, bool keepdim, bool reduce_all) -> Tensor(out) + args : (Tensor x, Tensor out, Tensor out_grad, int64_t[] axis, bool keepdim, bool reduce_all) + output : Tensor(x_grad) + infer_meta : + func : UnchangedInferMeta + param: [x] + kernel : + func : logsumexp_grad + - backward_op : lu_grad forward : lu (Tensor x, bool pivot = true) -> Tensor(out), Tensor(pivots), Tensor(infos) args : (Tensor x, Tensor out, Tensor pivots, Tensor out_grad, bool pivot) @@ -1859,6 +1877,18 @@ kernel : func : matrix_power_grad +- backward_op : max_grad + forward: max (Tensor x, IntArray axis={}, bool keepdim=false) -> Tensor(out) + args : (Tensor x, Tensor out, Tensor out_grad, IntArray axis={}, bool keepdim=false, bool reduce_all=false) + output : Tensor(x_grad) + infer_meta : + func : UnchangedInferMeta + param: [x] + spmd_rule : ReductionGradInferSpmd + kernel : + func : max_grad + composite : max_grad(x, out, out_grad, axis, keepdim, reduce_all, x_grad) + - backward_op : max_pool2d_with_index_grad forward : max_pool2d_with_index(Tensor x, int[] kernel_size, int[] strides = {1, 1}, int[] paddings = {0, 0}, bool global_pooling = false, bool adaptive = false) -> Tensor(out), Tensor(mask) args : (Tensor x, Tensor mask, Tensor out_grad, int[] kernel_size, int[] strides, int[] paddings, bool global_pooling, bool adaptive) @@ -2079,6 +2109,29 @@ no_need_buffer : x backward : pad3d_double_grad +- backward_op : pad_double_grad + forward : pad_grad(Tensor x, Tensor grad_out, int[] paddings, Scalar pad_value) -> Tensor(grad_x) + args : (Tensor grad_x_grad, int[] paddings, Scalar pad_value) + output : Tensor(grad_out_grad) + infer_meta : + func : PadInferMeta + kernel : + func : pad + +- backward_op : pad_grad + forward : pad(Tensor x, int[] paddings, Scalar pad_value) -> Tensor(out) + args : (Tensor x, Tensor out_grad, int[] paddings, Scalar pad_value) + output : Tensor(x_grad) + infer_meta : + func : UnchangedInferMeta + param: [x] + kernel : + func : pad_grad + param: [out_grad, paddings, pad_value] + no_need_buffer : x + composite : pad_grad(x, out_grad, paddings, pad_value, x_grad) + backward : pad_double_grad + - backward_op : partial_concat_grad forward : partial_concat (Tensor[] x, int start_index = 0, int length = -1) -> Tensor(out) args : (Tensor[] x, Tensor out_grad, int start_index, int length) @@ -2224,6 +2277,17 @@ func : prelu_grad data_type : x +- backward_op : prod_grad + forward : prod (Tensor x, IntArray dims, bool keep_dim, bool reduce_all) -> Tensor(out) + args : (Tensor x, Tensor out, Tensor out_grad, IntArray dims, bool keep_dim, bool reduce_all) + output : Tensor(x_grad) + infer_meta : + func : UnchangedInferMeta + param : [x] + kernel : + func : prod_grad + composite: prod_grad(x, out, out_grad, dims, keep_dim, reduce_all, x_grad) + - backward_op : psroi_pool_grad forward : psroi_pool (Tensor x, Tensor boxes, Tensor boxes_num, int pooled_height=1, int pooled_width=1, int output_channels=1, float spatial_scale=1.0) -> Tensor(out) args : (Tensor x, Tensor boxes, Tensor boxes_num, Tensor out_grad, int pooled_height, int pooled_width, int output_channels, float spatial_scale) @@ -2360,6 +2424,16 @@ kernel : func : renorm_grad +- backward_op : repeat_interleave_grad + forward : repeat_interleave(Tensor x, int repeats, int axis) -> Tensor(out) + args : (Tensor x, Tensor out_grad, int repeats, int axis) + output : Tensor(x_grad) + infer_meta : + func : UnchangedInferMeta + param : [x] + kernel : + func : repeat_interleave_grad + - backward_op : repeat_interleave_with_tensor_index_grad forward : repeat_interleave_with_tensor_index(Tensor x, Tensor repeats, int axis) -> Tensor(out) args : (Tensor x, Tensor repeats, Tensor out_grad, int axis) @@ -2727,6 +2801,26 @@ func : sinh_grad inplace : (out_grad -> x_grad) +- backward_op : slice_double_grad + forward : slice_grad (Tensor input, Tensor grad_out, int64_t[] axes, IntArray starts, IntArray ends, int64_t[] infer_flags, int64_t[] decrease_axis) -> Tensor(grad_input) + args : (Tensor grad_input_grad, int64_t[] axes, IntArray starts, IntArray ends, int64_t[] infer_flags, int64_t[] decrease_axis) + output : Tensor(grad_out_grad) + invoke : slice(grad_input_grad, axes, starts, ends, infer_flags, decrease_axis) + +- backward_op : slice_grad + forward : slice (Tensor input, int64_t[] axes, IntArray starts, IntArray ends, int64_t[] infer_flags, int64_t[] decrease_axis) -> Tensor(out) + args : (Tensor input, Tensor out_grad, int64_t[] axes, IntArray starts, IntArray ends, int64_t[] infer_flags, int64_t[] decrease_axis) + output : Tensor(input_grad) + infer_meta : + func : UnchangedInferMeta + param : [input] + spmd_rule: SliceGradInferSpmdDynamic + kernel : + func : slice_grad + composite: slice_grad(input, out_grad, axes, starts, ends, infer_flags, decrease_axis, input_grad) + backward : slice_double_grad + no_need_buffer : input + - backward_op : slogdet_grad forward : slogdet (Tensor x) -> Tensor(out) args : (Tensor x, Tensor out, Tensor out_grad) @@ -2803,6 +2897,20 @@ func : spectral_norm_grad data_type : weight +- backward_op : split_grad + forward : split (Tensor x, IntArray num_or_sections, Scalar axis) -> Tensor[](out) + args : (Tensor[] out_grad, Scalar axis = -1) + output : Tensor(x_grad) + invoke : concat( out_grad, axis) + composite : split_grad(out_grad, axis, x_grad) + +- backward_op : split_with_num_grad + forward : split_with_num (Tensor x, int num, Scalar axis) -> Tensor[](out) + args : (Tensor[] out_grad, Scalar axis = -1) + output : Tensor(x_grad) + invoke : concat( out_grad, axis) + composite : split_grad(out_grad, axis, x_grad) + - backward_op : sqrt_double_grad forward : sqrt_grad (Tensor out, Tensor grad_out) -> Tensor(grad_x) args : (Tensor out, Tensor grad_x, Tensor grad_x_grad) @@ -2926,6 +3034,26 @@ func : strided_slice_grad no_need_buffer : x +- backward_op : sum_double_grad + forward : sum_grad (Tensor x, Tensor grad_out, IntArray axis, bool keepdim, bool reduce_all=false) -> Tensor(grad_x) + args : (Tensor grad_x_grad, IntArray axis={}, bool keepdim=false) + output : Tensor(grad_out_grad) + invoke : sum(grad_x_grad, axis, grad_x_grad.dtype(), keepdim) + +- backward_op : sum_grad + forward : sum (Tensor x, IntArray axis={}, DataType dtype=DataType::UNDEFINED, bool keepdim=false) -> Tensor(out) + args : (Tensor x, Tensor out_grad, IntArray axis, bool keepdim, bool reduce_all=false) + output : Tensor(x_grad) + infer_meta : + func : UnchangedInferMeta + param : [x] + spmd_rule : ReductionGradInferSpmd + kernel : + func : sum_grad + composite : sum_grad(x, out_grad, axis, keepdim, reduce_all, x_grad) + no_need_buffer : x + backward : sum_double_grad + - backward_op : svd_grad forward : svd (Tensor x, bool full_matrices = false) -> Tensor(u), Tensor(s), Tensor(vh) args : (Tensor x, Tensor u, Tensor vh, Tensor s, Tensor u_grad, Tensor vh_grad, Tensor s_grad, bool full_matrices) @@ -3108,6 +3236,25 @@ kernel : func : trans_layout_grad +- backward_op : transpose_double_grad + forward : transpose_grad (Tensor grad_out, int[] perm) -> Tensor(grad_x) + args : (Tensor grad_x_grad, int[] perm) + output : Tensor(grad_out_grad) + invoke : transpose(grad_x_grad, perm) + +- backward_op : transpose_grad + forward : transpose (Tensor x, int[] perm) -> Tensor(out) + args : (Tensor out_grad, int[] perm) + output : Tensor(x_grad) + infer_meta : + func : TransposeGradInferMeta + param : [out_grad, perm] + spmd_rule: TransposeGradInferSpmd + kernel : + func : transpose_grad + backward : transpose_double_grad + composite: transpose_grad(out_grad, perm, x_grad) + - backward_op : triangular_solve_grad forward : triangular_solve (Tensor x, Tensor y, bool upper=true, bool transpose=false, bool unitriangular=false) -> Tensor(out) args : (Tensor x, Tensor y, Tensor out, Tensor out_grad, bool upper, bool transpose, bool unitriangular) @@ -3118,6 +3265,16 @@ kernel : func : triangular_solve_grad +- backward_op : tril_grad + forward : tril(Tensor x, int diagonal) -> Tensor(out) + args : (Tensor out_grad, int diagonal) + output : Tensor(x_grad) + infer_meta : + func : UnchangedInferMeta + param : [out_grad] + kernel : + func : tril_grad + - backward_op : trilinear_interp_grad forward : trilinear_interp (Tensor x, Tensor out_size, Tensor[] size_tensor, Tensor scale_tensor, str data_format="NCHW", int out_d=0, int out_h=0, int out_w=0, float[] scale={}, str interp_method="bilinear", bool align_corners=true, int align_mode=1) -> Tensor(output) args : (Tensor x, Tensor out_size, Tensor[] size_tensor, Tensor scale_tensor, Tensor output_grad, str data_format, int out_d, int out_h, int out_w, float[] scale, str interp_method, bool align_corners, int align_mode) @@ -3133,6 +3290,17 @@ data_transform : skip_transform : out_size, size_tensor, scale_tensor +- backward_op : triu_grad + forward : triu(Tensor x, int diagonal) -> Tensor(out) + args : (Tensor out_grad, int diagonal) + output : Tensor(x_grad) + infer_meta : + func : UnchangedInferMeta + param : [out_grad] + spmd_rule : TriuGradInferSpmd + kernel : + func : triu_grad + - backward_op : trunc_grad forward : trunc (Tensor input) -> Tensor(out) args : (Tensor out_grad) diff --git a/paddle/phi/ops/yaml/inconsistent/dygraph_backward.yaml b/paddle/phi/ops/yaml/inconsistent/dygraph_backward.yaml index 1a48324de7ac2..8a49493b7eb19 100755 --- a/paddle/phi/ops/yaml/inconsistent/dygraph_backward.yaml +++ b/paddle/phi/ops/yaml/inconsistent/dygraph_backward.yaml @@ -94,14 +94,6 @@ func : c_embedding_grad no_need_buffer : weight -- backward_op : cast_grad - forward : cast (Tensor x, DataType dtype) -> Tensor(out) - args : (Tensor x, Tensor out_grad) - output : Tensor(x_grad) - invoke : cast (out_grad, x.dtype()) - composite: cast_grad(x, out_grad, x_grad) - no_need_buffer : x - - backward_op : divide_double_grad forward : divide_grad (Tensor x, Tensor y, Tensor out, Tensor grad_out, int axis = -1) -> Tensor(grad_x), Tensor(grad_y) args : (Tensor y, Tensor out, Tensor grad_out, Tensor grad_x, Tensor grad_x_grad, Tensor grad_y_grad, int axis = -1) @@ -176,16 +168,6 @@ func : hardswish_grad inplace : (out_grad -> x_grad) -- backward_op : logsumexp_grad - forward : logsumexp(Tensor x, int64_t[] axis, bool keepdim, bool reduce_all) -> Tensor(out) - args : (Tensor x, Tensor out, Tensor out_grad, int64_t[] axis, bool keepdim, bool reduce_all) - output : Tensor(x_grad) - infer_meta : - func : UnchangedInferMeta - param: [x] - kernel : - func : logsumexp_grad - - backward_op : matmul_double_grad forward : matmul_grad (Tensor x, Tensor y, Tensor grad_out, bool transpose_x=false, bool transpose_y=false) -> Tensor(grad_x), Tensor(grad_y) args : (Tensor x, Tensor y, Tensor grad_out, Tensor grad_x_grad, Tensor grad_y_grad, bool transpose_x=false, bool transpose_y=false) @@ -210,18 +192,6 @@ func : matmul_grad backward : matmul_double_grad -- backward_op : max_grad - forward: max (Tensor x, IntArray axis={}, bool keepdim=false) -> Tensor(out) - args : (Tensor x, Tensor out, Tensor out_grad, IntArray axis={}, bool keepdim=false, bool reduce_all=false) - output : Tensor(x_grad) - infer_meta : - func : UnchangedInferMeta - param: [x] - spmd_rule : ReductionGradInferSpmd - kernel : - func : max_grad - composite : max_grad(x, out, out_grad, axis, keepdim, reduce_all, x_grad) - - backward_op : maximum_grad forward : maximum(Tensor x, Tensor y) -> Tensor(out) args : (Tensor x, Tensor y, Tensor out_grad) @@ -296,50 +266,6 @@ func : multiply_triple_grad optional : fwd_grad_grad_x, fwd_grad_grad_y, grad_x_grad, grad_y_grad, grad_grad_out_grad -- backward_op : pad_double_grad - forward : pad_grad(Tensor x, Tensor grad_out, int[] paddings, Scalar pad_value) -> Tensor(grad_x) - args : (Tensor grad_x_grad, int[] paddings, Scalar pad_value) - output : Tensor(grad_out_grad) - infer_meta : - func : PadInferMeta - kernel : - func : pad - -- backward_op : pad_grad - forward : pad(Tensor x, int[] paddings, Scalar pad_value) -> Tensor(out) - args : (Tensor x, Tensor out_grad, int[] paddings, Scalar pad_value) - output : Tensor(x_grad) - infer_meta : - func : UnchangedInferMeta - param: [x] - kernel : - func : pad_grad - param: [out_grad, paddings, pad_value] - no_need_buffer : x - composite : pad_grad(x, out_grad, paddings, pad_value, x_grad) - backward : pad_double_grad - -- backward_op : prod_grad - forward : prod (Tensor x, IntArray dims, bool keep_dim, bool reduce_all) -> Tensor(out) - args : (Tensor x, Tensor out, Tensor out_grad, IntArray dims, bool keep_dim, bool reduce_all) - output : Tensor(x_grad) - infer_meta : - func : UnchangedInferMeta - param : [x] - kernel : - func : prod_grad - composite: prod_grad(x, out, out_grad, dims, keep_dim, reduce_all, x_grad) - -- backward_op : repeat_interleave_grad - forward : repeat_interleave(Tensor x, int repeats, int axis) -> Tensor(out) - args : (Tensor x, Tensor out_grad, int repeats, int axis) - output : Tensor(x_grad) - infer_meta : - func : UnchangedInferMeta - param : [x] - kernel : - func : repeat_interleave_grad - - backward_op : reshape_double_grad forward : reshape_grad (Tensor xshape, Tensor grad_out) -> Tensor(grad_x) args : (Tensor grad_out, Tensor grad_x_grad) @@ -380,26 +306,6 @@ func: set_value_with_scalar_grad param: [out_grad, starts, ends, steps, axes, decrease_axes, none_axes] -- backward_op : slice_double_grad - forward : slice_grad (Tensor input, Tensor grad_out, int64_t[] axes, IntArray starts, IntArray ends, int64_t[] infer_flags, int64_t[] decrease_axis) -> Tensor(grad_input) - args : (Tensor grad_input_grad, int64_t[] axes, IntArray starts, IntArray ends, int64_t[] infer_flags, int64_t[] decrease_axis) - output : Tensor(grad_out_grad) - invoke : slice(grad_input_grad, axes, starts, ends, infer_flags, decrease_axis) - -- backward_op : slice_grad - forward : slice (Tensor input, int64_t[] axes, IntArray starts, IntArray ends, int64_t[] infer_flags, int64_t[] decrease_axis) -> Tensor(out) - args : (Tensor input, Tensor out_grad, int64_t[] axes, IntArray starts, IntArray ends, int64_t[] infer_flags, int64_t[] decrease_axis) - output : Tensor(input_grad) - infer_meta : - func : UnchangedInferMeta - param : [input] - spmd_rule: SliceGradInferSpmdDynamic - kernel : - func : slice_grad - composite: slice_grad(input, out_grad, axes, starts, ends, infer_flags, decrease_axis, input_grad) - backward : slice_double_grad - no_need_buffer : input - - backward_op : softmax_grad forward : softmax (Tensor x, int axis) -> Tensor(out) args : (Tensor out, Tensor out_grad, int axis) @@ -412,20 +318,6 @@ func : softmax_grad composite : softmax_grad(out, out_grad, axis, x_grad) -- backward_op : split_grad - forward : split (Tensor x, IntArray num_or_sections, Scalar axis) -> Tensor[](out) - args : (Tensor[] out_grad, Scalar axis = -1) - output : Tensor(x_grad) - invoke : concat( out_grad, axis) - composite : split_grad(out_grad, axis, x_grad) - -- backward_op : split_with_num_grad - forward : split_with_num (Tensor x, int num, Scalar axis) -> Tensor[](out) - args : (Tensor[] out_grad, Scalar axis = -1) - output : Tensor(x_grad) - invoke : concat( out_grad, axis) - composite : split_grad(out_grad, axis, x_grad) - - backward_op : subtract_double_grad forward : subtract_grad (Tensor x, Tensor y, Tensor grad_out, int axis = -1) -> Tensor(grad_x), Tensor(grad_y) args : (Tensor y, Tensor grad_out, Tensor grad_x_grad, Tensor grad_y_grad, int axis = -1) @@ -455,26 +347,6 @@ backward : subtract_double_grad inplace : (out_grad -> x_grad) -- backward_op : sum_double_grad - forward : sum_grad (Tensor x, Tensor grad_out, IntArray axis, bool keepdim, bool reduce_all=false) -> Tensor(grad_x) - args : (Tensor grad_x_grad, IntArray axis={}, bool keepdim=false) - output : Tensor(grad_out_grad) - invoke : sum(grad_x_grad, axis, grad_x_grad.dtype(), keepdim) - -- backward_op : sum_grad - forward : sum (Tensor x, IntArray axis={}, DataType dtype=DataType::UNDEFINED, bool keepdim=false) -> Tensor(out) - args : (Tensor x, Tensor out_grad, IntArray axis, bool keepdim, bool reduce_all=false) - output : Tensor(x_grad) - infer_meta : - func : UnchangedInferMeta - param : [x] - spmd_rule : ReductionGradInferSpmd - kernel : - func : sum_grad - composite : sum_grad(x, out_grad, axis, keepdim, reduce_all, x_grad) - no_need_buffer : x - backward : sum_double_grad - - backward_op : tile_double_grad forward : tile_grad (Tensor x, Tensor grad_out, IntArray repeat_times) -> Tensor(grad_x) args : (Tensor grad_x_grad, IntArray repeat_times) @@ -495,46 +367,6 @@ composite : tile_grad(x, out_grad, repeat_times, x_grad) backward : tile_double_grad -- backward_op : transpose_double_grad - forward : transpose_grad (Tensor grad_out, int[] perm) -> Tensor(grad_x) - args : (Tensor grad_x_grad, int[] perm) - output : Tensor(grad_out_grad) - invoke : transpose(grad_x_grad, perm) - -- backward_op : transpose_grad - forward : transpose (Tensor x, int[] perm) -> Tensor(out) - args : (Tensor out_grad, int[] perm) - output : Tensor(x_grad) - infer_meta : - func : TransposeGradInferMeta - param : [out_grad, perm] - spmd_rule: TransposeGradInferSpmd - kernel : - func : transpose_grad - backward : transpose_double_grad - composite: transpose_grad(out_grad, perm, x_grad) - -- backward_op : tril_grad - forward : tril(Tensor x, int diagonal) -> Tensor(out) - args : (Tensor out_grad, int diagonal) - output : Tensor(x_grad) - infer_meta : - func : UnchangedInferMeta - param : [out_grad] - kernel : - func : tril_grad - -- backward_op : triu_grad - forward : triu(Tensor x, int diagonal) -> Tensor(out) - args : (Tensor out_grad, int diagonal) - output : Tensor(x_grad) - infer_meta : - func : UnchangedInferMeta - param : [out_grad] - spmd_rule : TriuGradInferSpmd - kernel : - func : triu_grad - - backward_op: fused_gemm_epilogue_grad forward : fused_gemm_epilogue(Tensor x, Tensor y, Tensor bias, bool trans_x, bool trans_y, str activation) -> Tensor(out), Tensor(reserve_space) args : (Tensor x, Tensor y, Tensor reserve_space, Tensor out_grad, bool trans_x, bool trans_y, str activation) diff --git a/paddle/phi/ops/yaml/inconsistent/dygraph_ops.yaml b/paddle/phi/ops/yaml/inconsistent/dygraph_ops.yaml index d8e8508f14cee..bd704f926a9a1 100755 --- a/paddle/phi/ops/yaml/inconsistent/dygraph_ops.yaml +++ b/paddle/phi/ops/yaml/inconsistent/dygraph_ops.yaml @@ -42,31 +42,6 @@ backward : assign_grad inplace : (x -> out) -- op : assign_out_ - args : (Tensor x, Tensor output) - output : Tensor(out) - infer_meta : - func : UnchangedInferMeta - param : [x] - kernel : - func : assign - param : [x] - inplace : (output -> out) - backward : assign_out__grad - -- op : assign_value_ - args : (Tensor output, int[] shape, DataType dtype, Scalar[] values, Place place = {}) - output : Tensor(out) - inplace: (output -> out) - infer_meta : - func : AssignValueInferMeta - param : [shape, dtype] - kernel : - func : assign_value - param : [shape, dtype, values] - data_type : dtype - backend : place > output - - op : batch_norm args : (Tensor x, Tensor mean, Tensor variance, Tensor scale, Tensor bias, bool is_test, float momentum, float epsilon, str data_format, bool use_global_stats, bool trainable_statistics) output : Tensor(out), Tensor(mean_out), Tensor(variance_out), Tensor(saved_mean), Tensor(saved_variance), Tensor(reserve_space) @@ -91,19 +66,6 @@ data_type : weight backward : c_embedding_grad -- op : cast - args : (Tensor x, DataType dtype) - output : Tensor(out) - infer_meta : - func : CastInferMeta - spmd_rule : CastInferSpmd - kernel : - func : cast - param : [x, dtype] - data_type : x - inplace: (x -> out) - backward : cast_grad - - op : distribute_fpn_proposals args : (Tensor fpn_rois, Tensor rois_num, int min_level, int max_level, int refer_level, int refer_scale, bool pixel_offset) output : Tensor[](multi_fpn_rois){max_level - min_level + 1}, Tensor[](multi_level_rois_num){max_level - min_level + 1}, Tensor(restore_index) @@ -169,18 +131,6 @@ func : embedding_grad data_type : weight -- op : empty - args : (IntArray shape, DataType dtype=DataType::FLOAT32, Place place=CPUPlace()) - output: Tensor(out) - infer_meta : - func : CreateInferMeta - param : [shape, dtype] - kernel : - func : empty - param : [shape, dtype] - data_type : dtype - backend : place - - op : equal args : (Tensor x, Tensor y) output : Tensor(out) @@ -191,17 +141,6 @@ func : equal inplace: (x -> out) -- op : exponential_ - args : (Tensor x, float lam) - output : Tensor(out) - infer_meta : - func : UnchangedInferMeta - param : [x] - kernel : - func : exponential - inplace : (x -> out) - backward : exponential__grad - - op : floor_divide args : (Tensor x, Tensor y) output : Tensor(out) @@ -211,16 +150,6 @@ func : floor_divide inplace: (x -> out) -- op : full_with_tensor - args : (Tensor value, IntArray shape, DataType dtype=DataType::FLOAT32) - output: Tensor(out) - infer_meta : - func : FullWithTensorInferMeta - param : [shape, dtype] - kernel : - func : full_with_tensor - data_type : dtype - - op : fused_adam_ args : (Tensor[] params, Tensor[] grads, Tensor learning_rate, Tensor[] moments1, Tensor[] moments2, Tensor[] beta1_pows, Tensor[] beta2_pows, Tensor[] master_params, Tensor skip_update, Scalar beta1, Scalar beta2, Scalar epsilon, int chunk_size, float weight_decay, bool use_adamw, bool multi_precision, bool use_global_beta_pow) output : Tensor[](params_out){params.size()}, Tensor[](moments1_out){params.size()}, Tensor[](moments2_out){params.size()}, Tensor[](beta1_pows_out){params.size()}, Tensor[](beta2_pows_out){params.size()}, Tensor[](master_params_out){params.size()} @@ -239,18 +168,6 @@ backward: fused_gemm_epilogue_grad optional: reserve_space -- op : gaussian - args : (IntArray shape, float mean, float std, int seed, DataType dtype, Place place={}) - output: Tensor(out) - infer_meta : - func : GaussianInferMeta - param : [shape, mean, std, seed, dtype] - kernel : - func : gaussian - param : [shape, mean, std, seed, dtype] - data_type : dtype - backend : place - - op : greater_equal args : (Tensor x, Tensor y) output : Tensor(out) @@ -279,15 +196,6 @@ func : hardswish backward : hardswish_grad -- op : increment - args : (Tensor x, float value = 1.0) - output : Tensor(out) - infer_meta : - func : IncrementInferMeta - kernel : - func : increment - inplace : (x -> out) - - op : less_equal args : (Tensor x, Tensor y) output : Tensor(out) @@ -306,39 +214,6 @@ func : less_than inplace: (x -> out) -- op : linspace - args : (Tensor start, Tensor stop, Tensor number, DataType dtype, Place place) - output : Tensor(out) - infer_meta : - func : LinspaceInferMeta - param: [start, stop, number, dtype] - kernel : - func : linspace - param: [start, stop, number, dtype] - data_type : dtype - backend : place - -- op : logspace - args : (Tensor start, Tensor stop, Tensor num, Tensor base, DataType dtype, Place place={}) - output : Tensor(out) - infer_meta: - func : LogspaceInferMeta - param : [start, stop, num, base, dtype] - kernel : - func : logspace - param : [start, stop, num, base, dtype] - data_type : dtype - backend : place - -- op : logsumexp - args : (Tensor x, int64_t[] axis, bool keepdim, bool reduce_all) - output : Tensor(out) - infer_meta : - func : LogsumexpInferMeta - kernel : - func : logsumexp - backward : logsumexp_grad - - op : matmul args : (Tensor x, Tensor y, bool transpose_x = false, bool transpose_y = false) output : Tensor @@ -349,16 +224,6 @@ func : matmul backward : matmul_grad -- op : max - args : (Tensor x, IntArray axis={}, bool keepdim=false) - output : Tensor(out) - infer_meta : - func : ReduceIntArrayAxisInferMeta - spmd_rule: ReductionMaxInferSpmdDynamic - kernel : - func : max - backward : max_grad - - op : maximum args : (Tensor x, Tensor y) output : Tensor(out) @@ -409,48 +274,6 @@ func : not_equal inplace: (x -> out) -- op : pad - args : (Tensor x, int[] paddings, Scalar pad_value) - output : Tensor - infer_meta : - func : PadInferMeta - kernel : - func : pad - backward : pad_grad - -- op : prod - args : (Tensor x, IntArray dims, bool keep_dim, bool reduce_all) - output : Tensor - infer_meta : - func : ReduceIntArrayAxisInferMetaBase - kernel : - func : prod - backward : prod_grad - -- op : randint - args : (int low, int high, IntArray shape, DataType dtype=DataType::INT64, Place place={}) - output : Tensor(out) - infer_meta : - func : RandintInferMeta - param : [low, high, shape, dtype] - kernel : - func : randint - param : [low, high, shape, dtype] - data_type : dtype - backend : place - -- op : randperm - args : (int n, DataType dtype, Place place={}) - output : Tensor(out) - infer_meta : - func : RandpermInferMeta - param : [n, dtype] - kernel : - func : randperm - param : [n, dtype] - data_type : dtype - backend : place - - op : remainder args : (Tensor x, Tensor y) output : Tensor (out) @@ -460,16 +283,6 @@ func : remainder inplace : (x -> out) -- op : repeat_interleave - args : (Tensor x, int repeats, int axis) - output : Tensor(out) - infer_meta : - func : RepeatInterleaveInferMeta - kernel : - func : repeat_interleave - data_type : x - backward: repeat_interleave_grad - - op : reshape args : (Tensor x, IntArray shape) output : Tensor(out), Tensor(xshape) @@ -485,15 +298,6 @@ intermediate : xshape backward: reshape_grad -- op : sequence_mask - args: (Tensor x, Scalar(int) max_len, DataType out_dtype) - output: Tensor(y) - infer_meta: - func: SequenceMaskScalarInferMeta - kernel: - func: sequence_mask_scalar - data_type : x - - op : set_value args : (Tensor x, IntArray starts, IntArray ends, IntArray steps, int64_t[] axes, int64_t[] decrease_axes, int64_t[] none_axes, int64_t[] shape, Scalar[] values) output : Tensor(out) @@ -505,16 +309,6 @@ func : set_value backward: set_value_grad -- op : slice - args : (Tensor input, int64_t[] axes, IntArray starts, IntArray ends, int64_t[] infer_flags, int64_t[] decrease_axis) - output : Tensor - infer_meta : - func : SliceRawInferMeta - spmd_rule : SliceInferSpmdDynamic - kernel : - func : slice - backward : slice_grad - - op : softmax args : (Tensor x, int axis) output : Tensor(out) @@ -526,25 +320,6 @@ inplace : (x -> out) backward : softmax_grad -- op : split - args : (Tensor x, IntArray sections, Scalar(int) axis) - output : Tensor[]{sections.size()} - infer_meta : - func : SplitInferMeta - kernel : - func : split - backward : split_grad - -- op : split_with_num - args : (Tensor x, int num, Scalar(int) axis) - output : Tensor[]{num} - infer_meta : - func : SplitWithNumInferMeta - spmd_rule : SplitWithNumInferSpmdDynamic - kernel : - func : split_with_num - backward : split_with_num_grad - - op : subtract args : (Tensor x, Tensor y) output : Tensor(out) @@ -556,17 +331,6 @@ inplace : (x -> out) backward : subtract_grad -- op : sum - args : (Tensor x, IntArray axis={}, DataType dtype=DataType::UNDEFINED, bool keepdim=false) - output : Tensor(out) - infer_meta : - func : SumInferMeta - spmd_rule : ReductionSumInferSpmdDynamic - kernel : - func : sum - data_type : x - backward : sum_grad - - op : tile args : (Tensor x, IntArray repeat_times = {}) output : Tensor(out) @@ -577,87 +341,6 @@ func : tile backward : tile_grad -- op : transpose - args : (Tensor x, int[] perm) - output : Tensor(out) - infer_meta : - func : TransposeInferMeta - spmd_rule: TransposeInferSpmd - kernel : - func : transpose - inplace : (x -> out) - backward : transpose_grad - -- op : tril - args : (Tensor x, int diagonal) - output : Tensor(out) - infer_meta : - func : TrilInferMeta - kernel : - func : tril - inplace: (x -> out) - backward : tril_grad - -- op : tril_indices - args : (int rows, int cols, int offset, DataType dtype, Place place={}) - output : Tensor(out) - infer_meta : - func : TrilIndicesInferMeta - param : [rows, cols, offset, dtype] - kernel : - func : tril_indices - param : [rows, cols, offset, dtype] - data_type : dtype - backend : place - -- op : triu - args : (Tensor x, int diagonal) - output : Tensor(out) - infer_meta : - func : TriuInferMeta - spmd_rule : TriuInferSpmd - kernel : - func : triu - inplace: (x -> out) - backward : triu_grad - -- op : triu_indices - args : (int row, int col, int offset, DataType dtype, Place place={}) - output : Tensor(out) - infer_meta : - func : TriuIndicesInferMeta - param : [row, col, offset, dtype] - kernel : - func : triu_indices - param : [row, col, offset, dtype] - data_type : dtype - backend : place - -# python API: paddle.nn.initializer.TruncatedNormal -- op : truncated_gaussian_random - args : (int[] shape, float mean, float std, int seed, float a, float b, DataType dtype=DataType::FLOAT32, Place place={}) - output : Tensor(out) - infer_meta : - func : TruncatedGaussianRandomInferMeta - param : [shape, mean, std, seed, a, b, dtype] - kernel : - func : truncated_gaussian_random - param : [shape, mean, std, seed, a, b, dtype] - backend : place - data_type : dtype - -- op : uniform - args : (IntArray shape, DataType dtype, Scalar min, Scalar max, int seed, Place place={}) - output : Tensor(out) - infer_meta : - func : UniformRandomInferMeta - param: [shape, dtype] - kernel : - func : uniform - param: [shape, dtype, min, max, seed] - data_type : dtype - backend : place - # The `axis` argument of Python API paddle.unique is not vector - op : unique args : (Tensor x, bool return_index, bool return_inverse, bool return_counts, int[] axis, DataType dtype=DataType::INT64) diff --git a/paddle/phi/ops/yaml/inconsistent/static_backward.yaml b/paddle/phi/ops/yaml/inconsistent/static_backward.yaml index 419c3e7cbc454..f408cece8e006 100644 --- a/paddle/phi/ops/yaml/inconsistent/static_backward.yaml +++ b/paddle/phi/ops/yaml/inconsistent/static_backward.yaml @@ -121,14 +121,6 @@ func: c_softmax_with_cross_entropy_grad data_type: loss_grad -- backward_op : cast_grad - forward : cast (Tensor x, DataType dtype) -> Tensor(out) - args : (Tensor x, Tensor out_grad) - output : Tensor(x_grad) - invoke : cast (out_grad, x.dtype()) - composite: cast_grad(x, out_grad, x_grad) - no_need_buffer : x - - backward_op : divide_double_grad forward : divide_grad (Tensor x, Tensor y, Tensor out, Tensor grad_out, int axis = -1) -> Tensor(grad_x), Tensor(grad_y) args : (Tensor y, Tensor out, Tensor grad_out, Tensor grad_x, Tensor grad_x_grad, Tensor grad_y_grad, int axis = -1) @@ -234,16 +226,6 @@ inplace : (out_grad -> x_grad) composite : hardswish_grad(x, out_grad, x_grad) -- backward_op : logsumexp_grad - forward : logsumexp(Tensor x, int64_t[] axis, bool keepdim, bool reduce_all) -> Tensor(out) - args : (Tensor x, Tensor out, Tensor out_grad, int64_t[] axis, bool keepdim, bool reduce_all) - output : Tensor(x_grad) - infer_meta : - func : UnchangedInferMeta - param: [x] - kernel : - func : logsumexp_grad - - backward_op : lrn_grad forward : lrn (Tensor x, int n=5, float k=2.0, float alpha=0.0001, float beta=0.75, str data_format="NCHW") -> Tensor(out), Tensor(mid_out) args : (Tensor x, Tensor out, Tensor mid_out, Tensor out_grad, int n=5, float k=2.0, float alpha=0.0001, float beta=0.75, str data_format="NCHW") @@ -289,18 +271,6 @@ kernel : func : matmul_with_flatten_grad -- backward_op : max_grad - forward: max (Tensor x, IntArray axis={}, bool keepdim=false) -> Tensor(out) - args : (Tensor x, Tensor out, Tensor out_grad, IntArray axis={}, bool keepdim=false, bool reduce_all=false) - output : Tensor(x_grad) - infer_meta : - func : UnchangedInferMeta - param: [x] - spmd_rule : ReductionGradInferSpmd - kernel : - func : max_grad - composite : max_grad(x, out, out_grad, axis, keepdim, reduce_all, x_grad) - - backward_op : maximum_grad forward : maximum(Tensor x, Tensor y) -> Tensor(out) args : (Tensor x, Tensor y, Tensor out_grad) @@ -384,40 +354,6 @@ data_type: input optional: bias, sample_weight, custom_dist_probs, custom_dist_alias, custom_dist_alias_probs -- backward_op : pad_double_grad - forward : pad_grad(Tensor x, Tensor grad_out, int[] paddings, Scalar pad_value) -> Tensor(grad_x) - args : (Tensor grad_x_grad, int[] paddings, Scalar pad_value) - output : Tensor(grad_out_grad) - infer_meta : - func : PadInferMeta - kernel : - func : pad - -- backward_op : pad_grad - forward : pad(Tensor x, int[] paddings, Scalar pad_value) -> Tensor(out) - args : (Tensor x, Tensor out_grad, int[] paddings, Scalar pad_value) - output : Tensor(x_grad) - infer_meta : - func : UnchangedInferMeta - param: [x] - kernel : - func : pad_grad - param: [out_grad, paddings, pad_value] - no_need_buffer : x - composite : pad_grad(x, out_grad, paddings, pad_value, x_grad) - backward : pad_double_grad - -- backward_op : prod_grad - forward : prod (Tensor x, IntArray dims, bool keep_dim, bool reduce_all) -> Tensor(out) - args : (Tensor x, Tensor out, Tensor out_grad, IntArray dims, bool keep_dim, bool reduce_all) - output : Tensor(x_grad) - infer_meta : - func : UnchangedInferMeta - param : [x] - kernel : - func : prod_grad - composite: prod_grad(x, out, out_grad, dims, keep_dim, reduce_all, x_grad) - - backward_op : push_box_sparse forward : pull_box_sparse (Tensor w, Tensor[] ids, bool is_sparse = false, bool is_distributed = false, int size = 1) -> Tensor[](out){ids.size()} args : (Tensor[] ids, Tensor[] out_grad_in, bool is_sparse = false, bool is_distributed = false, int size = 1) @@ -430,16 +366,6 @@ data_type : out_grad_in inplace : (out_grad_in -> out_grad_out) -- backward_op : repeat_interleave_grad - forward : repeat_interleave(Tensor x, int repeats, int axis) -> Tensor(out) - args : (Tensor x, Tensor out_grad, int repeats, int axis) - output : Tensor(x_grad) - infer_meta : - func : UnchangedInferMeta - param : [x] - kernel : - func : repeat_interleave_grad - - backward_op : reshape_double_grad forward : reshape_grad (Tensor xshape, Tensor grad_out) -> Tensor(grad_x) args : (Tensor grad_out, Tensor grad_x_grad) @@ -490,26 +416,6 @@ func: assign param: [out_grad] -- backward_op : slice_double_grad - forward : slice_grad (Tensor input, Tensor grad_out, int64_t[] axes, IntArray starts, IntArray ends, int64_t[] infer_flags, int64_t[] decrease_axis) -> Tensor(grad_input) - args : (Tensor grad_input_grad, int64_t[] axes, IntArray starts, IntArray ends, int64_t[] infer_flags, int64_t[] decrease_axis) - output : Tensor(grad_out_grad) - invoke : slice(grad_input_grad, axes, starts, ends, infer_flags, decrease_axis) - -- backward_op : slice_grad - forward : slice (Tensor input, int64_t[] axes, IntArray starts, IntArray ends, int64_t[] infer_flags, int64_t[] decrease_axis) -> Tensor(out) - args : (Tensor input, Tensor out_grad, int64_t[] axes, IntArray starts, IntArray ends, int64_t[] infer_flags, int64_t[] decrease_axis) - output : Tensor(input_grad) - infer_meta : - func : UnchangedInferMeta - param : [input] - spmd_rule: SliceGradInferSpmdDynamic - kernel : - func : slice_grad - composite: slice_grad(input, out_grad, axes, starts, ends, infer_flags, decrease_axis, input_grad) - backward : slice_double_grad - no_need_buffer : input - - backward_op : soft_relu_grad forward : soft_relu (Tensor x, float threshold) -> Tensor(out) args : (Tensor out, Tensor out_grad, float threshold) @@ -531,20 +437,6 @@ func : softmax_grad composite : softmax_grad(out, out_grad, axis, x_grad) -- backward_op : split_grad - forward : split (Tensor x, IntArray num_or_sections, Scalar axis) -> Tensor[](out) - args : (Tensor[] out_grad, Scalar axis = -1) - output : Tensor(x_grad) - invoke : concat( out_grad, axis) - composite : split_grad(out_grad, axis, x_grad) - -- backward_op : split_with_num_grad - forward : split_with_num (Tensor x, int num, Scalar axis) -> Tensor[](out) - args : (Tensor[] out_grad, Scalar axis = -1) - output : Tensor(x_grad) - invoke : concat( out_grad, axis) - composite : split_grad(out_grad, axis, x_grad) - - backward_op : subtract_double_grad forward : subtract_grad (Tensor x, Tensor y, Tensor grad_out, int axis = -1) -> Tensor(grad_x), Tensor(grad_y) args : (Tensor y, Tensor grad_out, Tensor grad_x_grad, Tensor grad_y_grad, int axis = -1) @@ -574,26 +466,6 @@ backward : subtract_double_grad inplace : (out_grad -> x_grad) -- backward_op : sum_double_grad - forward : sum_grad (Tensor x, Tensor grad_out, IntArray axis, bool keepdim, bool reduce_all=false) -> Tensor(grad_x) - args : (Tensor grad_x_grad, IntArray axis={}, bool keepdim=false) - output : Tensor(grad_out_grad) - invoke : sum(grad_x_grad, axis, grad_x_grad.dtype(), keepdim) - -- backward_op : sum_grad - forward : sum (Tensor x, IntArray axis={}, DataType dtype=DataType::UNDEFINED, bool keepdim=false) -> Tensor(out) - args : (Tensor x, Tensor out_grad, IntArray axis, bool keepdim, bool reduce_all=false) - output : Tensor(x_grad) - infer_meta : - func : UnchangedInferMeta - param : [x] - spmd_rule : ReductionGradInferSpmd - kernel : - func : sum_grad - composite : sum_grad(x, out_grad, axis, keepdim, reduce_all, x_grad) - no_need_buffer : x - backward : sum_double_grad - - backward_op : tile_double_grad forward : tile_grad (Tensor x, Tensor grad_out, IntArray repeat_times) -> Tensor(grad_x) args : (Tensor grad_x_grad, IntArray repeat_times) @@ -613,46 +485,6 @@ composite : tile_grad(x, out_grad, repeat_times, x_grad) backward : tile_double_grad -- backward_op : transpose_double_grad - forward : transpose_grad (Tensor grad_out, int[] perm) -> Tensor(grad_x) - args : (Tensor grad_x_grad, int[] perm) - output : Tensor(grad_out_grad) - invoke : transpose(grad_x_grad, perm) - -- backward_op : transpose_grad - forward : transpose (Tensor x, int[] perm) -> Tensor(out) - args : (Tensor out_grad, int[] perm) - output : Tensor(x_grad) - infer_meta : - func : TransposeGradInferMeta - param : [out_grad, perm] - spmd_rule: TransposeGradInferSpmd - kernel : - func : transpose_grad - backward : transpose_double_grad - composite: transpose_grad(out_grad, perm, x_grad) - -- backward_op : tril_grad - forward : tril(Tensor x, int diagonal) -> Tensor(out) - args : (Tensor out_grad, int diagonal) - output : Tensor(x_grad) - infer_meta : - func : UnchangedInferMeta - param : [out_grad] - kernel : - func : tril_grad - -- backward_op : triu_grad - forward : triu(Tensor x, int diagonal) -> Tensor(out) - args : (Tensor out_grad, int diagonal) - output : Tensor(x_grad) - infer_meta : - func : UnchangedInferMeta - param : [out_grad] - spmd_rule : TriuGradInferSpmd - kernel : - func : triu_grad - - backward_op: fused_elemwise_add_activation_grad forward: fused_elemwise_add_activation(Tensor x, Tensor y, str[] functor_list, float scale=0.0, int axis=-1, bool save_intermediate_out=false) -> Tensor(out), Tensor(intermediate_out) args: (Tensor x, Tensor y, Tensor out, Tensor intermediate_out, Tensor out_grad, str[] functor_list, float scale=0.0, int axis=-1, bool save_intermediate_out=false) diff --git a/paddle/phi/ops/yaml/inconsistent/static_ops.yaml b/paddle/phi/ops/yaml/inconsistent/static_ops.yaml index 92505d3399af6..ddfe98cefcc80 100644 --- a/paddle/phi/ops/yaml/inconsistent/static_ops.yaml +++ b/paddle/phi/ops/yaml/inconsistent/static_ops.yaml @@ -49,19 +49,6 @@ inplace : (x -> out) interfaces : paddle::dialect::InferSymbolicShapeInterface, paddle::dialect::LayoutTransformationInterface -- op : assign_out_ - args : (Tensor x, Tensor output) - output : Tensor(out) - infer_meta : - func : UnchangedInferMeta - param : [x] - kernel : - func : assign - param : [x] - inplace : (output -> out) - backward : assign_out__grad - traits : pir::SideEffectTrait - - op : assign_pos args : (Tensor x, Tensor cum_count, Tensor eff_num_len) output : Tensor(out) @@ -83,20 +70,6 @@ data_type : dtype interfaces : paddle::dialect::InferSymbolicShapeInterface -- op : assign_value_ - args : (Tensor output, int[] shape, DataType dtype, Scalar[] values, Place place = {}) - output : Tensor(out) - inplace: (output -> out) - infer_meta : - func : AssignValueInferMeta - param : [shape, dtype] - kernel : - func : assign_value - param : [shape, dtype, values] - data_type : dtype - backend : place > output - interfaces : paddle::dialect::InferSymbolicShapeInterface - - op : barrier args : (Tensor x, int ring_id=0) output : Tensor(out) @@ -213,20 +186,6 @@ kernel : func : c_split -- op : cast - args : (Tensor x, DataType dtype) - output : Tensor(out) - infer_meta : - func : CastInferMeta - spmd_rule : CastInferSpmd - kernel : - func : cast - param : [x, dtype] - data_type : x - inplace: (x -> out) - backward : cast_grad - interfaces : paddle::dialect::InferSymbolicShapeInterface - - op : coalesce_tensor_ args : (Tensor[] input, DataType dtype, bool copy_data = false, bool set_constant = false, bool persist_output = false, float constant = 0.0, bool use_align = true, int align_size = -1, int size_of_dtype = -1, int64_t[] concated_shapes = {}, int64_t[] concated_ranks = {}) output : Tensor[](output){input.size()}, Tensor(fused_output) @@ -352,19 +311,6 @@ backward : embedding_grad interfaces : paddle::dialect::InferSymbolicShapeInterface -- op : empty - args : (IntArray shape, DataType dtype=DataType::FLOAT32, Place place=CPUPlace()) - output: Tensor(out) - infer_meta : - func : CreateInferMeta - param : [shape, dtype] - kernel : - func : empty - param : [shape, dtype] - data_type : dtype - backend : place - interfaces : paddle::dialect::InferSymbolicShapeInterface - - op : equal args : (Tensor x, Tensor y) output : Tensor(out) @@ -378,18 +324,6 @@ inplace: (x -> out) interfaces : paddle::dialect::InferSymbolicShapeInterface -- op : exponential_ - args : (Tensor x, float lam) - output : Tensor(out) - infer_meta : - func : UnchangedInferMeta - param : [x] - kernel : - func : exponential - inplace : (x -> out) - backward : exponential__grad - interfaces : paddle::dialect::InferSymbolicShapeInterface - - op : feed args : (str name, int col) output : Tensor(out) @@ -419,17 +353,6 @@ support_trans_dtype : x, y inplace: (x -> out) -- op : full_with_tensor - args : (Tensor value, IntArray shape, DataType dtype=DataType::FLOAT32) - output: Tensor(out) - infer_meta : - func : FullWithTensorInferMeta - param : [shape, dtype] - kernel : - func : full_with_tensor - data_type : dtype - interfaces : paddle::dialect::InferSymbolicShapeInterface - - op : fused_adam_ args : (Tensor[] params, Tensor[] grads, Tensor learning_rate, Tensor[] moments1, Tensor[] moments2, Tensor[] beta1_pows, Tensor[] beta2_pows, Tensor[] master_params, Tensor skip_update, Scalar beta1, Scalar beta2, Scalar epsilon, int chunk_size, float weight_decay, bool use_adamw, bool multi_precision, bool use_global_beta_pow) output : Tensor[](params_out){params.size()}, Tensor[](moments1_out){params.size()}, Tensor[](moments2_out){params.size()}, Tensor[](beta1_pows_out){params.size()}, Tensor[](beta2_pows_out){params.size()}, Tensor[](master_params_out){params.size()} @@ -441,20 +364,6 @@ optional : skip_update, master_params, master_params_out inplace : (params -> params_out), (moments1 -> moments1_out), (moments2 -> moments2_out), (beta1_pows -> beta1_pows_out), (beta2_pows -> beta2_pows_out), (master_params -> master_params_out) -- op : gaussian - args : (IntArray shape, float mean, float std, int seed, DataType dtype, Place place={}) - output: Tensor(out) - infer_meta : - func : GaussianInferMeta - param : [shape, mean, std, seed, dtype] - kernel : - func : gaussian - param : [shape, mean, std, seed, dtype] - data_type : dtype - backend : place - interfaces : paddle::dialect::InferSymbolicShapeInterface - traits : pir::SideEffectTrait - - op : get_tensor_from_selected_rows args : (Tensor x) output : Tensor(out) @@ -517,16 +426,6 @@ func : hardswish backward : hardswish_grad -- op : increment - args : (Tensor x, float value = 1.0) - output : Tensor(out) - infer_meta : - func : IncrementInferMeta - kernel : - func : increment - inplace : (x -> out) - interfaces : paddle::dialect::InferSymbolicShapeInterface - - op : less_equal args : (Tensor x, Tensor y) output : Tensor(out) @@ -551,19 +450,6 @@ inplace: (x -> out) interfaces : paddle::dialect::InferSymbolicShapeInterface -- op : linspace - args : (Tensor start, Tensor stop, Tensor number, DataType dtype, Place place) - output : Tensor(out) - infer_meta : - func : LinspaceInferMeta - param: [start, stop, number, dtype] - kernel : - func : linspace - param: [start, stop, number, dtype] - data_type : dtype - backend : place - interfaces : paddle::dialect::InferSymbolicShapeInterface - - op : load_combine args : (str file_path, bool load_as_fp16, bool model_from_memory) output : Tensor[](Out) @@ -576,29 +462,6 @@ args : (Tensor[] x) output : Tensor(out) -- op : logspace - args : (Tensor start, Tensor stop, Tensor num, Tensor base, DataType dtype, Place place={}) - output : Tensor(out) - infer_meta: - func : LogspaceInferMeta - param : [start, stop, num, base, dtype] - kernel : - func : logspace - param : [start, stop, num, base, dtype] - data_type : dtype - backend : place - interfaces : paddle::dialect::InferSymbolicShapeInterface - -- op : logsumexp - args : (Tensor x, int64_t[] axis, bool keepdim, bool reduce_all) - output : Tensor(out) - infer_meta : - func : LogsumexpInferMeta - kernel : - func : logsumexp - backward : logsumexp_grad - interfaces : paddle::dialect::InferSymbolicShapeInterface - - op : lrn args : (Tensor x, int n=5, float k=2.0, float alpha=0.0001, float beta=0.75, str data_format="NCHW") output : Tensor(out), Tensor(mid_out) @@ -632,17 +495,6 @@ data_type : x backward : matmul_with_flatten_grad -- op : max - args : (Tensor x, IntArray axis={}, bool keepdim=false) - output : Tensor(out) - infer_meta : - func : ReduceIntArrayAxisInferMeta - spmd_rule: ReductionMaxInferSpmdDynamic - kernel : - func : max - backward : max_grad - interfaces : paddle::dialect::InferSymbolicShapeInterface - - op : maximum args : (Tensor x, Tensor y) output : Tensor(out) @@ -726,16 +578,6 @@ inplace: (x -> out) interfaces : paddle::dialect::InferSymbolicShapeInterface -- op : pad - args : (Tensor x, int[] paddings, Scalar pad_value) - output : Tensor - infer_meta : - func : PadInferMeta - kernel : - func : pad - backward : pad_grad - interfaces : paddle::dialect::InferSymbolicShapeInterface - - op : partial_allgather args : (Tensor x, int nranks, int rank, int ring_id = 0, bool use_calc_stream = false) output : Tensor(out) @@ -766,16 +608,6 @@ interfaces : paddle::dialect::InferSymbolicShapeInterface traits : pir::SideEffectTrait -- op : prod - args : (Tensor x, IntArray dims, bool keep_dim, bool reduce_all) - output : Tensor - infer_meta : - func : ReduceIntArrayAxisInferMetaBase - kernel : - func : prod - backward : prod_grad - interfaces : paddle::dialect::InferSymbolicShapeInterface - - op : prune_gate_by_capacity args : (Tensor gate_idx, Tensor expert_count, int64_t n_expert, int64_t n_worker) output : Tensor(new_gate_idx) @@ -852,33 +684,6 @@ optional : scale, in_accum, in_state, out_state, out_accum, out_scale inplace : (scale -> out_scale, in_accum -> out_accum, in_state -> out_state) -- op : randint - args : (int low, int high, IntArray shape, DataType dtype=DataType::INT64, Place place={}) - output : Tensor(out) - infer_meta : - func : RandintInferMeta - param : [low, high, shape, dtype] - kernel : - func : randint - param : [low, high, shape, dtype] - data_type : dtype - backend : place - interfaces : paddle::dialect::InferSymbolicShapeInterface - traits : pir::SideEffectTrait - -- op : randperm - args : (int n, DataType dtype, Place place={}) - output : Tensor(out) - infer_meta : - func : RandpermInferMeta - param : [n, dtype] - kernel : - func : randperm - param : [n, dtype] - data_type : dtype - backend : place - traits : pir::SideEffectTrait - - op : recv_v2 args : (int[] out_shape = {}, DataType dtype = DataType::FLOAT32, int peer = 0, int ring_id = 0, bool use_calc_stream = false, bool dynamic_shape = false) output : Tensor(out) @@ -902,17 +707,6 @@ inplace : (x -> out) interfaces : paddle::dialect::InferSymbolicShapeInterface -- op : repeat_interleave - args : (Tensor x, int repeats, int axis) - output : Tensor(out) - infer_meta : - func : RepeatInterleaveInferMeta - kernel : - func : repeat_interleave - data_type : x - backward: repeat_interleave_grad - interfaces : paddle::dialect::InferSymbolicShapeInterface - - op : reshape args : (Tensor x, IntArray shape) output : Tensor(out), Tensor(xshape) @@ -1029,17 +823,6 @@ backward : shuffle_batch_grad traits : pir::SideEffectTrait -- op : slice - args : (Tensor input, int64_t[] axes, IntArray starts, IntArray ends, int64_t[] infer_flags, int64_t[] decrease_axis) - output : Tensor - infer_meta : - func : SliceRawInferMeta - spmd_rule : SliceInferSpmdDynamic - kernel : - func : slice - backward : slice_grad - interfaces : paddle::dialect::InferSymbolicShapeInterface - - op : soft_relu args : (Tensor x, float threshold = 20.0f) output : Tensor(out) @@ -1062,27 +845,6 @@ backward : softmax_grad interfaces : paddle::dialect::InferSymbolicShapeInterface -- op : split - args : (Tensor x, IntArray sections, Scalar(int) axis) - output : Tensor[]{sections.size()} - infer_meta : - func : SplitInferMeta - kernel : - func : split - backward : split_grad - interfaces : paddle::dialect::InferSymbolicShapeInterface - -- op : split_with_num - args : (Tensor x, int num, Scalar(int) axis) - output : Tensor[]{num} - infer_meta : - func : SplitWithNumInferMeta - spmd_rule : SplitWithNumInferSpmdDynamic - kernel : - func : split_with_num - backward : split_with_num_grad - interfaces : paddle::dialect::InferSymbolicShapeInterface - - op : subtract args : (Tensor x, Tensor y) output : Tensor(out) @@ -1097,18 +859,6 @@ backward : subtract_grad interfaces : paddle::dialect::InferSymbolicShapeInterface -- op : sum - args : (Tensor x, IntArray axis={}, DataType dtype=DataType::UNDEFINED, bool keepdim=false) - output : Tensor(out) - infer_meta : - func : SumInferMeta - spmd_rule : ReductionSumInferSpmdDynamic - kernel : - func : sum - data_type : x - backward : sum_grad - interfaces : paddle::dialect::InferSymbolicShapeInterface - - op : tdm_sampler args: (Tensor x, Tensor travel, Tensor layer, bool output_positive=true, int[] neg_samples_num_list={}, int[] layer_offset_lod={}, int seed = 0, int dtype=2) output: Tensor(out), Tensor(labels), Tensor(mask) @@ -1130,95 +880,6 @@ backward : tile_grad interfaces : paddle::dialect::InferSymbolicShapeInterface -- op : transpose - args : (Tensor x, int[] perm) - output : Tensor(out) - infer_meta : - func : TransposeInferMeta - spmd_rule: TransposeInferSpmd - kernel : - func : transpose - inplace : (x -> out) - backward : transpose_grad - interfaces : paddle::dialect::InferSymbolicShapeInterface - -- op : tril - args : (Tensor x, int diagonal) - output : Tensor(out) - infer_meta : - func : TrilInferMeta - kernel : - func : tril - inplace: (x -> out) - backward : tril_grad - interfaces : paddle::dialect::InferSymbolicShapeInterface - -- op : tril_indices - args : (int rows, int cols, int offset, DataType dtype, Place place={}) - output : Tensor(out) - infer_meta : - func : TrilIndicesInferMeta - param : [rows, cols, offset, dtype] - kernel : - func : tril_indices - param : [rows, cols, offset, dtype] - data_type : dtype - backend : place - interfaces : paddle::dialect::InferSymbolicShapeInterface - -- op : triu - args : (Tensor x, int diagonal) - output : Tensor(out) - infer_meta : - func : TriuInferMeta - spmd_rule : TriuInferSpmd - kernel : - func : triu - inplace: (x -> out) - backward : triu_grad - interfaces : paddle::dialect::InferSymbolicShapeInterface - -- op : triu_indices - args : (int row, int col, int offset, DataType dtype, Place place={}) - output : Tensor(out) - infer_meta : - func : TriuIndicesInferMeta - param : [row, col, offset, dtype] - kernel : - func : triu_indices - param : [row, col, offset, dtype] - data_type : dtype - backend : place - interfaces : paddle::dialect::InferSymbolicShapeInterface - -# python API: paddle.nn.initializer.TruncatedNormal -- op : truncated_gaussian_random - args : (int[] shape, float mean, float std, int seed, float a, float b, DataType dtype=DataType::FLOAT32, Place place={}) - output : Tensor(out) - infer_meta : - func : TruncatedGaussianRandomInferMeta - param : [shape, mean, std, seed, a, b, dtype] - kernel : - func : truncated_gaussian_random - param : [shape, mean, std, seed, a, b, dtype] - backend : place - data_type : dtype - traits : pir::SideEffectTrait - -- op : uniform - args : (IntArray shape, DataType dtype, Scalar min, Scalar max, int seed, Place place={}) - output : Tensor(out) - infer_meta : - func : UniformRandomInferMeta - param: [shape, dtype] - kernel : - func : uniform - param: [shape, dtype, min, max, seed] - data_type : dtype - backend : place - interfaces : paddle::dialect::InferSymbolicShapeInterface - traits : pir::SideEffectTrait - - op : unique args : (Tensor x, bool return_index=false, bool return_inverse=false, bool return_counts=false, int[] axis={}, DataType dtype=DataType::INT64, bool is_sorted=false) output : Tensor(out), Tensor(indices), Tensor(inverse), Tensor(counts) diff --git a/paddle/phi/ops/yaml/inconsistent/update_ops.yaml b/paddle/phi/ops/yaml/inconsistent/update_ops.yaml index 6110e5a8e1677..f8972c5cab25a 100644 --- a/paddle/phi/ops/yaml/inconsistent/update_ops.yaml +++ b/paddle/phi/ops/yaml/inconsistent/update_ops.yaml @@ -16,12 +16,3 @@ backend : place support_tensor : [start, end, step] interfaces : paddle::dialect::InferSymbolicShapeInterface - -- op : sequence_mask - args: (Tensor x, Scalar(int) max_len, DataType out_dtype) - output: Tensor(y) - infer_meta: - func: SequenceMaskScalarInferMeta - kernel: - func: sequence_mask_scalar - data_type : x diff --git a/paddle/phi/ops/yaml/legacy/backward_exclude.yaml b/paddle/phi/ops/yaml/legacy/backward_exclude.yaml index 3cded3e4aa1e9..f372b7b968f80 100644 --- a/paddle/phi/ops/yaml/legacy/backward_exclude.yaml +++ b/paddle/phi/ops/yaml/legacy/backward_exclude.yaml @@ -4,6 +4,7 @@ - amax_grad - amin_grad +- cast_grad - conv2d_transpose_double_grad - conv2d_transpose_grad - deformable_conv_grad @@ -17,19 +18,35 @@ - fused_softmax_mask_grad - fused_softmax_mask_upper_triangle_grad - hsigmoid_loss_grad +- logsumexp_grad +- max_grad - mean_double_grad - mean_grad - mish_grad - norm_grad +- pad_double_grad +- pad_grad - pool2d_double_grad - pool2d_grad - pool3d_grad +- prod_grad +- repeat_interleave_grad - repeat_interleave_with_tensor_index_grad - rnn_grad - rrelu_grad - set_value_with_tensor_grad +- slice_double_grad +- slice_grad +- split_grad +- split_with_num_grad - strided_slice_grad +- sum_double_grad +- sum_grad - swish_grad - sync_batch_norm_grad - trans_layout_grad +- transpose_double_grad +- transpose_grad +- tril_grad +- triu_grad - unpool_grad diff --git a/paddle/phi/ops/yaml/legacy/ops_exclude.yaml b/paddle/phi/ops/yaml/legacy/ops_exclude.yaml index 23dc1b22b8f6f..de86827d366bf 100644 --- a/paddle/phi/ops/yaml/legacy/ops_exclude.yaml +++ b/paddle/phi/ops/yaml/legacy/ops_exclude.yaml @@ -8,6 +8,8 @@ - amax - amin - any +- assign_out_ +- assign_value_ - c_allgather - c_allreduce_max - c_allreduce_min @@ -19,6 +21,7 @@ - c_reduce_sum - c_sync_calc_stream - c_sync_comm_stream +- cast - conv2d_transpose - conv2d_transpose_bias - copy_to @@ -28,21 +31,31 @@ - dequantize_log - disable_check_model_nan_inf - dropout +- empty - empty_like - enable_check_model_nan_inf +- exponential_ - eye - frobenius_norm +- full - full_ - full_batch_size_like - full_like +- full_with_tensor - fused_batch_norm_act - fused_bn_add_activation - fused_multi_transformer - fused_softmax_mask - fused_softmax_mask_upper_triangle +- gaussian - hsigmoid_loss +- increment +- linspace +- logspace +- logsumexp - matrix_rank - matrix_rank_tol +- max - mean - memcpy_d2h - memcpy_h2d @@ -51,17 +64,34 @@ - one_hot - ones - ones_like +- pad - pool2d - pool3d +- prod +- randint +- randperm - read_file +- repeat_interleave - repeat_interleave_with_tensor_index - rnn - rrelu +- sequence_mask - set_value_with_tensor +- slice +- split +- split_with_num - strided_slice +- sum - swish - sync_batch_norm_ - trans_layout +- transpose +- tril +- tril_indices +- triu +- triu_indices +- truncated_gaussian_random +- uniform - unpool - zeros - zeros_like diff --git a/paddle/phi/ops/yaml/ops.yaml b/paddle/phi/ops/yaml/ops.yaml index 76a3357653c41..e26d187d90cd3 100755 --- a/paddle/phi/ops/yaml/ops.yaml +++ b/paddle/phi/ops/yaml/ops.yaml @@ -308,6 +308,33 @@ backward : asinh_grad interfaces : paddle::dialect::InferSymbolicShapeInterface +- op : assign_out_ + args : (Tensor x, Tensor output) + output : Tensor(out) + infer_meta : + func : UnchangedInferMeta + param : [x] + kernel : + func : assign + param : [x] + inplace : (output -> out) + backward : assign_out__grad + traits : pir::SideEffectTrait + +- op : assign_value_ + args : (Tensor output, int[] shape, DataType dtype, Scalar[] values, Place place = {}) + output : Tensor(out) + inplace: (output -> out) + infer_meta : + func : AssignValueInferMeta + param : [shape, dtype] + kernel : + func : assign_value + param : [shape, dtype, values] + data_type : dtype + backend : place > output + interfaces : paddle::dialect::InferSymbolicShapeInterface + - op : atan args : (Tensor x) output : Tensor(out) @@ -642,6 +669,20 @@ func : c_sync_comm_stream inplace : (x -> out) +- op : cast + args : (Tensor x, DataType dtype) + output : Tensor(out) + infer_meta : + func : CastInferMeta + spmd_rule : CastInferSpmd + kernel : + func : cast + param : [x, dtype] + data_type : x + inplace: (x -> out) + backward : cast_grad + interfaces : paddle::dialect::InferSymbolicShapeInterface + - op : ceil args : (Tensor x) output : Tensor(out) @@ -1279,6 +1320,19 @@ inplace : (x -> out) backward : elu_grad +- op : empty + args : (IntArray shape, DataType dtype=DataType::FLOAT32, Place place=CPUPlace()) + output: Tensor(out) + infer_meta : + func : CreateInferMeta + param : [shape, dtype] + kernel : + func : empty + param : [shape, dtype] + data_type : dtype + backend : place + interfaces : paddle::dialect::InferSymbolicShapeInterface + - op : empty_like args : (Tensor x, DataType dtype = DataType::UNDEFINED, Place place = {}) output: Tensor(out) @@ -1379,6 +1433,18 @@ backward : expm1_grad interfaces : paddle::dialect::InferSymbolicShapeInterface +- op : exponential_ + args : (Tensor x, float lam) + output : Tensor(out) + infer_meta : + func : UnchangedInferMeta + param : [x] + kernel : + func : exponential + inplace : (x -> out) + backward : exponential__grad + interfaces : paddle::dialect::InferSymbolicShapeInterface + - op : eye args : (Scalar num_rows, Scalar num_columns, DataType dtype=DataType::FLOAT32, Place place={}) output : Tensor(out) @@ -1768,6 +1834,17 @@ data_transform : skip_transform : x +- op : full_with_tensor + args : (Tensor value, IntArray shape, DataType dtype=DataType::FLOAT32) + output: Tensor(out) + infer_meta : + func : FullWithTensorInferMeta + param : [shape, dtype] + kernel : + func : full_with_tensor + data_type : dtype + interfaces : paddle::dialect::InferSymbolicShapeInterface + - op : fused_batch_norm_act args : (Tensor x, Tensor scale, Tensor bias, Tensor mean, Tensor variance, float momentum, float epsilon, str act_type) output : Tensor(out), Tensor(mean_out), Tensor(variance_out), Tensor(saved_mean), Tensor(saved_variance), Tensor(reserve_space) @@ -1873,6 +1950,20 @@ func : gather_tree data_type : ids +- op : gaussian + args : (IntArray shape, float mean, float std, int seed, DataType dtype, Place place={}) + output: Tensor(out) + infer_meta : + func : GaussianInferMeta + param : [shape, mean, std, seed, dtype] + kernel : + func : gaussian + param : [shape, mean, std, seed, dtype] + data_type : dtype + backend : place + interfaces : paddle::dialect::InferSymbolicShapeInterface + traits : pir::SideEffectTrait + - op : gaussian_inplace args: (Tensor x, float mean=0, float std=1.0, int seed=0) output: Tensor(out) @@ -2110,6 +2201,16 @@ backward : imag_grad interfaces : paddle::dialect::InferSymbolicShapeInterface +- op : increment + args : (Tensor x, float value = 1.0) + output : Tensor(out) + infer_meta : + func : IncrementInferMeta + kernel : + func : increment + inplace : (x -> out) + interfaces : paddle::dialect::InferSymbolicShapeInterface + - op : index_add args : (Tensor x, Tensor index, Tensor add_value, int axis = 0) output : Tensor(out) @@ -2371,6 +2472,19 @@ skip_transform : out_size, size_tensor, scale_tensor interfaces : paddle::dialect::InferSymbolicShapeInterface +- op : linspace + args : (Tensor start, Tensor stop, Tensor number, DataType dtype, Place place) + output : Tensor(out) + infer_meta : + func : LinspaceInferMeta + param: [start, stop, number, dtype] + kernel : + func : linspace + param: [start, stop, number, dtype] + data_type : dtype + backend : place + interfaces : paddle::dialect::InferSymbolicShapeInterface + - op : llm_int8_linear args : (Tensor x, Tensor weight, Tensor bias, Tensor weight_scale, float threshold=6.0) output : Tensor(out) @@ -2521,6 +2635,29 @@ func : logsigmoid backward : logsigmoid_grad +- op : logspace + args : (Tensor start, Tensor stop, Tensor num, Tensor base, DataType dtype, Place place={}) + output : Tensor(out) + infer_meta: + func : LogspaceInferMeta + param : [start, stop, num, base, dtype] + kernel : + func : logspace + param : [start, stop, num, base, dtype] + data_type : dtype + backend : place + interfaces : paddle::dialect::InferSymbolicShapeInterface + +- op : logsumexp + args : (Tensor x, int64_t[] axis, bool keepdim, bool reduce_all) + output : Tensor(out) + infer_meta : + func : LogsumexpInferMeta + kernel : + func : logsumexp + backward : logsumexp_grad + interfaces : paddle::dialect::InferSymbolicShapeInterface + - op : lstsq args : (Tensor x, Tensor y, Scalar rcond=0.0f, str driver="gels") output : Tensor(solution), Tensor(residuals), Tensor(rank), Tensor(singular_values) @@ -2619,6 +2756,17 @@ kernel : func : matrix_rank_tol +- op : max + args : (Tensor x, IntArray axis={}, bool keepdim=false) + output : Tensor(out) + infer_meta : + func : ReduceIntArrayAxisInferMeta + spmd_rule: ReductionMaxInferSpmdDynamic + kernel : + func : max + backward : max_grad + interfaces : paddle::dialect::InferSymbolicShapeInterface + - op : max_pool2d_with_index args : (Tensor x, int[] kernel_size, int[] strides= {1, 1}, int[] paddings = {0, 0}, bool global_pooling = false, bool adaptive = false) output : Tensor(out), Tensor(mask) @@ -2962,6 +3110,16 @@ func : p_norm backward : p_norm_grad +- op : pad + args : (Tensor x, int[] paddings, Scalar pad_value) + output : Tensor + infer_meta : + func : PadInferMeta + kernel : + func : pad + backward : pad_grad + interfaces : paddle::dialect::InferSymbolicShapeInterface + - op : pad3d args : (Tensor x, IntArray paddings, str mode = "constant", float pad_value = 0.0, str data_format = "NCDHW") output : Tensor(out) @@ -3087,6 +3245,16 @@ func : prior_box data_type : input +- op : prod + args : (Tensor x, IntArray dims, bool keep_dim, bool reduce_all) + output : Tensor + infer_meta : + func : ReduceIntArrayAxisInferMetaBase + kernel : + func : prod + backward : prod_grad + interfaces : paddle::dialect::InferSymbolicShapeInterface + - op : psroi_pool args : (Tensor x, Tensor boxes, Tensor boxes_num, int pooled_height=1, int pooled_width=1, int output_channels=1, float spatial_scale=1.0) output : Tensor @@ -3132,6 +3300,20 @@ inplace : (param -> param_out), (beta1_pow -> beta1_pow_out), (beta2_pow -> beta2_pow_out), (rho -> rho_out), (moment1 -> moment1_out), (moment2 -> moment2_out), (master_param->master_param_out) traits : pir::SideEffectTrait +- op : randint + args : (int low, int high, IntArray shape, DataType dtype=DataType::INT64, Place place={}) + output : Tensor(out) + infer_meta : + func : RandintInferMeta + param : [low, high, shape, dtype] + kernel : + func : randint + param : [low, high, shape, dtype] + data_type : dtype + backend : place + interfaces : paddle::dialect::InferSymbolicShapeInterface + traits : pir::SideEffectTrait + - op : random_routing args : (Tensor prob, Tensor topk_value, Tensor topk_idx) output : Tensor(out) @@ -3143,6 +3325,19 @@ inplace : (topk_idx -> out) traits : pir::SideEffectTrait +- op : randperm + args : (int n, DataType dtype, Place place={}) + output : Tensor(out) + infer_meta : + func : RandpermInferMeta + param : [n, dtype] + kernel : + func : randperm + param : [n, dtype] + data_type : dtype + backend : place + traits : pir::SideEffectTrait + - op : rank_attention args : (Tensor x, Tensor rank_offset, Tensor rank_param, int max_rank = 3, int max_size = 0) output : Tensor(input_help), Tensor(out), Tensor(ins_rank) @@ -3241,6 +3436,17 @@ inplace: (x -> out) backward : renorm_grad +- op : repeat_interleave + args : (Tensor x, int repeats, int axis) + output : Tensor(out) + infer_meta : + func : RepeatInterleaveInferMeta + kernel : + func : repeat_interleave + data_type : x + backward: repeat_interleave_grad + interfaces : paddle::dialect::InferSymbolicShapeInterface + - op : repeat_interleave_with_tensor_index args : (Tensor x, Tensor repeats, int axis) output : Tensor(out) @@ -3498,6 +3704,15 @@ optional: padding_data backward: sequence_conv_grad +- op : sequence_mask + args: (Tensor x, Scalar(int) max_len, DataType out_dtype) + output: Tensor(y) + infer_meta: + func: SequenceMaskScalarInferMeta + kernel: + func: sequence_mask_scalar + data_type : x + - op : set_value_with_tensor args : (Tensor x, Tensor values, IntArray starts, IntArray ends, IntArray steps, int64_t[] axes, int64_t[] decrease_axes, int64_t[] none_axes) output : Tensor(out) @@ -3621,6 +3836,17 @@ backward : sinh_grad interfaces : paddle::dialect::InferSymbolicShapeInterface +- op : slice + args : (Tensor input, int64_t[] axes, IntArray starts, IntArray ends, int64_t[] infer_flags, int64_t[] decrease_axis) + output : Tensor + infer_meta : + func : SliceRawInferMeta + spmd_rule : SliceInferSpmdDynamic + kernel : + func : slice + backward : slice_grad + interfaces : paddle::dialect::InferSymbolicShapeInterface + - op : slogdet args : (Tensor x) output : Tensor @@ -3693,6 +3919,27 @@ data_type : weight backward : spectral_norm_grad +- op : split + args : (Tensor x, IntArray sections, Scalar(int) axis) + output : Tensor[]{sections.size()} + infer_meta : + func : SplitInferMeta + kernel : + func : split + backward : split_grad + interfaces : paddle::dialect::InferSymbolicShapeInterface + +- op : split_with_num + args : (Tensor x, int num, Scalar(int) axis) + output : Tensor[]{num} + infer_meta : + func : SplitWithNumInferMeta + spmd_rule : SplitWithNumInferSpmdDynamic + kernel : + func : split_with_num + backward : split_with_num_grad + interfaces : paddle::dialect::InferSymbolicShapeInterface + - op : sqrt args : (Tensor x) output : Tensor(out) @@ -3788,6 +4035,18 @@ func : strided_slice backward : strided_slice_grad +- op : sum + args : (Tensor x, IntArray axis={}, DataType dtype=DataType::UNDEFINED, bool keepdim=false) + output : Tensor(out) + infer_meta : + func : SumInferMeta + spmd_rule : ReductionSumInferSpmdDynamic + kernel : + func : sum + data_type : x + backward : sum_grad + interfaces : paddle::dialect::InferSymbolicShapeInterface + - op : svd args : (Tensor x, bool full_matrices = false) output : Tensor(u), Tensor(s), Tensor(vh) @@ -3946,6 +4205,18 @@ func : transpose backward : trans_layout_grad +- op : transpose + args : (Tensor x, int[] perm) + output : Tensor(out) + infer_meta : + func : TransposeInferMeta + spmd_rule: TransposeInferSpmd + kernel : + func : transpose + inplace : (x -> out) + backward : transpose_grad + interfaces : paddle::dialect::InferSymbolicShapeInterface + - op : triangular_solve args : (Tensor x, Tensor y, bool upper=true, bool transpose=false, bool unitriangular=false) output : Tensor @@ -3956,6 +4227,30 @@ data_type : x backward : triangular_solve_grad +- op : tril + args : (Tensor x, int diagonal) + output : Tensor(out) + infer_meta : + func : TrilInferMeta + kernel : + func : tril + inplace: (x -> out) + backward : tril_grad + interfaces : paddle::dialect::InferSymbolicShapeInterface + +- op : tril_indices + args : (int rows, int cols, int offset, DataType dtype, Place place={}) + output : Tensor(out) + infer_meta : + func : TrilIndicesInferMeta + param : [rows, cols, offset, dtype] + kernel : + func : tril_indices + param : [rows, cols, offset, dtype] + data_type : dtype + backend : place + interfaces : paddle::dialect::InferSymbolicShapeInterface + - op : trilinear_interp args : (Tensor x, Tensor out_size, Tensor[] size_tensor, Tensor scale_tensor, str data_format="NCHW", int out_d=0, int out_h=0, int out_w=0, float[] scale={}, str interp_method="bilinear", bool align_corners=true, int align_mode=1) output : Tensor(output) @@ -3970,6 +4265,31 @@ skip_transform : out_size, size_tensor, scale_tensor interfaces : paddle::dialect::InferSymbolicShapeInterface +- op : triu + args : (Tensor x, int diagonal) + output : Tensor(out) + infer_meta : + func : TriuInferMeta + spmd_rule : TriuInferSpmd + kernel : + func : triu + inplace: (x -> out) + backward : triu_grad + interfaces : paddle::dialect::InferSymbolicShapeInterface + +- op : triu_indices + args : (int row, int col, int offset, DataType dtype, Place place={}) + output : Tensor(out) + infer_meta : + func : TriuIndicesInferMeta + param : [row, col, offset, dtype] + kernel : + func : triu_indices + param : [row, col, offset, dtype] + data_type : dtype + backend : place + interfaces : paddle::dialect::InferSymbolicShapeInterface + - op : trunc args : (Tensor input) output : Tensor(out) @@ -3981,6 +4301,20 @@ backward : trunc_grad interfaces : paddle::dialect::InferSymbolicShapeInterface +# python API: paddle.nn.initializer.TruncatedNormal +- op : truncated_gaussian_random + args : (int[] shape, float mean, float std, int seed, float a, float b, DataType dtype=DataType::FLOAT32, Place place={}) + output : Tensor(out) + infer_meta : + func : TruncatedGaussianRandomInferMeta + param : [shape, mean, std, seed, a, b, dtype] + kernel : + func : truncated_gaussian_random + param : [shape, mean, std, seed, a, b, dtype] + backend : place + data_type : dtype + traits : pir::SideEffectTrait + - op : unbind args : (Tensor input, int axis = 0) output : Tensor[] {axis<0 ? input.dims()[input.dims().size()+axis]:input.dims()[axis]} @@ -4001,6 +4335,20 @@ func : unfold backward : unfold_grad +- op : uniform + args : (IntArray shape, DataType dtype, Scalar min, Scalar max, int seed, Place place={}) + output : Tensor(out) + infer_meta : + func : UniformRandomInferMeta + param: [shape, dtype] + kernel : + func : uniform + param: [shape, dtype, min, max, seed] + data_type : dtype + backend : place + interfaces : paddle::dialect::InferSymbolicShapeInterface + traits : pir::SideEffectTrait + - op : uniform_inplace args: (Tensor x, float min = -1.0, float max = 1.0, int seed = 0, int diag_num = 0, int diag_step = 0, float diag_val = 1.0) output: Tensor(out) diff --git a/paddle/pir/include/pass/pass.h b/paddle/pir/include/pass/pass.h index 4f2e317522418..b7d03f63f1ffa 100644 --- a/paddle/pir/include/pass/pass.h +++ b/paddle/pir/include/pass/pass.h @@ -167,9 +167,9 @@ class IR_API Pass { Set("__match_count__", new int64_t{match_count}); } - void AddStatistics(int64_t match_count, int64_t all_count) { - Set("__match_count__", new int64_t{match_count}); - Set("__all_count__", new int64_t{all_count}); + void AddStatistics(int64_t match_count_1, int64_t match_count_2) { + Set("__match_count_1__", new int64_t{match_count_1}); + Set("__match_count_2__", new int64_t{match_count_2}); } void AddStatistics(const std::string& custom_log) { diff --git a/paddle/pir/src/core/op_info_impl.cc b/paddle/pir/src/core/op_info_impl.cc index 40bb94e4c02ec..e32f6037ac04b 100644 --- a/paddle/pir/src/core/op_info_impl.cc +++ b/paddle/pir/src/core/op_info_impl.cc @@ -72,28 +72,34 @@ OpInfo OpInfoImpl::Create(Dialect *dialect, << " interfaces, " << traits_num << " traits, " << attributes_num << " attributes."; size_t base_size = sizeof(TypeId) * traits_num + sizeof(OpInfoImpl); - char *base_ptr = static_cast(::operator new(base_size)); + std::unique_ptr base_ptr(new char[base_size]); VLOG(10) << "Malloc " << base_size << " Bytes at " - << static_cast(base_ptr); + << static_cast(base_ptr.get()); + + char *raw_base_ptr = base_ptr.get(); if (traits_num > 0) { - auto p_first_trait = reinterpret_cast(base_ptr); - memcpy(base_ptr, trait_set.data(), sizeof(TypeId) * traits_num); + auto p_first_trait = reinterpret_cast(raw_base_ptr); + memcpy(raw_base_ptr, trait_set.data(), sizeof(TypeId) * traits_num); std::sort(p_first_trait, p_first_trait + traits_num); - base_ptr += traits_num * sizeof(TypeId); + raw_base_ptr += traits_num * sizeof(TypeId); } + // Construct OpInfoImpl. - VLOG(10) << "Construct OpInfoImpl at " << reinterpret_cast(base_ptr) - << " ......"; - OpInfo op_info = OpInfo(new (base_ptr) OpInfoImpl(std::move(interface_set), - dialect, - op_id, - op_name, - traits_num, - attributes_num, - attributes_name, - verify_sig, - verify_region)); - return op_info; + VLOG(10) << "Construct OpInfoImpl at " + << reinterpret_cast(raw_base_ptr) << " ......"; + OpInfoImpl *impl = new (raw_base_ptr) OpInfoImpl(std::move(interface_set), + dialect, + op_id, + op_name, + traits_num, + attributes_num, + attributes_name, + verify_sig, + verify_region); + + // Release the unique_ptr ownership after successful construction + base_ptr.release(); + return OpInfo(impl); } void OpInfoImpl::Destroy(OpInfo info) { if (info.impl_) { diff --git a/paddle/pir/src/dialect/shape/utils/dim_expr_util.cc b/paddle/pir/src/dialect/shape/utils/dim_expr_util.cc index b4cc0585438ee..c7b5e21a2e01b 100644 --- a/paddle/pir/src/dialect/shape/utils/dim_expr_util.cc +++ b/paddle/pir/src/dialect/shape/utils/dim_expr_util.cc @@ -185,6 +185,10 @@ struct IsListLhsBeforeListRhsStruct { static bool Call(const Op& lhs, const Op& rhs) { const auto& [lhs_operands] = lhs; const auto& [rhs_operands] = rhs; + if (lhs_operands->empty() || rhs_operands->empty()) { + // 处理错误情况或抛出异常 + throw std::runtime_error("Operands are uninitialized."); + } if (lhs_operands->size() < rhs_operands->size()) { return true; } diff --git a/paddle/pir/src/pass/print_statistics.cc b/paddle/pir/src/pass/print_statistics.cc index 9d26374592284..448a00233efd9 100644 --- a/paddle/pir/src/pass/print_statistics.cc +++ b/paddle/pir/src/pass/print_statistics.cc @@ -37,20 +37,14 @@ class PrintStatistics : public PassInstrumentation { } void RunAfterPass(Pass *pass, Operation *op) override { - if (pass->Has("__match_count__") && pass->Has("__all_count__")) { - auto match_count = pass->Get("__match_count__"); - auto all_count = pass->Get("__all_count__"); - PADDLE_ENFORCE_LE(match_count, - all_count, - phi::errors::InvalidArgument( - "match_count: %d should smaller than all_count: %d", - match_count, - all_count)); - if (match_count > 0) { - LOG(INFO) << "--- detected [" << match_count << "/" << all_count + if (pass->Has("__match_count_1__") && pass->Has("__match_count_2__")) { + auto match_count_1 = pass->Get("__match_count_1__"); + auto match_count_2 = pass->Get("__match_count_2__"); + if (match_count_1 > 0 || match_count_2 > 0) { + LOG(INFO) << "--- detected [" << match_count_1 << ", " << match_count_2 << "] subgraphs!"; } - } else if (pass->Has("__match_count__") && !pass->Has("__all_count__")) { + } else if (pass->Has("__match_count__")) { auto match_count = pass->Get("__match_count__"); if (match_count > 0) { LOG(INFO) << "--- detected [" << match_count << "] subgraphs!"; diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py index e126e41efcf65..3b730236b7a1d 100644 --- a/python/paddle/__init__.py +++ b/python/paddle/__init__.py @@ -577,6 +577,7 @@ if os.path.exists(cuh_file): os.environ.setdefault('runtime_include_dir', runtime_include_dir) + if is_compiled_with_cuda(): import os import platform @@ -608,6 +609,118 @@ cupti_dir_lib_path = package_dir + "/.." + "/nvidia/cuda_cupti/lib" set_flags({"FLAGS_cupti_dir": cupti_dir_lib_path}) + elif ( + platform.system() == 'Windows' + and platform.machine() in ('x86_64', 'AMD64') + and paddle.version.with_pip_cuda_libraries == 'ON' + ): + package_dir = os.path.dirname(os.path.abspath(__file__)) + win_cuda_bin_path = package_dir + "\\.." + "\\nvidia" + set_flags({"FLAGS_win_cuda_bin_dir": win_cuda_bin_path}) + + import sys + + if sys.platform == 'win32': + pfiles_path = os.getenv('ProgramFiles', 'C:\\Program Files') + py_dll_path = os.path.join(sys.exec_prefix, 'Library', 'bin') + th_dll_path = os.path.join(os.path.dirname(__file__), 'libs') + site_cuda_base_path = os.path.join( + os.path.dirname(__file__), '..', 'nvidia' + ) + site_cuda_list = [ + "cublas", + "cuda_nvrtc", + "cuda_runtime", + "cudnn", + "cufft", + "curand", + "cusolver", + "cusparse", + "nvjitlink", + ] + + if sys.exec_prefix != sys.base_exec_prefix: + base_py_dll_path = os.path.join( + sys.base_exec_prefix, 'Library', 'bin' + ) + else: + base_py_dll_path = '' + + dll_paths = list( + filter( + os.path.exists, [th_dll_path, py_dll_path, base_py_dll_path] + ) + ) + for site_cuda_package in site_cuda_list: + site_cuda_path = os.path.join( + site_cuda_base_path, site_cuda_package, 'bin' + ) + if os.path.exists(site_cuda_path): + dll_paths.append(site_cuda_path) + + import ctypes + + kernel32 = ctypes.WinDLL('kernel32.dll', use_last_error=True) + with_load_library_flags = hasattr(kernel32, 'AddDllDirectory') + prev_error_mode = kernel32.SetErrorMode(0x0001) + + kernel32.LoadLibraryW.restype = ctypes.c_void_p + if with_load_library_flags: + kernel32.LoadLibraryExW.restype = ctypes.c_void_p + + for dll_path in dll_paths: + os.add_dll_directory(dll_path) + + try: + ctypes.CDLL('vcruntime140.dll') + ctypes.CDLL('msvcp140.dll') + ctypes.CDLL('vcruntime140_1.dll') + except OSError: + print( + '''Microsoft Visual C++ Redistributable is not installed, this may lead to the DLL load failure. + It can be downloaded at https://aka.ms/vs/16/release/vc_redist.x64.exe''' + ) + import glob + + dlls = glob.glob(os.path.join(th_dll_path, '*.dll')) + for site_cuda_package in site_cuda_list: + site_cuda_path = os.path.join( + site_cuda_base_path, site_cuda_package, 'bin' + ) + if os.path.exists(site_cuda_path): + dlls.extend( + glob.glob(os.path.join(site_cuda_path, '*.dll')) + ) + # Not load 32 bit dlls in 64 bit python. + dlls = [dll for dll in dlls if '32_' not in dll] + path_patched = False + for dll in dlls: + is_loaded = False + print("dll:", dll) + if with_load_library_flags: + res = kernel32.LoadLibraryExW(dll, None, 0x00001100) + last_error = ctypes.get_last_error() + if res is None and last_error != 126: + err = ctypes.WinError(last_error) + err.strerror += f' Error loading "{dll}" or one of its dependencies.' + raise err + elif res is not None: + is_loaded = True + if not is_loaded: + if not path_patched: + prev_path = os.environ['PATH'] + os.environ['PATH'] = ';'.join( + dll_paths + [os.environ['PATH']] + ) + path_patched = True + res = kernel32.LoadLibraryW(dll) + if path_patched: + os.environ['PATH'] = prev_path + if res is None: + err = ctypes.WinError(ctypes.get_last_error()) + err.strerror += f' Error loading "{dll}" or one of its dependencies.' + raise err + kernel32.SetErrorMode(prev_error_mode) disable_static() diff --git a/python/paddle/autograd/backward_utils.py b/python/paddle/autograd/backward_utils.py index 6b086176fcb9f..0649c3e19bf05 100644 --- a/python/paddle/autograd/backward_utils.py +++ b/python/paddle/autograd/backward_utils.py @@ -14,8 +14,10 @@ from __future__ import annotations import collections +import logging import warnings from collections.abc import Sequence +from functools import lru_cache from typing import Any from paddle import pir @@ -660,3 +662,8 @@ def get_split_op(value): if op.name() == "builtin.split": return op return None + + +@lru_cache +def warning_once(message: str): + logging.warning(message) diff --git a/python/paddle/autograd/ir_backward.py b/python/paddle/autograd/ir_backward.py index b208052ae9b11..83af23a53b0c6 100644 --- a/python/paddle/autograd/ir_backward.py +++ b/python/paddle/autograd/ir_backward.py @@ -45,6 +45,7 @@ return_map_value_list, some_in_set, update_no_grad_set_by_stopgradient, + warning_once, while_prune_check, ) from paddle.base.libpaddle.pir import ( @@ -906,7 +907,7 @@ def prepare_backward_prune_set(inputs, outputs): for item in get_real_op_inputs(used_op): outputs_fwd_set.add(item) else: - logging.warning("input provided by inputs has no use") + warning_once("input provided by inputs has no use") inputs_fwd_set = ValueSet() for output in outputs: diff --git a/python/paddle/decomposition/decomp.py b/python/paddle/decomposition/decomp.py index 71a1daa19f27f..059c7e6a516d6 100644 --- a/python/paddle/decomposition/decomp.py +++ b/python/paddle/decomposition/decomp.py @@ -855,6 +855,9 @@ def decompose_dist_program(pir_program): ops = pir_program.global_block().ops for op in ops: bwd_op_name = op.name() + # todo(CZ): to be removed + if bwd_op_name in ["pd_op.mean_grad", "pd_op.concat_grad"]: + continue if has_decomp_vjp(op): pir.set_insertion_point(op) orig_outs = op.results() diff --git a/python/paddle/distributed/auto_parallel/static/engine.py b/python/paddle/distributed/auto_parallel/static/engine.py index 08276ef050ce8..7c2439a059a34 100644 --- a/python/paddle/distributed/auto_parallel/static/engine.py +++ b/python/paddle/distributed/auto_parallel/static/engine.py @@ -283,6 +283,15 @@ def _prepare_data_spec_from_dataloader(self, dataloader): inputs_spec = [] labels_spec = [] data = next(iter(dataloader)) + if hasattr(dataloader, "batch_sampler"): + batch_sampler = dataloader.batch_sampler + else: + batch_sampler = dataloader._dataloader.batch_sampler + if isinstance(batch_sampler, paddle.io.DistributedBatchSampler): + # Get data from DataLoader iterator directly may affect data generation randomness + # of BatchSampler when `Shuffle=True`. It may cause difference of data feeding + # between dynamic and to_static mode. + batch_sampler.epoch -= 1 if isinstance(data, dict): data = tuple(data.values()) if len(data) != 2: diff --git a/python/paddle/distributed/communication/stream/broadcast.py b/python/paddle/distributed/communication/stream/broadcast.py index 751a670a119ea..e1a6513635555 100644 --- a/python/paddle/distributed/communication/stream/broadcast.py +++ b/python/paddle/distributed/communication/stream/broadcast.py @@ -12,13 +12,15 @@ # See the License for the specific language governing permissions and # limitations under the License. -from paddle import framework +from paddle import _C_ops, framework from paddle.base import data_feeder from paddle.distributed.communication.group import ( _get_global_group, _get_or_throw_group_rank, _warn_cur_rank_not_in_group, ) +from paddle.distributed.communication.reduce import _to_inplace_op +from paddle.framework import in_pir_mode def _broadcast_in_dygraph( @@ -59,6 +61,11 @@ def _broadcast_in_static_mode( helper = framework.LayerHelper(op_type, **locals()) ring_id = 0 if group is None else group.id + if in_pir_mode(): + op_type = _to_inplace_op(op_type) + getattr(_C_ops, op_type)(tensor, ring_id, src_rank_in_group, sync_op) + return + helper.append_op( type=op_type, inputs={'X': [tensor]}, diff --git a/python/paddle/distributed/passes/allreduce_matmul_grad_overlapping.py b/python/paddle/distributed/passes/allreduce_matmul_grad_overlapping.py index fb68f2ff0246b..77affd4cd9c1e 100644 --- a/python/paddle/distributed/passes/allreduce_matmul_grad_overlapping.py +++ b/python/paddle/distributed/passes/allreduce_matmul_grad_overlapping.py @@ -117,10 +117,6 @@ def _split_matmul_grad_and_multi_streaming_allreduce( block, matmul_grad_id, self.dist_context, self.op_namescope ) - allreduce_op.dist_attr.execution_stream = ( - AutoParallelStreamType.MP_STREAM.value - ) - # NOTE(Ruibiao): Required OP scheduling order: matmul(dOut, Y^T) -> c_allreduce_sum(dX) -> matmul(X^T, dOut). # c_allreduce_sum(dX) and matmul(X^T, dOut) cannot be swapped. Otherwise, after buffer_shared_inplace_pass # adding share_buffer OP before c_allreduce_sum, c_allreduce_sum will synchronous with comp-stream, and then @@ -128,6 +124,9 @@ def _split_matmul_grad_and_multi_streaming_allreduce( allreduce_op_dist_attr = ( self.dist_context.get_op_dist_attr_for_program(allreduce_op) ) + allreduce_op_dist_attr.execution_stream = ( + AutoParallelStreamType.MP_STREAM.value + ) allreduce_op_inputs = allreduce_op.desc.input_names() allreduce_op_outputs = allreduce_op.desc.output_names() diff --git a/python/paddle/jit/sot/opcode_translator/executor/function_graph.py b/python/paddle/jit/sot/opcode_translator/executor/function_graph.py index 23ac3ce07a477..99ea75ebbcd48 100644 --- a/python/paddle/jit/sot/opcode_translator/executor/function_graph.py +++ b/python/paddle/jit/sot/opcode_translator/executor/function_graph.py @@ -64,7 +64,7 @@ SideEffectRestorer, SideEffects, ) -from .tracker import BuiltinTracker, DummyTracker +from .tracker import BuiltinTracker, DummyTracker, SymbolicOperationTracker from .variables import ( ConstantVariable, DictVariable, @@ -580,13 +580,20 @@ def message_handler(*args, **kwargs): try: return inner_error_default_handler( self.symbolic_call, message_handler - )(ast_infer_meta, compute_fn, static_function, *args, **kwargs) + )( + ast_infer_meta, + compute_fn, + static_function, + False, + *args, + **kwargs, + ) except Exception as e: log(3, f"[call AST] {e}") return None def symbolic_call( - self, infer_meta_fn, compute_fn, func, is_symbolic_int, *args, **kwargs + self, infer_meta_fn, compute_fn, func, is_symbolic_var, *args, **kwargs ): """ Using infer_meta_fn and compute_fn convert func to symbolic function. @@ -612,14 +619,21 @@ def symbolic_call( log(3, f" inputs : {inputs_symbols}", "\n") - var_cls = SymbolicVariable if is_symbolic_int else TensorVariable + if is_symbolic_var: + var_cls = SymbolicVariable + tracker = SymbolicOperationTracker( + list(args) + list(kwargs.values()), func + ) + else: + var_cls = TensorVariable + tracker = DummyTracker(list(args) + list(kwargs.values())) outputs = map_if( out_metas, pred=lambda x: isinstance(x, MetaInfo), true_fn=lambda x: var_cls( x, self, - tracker=DummyTracker(list(args) + list(kwargs.values())), + tracker=tracker, ), false_fn=lambda x: x, ) @@ -649,9 +663,15 @@ def symbolic_call( stmt_stacks, ) # symbolic only contain symbols. self._put_inner(outputs) - return VariableFactory.from_value( - outputs, self, DummyTracker(list(args) + list(kwargs.values())) - ) + if is_symbolic_var: + # compute_fn should be call_method + tracker = SymbolicOperationTracker( + list(args) + list(kwargs.values()), func + ) + else: + tracker = DummyTracker(list(args) + list(kwargs.values())) + + return VariableFactory.from_value(outputs, self, tracker) else: return ConstantVariable.wrap_literal(None, self) diff --git a/python/paddle/jit/sot/opcode_translator/executor/opcode_executor.py b/python/paddle/jit/sot/opcode_translator/executor/opcode_executor.py index 72b26c2e8b015..70870913a6a02 100644 --- a/python/paddle/jit/sot/opcode_translator/executor/opcode_executor.py +++ b/python/paddle/jit/sot/opcode_translator/executor/opcode_executor.py @@ -88,6 +88,7 @@ NullVariable, SequenceIterVariable, SliceVariable, + SymbolicVariable, TensorVariable, TupleVariable, UserDefinedFunctionVariable, @@ -204,7 +205,7 @@ def inner(self: OpcodeExecutorBase, instr: Instruction): fn, graph=self._graph, tracker=DanglingTracker() )(res) - assert isinstance(res, ConstantVariable) + assert isinstance(res, (ConstantVariable, SymbolicVariable)) is_jump = res.get_py_value() assert isinstance(is_jump, bool) if is_jump: diff --git a/python/paddle/jit/sot/opcode_translator/executor/tracker.py b/python/paddle/jit/sot/opcode_translator/executor/tracker.py index 1f9a378a4f5f2..41ce17dba7cbc 100644 --- a/python/paddle/jit/sot/opcode_translator/executor/tracker.py +++ b/python/paddle/jit/sot/opcode_translator/executor/tracker.py @@ -127,6 +127,31 @@ def need_guard(self) -> bool: return False +class SymbolicOperationTracker(DummyTracker): + """ + SymbolicOperationTracker is a subclass of Tracker that specifically tracks variables cannot be reproduced from the frame. + It is mostly generated by complex operations of symbolic variables. + + Args: + inputs (list[VariableBase]): The input variables associated with the generated variables. + """ + + def __init__(self, inputs: Sequence[VariableBase], method_name: str): + super().__init__(inputs) + self.method_name = method_name + + def gen_instructions(self, codegen: PyCodeGen): + raise InnerError("SymbolicOperationTracker has no instructions") + + def trace_value_from_frame(self): + raise InnerError( + "SymbolicOperationTracker can't trace value from frame" + ) + + def __repr__(self) -> str: + return f"SymbolicOperationTracker(num_inputs={len(self.inputs)})" + + class DanglingTracker(Tracker): """ DanglingTracker is a subclass of Tracker that specifically tracks variables that are not in the frame. diff --git a/python/paddle/jit/sot/opcode_translator/executor/variable_dispatch.py b/python/paddle/jit/sot/opcode_translator/executor/variable_dispatch.py index 719c3be066ead..6dc1b0bbec0d5 100644 --- a/python/paddle/jit/sot/opcode_translator/executor/variable_dispatch.py +++ b/python/paddle/jit/sot/opcode_translator/executor/variable_dispatch.py @@ -576,12 +576,12 @@ def dispatch_reversed(var: ContainerVariable): # bool Dispatcher.register( bool, - ("ContainerVariable",), + ("ContainerVariable | SymbolicVariable",), lambda var: var.bool(), ) Dispatcher.register( operator.truth, - ("ConstantVariable",), + ("ConstantVariable | SymbolicVariable",), lambda var: var.bool(), ) @@ -941,19 +941,6 @@ def is_not_func(var: VariableBase, other: VariableBase): magic_method.name, ), ) - Dispatcher.register( - binary_fn, - ( - "SymbolicVariable", - "ConstantVariable | SymbolicVariable", - ), - partial( - lambda magic_name, var, other: var.graph.call_symbolic_method( - magic_name, var, other - ), - magic_method.name, - ), - ) else: # skip __mod__ for str and TensorVariable if magic_method.name == "__rmod__": @@ -983,17 +970,39 @@ def tensor_mod_dispatcher( magic_method.name, ), ) +# Symbolic +for binary_fn in BINARY_OPS: + for magic_method in magic_method_builtin_dispatch(binary_fn): + # skip all inplace magic method name, we will dispatch it to non-inplace + # magic methods + if magic_method.is_inplace: + continue - Dispatcher.register( - binary_fn, - ("ConstantVariable", "SymbolicVariable"), - partial( - lambda magic_name, var, other: var.graph.call_symbolic_method( - magic_name, var, other - ), - magic_method.name, + if not magic_method.is_reverse: + Dispatcher.register( + binary_fn, + ( + "SymbolicVariable", + "ConstantVariable | SymbolicVariable", + ), + partial( + lambda magic_name, var, other: var.graph.call_symbolic_method( + magic_name, var, other ), - ) + magic_method.name, + ), + ) + else: + Dispatcher.register( + binary_fn, + ("ConstantVariable", "SymbolicVariable"), + partial( + lambda magic_name, var, other: var.graph.call_symbolic_method( + magic_name, var, other + ), + magic_method.name, + ), + ) # Register dispatch for NumpyVariable: fallback ! for unary_fn in UNARY_OPS: diff --git a/python/paddle/jit/sot/opcode_translator/executor/variables/basic.py b/python/paddle/jit/sot/opcode_translator/executor/variables/basic.py index 259d8c1a090d7..965b7edba28ed 100644 --- a/python/paddle/jit/sot/opcode_translator/executor/variables/basic.py +++ b/python/paddle/jit/sot/opcode_translator/executor/variables/basic.py @@ -53,6 +53,7 @@ GetAttrTracker, GetIterTracker, GlobalTracker, + SymbolicOperationTracker, Tracker, ) from .base import VariableBase, VariableFactory @@ -618,16 +619,35 @@ def __init__( self.meta = MetaInfo( [], paddle.int64, True, self.var_name, False, None, None ) + self.need_guard_value = False def get_py_value(self, allow_tensor=False): + self.need_guard_value = True + if self.value is None: + assert isinstance( + self.tracker, SymbolicOperationTracker + ), f"self.value is None, but tracker is not SymbolicOperationTracker. tracker: {self.tracker}" + inputs = self.tracker.inputs + assert len(inputs) >= 1 + other_inputs_value = [x.get_py_value() for x in inputs[1:]] + self.value = getattr( + inputs[0].get_py_value(), self.tracker.method_name + )(*other_inputs_value) return self.value def get_py_type(self): - return int + # TODO(zrr1999): not need to use value to get type + return super().get_py_type() def get_symbol(self) -> Symbol: return Symbol(self.var_name) + def __bool__(self) -> bool: + return bool(self.get_py_value()) + + def bool(self): + return ConstantVariable(bool(self), self.graph, DummyTracker([self])) + @property def out_var_name(self): return f"{self.graph.OUT_VAR_PREFIX}{self.var_name}" @@ -651,10 +671,11 @@ def make_stringify_guard(self) -> list[StringifyExpression]: symbolic_input = symbolic_inputs[frame_value_tracer.inlined_expr] symbolic_input.setdefault(self.value, 0) symbolic_input[self.value] += 1 - + if self.need_guard_value: + return super().make_stringify_guard() return [ StringifyExpression( - "isinstance({}, int)", + f"id(type({{}})) == {id(self.get_py_type())}", [frame_value_tracer], union_free_vars(frame_value_tracer.free_vars), ) diff --git a/python/paddle/tensor/random.py b/python/paddle/tensor/random.py index 7987a02af1cea..097f62f594864 100644 --- a/python/paddle/tensor/random.py +++ b/python/paddle/tensor/random.py @@ -463,6 +463,22 @@ def uniform_random_batch_size_like( >>> print(out_2.shape) [2, 3] """ + if in_dynamic_or_pir_mode(): + dtype = convert_np_dtype_to_dtype_(dtype) + return _C_ops.uniform_random_batch_size_like( + input, + shape, + input_dim_idx, + output_dim_idx, + min, + max, + seed, + 0, + 0, + 1.0, + dtype, + ) + check_variable_and_dtype( input, 'Input', diff --git a/python/setup.py.in b/python/setup.py.in index ad4cdf566609e..67d23a089aa37 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -425,7 +425,8 @@ write_distributed_training_mode_py(filename='@PADDLE_BINARY_DIR@/python/paddle/i def get_paddle_extra_install_requirements(): #(Note risemeup1): Paddle will install the pypi cuda package provided by Nvidia, which includes the cuda runtime, cudnn, and cublas, thereby making the operation of 'pip install paddle' no longer dependent on the installation of cuda and cudnn. if '@WITH_PIP_CUDA_LIBRARIES@' == 'ON': - PADDLE_CUDA_INSTALL_REQUIREMENTS = { + if platform.system() == 'Linux': + PADDLE_CUDA_INSTALL_REQUIREMENTS = { "V11": ( "nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | " "nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | " @@ -453,6 +454,27 @@ def get_paddle_extra_install_requirements(): "nvidia-cuda-nvrtc-cu12==12.3.107; platform_system == 'Linux' and platform_machine == 'x86_64'" ), } + elif platform.system() == 'Windows': + PADDLE_CUDA_INSTALL_REQUIREMENTS = { + "V11": ( + "nvidia-cuda-runtime-cu11==11.8.89 | " + "nvidia-cudnn-cu11==8.9.4.19 | " + "nvidia-cublas-cu11==11.11.3.6 | " + "nvidia-cufft-cu11==10.9.0.58 | " + "nvidia-curand-cu11==10.3.0.86 | " + "nvidia-cusolver-cu11==11.4.1.48 | " + "nvidia-cusparse-cu11==11.7.5.86 " + ), + "V12": ( + "nvidia-cuda-runtime-cu12==12.3.101 | " + "nvidia-cudnn-cu12==9.0.0.312 | " + "nvidia-cublas-cu12==12.3.4.1 | " + "nvidia-cufft-cu12==11.2.1.3 | " + "nvidia-curand-cu12==10.3.5.147 | " + "nvidia-cusolver-cu12==11.6.1.9 | " + "nvidia-cusparse-cu12==12.3.1.170 " + ), + } try: output = subprocess.check_output(['nvcc', '--version']).decode('utf-8') version_line = [line for line in output.split('\n') if 'release' in line][0] @@ -654,7 +676,7 @@ if sys.version_info >= (3,8): continue setup_requires_tmp+=[setup_requires_i] setup_requires = setup_requires_tmp - if platform.system() == 'Linux' and platform.machine() == 'x86_64': + if '@WITH_GPU@' == 'ON' and platform.system() in ('Linux', 'Windows') and platform.machine() in ('x86_64', 'AMD64'): paddle_cuda_requires = get_paddle_extra_install_requirements() setup_requires += paddle_cuda_requires diff --git a/setup.py b/setup.py index a8e6850aa4ad5..aab6fe0bcfd82 100644 --- a/setup.py +++ b/setup.py @@ -955,34 +955,56 @@ def get_setup_requires(): def get_paddle_extra_install_requirements(): # (Note risemeup1): Paddle will install the pypi cuda package provided by Nvidia, which includes the cuda runtime, cudnn, and cublas, thereby making the operation of 'pip install paddle' no longer dependent on the installation of cuda and cudnn. if env_dict.get("WITH_PIP_CUDA_LIBRARIES") == "ON": - PADDLE_CUDA_INSTALL_REQUIREMENTS = { - "V11": ( - "nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-cudnn-cu11==8.7.0.84; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-nccl-cu11==2.19.3; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64'" - ), - "V12": ( - "nvidia-cuda-runtime-cu12==12.3.101; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-cuda-cupti-cu12==12.3.101; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-cudnn-cu12==9.0.0.312; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-cublas-cu12==12.3.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-nccl-cu12==2.19.3; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-cuda-nvrtc-cu12==12.3.107; platform_system == 'Linux' and platform_machine == 'x86_64'" - ), - } + if platform.system() == 'Linux': + PADDLE_CUDA_INSTALL_REQUIREMENTS = { + "V11": ( + "nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-cudnn-cu11==8.7.0.84; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-nccl-cu11==2.19.3; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64'" + ), + "V12": ( + "nvidia-cuda-runtime-cu12==12.3.101; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-cuda-cupti-cu12==12.3.101; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-cudnn-cu12==9.0.0.312; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-cublas-cu12==12.3.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-nccl-cu12==2.19.3; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-cuda-nvrtc-cu12==12.3.107; platform_system == 'Linux' and platform_machine == 'x86_64'" + ), + } + elif platform.system() == 'Windows': + PADDLE_CUDA_INSTALL_REQUIREMENTS = { + "V11": ( + "nvidia-cuda-runtime-cu11==11.8.89 | " + "nvidia-cudnn-cu11==8.9.4.19 | " + "nvidia-cublas-cu11==11.11.3.6 | " + "nvidia-cufft-cu11==10.9.0.58 | " + "nvidia-curand-cu11==10.3.0.86 | " + "nvidia-cusolver-cu11==11.4.1.48 | " + "nvidia-cusparse-cu11==11.7.5.86 " + ), + "V12": ( + "nvidia-cuda-runtime-cu12==12.3.101 | " + "nvidia-cudnn-cu12==9.0.0.312 | " + "nvidia-cublas-cu12==12.3.4.1 | " + "nvidia-cufft-cu12==11.2.1.3 | " + "nvidia-curand-cu12==10.3.5.147 | " + "nvidia-cusolver-cu12==11.6.1.9 | " + "nvidia-cusparse-cu12==12.3.1.170 " + ), + } try: output = subprocess.check_output(['nvcc', '--version']).decode( 'utf-8' @@ -1468,7 +1490,15 @@ def get_headers(): def get_setup_parameters(): # get setup_requires setup_requires = get_setup_requires() - if platform.system() == 'Linux' and platform.machine() == 'x86_64': + if ( + env_dict.get("WITH_GPU") == 'ON' + and platform.system() in ('Linux', 'Windows') + and platform.machine() + in ( + 'x86_64', + 'AMD64', + ) + ): paddle_cuda_requires = get_paddle_extra_install_requirements() setup_requires += paddle_cuda_requires diff --git a/test/auto_parallel/hybrid_strategy/semi_auto_llama_dataloader.py b/test/auto_parallel/hybrid_strategy/semi_auto_llama_dataloader.py new file mode 100644 index 0000000000000..53d5cc96face7 --- /dev/null +++ b/test/auto_parallel/hybrid_strategy/semi_auto_llama_dataloader.py @@ -0,0 +1,260 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from functools import reduce + +import numpy as np +from semi_auto_parallel_llama_model import ( + LlamaForCausalLMAuto, + LlamaPretrainingCriterionAuto, + get_mesh, +) + +import paddle +import paddle.distributed as dist +from paddle import LazyGuard +from paddle.io import BatchSampler, DataLoader, Dataset + + +class Config: + vocab_size = 32000 + hidden_size = 4096 + intermediate_size = 11008 + max_position_embeddings = 2048 + seq_length = 2048 + num_hidden_layers = 2 + num_attention_heads = 32 + num_key_value_heads = 32 + initializer_range = 0.02 + rms_norm_eps = 1e-6 + use_cache = True + use_flash_attention = False + sequence_parallel = False + rope = True + recompute = False + recompute_granularity = None + use_lazy_init = False + + +inputs = [] +labels = [] + +for i in range(100): + inputs.append( + np.random.uniform(low=0, high=32000, size=[Config().seq_length]).astype( + "int64" + ) + ) + labels.append( + (np.random.uniform(size=[Config().seq_length]) * 10).astype("int64") + ) + + +class RandomDataset(Dataset): + def __init__(self, seq_len, num_samples=100): + super().__init__() + self.seq_len = seq_len + self.num_samples = num_samples + + def __getitem__(self, index): + global inputs, labels + return inputs[index], labels[index] + + def __len__(self): + return self.num_samples + + +def create_optimizer(model, lr_scheduler): + decay_parameters = [ + p.name + for n, p in model.named_parameters() + if not any(nd in n for nd in ["bias", "norm"]) + ] + + def apply_decay_param_fun(x): + return x in decay_parameters + + # test global_clip in auto_parallel + if os.getenv("use_param_group") == "true": + param_group = {} + param_group["params"] = list(model.parameters()) + param_group["weight_decay"] = 0.01 + param_group["grad_clip"] = paddle.nn.ClipGradByGlobalNorm(1.0) + optimizer = paddle.optimizer.adamw.AdamW( + learning_rate=lr_scheduler, + apply_decay_param_fun=apply_decay_param_fun, + parameters=[param_group], + ) + else: + optimizer = paddle.optimizer.adamw.AdamW( + learning_rate=lr_scheduler, + apply_decay_param_fun=apply_decay_param_fun, + parameters=model.parameters(), + weight_decay=0.01, + grad_clip=paddle.nn.ClipGradByGlobalNorm(1.0), + ) + return optimizer + + +class TestLlamaAuto: + def __init__(self): + self.config = Config() + self.dp = int(os.getenv("dp")) + self.mp = int(os.getenv("mp")) + self.pp = int(os.getenv("pp")) + if os.getenv("use_sp") == "true": + self.config.sequence_parallel = True + if os.getenv("recompute") == "true": + self.config.recompute = True + self.config.recompute_granularity = os.getenv("recompute_granularity") + if os.getenv("use_lazy_init") == "true": + self.config.use_lazy_init = True + self.gradient_accumulation_steps = int(os.getenv("acc_step")) + self.amp = False + self.amp_dtype = "float16" + self.amp_level = "O1" + self.amp_master_grad = False + if os.getenv("amp") == "true": + self.amp = True + if os.getenv("amp_dtype") in ["float16", "bfloat16"]: + self.amp_dtype = os.getenv("amp_dtype") + if os.getenv("amp_level") in ["O0", "O1", "O2"]: + self.amp_level = os.getenv("amp_level") + if os.getenv("amp_master_grad") == "true": + self.amp_master_grad = True + + self.init_dist_env() + + def init_dist_env(self): + order = ["dp", "pp", "mp"] + dp_degree = self.dp + mp_degree = self.mp + pp_degree = self.pp + degree = [dp_degree, pp_degree, mp_degree] + mesh_dims = list(filter(lambda x: x[1] > 1, list(zip(order, degree)))) + if not mesh_dims: + mesh_dims = [("dp", 1)] + dim_names = [mesh_dim[0] for mesh_dim in mesh_dims] + mesh_shape = [mesh_dim[1] for mesh_dim in mesh_dims] + mesh_arr = np.arange( + 0, reduce(lambda x, y: x * y, mesh_shape, 1) + ).reshape(mesh_shape) + global_mesh = dist.ProcessMesh(mesh_arr, dim_names) + dist.auto_parallel.set_mesh(global_mesh) + + def run_llama(self, to_static=0): + if self.config.use_lazy_init: + with LazyGuard(): + model = LlamaForCausalLMAuto(self.config) + for param in model.parameters(): + assert not param._is_initialized() + param.initialize() + else: + model = LlamaForCausalLMAuto(self.config) + criterion = LlamaPretrainingCriterionAuto(self.config) + + lr_scheduler = paddle.optimizer.lr.LinearWarmup( + learning_rate=0.0001, warmup_steps=2, start_lr=0, end_lr=0.0001 + ) + optimizer = create_optimizer(model, lr_scheduler) + if self.amp and not to_static: + model, optimizer = paddle.amp.decorate( + models=model, + optimizers=optimizer, + level=self.amp_level, + dtype=self.amp_dtype, + master_grad=self.amp_master_grad, + ) + optimizer = dist.shard_optimizer(optimizer) + + train_dataset = RandomDataset(self.config.seq_length) + train_sampler = BatchSampler( + train_dataset, + batch_size=2, + shuffle=True, + drop_last=False, + ) + train_dataloader = DataLoader( + train_dataset, + batch_sampler=train_sampler, + num_workers=0, + ) + + if self.pp == 1: + meshes = [get_mesh(0)] + elif self.pp > 1: + meshes = [get_mesh(0), get_mesh(-1)] + else: + raise ValueError("pp should be greater or equal to 1") + + dist_loader = dist.shard_dataloader( + dataloader=train_dataloader, + meshes=meshes, + shard_dims="dp", + ) + + global_step = 1 + tr_loss = float(0) + + if not to_static: + model.train() + scaler = None + if self.amp and self.amp_dtype == "float16": + scaler = paddle.amp.GradScaler(init_loss_scaling=1024) + scaler = dist.shard_scaler(scaler) + + for epoch_idx in range(1): + for step, inputs in enumerate(dist_loader()): + input_ids, labels = inputs + return input_ids._local_value()._md5sum() + break + else: + strategy = dist.Strategy() + if self.gradient_accumulation_steps > 1: + strategy.pipeline.accumulate_steps = ( + self.gradient_accumulation_steps + ) + + if self.amp: + amp = strategy.amp + amp.enable = self.amp + amp.dtype = self.amp_dtype + amp.level = self.amp_level.lower() + if self.amp_master_grad: + amp.use_master_grad = True + + dist_model = dist.to_static( + model, + dist_loader, + criterion, + optimizer, + strategy=strategy, + ) + + dist_model.train() + for step, inputs in enumerate(dist_loader()): + input_ids, labels = inputs + return input_ids._local_value()._md5sum() + break + + def run_test_cases(self): + dynamic_input_mdsum = self.run_llama(to_static=0) + static_input_md5sum = self.run_llama(to_static=1) + if dist.get_rank() == 0: + assert dynamic_input_mdsum == static_input_md5sum + + +if __name__ == '__main__': + TestLlamaAuto().run_test_cases() diff --git a/test/auto_parallel/hybrid_strategy/test_semi_auto_parallel_llama_model.py b/test/auto_parallel/hybrid_strategy/test_semi_auto_parallel_llama_model.py index 17516fbd57139..c9909bd0e13cb 100644 --- a/test/auto_parallel/hybrid_strategy/test_semi_auto_parallel_llama_model.py +++ b/test/auto_parallel/hybrid_strategy/test_semi_auto_parallel_llama_model.py @@ -204,5 +204,28 @@ def test_simple_net_hybrid_strategy(self): ) +class TestSemiAutoParallelLlamaDataLoader(test_base.CommunicationTestDistBase): + def setUp(self): + super().setUp(num_of_devices=8, timeout=200, nnode=1) + self._default_envs = {"dp": "2", "mp": "2", "pp": "2", "acc_step": "1"} + self._changeable_envs = { + "backend": ["gpu"], + "use_sp": ["false"], + "use_param_group": ["false"], + "recompute": ["true"], + "recompute_granularity": ["full"], + } + + def test_simple_net_hybrid_strategy(self): + envs_list = test_base.gen_product_envs_list( + self._default_envs, self._changeable_envs + ) + for envs in envs_list: + self.run_test_case( + "semi_auto_llama_dataloader.py", + user_defined_envs=envs, + ) + + if __name__ == "__main__": unittest.main() diff --git a/test/collective/process_group_nccl_pir.py b/test/collective/process_group_nccl_pir.py index 014ce56c787d1..00442ef265d47 100644 --- a/test/collective/process_group_nccl_pir.py +++ b/test/collective/process_group_nccl_pir.py @@ -321,6 +321,58 @@ def test_allreduce_prod_with_0d_input(self): np.multiply(x_np, y_np), y_out ) + def test_broadcast(self): + # to_tensor dose not support float16 input + if self.dtype == "float16": + return + pg = self.pg + # rank 0 + x_np = np.random.random(self.shape).astype(self.dtype) + # rank 1 + y_np = np.random.random(self.shape).astype(self.dtype) + with paddle.pir_utils.IrGuard(): + main_program = paddle.static.Program() + startup_program = paddle.static.Program() + with paddle.static.program_guard(main_program, startup_program): + if pg.rank() == 0: + data = paddle.to_tensor(x_np) + else: + data = paddle.to_tensor(y_np) + dist.broadcast(data, 1) + exe = paddle.static.Executor() + (data,) = exe.run( + main_program, + feed={}, + fetch_list=[data], + ) + np.testing.assert_array_equal(y_np, data) + + def test_broadcast_with_0d_input(self): + # to_tensor dose not support float16 input + if self.dtype == "float16": + return + pg = self.pg + # rank 0 + x_np = np.random.random([]).astype(self.dtype) + # rank 1 + y_np = np.random.random([]).astype(self.dtype) + with paddle.pir_utils.IrGuard(): + main_program = paddle.static.Program() + startup_program = paddle.static.Program() + with paddle.static.program_guard(main_program, startup_program): + if pg.rank() == 0: + data = paddle.to_tensor(x_np) + else: + data = paddle.to_tensor(y_np) + dist.broadcast(data, 1) + exe = paddle.static.Executor() + (data,) = exe.run( + main_program, + feed={}, + fetch_list=[data], + ) + np.testing.assert_array_equal(y_np, data) + class TestProcessGroupFp16(TestProcessGroupFp32): def setUp(self): diff --git a/test/deprecated/legacy_test/CMakeLists.txt b/test/deprecated/legacy_test/CMakeLists.txt index 9129199f30850..18891bc1cb65e 100644 --- a/test/deprecated/legacy_test/CMakeLists.txt +++ b/test/deprecated/legacy_test/CMakeLists.txt @@ -735,6 +735,7 @@ set_tests_properties(test_cumprod_op PROPERTIES TIMEOUT 300) set_tests_properties(test_split_program PROPERTIES TIMEOUT 120) set_tests_properties(test_graph_send_ue_recv_op PROPERTIES TIMEOUT 60) set_tests_properties(test_graph_send_uv_op PROPERTIES TIMEOUT 60) +set_tests_properties(test_uniform_random_op_deprecated PROPERTIES TIMEOUT 60) set_tests_properties(test_pretrained_model PROPERTIES TIMEOUT 120) set_tests_properties(test_model PROPERTIES TIMEOUT 300) diff --git a/test/deprecated/legacy_test/test_save_inference_model_conditional_op.py b/test/deprecated/legacy_test/test_save_inference_model_conditional_op.py index 783806a62b853..fc5129bd5ffb2 100644 --- a/test/deprecated/legacy_test/test_save_inference_model_conditional_op.py +++ b/test/deprecated/legacy_test/test_save_inference_model_conditional_op.py @@ -16,8 +16,6 @@ import tempfile import unittest -import numpy as np - import paddle import paddle.nn.functional as F from paddle.pir_utils import test_with_dygraph_pir @@ -102,7 +100,6 @@ def test_while_op(self): ) root_path = tempfile.TemporaryDirectory() model_file = os.path.join(root_path.name, "while_net") - x = paddle.to_tensor(np.random.random((1, 3, 8, 8)).astype('float32')) paddle.jit.save(net, model_file) paddle.enable_static() diff --git a/test/deprecated/legacy_test/test_uniform_random_op_deprecated.py b/test/deprecated/legacy_test/test_uniform_random_op_deprecated.py new file mode 100644 index 0000000000000..0e21865131bc0 --- /dev/null +++ b/test/deprecated/legacy_test/test_uniform_random_op_deprecated.py @@ -0,0 +1,67 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import unittest + +import numpy as np +from test_attribute_var import UnittestBase + +import paddle +from paddle.base import Program, program_guard + + +class TestUniformMinMaxTensor(UnittestBase): + def init_info(self): + self.shapes = [[2, 3, 4]] + self.save_path = os.path.join(self.temp_dir.name, self.path_prefix()) + + def test_static(self): + main_prog = Program() + startup_prog = Program() + with program_guard(main_prog, startup_prog): + fc = paddle.nn.Linear(4, 10) + x = paddle.randn([2, 3, 4]) + x.stop_gradient = False + feat = fc(x) # [2,3,10] + min_v = paddle.to_tensor([0.1]) + max_v = paddle.to_tensor([0.9]) + y = paddle.uniform([2, 3, 10], min=min_v, max=max_v) + z = paddle.uniform([2, 3, 10], min=min_v, max=max_v) + + out = feat + y + z + + sgd = paddle.optimizer.SGD() + sgd.minimize(paddle.mean(out)) + self.assertTrue(self.var_prefix() in str(main_prog)) + + exe = paddle.static.Executor() + exe.run(startup_prog) + res = exe.run(fetch_list=[out]) + np.testing.assert_array_equal(res[0].shape, [2, 3, 10]) + + paddle.static.save_inference_model(self.save_path, [x], [out], exe) + # Test for Inference Predictor + infer_out = self.infer_prog() + np.testing.assert_array_equal(res[0].shape, [2, 3, 10]) + + def path_prefix(self): + return 'uniform_random' + + def var_prefix(self): + return "Var[" + + +if __name__ == "__main__": + unittest.main() diff --git a/test/dygraph_to_static/test_save_inference_model.py b/test/dygraph_to_static/test_save_inference_model.py index f3a0b1eb9260e..880990c64ad7a 100644 --- a/test/dygraph_to_static/test_save_inference_model.py +++ b/test/dygraph_to_static/test_save_inference_model.py @@ -91,7 +91,7 @@ def tearDown(self): @test_ast_only @test_legacy_and_pir - def _test_save_inference_model(self): + def test_save_inference_model(self): fc_size = 20 x_data = np.random.random((fc_size, fc_size)).astype('float32') paddle.seed(SEED) diff --git a/test/ir/inference/CMakeLists.txt b/test/ir/inference/CMakeLists.txt index 0efa774fa7fae..63c2cb59ce90f 100755 --- a/test/ir/inference/CMakeLists.txt +++ b/test/ir/inference/CMakeLists.txt @@ -158,7 +158,6 @@ if(WITH_GPU AND TENSORRT_FOUND) set_tests_properties(test_trt_ops_fp32_mix_precision PROPERTIES TIMEOUT 300) set_tests_properties(test_trt_inference_fp16_io PROPERTIES TIMEOUT 500) set_tests_properties(test_trt_convert_unary PROPERTIES TIMEOUT 600) - set_tests_properties(test_seq_concat_fc_fuse_pass PROPERTIES TIMEOUT 200) if(NOT WIN32) set_tests_properties(test_trt_explicit_quantization_resnet PROPERTIES TIMEOUT 300) @@ -230,6 +229,7 @@ if(WITH_GPU AND TENSORRT_FOUND) set_tests_properties(test_conv_eltwiseadd_bn_fuse_pass PROPERTIES TIMEOUT 300) set_tests_properties(test_save_optimized_model_pass PROPERTIES TIMEOUT 300) + set_tests_properties(test_seq_concat_fc_fuse_pass PROPERTIES TIMEOUT 200) if(WIN32) set_tests_properties(test_matmul_scale_fuse_pass PROPERTIES TIMEOUT 300) set_tests_properties(test_matmul_v2_scale_fuse_pass PROPERTIES TIMEOUT diff --git a/test/ir/inference/auto_scan_test.py b/test/ir/inference/auto_scan_test.py index 35986d6888cd7..4d18b5087c1b3 100755 --- a/test/ir/inference/auto_scan_test.py +++ b/test/ir/inference/auto_scan_test.py @@ -656,6 +656,7 @@ def __init__(self, *args, **kwargs): # Use a separate random generator for skipping tests self.skip_rng = np.random.default_rng(int(time.strftime("%W"))) + self.optimization_level = None def create_inference_config(self, use_trt=True) -> paddle_infer.Config: config = paddle_infer.Config() @@ -683,6 +684,8 @@ def create_inference_config(self, use_trt=True) -> paddle_infer.Config: self.dynamic_shape.opt_input_shape, self.dynamic_shape.disable_trt_plugin_fp16, ) + if self.optimization_level is not None: + config.set_tensorrt_optimization_level(self.optimization_level) return config def assert_tensors_near( diff --git a/test/ir/inference/test_trt_convert_flash_multihead_matmul.py b/test/ir/inference/test_trt_convert_flash_multihead_matmul.py index 62b9674fcfb4b..141deb99dbd2f 100644 --- a/test/ir/inference/test_trt_convert_flash_multihead_matmul.py +++ b/test/ir/inference/test_trt_convert_flash_multihead_matmul.py @@ -24,6 +24,10 @@ class TrtConvertFlashMultiHeadMatmulTest(TrtLayerAutoScanTest): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.optimization_level = 5 + def is_program_valid(self, program_config: ProgramConfig) -> bool: ver = paddle_infer.get_trt_compile_version() if ver[0] * 1000 + ver[1] * 100 + ver[2] * 10 < 8520: @@ -267,7 +271,7 @@ def clear_dynamic_shape(): # for static_shape clear_dynamic_shape() self.trt_param.precision = paddle_infer.PrecisionType.Float32 - self.trt_param.workspace_size = 2013265920 + self.trt_param.workspace_size = 1 << 33 yield self.create_inference_config(), (1, 2), (1e-5, 1e-5) self.trt_param.precision = paddle_infer.PrecisionType.Half yield self.create_inference_config(), (1, 2), (2e-2, 5e-3) @@ -275,7 +279,7 @@ def clear_dynamic_shape(): # for dynamic_shape generate_dynamic_shape(attrs) self.trt_param.precision = paddle_infer.PrecisionType.Float32 - self.trt_param.workspace_size = 2013265920 + self.trt_param.workspace_size = 1 << 33 yield self.create_inference_config(), (1, 2), (1e-5, 1e-4) self.trt_param.precision = paddle_infer.PrecisionType.Half yield self.create_inference_config(), (1, 2), (2e-2, 5e-3) @@ -320,6 +324,10 @@ def test(self): class TrtConvertFlashMultiHeadMatmulWeightInputTest(TrtLayerAutoScanTest): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.optimization_level = 5 + def is_program_valid(self, program_config: ProgramConfig) -> bool: ver = paddle_infer.get_trt_compile_version() if ver[0] * 1000 + ver[1] * 100 + ver[2] * 10 < 8520: @@ -570,7 +578,7 @@ def clear_dynamic_shape(): clear_dynamic_shape() self.trt_param.precision = paddle_infer.PrecisionType.Float32 program_config.set_input_type(np.float32) - self.trt_param.workspace_size = 2013265920 + self.trt_param.workspace_size = 1 << 33 yield self.create_inference_config(), (1, 5), (1e-5, 1e-5) self.trt_param.precision = paddle_infer.PrecisionType.Half program_config.set_input_type(np.float16) @@ -579,7 +587,7 @@ def clear_dynamic_shape(): generate_dynamic_shape(attrs) self.trt_param.precision = paddle_infer.PrecisionType.Float32 program_config.set_input_type(np.float32) - self.trt_param.workspace_size = 2013265920 + self.trt_param.workspace_size = 1 << 33 yield self.create_inference_config(), (1, 5), (1e-5, 1e-4) self.trt_param.precision = paddle_infer.PrecisionType.Half program_config.set_input_type(np.float16) diff --git a/test/ir/inference/test_trt_convert_share_data.py b/test/ir/inference/test_trt_convert_share_data.py index 168ef72b6e590..5c5358a4ab45b 100644 --- a/test/ir/inference/test_trt_convert_share_data.py +++ b/test/ir/inference/test_trt_convert_share_data.py @@ -64,12 +64,14 @@ def generate_input(type): "op_inputs": {"X": ["input_data"]}, "op_outputs": {"Out": ["output_data0"]}, "op_attrs": {}, + "outputs_dtype": {"output_data0": dtype}, }, { "op_type": "share_data", "op_inputs": {"X": ["output_data0"]}, "op_outputs": {"Out": ["output_data1"]}, "op_attrs": {}, + "outputs_dtype": {"output_data1": dtype}, }, ] diff --git a/test/ir/inference/test_trt_convert_trans_layernorm.py b/test/ir/inference/test_trt_convert_trans_layernorm.py index 67ab3277f33b1..7aad264b5080d 100644 --- a/test/ir/inference/test_trt_convert_trans_layernorm.py +++ b/test/ir/inference/test_trt_convert_trans_layernorm.py @@ -27,6 +27,10 @@ class TrtConvertTransLayernormTest(TrtLayerAutoScanTest): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.optimization_level = 5 + def is_program_valid(self, program_config: ProgramConfig) -> bool: return True diff --git a/test/ir/inference/test_trt_explicit_quantization_model.py b/test/ir/inference/test_trt_explicit_quantization_model.py index 04a050f3e7659..4a36d9e3f96b8 100644 --- a/test/ir/inference/test_trt_explicit_quantization_model.py +++ b/test/ir/inference/test_trt_explicit_quantization_model.py @@ -81,7 +81,7 @@ def transform(x): batch_size=64, ) - def train(program, stop_iter=100): + def train(program, stop_iter=128): for it, data in enumerate(train_loader): if it == 0: self.input_data = data[0]['image'] diff --git a/test/ir/pir/cinn/CMakeLists.txt b/test/ir/pir/cinn/CMakeLists.txt index e037d564547c1..a8d99e7170654 100644 --- a/test/ir/pir/cinn/CMakeLists.txt +++ b/test/ir/pir/cinn/CMakeLists.txt @@ -18,7 +18,7 @@ if(WITH_GPU) COMMAND ${CMAKE_COMMAND} -E env PYTHONPATH=${CMAKE_BINARY_DIR}:${CMAKE_BINARY_DIR}/python/:$ENV{PYTHONPATH} - FLAGS_enable_pir_api=1 FLAGS_prim_all=True + FLAGS_enable_pir_api=1 FLAGS_prim_all=True FLAGS_check_infer_symbolic=1 FLAGS_cinn_new_group_scheduler=1 FLAGS_cinn_bucket_compile=1 FLAGS_group_schedule_tiling_first=1 ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/${cinn_pir_test_name}.py diff --git a/test/ir/pir/cinn/inference/CMakeLists.txt b/test/ir/pir/cinn/inference/CMakeLists.txt index 497f0e3b474b6..18b4fbcb32145 100644 --- a/test/ir/pir/cinn/inference/CMakeLists.txt +++ b/test/ir/pir/cinn/inference/CMakeLists.txt @@ -12,8 +12,9 @@ if(WITH_GPU) ${CMAKE_COMMAND} -E env PYTHONPATH=${CMAKE_BINARY_DIR}:${CMAKE_BINARY_DIR}/python/:$ENV{PYTHONPATH} FLAGS_prim_enable_dynamic=True FLAGS_prim_all=True - FLAGS_enable_pir_api=1 FLAGS_cinn_bucket_compile=True - FLAGS_group_schedule_tiling_first=1 ${PYTHON_EXECUTABLE} + FLAGS_check_infer_symbolic=1 FLAGS_enable_pir_api=1 + FLAGS_cinn_bucket_compile=True FLAGS_group_schedule_tiling_first=1 + ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/${cinn_pir_test_name}.py WORKING_DIRECTORY ${CMAKE_BINARY_DIR}) set_tests_properties(${cinn_pir_test_name} PROPERTIES LABELS diff --git a/test/ir/pir/cinn/sub_graphs/CMakeLists.txt b/test/ir/pir/cinn/sub_graphs/CMakeLists.txt index 9bc68958916d2..8c9ee7be18da1 100644 --- a/test/ir/pir/cinn/sub_graphs/CMakeLists.txt +++ b/test/ir/pir/cinn/sub_graphs/CMakeLists.txt @@ -95,9 +95,9 @@ if(WITH_GPU) ${CMAKE_COMMAND} -E env PYTHONPATH=${CMAKE_BINARY_DIR}:${CMAKE_BINARY_DIR}/python/:$ENV{PYTHONPATH} FLAGS_prim_enable_dynamic=1 FLAGS_cinn_new_group_scheduler=1 - FLAGS_enable_pir_api=1 FLAGS_cinn_bucket_compile=1 - FLAGS_group_schedule_tiling_first=1 FLAGS_cudnn_deterministic=true - ${PYTHON_EXECUTABLE} + FLAGS_check_infer_symbolic=1 FLAGS_enable_pir_api=1 + FLAGS_cinn_bucket_compile=1 FLAGS_group_schedule_tiling_first=1 + FLAGS_cudnn_deterministic=true ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/${cinn_sub_graph_test_name}.py WORKING_DIRECTORY ${CMAKE_BINARY_DIR}) set_tests_properties(${cinn_sub_graph_test_name} PROPERTIES LABELS diff --git a/test/ir/pir/cinn/sub_graphs/base.py b/test/ir/pir/cinn/sub_graphs/base.py index c618f16f39d59..515d2e92d6a4f 100644 --- a/test/ir/pir/cinn/sub_graphs/base.py +++ b/test/ir/pir/cinn/sub_graphs/base.py @@ -17,6 +17,7 @@ import numpy as np import paddle +from paddle import nn class TestBase(unittest.TestCase): @@ -28,9 +29,18 @@ def setUp(self): self.with_prim = True self.with_cinn = True self.atol = 1e-6 + self.train_atol = 1e-6 self.with_precision_compare = True + self.with_train = False # 本个pr中默认为false,下个增量pr中改为默认true # override customized settting self.init() + if self.inputs: + self.set_input_grad() + + def set_input_grad(self): + if self.with_train: + for i in range(len(self.inputs)): + self.inputs[i].stop_gradient = False def init(self): pass @@ -55,7 +65,10 @@ def train(self, net, to_static, with_prim=False, with_cinn=False): net, full_graph=True, input_spec=self.input_specs ) paddle.seed(123) - net.eval() + if self.with_train: + net.train() + else: + net.eval() outs = net(*self.inputs) return outs @@ -77,3 +90,22 @@ def test_ast_prim_cinn(self): np.testing.assert_allclose( st.numpy(), cinn.numpy(), atol=self.atol ) + if self.with_train: + criterion = nn.MSELoss() + target = paddle.rand(shape=st_out.shape, dtype=st_out.dtype) + st_loss = criterion(st_out, target) + st_loss.backward() + st_grad = [] + for i in range(len(self.inputs)): + if self.inputs[i].dtype != paddle.int64: + st_grad.append(self.inputs[i].grad.numpy().copy()) + cinn_loss = criterion(cinn_out, target) + cinn_loss.backward() + cinn_grad = [] + for i in range(len(self.inputs)): + if self.inputs[i].dtype != paddle.int64: + cinn_grad.append(self.inputs[i].grad.numpy().copy()) + for i in range(len(cinn_grad)): + np.testing.assert_allclose( + st_grad[i], cinn_grad[i], atol=self.train_atol + ) diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_0.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_0.py index 37f4ec1fe33a4..d9ae6f9bca654 100644 --- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_0.py +++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_0.py @@ -135,6 +135,7 @@ def init(self): paddle.rand(shape=[22, 512, 7, 7], dtype=paddle.float32), ) self.net = LayerCase() + self.with_train = True def set_flags(self): # NOTE(Aurelius84): cinn_op.pool2d only support pool_type='avg' under adaptive=True diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_1.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_1.py index 7941c323998a7..39204aa627f26 100644 --- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_1.py +++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_1.py @@ -55,7 +55,7 @@ def init(self): shape=(-1, -1, -1, -1), dtype=paddle.float32, name=None, - stop_gradient=True, + stop_gradient=False, ) ] self.inputs = ( diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_2.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_2.py index 6c3b016cb4895..0f69be3aa9427 100644 --- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_2.py +++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_2.py @@ -67,13 +67,14 @@ def init(self): shape=(-1, 256, -1, -1), dtype=paddle.float32, name=None, - stop_gradient=True, + stop_gradient=False, ) ] self.inputs = ( paddle.rand(shape=[43, 256, 56, 56], dtype=paddle.float32), ) self.net = LayerCase() + self.with_train = True def set_flags(self): # NOTE(Aurelius84): cinn_op.pool2d only support pool_type='avg' under adaptive=True diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_3.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_3.py index 05cd0e0db54d4..f944d4f9ee527 100644 --- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_3.py +++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_3.py @@ -17,8 +17,6 @@ # api:paddle.tensor.manipulation.reshape||api:paddle.tensor.linalg.transpose||api:paddle.tensor.linalg.transpose||method:__getitem__||api:paddle.tensor.manipulation.gather||method:__getitem__||api:paddle.tensor.manipulation.gather||method:__getitem__||api:paddle.tensor.manipulation.gather||method:__getitem__||api:paddle.tensor.manipulation.gather||method:__getitem__||api:paddle.tensor.manipulation.gather||method:__getitem__||api:paddle.tensor.manipulation.gather||method:__getitem__||api:paddle.tensor.manipulation.gather||method:__getitem__||api:paddle.tensor.manipulation.gather||method:__getitem__||api:paddle.tensor.manipulation.gather||method:__getitem__||api:paddle.tensor.manipulation.gather||method:__getitem__||api:paddle.tensor.manipulation.gather||method:__getitem__||api:paddle.tensor.manipulation.gather||method:__getitem__||api:paddle.tensor.manipulation.gather||method:__getitem__||api:paddle.tensor.manipulation.gather||method:__getitem__||api:paddle.tensor.manipulation.gather||method:__getitem__||api:paddle.tensor.manipulation.gather||api:paddle.tensor.manipulation.concat||api:paddle.tensor.linalg.transpose||method:reshape||api:paddle.tensor.linalg.transpose||api:paddle.tensor.linalg.matmul||method:__mul__||method:__add__||api:paddle.nn.functional.activation.softmax||api:paddle.tensor.linalg.matmul||api:paddle.tensor.linalg.transpose||api:paddle.tensor.manipulation.reshape from base import * # noqa: F403 -from paddle.static import InputSpec - class LayerCase(paddle.nn.Layer): def __init__(self): diff --git a/test/deprecated/legacy_test/test_uniform_random_bf16_op.py b/test/legacy_test/test_uniform_random_bf16_op.py similarity index 100% rename from test/deprecated/legacy_test/test_uniform_random_bf16_op.py rename to test/legacy_test/test_uniform_random_bf16_op.py diff --git a/test/deprecated/legacy_test/test_uniform_random_op.py b/test/legacy_test/test_uniform_random_op.py similarity index 94% rename from test/deprecated/legacy_test/test_uniform_random_op.py rename to test/legacy_test/test_uniform_random_op.py index b7424d4cf2e30..ac981f38b13c2 100644 --- a/test/deprecated/legacy_test/test_uniform_random_op.py +++ b/test/legacy_test/test_uniform_random_op.py @@ -12,13 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os import unittest import numpy as np from op import Operator from op_test import OpTest, convert_uint16_to_float -from test_attribute_var import UnittestBase import paddle from paddle import base @@ -733,46 +731,5 @@ def test_fixed_random_number(self): paddle.enable_static() -class TestUniformMinMaxTensor(UnittestBase): - def init_info(self): - self.shapes = [[2, 3, 4]] - self.save_path = os.path.join(self.temp_dir.name, self.path_prefix()) - - def test_static(self): - main_prog = Program() - startup_prog = Program() - with program_guard(main_prog, startup_prog): - fc = paddle.nn.Linear(4, 10) - x = paddle.randn([2, 3, 4]) - x.stop_gradient = False - feat = fc(x) # [2,3,10] - min_v = paddle.to_tensor([0.1]) - max_v = paddle.to_tensor([0.9]) - y = paddle.uniform([2, 3, 10], min=min_v, max=max_v) - z = paddle.uniform([2, 3, 10], min=min_v, max=max_v) - - out = feat + y + z - - sgd = paddle.optimizer.SGD() - sgd.minimize(paddle.mean(out)) - self.assertTrue(self.var_prefix() in str(main_prog)) - - exe = paddle.static.Executor() - exe.run(startup_prog) - res = exe.run(fetch_list=[out]) - np.testing.assert_array_equal(res[0].shape, [2, 3, 10]) - - paddle.static.save_inference_model(self.save_path, [x], [out], exe) - # Test for Inference Predictor - infer_out = self.infer_prog() - np.testing.assert_array_equal(res[0].shape, [2, 3, 10]) - - def path_prefix(self): - return 'uniform_random' - - def var_prefix(self): - return "Var[" - - if __name__ == "__main__": unittest.main() diff --git a/test/quantization/test_weight_only_linear.py b/test/quantization/test_weight_only_linear.py index e047cc92a93b1..367ebd1502989 100644 --- a/test/quantization/test_weight_only_linear.py +++ b/test/quantization/test_weight_only_linear.py @@ -675,8 +675,10 @@ def test_weight_quantize_and_dequantize_pir(self, algo='weight_only_int8'): res = exe.run(feed={}, fetch_list=[weight, dequant_weight]) np.testing.assert_allclose(res[0], res[1], rtol=1e-2, atol=1e-2) - def test_weight_quantize_and_dequantize_int4_pir(self): - self.test_weight_quantize_and_dequantize_pir(algo='weight_only_int4') + # [NOTE, wangbojun] currently, weight_only_int4 do not support gpu weight_quantize, + # which may cause error in pir test. + # def test_weight_quantize_and_dequantize_int4_pir(self): + # self.test_weight_quantize_and_dequantize_pir(algo='weight_only_int4') def test_weight_only_linear(self): out_expect = self.get_linear_out() @@ -722,13 +724,22 @@ def test_weightonly_linear_backward( * 1 / math.sqrt(4096) ) - - quant_weight, quant_scale = Q.weight_quantize( - x=weight.cuda(), algo=algo - ) - dequant_weight = Q.weight_dequantize( - quant_weight.cuda(), quant_scale, algo=algo - ) + if algo == "weight_only_int8": + quant_weight, quant_scale = Q.weight_quantize( + x=weight.cuda(), algo=algo + ) + dequant_weight = Q.weight_dequantize( + quant_weight.cuda(), quant_scale, algo=algo + ) + elif algo == "weight_only_int4": + quant_weight, quant_scale = Q.weight_quantize( + x=weight.cpu(), algo=algo + ) + quant_weight = quant_weight.cuda() + quant_scale = quant_scale.cuda() + dequant_weight = Q.weight_dequantize( + quant_weight, quant_scale, algo=algo + ) np.testing.assert_allclose(weight, dequant_weight, rtol=1e-2, atol=1e-2) quant_out = Q.weight_only_linear( @@ -738,11 +749,11 @@ def test_weightonly_linear_backward( weight_dtype=weight_dtype, ) out = paddle.matmul(x=x, y=weight) - np.testing.assert_allclose(quant_out, out, rtol=1e-3, atol=1e-3) + np.testing.assert_allclose(quant_out, out, rtol=1e-2, atol=1e-2) quant_out.backward() out.backward() - np.testing.assert_allclose(quant_x.grad, x.grad, rtol=1e-3, atol=1e-3) + np.testing.assert_allclose(quant_x.grad, x.grad, rtol=1e-2, atol=1e-2) def test_weightonly_linear_backward_int4(self): self.test_weightonly_linear_backward( diff --git a/test/sot/test_sot_dynamic_shape.py b/test/sot/test_sot_dynamic_shape.py index d0bb3623226c5..ceed37d64438a 100644 --- a/test/sot/test_sot_dynamic_shape.py +++ b/test/sot/test_sot_dynamic_shape.py @@ -39,6 +39,13 @@ def dynamic_int_input_func2(x, n): return x + n[1] +def dynamic_int_input_func3(x, n): + if n < 4: + return 1 + x = paddle.reshape(x, [n, -1]) + return (x + n) * 2 - 1, (-n + 1) * 2 - 1 + + class TestOpcodeExecutorDynamicShapeCache(TestCaseBase): def test_dynamic_int_input_cache_hit_case1(self): with with_allow_dynamic_shape_guard( @@ -68,6 +75,16 @@ def test_dynamic_int_input_cache_hit_case2(self): ) self.assertEqual(ctx.translate_count, 2) + def test_dynamic_int_input_cache_hit_case3(self): + with with_allow_dynamic_shape_guard( + True + ), test_instruction_translator_cache_context() as ctx: + for i in range(0, 6): + self.assert_results( + dynamic_int_input_func3, paddle.randn([3, 4, 5]), i + ) + self.assertEqual(ctx.translate_count, i + 1) + if __name__ == '__main__': unittest.main()