Fix

PaddlePaddle · May 30, 2024 · b56d976 · b56d976
2 parents 4c53647 + d3964c5
commit b56d976
Show file tree

Hide file tree

Showing 109 changed files with 2,330 additions and 1,842 deletions.
diff --git a/.clang-tidy b/.clang-tidy
@@ -70,7 +70,7 @@ clang-analyzer-core.uninitialized.Assign,
 clang-analyzer-cplusplus.InnerPointer,
 -clang-analyzer-cplusplus.Move,
 -clang-analyzer-cplusplus.NewDelete,
--clang-analyzer-cplusplus.NewDeleteLeaks,
+clang-analyzer-cplusplus.NewDeleteLeaks,
 -clang-analyzer-cplusplus.PureVirtualCall,
 -clang-analyzer-cplusplus.SelfAssignment,
 -clang-analyzer-cplusplus.SmartPtr,

diff --git a/paddle/cinn/hlir/framework/pir/op_lowering_impl.cc b/paddle/cinn/hlir/framework/pir/op_lowering_impl.cc
@@ -101,6 +101,14 @@ std::shared_ptr<GroupInfo> OpLowererImpl::GetGroupInfo(
   for (auto& val : group->output_values()) {
     group_info->direct_output_var_names.insert(ValueName(val));
   }
+
+  group->WalkOps([&group_info](::pir::Operation* op) {
+    if (CompatibleInfo::OpKind(*op) == OpPatternKind::kReduction) {
+      group_info->raw_reduce_axis = cinn::fusion::GetReduceAxisIdx(op);
+      group_info->raw_data_rank =
+          cinn::fusion::GetCompitableRank(op->operand_source(0));
+    }
+  });
   return group_info;
 }
 

diff --git a/paddle/cinn/hlir/framework/pir/op_lowering_impl.h b/paddle/cinn/hlir/framework/pir/op_lowering_impl.h
@@ -51,6 +51,8 @@ typedef bool (OpLowererImpl::*ScheduleDetermineFunction)(::pir::Operation*);
 struct GroupInfo {
   std::vector<int64_t> data_space;
   std::vector<int64_t> reduce_axis;
+  int64_t raw_data_rank;
+  std::vector<int64_t> raw_reduce_axis;
   std::set<std::string> reduce_var_names;
   std::set<std::string> shared_var_names;
   std::set<std::string> direct_output_var_names;

diff --git a/paddle/cinn/ir/group_schedule/config/filedatabase.cc b/paddle/cinn/ir/group_schedule/config/filedatabase.cc
@@ -14,13 +14,25 @@
 
 #include "paddle/cinn/ir/group_schedule/config/filedatabase.h"
 
+#include <sys/stat.h>
+
 #include <google/protobuf/text_format.h>
 #include <google/protobuf/util/json_util.h>
 #include <fstream>
 
 #include "paddle/cinn/utils/multi_threading.h"
 
+#define MKDIR(path) mkdir(path, S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH)
 PD_DECLARE_string(cinn_tile_config_filename_label);
+static bool PathExists(const std::string& path) {
+  struct stat statbuf;
+  if (stat(path.c_str(), &statbuf) != -1) {
+    if (S_ISDIR(statbuf.st_mode)) {
+      return true;
+    }
+  }
+  return false;
+}
 
 namespace cinn {
 namespace ir {

diff --git a/paddle/cinn/ir/group_schedule/config/filedatabase.h b/paddle/cinn/ir/group_schedule/config/filedatabase.h
@@ -16,7 +16,6 @@
 
 #include "paddle/cinn/ir/group_schedule/config/database.h"
 #include "paddle/cinn/ir/group_schedule/config/tileconfig_desc.pb.h"
-#include "paddle/fluid/inference/analysis/helper.h"
 namespace cinn {
 namespace ir {
 

diff --git a/paddle/cinn/ir/group_schedule/config/group_tile_config.cc b/paddle/cinn/ir/group_schedule/config/group_tile_config.cc
@@ -37,16 +37,24 @@ std::shared_ptr<ScheduleConfig::BaseInfo> InitBasicInfo(
   base_info->broadcast_info = group_info->broadcast_info;
   base_info->broadcast_to_elementwise = group_info->broadcast_to_elementwise;
   base_info->data_rank = group_info->data_space.size();
+  base_info->raw_data_rank = group_info->raw_data_rank;
 
   std::set<int64_t> reduce_dim_loc;
-  for (auto dim : group_info->reduce_axis) {
+  for (int64_t dim : group_info->reduce_axis) {
     if (dim < 0) {
       dim += base_info->data_rank;
     }
     base_info->reduce_axis.push_back(dim);
     reduce_dim_loc.insert(dim);
   }
 
+  for (int64_t dim : group_info->raw_reduce_axis) {
+    if (dim < 0) {
+      dim += base_info->data_rank;
+    }
+    base_info->raw_reduce_axis.push_back(dim);
+  }
+
   base_info->spatial_numel = 1;
   base_info->reduce_numel = 1;
   for (int64_t i = 0; i < base_info->data_rank; ++i) {

diff --git a/paddle/cinn/ir/group_schedule/config/group_tile_config.h b/paddle/cinn/ir/group_schedule/config/group_tile_config.h
@@ -29,7 +29,9 @@ namespace ir {
 struct ScheduleConfig {
   struct BaseInfo {
     std::vector<int64_t> reduce_axis;
+    std::vector<int64_t> raw_reduce_axis;
     int64_t data_rank;
+    int64_t raw_data_rank;
     int64_t reduce_numel;
     int64_t spatial_numel;
     bool has_dynamic_spatial{false};

diff --git a/paddle/common/flags.cc b/paddle/common/flags.cc
@@ -1724,3 +1724,7 @@ PHI_DEFINE_EXPORTED_string(cusolver_dir,  // NOLINT
 PHI_DEFINE_EXPORTED_string(cusparse_dir,  // NOLINT
                            "",
                            "Specify path for loading libcusparse.so.*.");
+PHI_DEFINE_EXPORTED_string(
+    win_cuda_bin_dir,  // NOLINT
+    "",
+    "Specify path for loading *.dll about cuda on windows");
diff --git a/paddle/fluid/distributed/ps/table/common_graph_table.cc b/paddle/fluid/distributed/ps/table/common_graph_table.cc
@@ -14,7 +14,7 @@
 
 #include "paddle/fluid/distributed/ps/table/common_graph_table.h"
 
-#include <time.h>
+#include <ctime>
 
 #include <algorithm>
 #include <chrono>

diff --git a/paddle/fluid/distributed/ps/table/graph/graph_node.cc b/paddle/fluid/distributed/ps/table/graph/graph_node.cc
@@ -74,7 +74,12 @@ void GraphNode::build_sampler(std::string sample_type) {
   } else if (sample_type == "weighted") {
     sampler = new WeightedSampler();
   }
-  sampler->build(edges);
+  if (sampler != nullptr) {
+    sampler->build(edges);
+  } else {
+    throw std::runtime_error("Failed to create a sampler of type: " +
+                             sample_type);
+  }
 }
 void FeatureNode::to_buffer(char* buffer, bool need_feature) {
   memcpy(buffer, &id, id_size);

diff --git a/paddle/fluid/framework/new_executor/interpreter/dependency_builder.cc b/paddle/fluid/framework/new_executor/interpreter/dependency_builder.cc
@@ -144,6 +144,10 @@ void DependencyBuilder::ShareDependencyFrom(const DependencyBuilder& src) {
   is_build_ = true;
 }
 
+const std::string& DependencyBuilder::GetInstructionName(size_t op_idx) const {
+  return (*instructions_)[op_idx].OpBase()->Type();
+}
+
 const std::map<size_t, std::set<size_t>>& DependencyBuilder::OpDownstreamMap()
     const {
   PADDLE_ENFORCE_EQ(
@@ -340,6 +344,13 @@ void DependencyBuilder::AddDependencyForReadOp() {
 void DependencyBuilder::AddDependencyForSequentialRun() {
   size_t dependence_op_idx = ULLONG_MAX;
   for (size_t op_idx = 0; op_idx < op_num_; ++op_idx) {
+    if (this->GetInstructionName(op_idx) == "pd_op.full_int_array") {
+      VLOG(8) << "Skip adding dependency for sequential run: "
+              << dependence_op_idx << "->" << op_idx << " "
+              << this->GetInstructionName(dependence_op_idx) << "->"
+              << this->GetInstructionName(op_idx);
+      continue;
+    }
     if (dependence_op_idx != ULLONG_MAX) {
       AddDownstreamOp(dependence_op_idx, op_idx);
     }
@@ -571,6 +582,11 @@ PirDependencyBuilder::PirDependencyBuilder() : instructions_() {
   op_happens_before_ = std::make_shared<std::vector<std::vector<bool>>>();
 }
 
+const std::string& PirDependencyBuilder::GetInstructionName(
+    size_t op_idx) const {
+  return (instructions_)[op_idx]->Name();
+}
+
 void PirDependencyBuilder::AddDependencyForCommunicationOp() {
   size_t dependence_op_idx = ULLONG_MAX;
   for (size_t op_idx = 0; op_idx < op_num_; ++op_idx) {

diff --git a/paddle/fluid/framework/new_executor/interpreter/dependency_builder.h b/paddle/fluid/framework/new_executor/interpreter/dependency_builder.h
@@ -63,6 +63,8 @@ class DependencyBuilder {
            &((*instructions_)[op2].DeviceContext());
   }
 
+  virtual const std::string& GetInstructionName(size_t op_idx) const;
+
  protected:
   void AddDependencyForCoalesceTensorOp();
   virtual void AddDependencyForCommunicationOp();
@@ -127,6 +129,8 @@ class PirDependencyBuilder : public DependencyBuilder {
            &((instructions_)[op2]->DeviceContext());
   }
 
+  const std::string& GetInstructionName(size_t op_idx) const override;
+
  private:
   void AddDependencyForCommunicationOp() override;
 

diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -114,6 +114,7 @@
 
 #include "paddle/common/flags.h"
 #include "paddle/fluid/ir_adaptor/translator/translate.h"
+#include "paddle/fluid/pir/transforms/general/common_subexpression_elimination_pass.h"
 #include "paddle/fluid/pir/transforms/general/constant_folding_pass.h"
 #include "paddle/fluid/pir/transforms/general/dead_code_elimination_pass.h"
 #include "paddle/fluid/pir/transforms/general/inplace_pass.h"
@@ -906,7 +907,7 @@ bool AnalysisPredictor::PrepareExecutor() {
         ctx->GetOrRegisterDialect<cinn::dialect::OperatorDialect>();
         ctx->GetOrRegisterDialect<pir::shape::ShapeDialect>();
         auto pass_manager = std::make_shared<::pir::PassManager>(
-            ::pir::IrContext::Instance(), 2);
+            ::pir::IrContext::Instance(), config_.pm_opt_level_);
         if (!config_.glog_info_disabled()) {
           pass_manager->EnablePrintStatistics();
         }
@@ -999,7 +1000,7 @@ bool AnalysisPredictor::PrepareExecutor() {
       // Apply some basic passes required by the framework
       ::pir::PassManager basic_pass_pm(::pir::IrContext::Instance(),
                                        config_.pm_opt_level_);
-
+      basic_pass_pm.AddPass(::pir::CreateCommonSubexpressionEliminationPass());
       auto params_sync_among_devices_pass =
           ::pir::CreateParamsSyncAmongDevicesPass();
       params_sync_among_devices_pass->SetNotOwned(pir::Pass::kPlaceAttr,

diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -610,6 +610,7 @@ const std::vector<std::string> kPirGpuPasses{
     "embedding_eltwise_layernorm_fuse_pass",
     "fused_flash_attn_pass",
     "multihead_matmul_fuse_pass",
+    "fused_weight_only_linear_pass",
     "matmul_add_act_fuse_pass",
     "fc_elementwise_layernorm_fuse_pass",
     "matmul_scale_fuse_pass",

diff --git a/paddle/fluid/inference/tensorrt/convert/bilinear_interp_v2_op.cc b/paddle/fluid/inference/tensorrt/convert/bilinear_interp_v2_op.cc
@@ -49,7 +49,11 @@ class BilinearInterpolateV2OpConverter : public OpConverter {
 
     auto layer = TRT_ENGINE_ADD_LAYER(engine_, Resize, *input);
     if (align_mode == 0) {
+#if IS_TRT_VERSION_GE(8600)
+      layer->setResizeMode(nvinfer1::InterpolationMode::kLINEAR);
+#else
       layer->setResizeMode(nvinfer1::ResizeMode::kLINEAR);
+#endif
     }
 #if IS_TRT_VERSION_GE(8000)
     if (align_corners == true) {

diff --git a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
@@ -221,7 +221,7 @@ class Conv2dOpConverter : public OpConverter {
           return layer;
         },
         [](nvinfer1::IConvolutionLayer* layer, nvinfer1::DimsHW& dilations) {
-          layer->setDilation(dilations);
+          layer->setDilationNd(dilations);
         },
         "conv2d");
   }
@@ -245,7 +245,7 @@ class Deconv2dOpConverter : public OpConverter {
             TensorRTEngine::Weight& weight,
             TensorRTEngine::Weight& bias) -> nvinfer1::IDeconvolutionLayer* {
           auto* layer = TRT_ENGINE_ADD_LAYER(engine_,
-                                             Deconvolution,
+                                             DeconvolutionNd,
                                              *inputs,
                                              n_output,
                                              ksize,

diff --git a/paddle/fluid/inference/tensorrt/convert/cross_multihead_matmul_op.cc b/paddle/fluid/inference/tensorrt/convert/cross_multihead_matmul_op.cc
@@ -83,10 +83,21 @@ class CrossMultiheadMatMulOpConverter : public OpConverter {
     nvinfer1::Weights weight_q{nvinfer1::DataType::kFLOAT,
                                static_cast<void*>(weight_q_data),
                                static_cast<int32_t>(weight_q_t->numel())};
+    nvinfer1::ITensor* input_q_shape_tensor = Shape(input_q);
+#if IS_TRT_VERSION_GE(8600)
+    auto* fc_q_weight_layer = TRT_ENGINE_ADD_LAYER(
+        engine_, Constant, nvinfer1::Dims3(1, n_q, hidden_in_q), weight_q);
+    auto* fc_q_layer =
+        TRT_ENGINE_ADD_LAYER(engine_,
+                             MatrixMultiply,
+                             *input_q,
+                             nvinfer1::MatrixOperation::kNONE,
+                             *fc_q_weight_layer->getOutput(0),
+                             nvinfer1::MatrixOperation::kTRANSPOSE);
+#else
     nvinfer1::Weights bias_q{};
     // add shuffle for FullyConnected layer
     std::vector<nvinfer1::ITensor*> reshape_before_fc_q_shape_tensor;
-    nvinfer1::ITensor* input_q_shape_tensor = Shape(input_q);
     for (int i = 0; i < 5; i++) {
       reshape_before_fc_q_shape_tensor.push_back(Add1DConstantLayer(1));
     }
@@ -109,6 +120,7 @@ class CrossMultiheadMatMulOpConverter : public OpConverter {
                                       n_q,
                                       weight_q,
                                       bias_q);
+#endif
     fc_q_layer->setName(
         ("multihead_matmul_fc_q(Output: " + output_name + ")").c_str());
 
@@ -184,11 +196,22 @@ class CrossMultiheadMatMulOpConverter : public OpConverter {
     nvinfer1::Weights weight_kv{nvinfer1::DataType::kFLOAT,
                                 static_cast<void*>(weight_kv_data),
                                 static_cast<int32_t>(weight_kv_t->numel())};
-    nvinfer1::Weights bias_kv{};
 
+    nvinfer1::ITensor* input_shape_tensor = Shape(input_kv);
+#if IS_TRT_VERSION_GE(8600)
+    auto* fc_weight_layer = TRT_ENGINE_ADD_LAYER(
+        engine_, Constant, nvinfer1::Dims3(1, n, hidden_in), weight_kv);
+    auto* fc_layer =
+        TRT_ENGINE_ADD_LAYER(engine_,
+                             MatrixMultiply,
+                             *input_q,
+                             nvinfer1::MatrixOperation::kNONE,
+                             *fc_weight_layer->getOutput(0),
+                             nvinfer1::MatrixOperation::kTRANSPOSE);
+#else
+    nvinfer1::Weights bias_kv{};
     // add shuffle for FullyConnected layer
     std::vector<nvinfer1::ITensor*> reshape_before_fc_shape_tensor;
-    nvinfer1::ITensor* input_shape_tensor = Shape(input_kv);
     for (int i = 0; i < 5; i++) {
       reshape_before_fc_shape_tensor.push_back(Add1DConstantLayer(1));
     }
@@ -211,6 +234,7 @@ class CrossMultiheadMatMulOpConverter : public OpConverter {
                                     n,
                                     weight_kv,
                                     bias_kv);
+#endif
     fc_layer->setName(
         ("multihead_matmul_fc(Output: " + output_name + ")").c_str());
 

diff --git a/paddle/fluid/inference/tensorrt/convert/flash_multihead_matmul_op.cc b/paddle/fluid/inference/tensorrt/convert/flash_multihead_matmul_op.cc
@@ -110,6 +110,17 @@ class FlashMultiheadMatMulOpConverter : public OpConverter {
       nvinfer1::Weights weight{nvinfer1::DataType::kFLOAT,
                                static_cast<void*>(weight_data),
                                static_cast<int32_t>(weight_t->numel())};
+#if IS_TRT_VERSION_GE(8600)
+      auto* fc_weight_layer = TRT_ENGINE_ADD_LAYER(
+          engine_, Constant, nvinfer1::Dims3(1, n, hidden_in), weight);
+      auto* fc_layer =
+          TRT_ENGINE_ADD_LAYER(engine_,
+                               MatrixMultiply,
+                               *input,
+                               nvinfer1::MatrixOperation::kNONE,
+                               *fc_weight_layer->getOutput(0),
+                               nvinfer1::MatrixOperation::kTRANSPOSE);
+#else
       nvinfer1::Weights bias{};
       // add shuffle for FullyConnected layer
       std::vector<nvinfer1::ITensor*> reshape_before_fc_shape_tensor;
@@ -138,6 +149,7 @@ class FlashMultiheadMatMulOpConverter : public OpConverter {
                                       n,
                                       weight,
                                       bias);
+#endif
       fc_layer->setName(
           ("multihead_matmul_fc(Output: " + output_name + ")").c_str());
       // add shuffle for fc layer
@@ -299,6 +311,20 @@ class FlashMultiheadMatMulOpConverter : public OpConverter {
         nvinfer1::Weights weight{nvinfer1::DataType::kFLOAT,
                                  static_cast<void*>(weight_data),
                                  static_cast<int32_t>(weight_tensor->numel())};
+#if IS_TRT_VERSION_GE(8600)
+        auto* qkv_fc_weight_layer =
+            TRT_ENGINE_ADD_LAYER(engine_,
+                                 Constant,
+                                 nvinfer1::Dims3(1, hidden_out, hidden_out),
+                                 weight);
+        qkv_fc_layers[i] =
+            TRT_ENGINE_ADD_LAYER(engine_,
+                                 MatrixMultiply,
+                                 *input,
+                                 nvinfer1::MatrixOperation::kNONE,
+                                 *qkv_fc_weight_layer->getOutput(0),
+                                 nvinfer1::MatrixOperation::kTRANSPOSE);
+#else
         nvinfer1::Weights bias{};
         qkv_fc_layers[i] =
             TRT_ENGINE_ADD_LAYER(engine_,
@@ -307,6 +333,7 @@ class FlashMultiheadMatMulOpConverter : public OpConverter {
                                  hidden_out,
                                  weight,
                                  bias);
+#endif
         qkv_fc_layers[i]->setName(("multihead_matmul_fc_" + std::to_string(i) +
                                    "_(Output: " + output_name + ")")
                                       .c_str());

diff --git a/paddle/fluid/inference/tensorrt/convert/grid_sampler_op.cc b/paddle/fluid/inference/tensorrt/convert/grid_sampler_op.cc
@@ -48,9 +48,17 @@ class GridSamplerOpConverter : public OpConverter {
     nvinfer1::InterpolationMode interpolationMode{
         nvinfer1::InterpolationMode::kNEAREST};
     if (mode == "nearest") {
+#if IS_TRT_VERSION_GE(8600)
+      interpolationMode = nvinfer1::InterpolationMode::kNEAREST;
+#else
       interpolationMode = nvinfer1::ResizeMode::kNEAREST;
+#endif
     } else if (mode == "bilinear") {
+#if IS_TRT_VERSION_GE(8600)
+      interpolationMode = nvinfer1::InterpolationMode::kLINEAR;
+#else
       interpolationMode = nvinfer1::ResizeMode::kLINEAR;
+#endif
     }
 
     nvinfer1::SampleMode sampleMode{nvinfer1::SampleMode::kFILL};