Merge pull request PaddlePaddle#45 from jiweibo/lite_engine

fluid-lite subgraph support content-dnn
jiweibo · Dec 26, 2019 · 1a7715d · 1a7715d
2 parents 3dfb181 + 20fa772
commit 1a7715d
Show file tree

Hide file tree

Showing 6 changed files with 44 additions and 2 deletions.
diff --git a/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc
@@ -221,9 +221,9 @@ void LiteSubgraphPass::SetUpEngine(
 
   bool use_gpu = Get<bool>("use_gpu");
   bool enable_int8 = Get<bool>("enable_int8");
-  lite_api::TargetType target_type = use_gpu ? TARGET(kCUDA) : TARGET(kHost);
+  lite_api::TargetType target_type = use_gpu ? TARGET(kCUDA) : TARGET(kX86);
   paddle::lite_api::PrecisionType precision_type =
-      enable_int8 ? PRECISION(kInt8) : PRECISION(kFloat);
+      enable_int8 ? PRECISION(kInt8) : PRECISION(kInt64);
   std::set<std::string> param_names_set(repetitive_params.begin(),
                                         repetitive_params.end());
   const_cast<std::vector<std::string>&>(repetitive_params)
@@ -232,6 +232,7 @@ void LiteSubgraphPass::SetUpEngine(
   config.model = program->Proto()->SerializeAsString();
   config.valid_places = {
       paddle::lite::Place({target_type, precision_type}),
+      paddle::lite::Place({target_type, PRECISION(kFloat)}),
       paddle::lite::Place({TARGET(kHost), PRECISION(kFloat)}),
   };
   if (dump_model) {

diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -46,6 +46,7 @@ struct AnalysisConfig {
   enum class Precision {
     kFloat32 = 0,
     kInt8,
+    kInt64,
     kHalf,
   };
 

diff --git a/paddle/fluid/inference/lite/engine.cc b/paddle/fluid/inference/lite/engine.cc
@@ -12,7 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#ifdef PADDLE_WITH_CUDA
+#define LITE_WITH_CUDA 1
+#endif
+
 #include "paddle/fluid/inference/lite/engine.h"
+#include "lite/core/context.h"
+#include "lite/core/device_info.h"
 
 namespace paddle {
 namespace inference {
@@ -34,7 +40,9 @@ paddle::lite::Predictor* EngineManager::Get(const std::string& name) const {
 paddle::lite::Predictor* EngineManager::Create(const std::string& name,
                                                const EngineConfig& cfg) {
   auto* p = new paddle::lite::Predictor();
+#ifdef PADDLE_WITH_CUDA
   paddle::lite::Env<TARGET(kCUDA)>::Init();
+#endif
   p->Build("", cfg.model, cfg.param, cfg.valid_places, cfg.neglected_passes,
            cfg.model_type, cfg.model_from_memory);
   engines_[name].reset(p);

diff --git a/paddle/fluid/inference/lite/op_teller.cc b/paddle/fluid/inference/lite/op_teller.cc
@@ -123,6 +123,22 @@ struct SimpleOpTeller : public Teller {
     ops_.insert("fusion_elementwise_max_activation");
     ops_.insert("fusion_elementwise_div_activation");
     ops_.insert("pad2d");
+    ops_.insert("sequence_reverse");
+    ops_.insert("lookup_table");
+    ops_.insert("search_seq_arithmetic");
+    ops_.insert("search_grnn");
+    ops_.insert("sequence_pool");
+    ops_.insert("search_group_padding");
+    ops_.insert("search_seq_fc");
+    ops_.insert("search_aligned_mat_mul");
+    ops_.insert("search_attention_padding_mask");
+    ops_.insert("search_seq_softmax");
+    ops_.insert("search_seq_depadding");
+    ops_.insert("match_matrix_tensor");
+    ops_.insert("var_conv_2d");
+    ops_.insert("sequence_concat");
+    ops_.insert("sequence_topk_avg_pooling");
+    ops_.insert("search_fc");
   }
 
   bool operator()(const std::string& op_type,

diff --git a/paddle/fluid/inference/lite/tensor_utils.cc b/paddle/fluid/inference/lite/tensor_utils.cc
@@ -29,6 +29,7 @@ using paddle::lite_api::DataLayoutType;
 template <typename DstLoD, typename SrcLoD>
 void SetLoD(DstLoD* dst, const SrcLoD& src) {
   dst->reserve(src.size());
+  dst->clear();
   for (auto&& v : src) {
     dst->emplace_back(v);
   }
@@ -41,6 +42,7 @@ template void SetLoD<framework::LoD, paddle::lite::LoD>(
 platform::Place GetNativePlace(const TargetType& type, int id = 0) {
   switch (type) {
     case TargetType::kHost:
+    case TargetType::kX86:
       return platform::CPUPlace();
     case TargetType::kCUDA:
       return platform::CUDAPlace(id);
@@ -65,6 +67,8 @@ PrecisionType GetLitePrecisionType(framework::proto::VarType::Type type) {
       return PrecisionType::kInt8;
     case framework::proto::VarType_Type_INT32:
       return PrecisionType::kInt32;
+    case framework::proto::VarType_Type_INT64:
+      return PrecisionType::kInt64;
     default:
       LOG(FATAL) << "Error precision type.";
       return PrecisionType::kUnk;
@@ -80,6 +84,8 @@ framework::proto::VarType::Type GetNativePrecisionType(
       return framework::proto::VarType_Type_INT8;
     case PrecisionType::kInt32:
       return framework::proto::VarType_Type_INT32;
+    case PrecisionType::kInt64:
+      return framework::proto::VarType_Type_INT64;
     default:
       LOG(FATAL) << "Error precision type.";
       return static_cast<framework::proto::VarType::Type>(-1);

diff --git a/paddle/fluid/operators/lite/lite_engine_op.h b/paddle/fluid/operators/lite/lite_engine_op.h
@@ -77,6 +77,7 @@ class LiteEngineOp : public framework::OperatorBase {
           inference::analysis::GetFromScope<framework::LoDTensor>(scope,
                                                                   in_names_[i]);
       paddle::lite::Tensor *dst_t = engine_->GetInput(i);
+      VLOG(3) << "fluid -> lite: " << in_names_[i];
       inference::lite::utils::TensorCopyAsync(dst_t, src_t, *ctx);
     }
 #ifdef PADDLE_WITH_CUDA
@@ -85,14 +86,23 @@ class LiteEngineOp : public framework::OperatorBase {
           static_cast<const platform::CUDADeviceContext *>(ctx)->stream());
     }
 #endif
+    VLOG(3) << "lite engine run";
     engine_->Run();
+    VLOG(3) << "lite engine run done";
     for (size_t i = 0; i < out_names_.size(); i++) {
       const paddle::lite::Tensor &src_t = *(engine_->GetOutput(i));
       framework::LoDTensor *dst_t =
           &inference::analysis::GetFromScope<framework::LoDTensor>(
               scope, out_names_[i]);
+      VLOG(3) << "lite -> fluid: " << out_names_[i];
       inference::lite::utils::TensorCopyAsync(dst_t, src_t, *ctx);
     }
+#ifdef PADDLE_WITH_CUDA
+    if (platform::is_gpu_place(dev_place)) {
+      platform::GpuStreamSync(
+          static_cast<const platform::CUDADeviceContext *>(ctx)->stream());
+    }
+#endif
   }
 };
-Original file line number
+Diff line change
@@ Expand Up / @@ -46,6 +46,7 @@ struct AnalysisConfig { @@
       enum class Precision {
         kFloat32 = 0,
         kInt8,
+        kInt64,
         kHalf,
       };
@@ Expand Down @@