PaddlePaddle · TeslaZhao · Jul 5, 2021 · Jun 30, 2021 · Jul 2, 2021 · Jul 2, 2021
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -30,7 +30,7 @@ find_package(Threads REQUIRED)
 find_package(CUDA QUIET)
 
 include(simd)
-
+# SET(CMAKE_BUILD_TYPE "Debug")
 # CMAKE_BUILD_TYPE
 if(NOT CMAKE_BUILD_TYPE)
     set(CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE STRING

diff --git a/README.md b/README.md
@@ -175,9 +175,12 @@ python3 -m paddle_serving_server.serve --model uci_housing_model --thread 10 --p
 
 | Argument                                       | Type | Default | Description                                           |
 | ---------------------------------------------- | ---- | ------- | ----------------------------------------------------- |
-| `thread`                                       | int  | `4`     | Concurrency of current service                        |
+| `thread`                                       | int  | `2`     | Number of brpc service thread                         |
+| `op_num`                                       | int[]| `0`     | Thread Number for each model in asynchronous mode     |
+| `op_max_batch`                                 | int[]| `0`     | Batch Number for each model in asynchronous mode      |
+| `gpu_ids`                                      | str[]| `"-1"`  | Gpu card id for each model                            |
 | `port`                                         | int  | `9292`  | Exposed port of current service to users              |
-| `model`                                        | str  | `""`    | Path of paddle model directory to be served           |
+| `model`                                        | str[]| `""`    | Path of paddle model directory to be served           |
 | `mem_optim_off`                                | -    | -       | Disable memory / graphic memory optimization          |
 | `ir_optim`                                     | bool | False   | Enable analysis and optimization of calculation graph |
 | `use_mkl` (Only for cpu version)               | -    | -       | Run inference with MKL                                |
@@ -186,7 +189,24 @@ python3 -m paddle_serving_server.serve --model uci_housing_model --thread 10 --p
 | `use_xpu`                                      | -    | -       | Run PaddleLite inference with Baidu Kunlun XPU        |
 | `precision`                                    | str  | FP32    | Precision Mode, support FP32, FP16, INT8              |
 | `use_calib`                                    | bool | False   | Only for deployment with TensorRT                     |
-
+| `gpu_multi_stream`                             | bool | False   | EnableGpuMultiStream to get larger QPS                |
+
+#### Description of asynchronous model
+    Asynchronous mode is suitable for 1. When the number of requests is very large, 2. When multiple models are concatenated and you want to specify the concurrency number of each model.
+    Asynchronous mode helps to improve the throughput (QPS) of service, but for a single request, the delay will increase slightly.
+    In asynchronous mode, each model will start n threads of the number you specify, and each thread contains a model instance. In other words, each model is equivalent to a thread pool containing N threads, and the task is taken from the task queue of the thread pool to execute.
+    In asynchronous mode, each RPC server thread is only responsible for putting the request into the task queue of the model thread pool. After the task is executed, the completed task is removed from the task queue.
+    In the above table, the number of RPC server threads is specified by --thread, and the default value is 2.
+    --op_num specifies the number of threads in the thread pool of each model. The default value is 0, indicating that asynchronous mode is not used.
+    --op_max_batch specifies the number of batches for each model. The default value is 32. It takes effect when --op_num is not 0.
+#### When you want a model to use multiple GPU cards.
+python3 -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9292 --gpu_ids 0,1,2
+#### When you want 2 models.
+python3 -m paddle_serving_server.serve --model uci_housing_model_1 uci_housing_model_2 --thread 10 --port 9292
+#### When you want 2 models, and want each of them use multiple GPU cards.
+python3 -m paddle_serving_server.serve --model uci_housing_model_1 uci_housing_model_2 --thread 10 --port 9292 --gpu_ids 0,1 1,2
+#### When a service contains two models, and each model needs to specify multiple GPU cards, and needs asynchronous mode, each model specifies different concurrency number.
+python3 -m paddle_serving_server.serve --model uci_housing_model_1 uci_housing_model_2 --thread 10 --port 9292 --gpu_ids 0,1 1,2 --op_num 4 8
 </center>
 
 ```python

diff --git a/README_CN.md b/README_CN.md
@@ -172,19 +172,40 @@ python3 -m paddle_serving_server.serve --model uci_housing_model --thread 10 --p
 ```
 <center>
 
-| Argument                                       | Type | Default | Description                                            |
-| ---------------------------------------------- | ---- | ------- | ------------------------------------------------------ |
-| `thread`                                       | int  | `4`     | Concurrency of current service                         |
-| `port`                                         | int  | `9292`  | Exposed port of current service to users               |
-| `name`                                         | str  | `""`    | Service name, can be used to generate HTTP request url |
-| `model`                                        | str  | `""`    | Path of paddle model directory to be served            |
-| `mem_optim_off`                                | -    | -       | Disable memory optimization                            |
-| `ir_optim`                                     | bool | False   | Enable analysis and optimization of calculation graph  |
-| `use_mkl` (Only for cpu version)               | -    | -       | Run inference with MKL                                 |
-| `use_trt` (Only for Cuda>=10.1 version)        | -    | -       | Run inference with TensorRT                            |
-| `use_lite` (Only for Intel x86 CPU or ARM CPU) | -    | -       | Run PaddleLite inference                               |
-| `use_xpu`                                      | -    | -       | Run PaddleLite inference with Baidu Kunlun XPU         |
-| `precision`                                    | str  | FP32    | Precision Mode, support FP32, FP16, INT8               |
+| Argument                                       | Type | Default | Description                                           |
+| ---------------------------------------------- | ---- | ------- | ----------------------------------------------------- |
+| `thread`                                       | int  | `2`     | Number of brpc service thread                         |
+| `op_num`                                       | int[]| `0`     | Thread Number for each model in asynchronous mode     |
+| `op_max_batch`                                 | int[]| `32`    | Batch Number for each model in asynchronous mode      |
+| `gpu_ids`                                      | str[]| `"-1"`  | Gpu card id for each model                            |
+| `port`                                         | int  | `9292`  | Exposed port of current service to users              |
+| `model`                                        | str[]| `""`    | Path of paddle model directory to be served           |
+| `mem_optim_off`                                | -    | -       | Disable memory / graphic memory optimization          |
+| `ir_optim`                                     | bool | False   | Enable analysis and optimization of calculation graph |
+| `use_mkl` (Only for cpu version)               | -    | -       | Run inference with MKL                                |
+| `use_trt` (Only for trt version)               | -    | -       | Run inference with TensorRT                           |
+| `use_lite` (Only for Intel x86 CPU or ARM CPU) | -    | -       | Run PaddleLite inference                              |
+| `use_xpu`                                      | -    | -       | Run PaddleLite inference with Baidu Kunlun XPU        |
+| `precision`                                    | str  | FP32    | Precision Mode, support FP32, FP16, INT8              |
+| `use_calib`                                    | bool | False   | Only for deployment with TensorRT                     |
+| `gpu_multi_stream`                             | bool | False   | EnableGpuMultiStream to get larger QPS                |
+
+#### 异步模型的说明
+    异步模式适用于1、请求数量非常大的情况，2、多模型串联，想要分别指定每个模型的并发数的情况。
+    异步模式有助于提高Service服务的吞吐（QPS），但对于单次请求而言，时延会有少量增加。
+    异步模式中，每个模型会启动您指定个数的N个线程，每个线程中包含一个模型实例，换句话说每个模型相当于包含N个线程的线程池，从线程池的任务队列中取任务来执行。
+    异步模式中，各个RPC Server的线程只负责将Request请求放入模型线程池的任务队列中，等任务被执行完毕后，再从任务队列中取出已完成的任务。
+    上表中通过 --thread 10 指定的是RPC Server的线程数量，默认值为2，--op_num 指定的是各个模型的线程池中线程数N，默认值为0，表示不使用异步模式。
+    --op_max_batch 指定的各个模型的batch数量，默认值为32，该参数只有当--op_num不为0时才生效。
+
+#### 当您的某个模型想使用多张GPU卡部署时.
+python3 -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9292 --gpu_ids 0,1,2
+#### 当您的一个服务包含两个模型部署时.
+python3 -m paddle_serving_server.serve --model uci_housing_model_1 uci_housing_model_2 --thread 10 --port 9292
+#### 当您的一个服务包含两个模型，且每个模型都需要指定多张GPU卡部署时.
+python3 -m paddle_serving_server.serve --model uci_housing_model_1 uci_housing_model_2 --thread 10 --port 9292 --gpu_ids 0,1 1,2
+#### 当您的一个服务包含两个模型，且每个模型都需要指定多张GPU卡，且需要异步模式每个模型指定不同的并发数时.
+python3 -m paddle_serving_server.serve --model uci_housing_model_1 uci_housing_model_2 --thread 10 --port 9292 --gpu_ids 0,1 1,2 --op_num 4 8
 
 </center>
 

diff --git a/core/configure/proto/server_configure.proto b/core/configure/proto/server_configure.proto
@@ -21,11 +21,12 @@ message EngineDesc {
   required string reloadable_meta = 3;
   required string reloadable_type = 4;
   required string model_dir = 5;
-  required int32 runtime_thread_num = 6;
-  required int32 batch_infer_size = 7;
-  required int32 enable_batch_align = 8;
-  optional string version_file = 9;
-  optional string version_type = 10;
+  repeated int32 gpu_ids = 6;
+  required int32 runtime_thread_num = 7;
+  required int32 batch_infer_size = 8;
+  required int32 enable_batch_align = 9;
+  optional string version_file = 10;
+  optional string version_type = 11;
 
   /*
    * Sparse Parameter Service type. Valid types are:
@@ -38,16 +39,17 @@ message EngineDesc {
     LOCAL = 1;
     REMOTE = 2;
   }
-  optional SparseParamServiceType sparse_param_service_type = 11;
-  optional string sparse_param_service_table_name = 12;
-  optional bool enable_memory_optimization = 13;
-  optional bool enable_ir_optimization = 14;
-  optional bool use_trt = 15;
-  optional bool use_lite = 16;
-  optional bool use_xpu = 17;
-  optional bool use_gpu = 18;
-  optional bool combined_model = 19;
-  optional bool encrypted_model = 20;
+  optional SparseParamServiceType sparse_param_service_type = 12;
+  optional string sparse_param_service_table_name = 13;
+  optional bool enable_memory_optimization = 14;
+  optional bool enable_ir_optimization = 15;
+  optional bool use_trt = 16;
+  optional bool use_lite = 17;
+  optional bool use_xpu = 18;
+  optional bool use_gpu = 19;
+  optional bool combined_model = 20;
+  optional bool encrypted_model = 21;
+  optional bool gpu_multi_stream = 22;
 };
 
 // model_toolkit conf

diff --git a/core/general-client/src/general_model.cpp b/core/general-client/src/general_model.cpp
@@ -166,6 +166,8 @@ int PredictorClient::numpy_predict(
   batch_size = batch_size > string_feed_batch.size() ? batch_size
                                                      : string_feed_batch.size();
   VLOG(2) << "batch size: " << batch_size;
+  // batch_size must be 1, cause batch is already in Tensor.
+  // I suggest to remove the outside vector<>.
   predict_res_batch.clear();
   Timer timeline;
   int64_t preprocess_start = timeline.TimeStampUS();
@@ -188,6 +190,8 @@ int PredictorClient::numpy_predict(
   }
 
   int vec_idx = 0;
+  // batch_size can only be 1, cause batch is already in Tensor.
+  // if batch_size is not 1, error will occur in C++ part.
   for (int bi = 0; bi < batch_size; bi++) {
     VLOG(2) << "prepare batch " << bi;
     std::vector<Tensor *> tensor_vec;

diff --git a/core/general-server/op/general_reader_op.cpp b/core/general-server/op/general_reader_op.cpp
@@ -93,6 +93,9 @@ int GeneralReaderOp::inference() {
   res->SetLogId(log_id);
   Timer timeline;
   int64_t start = timeline.TimeStampUS();
+  // only get insts(0), cause batch is already in Tensor.
+  // req can only include 1 inst.
+  // var_num means the number of feed_var.
   int var_num = req->insts(0).tensor_array_size();
 
   VLOG(2) << "(logid=" << log_id << ") var num: " << var_num
@@ -178,7 +181,10 @@ int GeneralReaderOp::inference() {
     VLOG(2) << "(logid=" << log_id << ") tensor size for var[" << i
             << "]: " << data_len;
     databuf_size = data_len * elem_size;
-    out->at(i).data.Resize(databuf_size);
+    void *databuf_char = MempoolWrapper::instance().malloc(databuf_size);
+    paddle::PaddleBuf paddleBuf(databuf_char, databuf_size);
+    out->at(i).data = paddleBuf;
+    // out->at(i).data.Resize(databuf_size);
     if (out->at(i).lod.size() > 0) {
       VLOG(2) << "(logid=" << log_id << ") var[" << i
               << "] has lod_tensor and len=" << out->at(i).lod[0].back();