Merge pull request #1264 from TeslaZhao/develop

python pipeline模式下提供mkldnn高性能推理
PaddlePaddle · May 27, 2021 · 8707874 · 8707874
2 parents d96e4b2 + 4045bde
commit 8707874
Show file tree

Hide file tree

Showing 7 changed files with 159 additions and 39 deletions.
diff --git a/python/examples/pipeline/ocr/benchmark.sh b/python/examples/pipeline/ocr/benchmark.sh
@@ -1,5 +1,5 @@
 export FLAGS_profile_pipeline=1
-alias python3="python3.6"
+alias python3="python3.7"
 modelname="ocr"
 
 # HTTP
@@ -11,11 +11,11 @@ rm -rf profile_log_$modelname
 
 echo "Starting HTTP Clients..."
 # Start a client in each thread, tesing the case of multiple threads.
-for thread_num in 1 2 4 8 12 16
+for thread_num in 1 2 4 6 8 12 16
 do
   for batch_size in 1
   do
-    echo '----$modelname thread num: $thread_num batch size: $batch_size mode:http ----' >>profile_log_$modelname
+    echo "----$modelname thread num: $thread_num batch size: $batch_size mode:http ----" >>profile_log_$modelname
     # Start one web service, If you start the service yourself, you can ignore it here.
     #python3 web_service.py >web.log 2>&1 &
     #sleep 3
@@ -51,7 +51,7 @@ sleep 3
 
 # Create yaml，If you already have the config.yaml, ignore it.
 #python3 benchmark.py yaml local_predictor 1 gpu
-rm -rf profile_log_$modelname
+#rm -rf profile_log_$modelname
 
 # Start a client in each thread, tesing the case of multiple threads.
 for thread_num in 1 2 4 6 8 12 16

diff --git a/python/examples/pipeline/ocr/config.yml b/python/examples/pipeline/ocr/config.yml
@@ -6,7 +6,7 @@ http_port: 9999
 
 #worker_num, 最大并发数。当build_dag_each_worker=True时, 框架会创建worker_num个进程，每个进程内构建grpcSever和DAG
 ##当build_dag_each_worker=False时，框架会设置主线程grpc线程池的max_workers=worker_num
-worker_num: 5
+worker_num: 20
 
 #build_dag_each_worker, False，框架在进程内创建一条DAG；True，框架会每个进程内创建多个独立的DAG
 build_dag_each_worker: false
@@ -26,7 +26,7 @@ dag:
 op:
     det:
         #并发数，is_thread_op=True时，为线程并发；否则为进程并发
-        concurrency: 2
+        concurrency: 6
 
         #当op配置没有server_endpoints时，从local_service_conf读取本地服务配置
         local_service_conf:
@@ -40,10 +40,19 @@ op:
             fetch_list: ["concat_1.tmp_0"]
 
             #计算硬件ID，当devices为""或不写时为CPU预测；当devices为"0", "0,1,2"时为GPU预测，表示使用的GPU卡
-            devices: "0"
+            devices: ""
+
+            #use_mkldnn
+            #use_mkldnn: True
+
+            #thread_num
+            thread_num: 2
+
+            #ir_optim
+            ir_optim: True
     rec:
         #并发数，is_thread_op=True时，为线程并发；否则为进程并发
-        concurrency: 2
+        concurrency: 3
 
         #超时时间, 单位ms
         timeout: -1
@@ -64,4 +73,13 @@ op:
             fetch_list: ["ctc_greedy_decoder_0.tmp_0", "softmax_0.tmp_0"] 
 
             #计算硬件ID，当devices为""或不写时为CPU预测；当devices为"0", "0,1,2"时为GPU预测，表示使用的GPU卡
-            devices: "0"
+            devices: ""
+
+            #use_mkldnn
+            #use_mkldnn: True
+
+            #thread_num
+            thread_num: 2
+
+            #ir_optim
+            ir_optim: True
diff --git a/python/examples/pipeline/simple_web_service/config.yml b/python/examples/pipeline/simple_web_service/config.yml
@@ -9,10 +9,14 @@ http_port: 18082
 dag:
     #op资源类型, True, 为线程模型；False，为进程模型
     is_thread_op: False
+
+    #tracer
+    tracer:
+        interval_s: 10
 op:
     uci:
         #并发数，is_thread_op=True时，为线程并发；否则为进程并发
-        concurrency: 2
+        concurrency: 1
 
         #当op配置没有server_endpoints时，从local_service_conf读取本地服务配置
         local_service_conf:
@@ -35,7 +39,10 @@ op:
             #precsion, 预测精度，降低预测精度可提升预测速度
             #GPU 支持: "fp32"(default), "fp16", "int8"；
             #CPU 支持: "fp32"(default), "fp16", "bf16"(mkldnn); 不支持: "int8"
-            precision: "FP16"
+            precision: "fp32"
+
+            #ir_optim开关, 默认False
+            ir_optim: True 
 
-            #ir_optim开关
-            ir_optim: False 
+            #use_mkldnn开关, 默认False, use_mkldnn与ir_optim同时打开才有性能提升
+            use_mkldnn: True
diff --git a/python/paddle_serving_app/local_predict.py b/python/paddle_serving_app/local_predict.py
@@ -64,6 +64,10 @@ def load_model_config(self,
                           use_xpu=False,
                           precision="fp32",
                           use_calib=False,
+                          use_mkldnn=False,
+                          mkldnn_cache_capacity=0,
+                          mkldnn_op_list=None,
+                          mkldnn_bf16_op_list=None,
                           use_feed_fetch_ops=False):
         """
         Load model configs and create the paddle predictor by Paddle Inference API.
@@ -73,14 +77,18 @@ def load_model_config(self,
             use_gpu: calculating with gpu, False default.
             gpu_id: gpu id, 0 default.
             use_profile: use predictor profiles, False default.
-            thread_num: thread nums, default 1. 
+            thread_num: thread nums of cpu math library, default 1. 
             mem_optim: memory optimization, True default.
             ir_optim: open calculation chart optimization, False default.
             use_trt: use nvidia TensorRT optimization, False default
             use_lite: use Paddle-Lite Engint, False default
             use_xpu: run predict on Baidu Kunlun, False default
             precision: precision mode, "fp32" default
             use_calib: use TensorRT calibration, False default
+            use_mkldnn: use MKLDNN, False default.
+            mkldnn_cache_capacity: cache capacity for input shapes, 0 default.
+            mkldnn_op_list: op list accelerated using MKLDNN, None default.
+            mkldnn_bf16_op_list: op list accelerated using MKLDNN bf16, None default.
             use_feed_fetch_ops: use feed/fetch ops, False default.
         """
         client_config = "{}/serving_server_conf.prototxt".format(model_path)
@@ -96,13 +104,15 @@ def load_model_config(self,
             config = paddle_infer.Config(model_path)
 
         logger.info(
-            "LocalPredictor load_model_config params: model_path:{}, use_gpu:{},\
-            gpu_id:{}, use_profile:{}, thread_num:{}, mem_optim:{}, ir_optim:{},\
-            use_trt:{}, use_lite:{}, use_xpu: {}, precision: {}, use_calib: {},\
-            use_feed_fetch_ops:{}"
-            .format(model_path, use_gpu, gpu_id, use_profile, thread_num,
-                    mem_optim, ir_optim, use_trt, use_lite, use_xpu, precision,
-                    use_calib, use_feed_fetch_ops))
+            "LocalPredictor load_model_config params: model_path:{}, use_gpu:{}, "
+            "gpu_id:{}, use_profile:{}, thread_num:{}, mem_optim:{}, ir_optim:{}, "
+            "use_trt:{}, use_lite:{}, use_xpu:{}, precision:{}, use_calib:{}, "
+            "use_mkldnn:{}, mkldnn_cache_capacity:{}, mkldnn_op_list:{}, "
+            "mkldnn_bf16_op_list:{}, use_feed_fetch_ops:{}, ".format(
+                model_path, use_gpu, gpu_id, use_profile, thread_num, mem_optim,
+                ir_optim, use_trt, use_lite, use_xpu, precision, use_calib,
+                use_mkldnn, mkldnn_cache_capacity, mkldnn_op_list,
+                mkldnn_bf16_op_list, use_feed_fetch_ops))
 
         self.feed_names_ = [var.alias_name for var in model_conf.feed_var]
         self.fetch_names_ = [var.alias_name for var in model_conf.fetch_var]
@@ -118,21 +128,35 @@ def load_model_config(self,
             self.fetch_names_to_idx_[var.alias_name] = i
             self.fetch_names_to_type_[var.alias_name] = var.fetch_type
 
+        # set precision of inference.
         precision_type = paddle_infer.PrecisionType.Float32
         if precision is not None and precision.lower() in precision_map:
             precision_type = precision_map[precision.lower()]
         else:
             logger.warning("precision error!!! Please check precision:{}".
                            format(precision))
+        # set profile
         if use_profile:
             config.enable_profile()
+        # set memory optimization
         if mem_optim:
             config.enable_memory_optim()
+        # set ir optimization, threads of cpu math library
         config.switch_ir_optim(ir_optim)
-        config.set_cpu_math_library_num_threads(thread_num)
+        # use feed & fetch ops
         config.switch_use_feed_fetch_ops(use_feed_fetch_ops)
+        # pass optim
         config.delete_pass("conv_transpose_eltwiseadd_bn_fuse_pass")
 
+        # set cpu & mkldnn
+        config.set_cpu_math_library_num_threads(thread_num)
+        if use_mkldnn:
+            config.enable_mkldnn()
+            if mkldnn_cache_capacity > 0:
+                config.set_mkldnn_cache_capacity(mkldnn_cache_capacity)
+            if mkldnn_op_list is not None:
+                config.set_mkldnn_op(mkldnn_op_list)
+        # set gpu
         if not use_gpu:
             config.disable_gpu()
         else:
@@ -145,18 +169,18 @@ def load_model_config(self,
                     min_subgraph_size=3,
                     use_static=False,
                     use_calib_mode=False)
-
+        # set lite
         if use_lite:
             config.enable_lite_engine(
                 precision_mode=precision_type,
                 zero_copy=True,
                 passes_filter=[],
                 ops_filter=[])
-
+        # set xpu
         if use_xpu:
             # 2MB l3 cache
             config.enable_xpu(8 * 1024 * 1024)
-
+        # set cpu low precision
         if not use_gpu and not use_lite:
             if precision_type == paddle_infer.PrecisionType.Int8:
                 logger.warning(
@@ -165,6 +189,9 @@ def load_model_config(self,
                 #config.enable_quantizer()
             if precision is not None and precision.lower() == "bf16":
                 config.enable_mkldnn_bfloat16()
+                if mkldnn_bf16_op_list is not None:
+                    config.set_bfloat16_op(mkldnn_bf16_op_list)
+
         self.predictor = paddle_infer.create_predictor(config)
 
     def predict(self, feed=None, fetch=None, batch=False, log_id=0):

diff --git a/python/pipeline/local_service_handler.py b/python/pipeline/local_service_handler.py
@@ -45,7 +45,11 @@ def __init__(self,
                  ir_optim=False,
                  available_port_generator=None,
                  use_profile=False,
-                 precision="fp32"):
+                 precision="fp32",
+                 use_mkldnn=False,
+                 mkldnn_cache_capacity=0,
+                 mkldnn_op_list=None,
+                 mkldnn_bf16_op_list=None):
         """
         Initialization of localservicehandler
 
@@ -64,6 +68,10 @@ def __init__(self,
            available_port_generator: generate available ports
            use_profile: use profiling, False default.
            precision: inference precesion, e.g. "fp32", "fp16", "int8"
+           use_mkldnn: use mkldnn, default False.
+           mkldnn_cache_capacity: cache capacity of mkldnn, 0 means no limit.
+           mkldnn_op_list: OP list optimized by mkldnn, None default.
+           mkldnn_bf16_op_list: OP list optimized by mkldnn bf16, None default.
 
         Returns:
            None
@@ -78,6 +86,10 @@ def __init__(self,
         self._use_trt = False
         self._use_lite = False
         self._use_xpu = False
+        self._use_mkldnn = False
+        self._mkldnn_cache_capacity = 0
+        self._mkldnn_op_list = None
+        self._mkldnn_bf16_op_list = None
 
         if device_type == -1:
             # device_type is not set, determined by `devices`, 
@@ -140,16 +152,24 @@ def __init__(self,
         self._use_profile = use_profile
         self._fetch_names = fetch_names
         self._precision = precision
+        self._use_mkldnn = use_mkldnn
+        self._mkldnn_cache_capacity = mkldnn_cache_capacity
+        self._mkldnn_op_list = mkldnn_op_list
+        self._mkldnn_bf16_op_list = mkldnn_bf16_op_list
 
         _LOGGER.info(
             "Models({}) will be launched by device {}. use_gpu:{}, "
             "use_trt:{}, use_lite:{}, use_xpu:{}, device_type:{}, devices:{}, "
             "mem_optim:{}, ir_optim:{}, use_profile:{}, thread_num:{}, "
-            "client_type:{}, fetch_names:{} precision:{}".format(
+            "client_type:{}, fetch_names:{}, precision:{}, use_mkldnn:{}, "
+            "mkldnn_cache_capacity:{}, mkldnn_op_list:{}, "
+            "mkldnn_bf16_op_list:{}".format(
                 model_config, self._device_name, self._use_gpu, self._use_trt,
-                self._use_lite, self._use_xpu, device_type, self._devices, self.
-                _mem_optim, self._ir_optim, self._use_profile, self._thread_num,
-                self._client_type, self._fetch_names, self._precision))
+                self._use_lite, self._use_xpu, device_type, self._devices,
+                self._mem_optim, self._ir_optim, self._use_profile,
+                self._thread_num, self._client_type, self._fetch_names,
+                self._precision, self._use_mkldnn, self._mkldnn_cache_capacity,
+                self._mkldnn_op_list, self._mkldnn_bf16_op_list))
 
     def get_fetch_list(self):
         return self._fetch_names
@@ -189,7 +209,7 @@ def get_client(self, concurrency_idx):
         from paddle_serving_app.local_predict import LocalPredictor
         if self._local_predictor_client is None:
             self._local_predictor_client = LocalPredictor()
-
+            # load model config and init predictor
             self._local_predictor_client.load_model_config(
                 model_path=self._model_config,
                 use_gpu=self._use_gpu,
@@ -201,7 +221,11 @@ def get_client(self, concurrency_idx):
                 use_trt=self._use_trt,
                 use_lite=self._use_lite,
                 use_xpu=self._use_xpu,
-                precision=self._precision)
+                precision=self._precision,
+                use_mkldnn=self._use_mkldnn,
+                mkldnn_cache_capacity=self._mkldnn_cache_capacity,
+                mkldnn_op_list=self._mkldnn_op_list,
+                mkldnn_bf16_op_list=self._mkldnn_bf16_op_list)
         return self._local_predictor_client
 
     def get_client_config(self):