Skip to content

Commit

Permalink
Merge pull request #1264 from TeslaZhao/develop
Browse files Browse the repository at this point in the history
python pipeline模式下提供mkldnn高性能推理
  • Loading branch information
bjjwwang authored May 27, 2021
2 parents d96e4b2 + 4045bde commit 8707874
Show file tree
Hide file tree
Showing 7 changed files with 159 additions and 39 deletions.
8 changes: 4 additions & 4 deletions python/examples/pipeline/ocr/benchmark.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
export FLAGS_profile_pipeline=1
alias python3="python3.6"
alias python3="python3.7"
modelname="ocr"

# HTTP
Expand All @@ -11,11 +11,11 @@ rm -rf profile_log_$modelname

echo "Starting HTTP Clients..."
# Start a client in each thread, tesing the case of multiple threads.
for thread_num in 1 2 4 8 12 16
for thread_num in 1 2 4 6 8 12 16
do
for batch_size in 1
do
echo '----$modelname thread num: $thread_num batch size: $batch_size mode:http ----' >>profile_log_$modelname
echo "----$modelname thread num: $thread_num batch size: $batch_size mode:http ----" >>profile_log_$modelname
# Start one web service, If you start the service yourself, you can ignore it here.
#python3 web_service.py >web.log 2>&1 &
#sleep 3
Expand Down Expand Up @@ -51,7 +51,7 @@ sleep 3

# Create yaml,If you already have the config.yaml, ignore it.
#python3 benchmark.py yaml local_predictor 1 gpu
rm -rf profile_log_$modelname
#rm -rf profile_log_$modelname

# Start a client in each thread, tesing the case of multiple threads.
for thread_num in 1 2 4 6 8 12 16
Expand Down
28 changes: 23 additions & 5 deletions python/examples/pipeline/ocr/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ http_port: 9999

#worker_num, 最大并发数。当build_dag_each_worker=True时, 框架会创建worker_num个进程,每个进程内构建grpcSever和DAG
##当build_dag_each_worker=False时,框架会设置主线程grpc线程池的max_workers=worker_num
worker_num: 5
worker_num: 20

#build_dag_each_worker, False,框架在进程内创建一条DAG;True,框架会每个进程内创建多个独立的DAG
build_dag_each_worker: false
Expand All @@ -26,7 +26,7 @@ dag:
op:
det:
#并发数,is_thread_op=True时,为线程并发;否则为进程并发
concurrency: 2
concurrency: 6

#当op配置没有server_endpoints时,从local_service_conf读取本地服务配置
local_service_conf:
Expand All @@ -40,10 +40,19 @@ op:
fetch_list: ["concat_1.tmp_0"]

#计算硬件ID,当devices为""或不写时为CPU预测;当devices为"0", "0,1,2"时为GPU预测,表示使用的GPU卡
devices: "0"
devices: ""

#use_mkldnn
#use_mkldnn: True

#thread_num
thread_num: 2

#ir_optim
ir_optim: True
rec:
#并发数,is_thread_op=True时,为线程并发;否则为进程并发
concurrency: 2
concurrency: 3

#超时时间, 单位ms
timeout: -1
Expand All @@ -64,4 +73,13 @@ op:
fetch_list: ["ctc_greedy_decoder_0.tmp_0", "softmax_0.tmp_0"]

#计算硬件ID,当devices为""或不写时为CPU预测;当devices为"0", "0,1,2"时为GPU预测,表示使用的GPU卡
devices: "0"
devices: ""

#use_mkldnn
#use_mkldnn: True

#thread_num
thread_num: 2

#ir_optim
ir_optim: True
15 changes: 11 additions & 4 deletions python/examples/pipeline/simple_web_service/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,14 @@ http_port: 18082
dag:
#op资源类型, True, 为线程模型;False,为进程模型
is_thread_op: False

#tracer
tracer:
interval_s: 10
op:
uci:
#并发数,is_thread_op=True时,为线程并发;否则为进程并发
concurrency: 2
concurrency: 1

#当op配置没有server_endpoints时,从local_service_conf读取本地服务配置
local_service_conf:
Expand All @@ -35,7 +39,10 @@ op:
#precsion, 预测精度,降低预测精度可提升预测速度
#GPU 支持: "fp32"(default), "fp16", "int8";
#CPU 支持: "fp32"(default), "fp16", "bf16"(mkldnn); 不支持: "int8"
precision: "FP16"
precision: "fp32"

#ir_optim开关, 默认False
ir_optim: True

#ir_optim开关
ir_optim: False
#use_mkldnn开关, 默认False, use_mkldnn与ir_optim同时打开才有性能提升
use_mkldnn: True
51 changes: 39 additions & 12 deletions python/paddle_serving_app/local_predict.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,10 @@ def load_model_config(self,
use_xpu=False,
precision="fp32",
use_calib=False,
use_mkldnn=False,
mkldnn_cache_capacity=0,
mkldnn_op_list=None,
mkldnn_bf16_op_list=None,
use_feed_fetch_ops=False):
"""
Load model configs and create the paddle predictor by Paddle Inference API.
Expand All @@ -73,14 +77,18 @@ def load_model_config(self,
use_gpu: calculating with gpu, False default.
gpu_id: gpu id, 0 default.
use_profile: use predictor profiles, False default.
thread_num: thread nums, default 1.
thread_num: thread nums of cpu math library, default 1.
mem_optim: memory optimization, True default.
ir_optim: open calculation chart optimization, False default.
use_trt: use nvidia TensorRT optimization, False default
use_lite: use Paddle-Lite Engint, False default
use_xpu: run predict on Baidu Kunlun, False default
precision: precision mode, "fp32" default
use_calib: use TensorRT calibration, False default
use_mkldnn: use MKLDNN, False default.
mkldnn_cache_capacity: cache capacity for input shapes, 0 default.
mkldnn_op_list: op list accelerated using MKLDNN, None default.
mkldnn_bf16_op_list: op list accelerated using MKLDNN bf16, None default.
use_feed_fetch_ops: use feed/fetch ops, False default.
"""
client_config = "{}/serving_server_conf.prototxt".format(model_path)
Expand All @@ -96,13 +104,15 @@ def load_model_config(self,
config = paddle_infer.Config(model_path)

logger.info(
"LocalPredictor load_model_config params: model_path:{}, use_gpu:{},\
gpu_id:{}, use_profile:{}, thread_num:{}, mem_optim:{}, ir_optim:{},\
use_trt:{}, use_lite:{}, use_xpu: {}, precision: {}, use_calib: {},\
use_feed_fetch_ops:{}"
.format(model_path, use_gpu, gpu_id, use_profile, thread_num,
mem_optim, ir_optim, use_trt, use_lite, use_xpu, precision,
use_calib, use_feed_fetch_ops))
"LocalPredictor load_model_config params: model_path:{}, use_gpu:{}, "
"gpu_id:{}, use_profile:{}, thread_num:{}, mem_optim:{}, ir_optim:{}, "
"use_trt:{}, use_lite:{}, use_xpu:{}, precision:{}, use_calib:{}, "
"use_mkldnn:{}, mkldnn_cache_capacity:{}, mkldnn_op_list:{}, "
"mkldnn_bf16_op_list:{}, use_feed_fetch_ops:{}, ".format(
model_path, use_gpu, gpu_id, use_profile, thread_num, mem_optim,
ir_optim, use_trt, use_lite, use_xpu, precision, use_calib,
use_mkldnn, mkldnn_cache_capacity, mkldnn_op_list,
mkldnn_bf16_op_list, use_feed_fetch_ops))

self.feed_names_ = [var.alias_name for var in model_conf.feed_var]
self.fetch_names_ = [var.alias_name for var in model_conf.fetch_var]
Expand All @@ -118,21 +128,35 @@ def load_model_config(self,
self.fetch_names_to_idx_[var.alias_name] = i
self.fetch_names_to_type_[var.alias_name] = var.fetch_type

# set precision of inference.
precision_type = paddle_infer.PrecisionType.Float32
if precision is not None and precision.lower() in precision_map:
precision_type = precision_map[precision.lower()]
else:
logger.warning("precision error!!! Please check precision:{}".
format(precision))
# set profile
if use_profile:
config.enable_profile()
# set memory optimization
if mem_optim:
config.enable_memory_optim()
# set ir optimization, threads of cpu math library
config.switch_ir_optim(ir_optim)
config.set_cpu_math_library_num_threads(thread_num)
# use feed & fetch ops
config.switch_use_feed_fetch_ops(use_feed_fetch_ops)
# pass optim
config.delete_pass("conv_transpose_eltwiseadd_bn_fuse_pass")

# set cpu & mkldnn
config.set_cpu_math_library_num_threads(thread_num)
if use_mkldnn:
config.enable_mkldnn()
if mkldnn_cache_capacity > 0:
config.set_mkldnn_cache_capacity(mkldnn_cache_capacity)
if mkldnn_op_list is not None:
config.set_mkldnn_op(mkldnn_op_list)
# set gpu
if not use_gpu:
config.disable_gpu()
else:
Expand All @@ -145,18 +169,18 @@ def load_model_config(self,
min_subgraph_size=3,
use_static=False,
use_calib_mode=False)

# set lite
if use_lite:
config.enable_lite_engine(
precision_mode=precision_type,
zero_copy=True,
passes_filter=[],
ops_filter=[])

# set xpu
if use_xpu:
# 2MB l3 cache
config.enable_xpu(8 * 1024 * 1024)

# set cpu low precision
if not use_gpu and not use_lite:
if precision_type == paddle_infer.PrecisionType.Int8:
logger.warning(
Expand All @@ -165,6 +189,9 @@ def load_model_config(self,
#config.enable_quantizer()
if precision is not None and precision.lower() == "bf16":
config.enable_mkldnn_bfloat16()
if mkldnn_bf16_op_list is not None:
config.set_bfloat16_op(mkldnn_bf16_op_list)

self.predictor = paddle_infer.create_predictor(config)

def predict(self, feed=None, fetch=None, batch=False, log_id=0):
Expand Down
38 changes: 31 additions & 7 deletions python/pipeline/local_service_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,11 @@ def __init__(self,
ir_optim=False,
available_port_generator=None,
use_profile=False,
precision="fp32"):
precision="fp32",
use_mkldnn=False,
mkldnn_cache_capacity=0,
mkldnn_op_list=None,
mkldnn_bf16_op_list=None):
"""
Initialization of localservicehandler
Expand All @@ -64,6 +68,10 @@ def __init__(self,
available_port_generator: generate available ports
use_profile: use profiling, False default.
precision: inference precesion, e.g. "fp32", "fp16", "int8"
use_mkldnn: use mkldnn, default False.
mkldnn_cache_capacity: cache capacity of mkldnn, 0 means no limit.
mkldnn_op_list: OP list optimized by mkldnn, None default.
mkldnn_bf16_op_list: OP list optimized by mkldnn bf16, None default.
Returns:
None
Expand All @@ -78,6 +86,10 @@ def __init__(self,
self._use_trt = False
self._use_lite = False
self._use_xpu = False
self._use_mkldnn = False
self._mkldnn_cache_capacity = 0
self._mkldnn_op_list = None
self._mkldnn_bf16_op_list = None

if device_type == -1:
# device_type is not set, determined by `devices`,
Expand Down Expand Up @@ -140,16 +152,24 @@ def __init__(self,
self._use_profile = use_profile
self._fetch_names = fetch_names
self._precision = precision
self._use_mkldnn = use_mkldnn
self._mkldnn_cache_capacity = mkldnn_cache_capacity
self._mkldnn_op_list = mkldnn_op_list
self._mkldnn_bf16_op_list = mkldnn_bf16_op_list

_LOGGER.info(
"Models({}) will be launched by device {}. use_gpu:{}, "
"use_trt:{}, use_lite:{}, use_xpu:{}, device_type:{}, devices:{}, "
"mem_optim:{}, ir_optim:{}, use_profile:{}, thread_num:{}, "
"client_type:{}, fetch_names:{} precision:{}".format(
"client_type:{}, fetch_names:{}, precision:{}, use_mkldnn:{}, "
"mkldnn_cache_capacity:{}, mkldnn_op_list:{}, "
"mkldnn_bf16_op_list:{}".format(
model_config, self._device_name, self._use_gpu, self._use_trt,
self._use_lite, self._use_xpu, device_type, self._devices, self.
_mem_optim, self._ir_optim, self._use_profile, self._thread_num,
self._client_type, self._fetch_names, self._precision))
self._use_lite, self._use_xpu, device_type, self._devices,
self._mem_optim, self._ir_optim, self._use_profile,
self._thread_num, self._client_type, self._fetch_names,
self._precision, self._use_mkldnn, self._mkldnn_cache_capacity,
self._mkldnn_op_list, self._mkldnn_bf16_op_list))

def get_fetch_list(self):
return self._fetch_names
Expand Down Expand Up @@ -189,7 +209,7 @@ def get_client(self, concurrency_idx):
from paddle_serving_app.local_predict import LocalPredictor
if self._local_predictor_client is None:
self._local_predictor_client = LocalPredictor()

# load model config and init predictor
self._local_predictor_client.load_model_config(
model_path=self._model_config,
use_gpu=self._use_gpu,
Expand All @@ -201,7 +221,11 @@ def get_client(self, concurrency_idx):
use_trt=self._use_trt,
use_lite=self._use_lite,
use_xpu=self._use_xpu,
precision=self._precision)
precision=self._precision,
use_mkldnn=self._use_mkldnn,
mkldnn_cache_capacity=self._mkldnn_cache_capacity,
mkldnn_op_list=self._mkldnn_op_list,
mkldnn_bf16_op_list=self._mkldnn_bf16_op_list)
return self._local_predictor_client

def get_client_config(self):
Expand Down
Loading

0 comments on commit 8707874

Please sign in to comment.