PaddlePaddle · jiweibo · Apr 14, 2023 · Apr 14, 2023 · Apr 14, 2023 · Apr 14, 2023
diff --git a/docs-official/api_reference/cxx_api_doc/Config/GPUConfig.md b/docs-official/api_reference/cxx_api_doc/Config/GPUConfig.md
@@ -21,9 +21,9 @@ void EnableUseGpu(uint64_t memory_pool_init_size_mb, int device_id = 0, Precisio
 // 返回：None
 void DisableGpu();
 
-// 判断是否启用 GPU 
+// 判断是否启用 GPU
 // 参数：None
-// 返回：bool - 是否启用 GPU 
+// 返回：bool - 是否启用 GPU
 bool use_gpu() const;
 
 // 获取 GPU 的device id
@@ -142,7 +142,7 @@ int main(int argc, char **argv) {
 
 ## TensorRT 设置
 
-**注意：** 
+**注意：**
 1. 启用 TensorRT 的前提为已经启用 GPU，否则启用 TensorRT 无法生效
 2. 对存在LoD信息的模型，如BERT, ERNIE等NLP模型，必须使用动态 Shape
 3. 启用 TensorRT OSS 可以支持更多 plugin，详细参考 [TensorRT OSS](https://news.developer.nvidia.com/nvidia-open-sources-parsers-and-plugins-in-tensorrt/)。当前开始OSS只对ERNIE/BERT模型加速效果（[示例代码](https://github.com/PaddlePaddle/Paddle-Inference-Demo/tree/master/c%2B%2B/ernie-varlen)）。
@@ -157,7 +157,7 @@ API定义如下：
 //                           行时显存占用。该值设置过小可能会导致选不到最佳kernel，设置过大时会增加初始
 //                           化阶段的显存使用，请根据实际情况调整，建议值256MB
 //      max_batch_size     - 设置最大的 batch 大小，运行时 batch 大小不得超过此限定值
-//      min_subgraph_size  - Paddle 内 TensorRT 是以子图的形式运行，为了避免性能损失，当 TensorRT 
+//      min_subgraph_size  - Paddle 内 TensorRT 是以子图的形式运行，为了避免性能损失，当 TensorRT
 //                           子图内部节点个数大于 min_subgraph_size 的时候，才会使用 TensorRT 运行
 //      precision          - 指定使用 TensorRT 的精度，支持 FP32(kFloat32)，FP16(kHalf)，
 //                           Int8(kInt8)
@@ -172,7 +172,7 @@ void EnableTensorRtEngine(int workspace_size = 1 << 20,
                           bool use_static = false,
                           bool use_calib_mode = true);
 
-// 判断是否启用 TensorRT 
+// 判断是否启用 TensorRT
 // 参数：None
 // 返回：bool - 是否启用 TensorRT
 bool tensorrt_engine_enabled() const;
@@ -202,13 +202,13 @@ void SetTRTDynamicShapeInfo(
       bool disable_trt_plugin_fp16 = false);
 
 //
-// TensorRT 动态 shape 的自动推导，使用示例参考 https://github.com/PaddlePaddle/Paddle-Inference-Demo/blob/d6c1aac35fa8a02271c9433b0565ff0054a5a82b/c++/paddle-trt/tuned_dynamic_shape 
-// 参数： shape_range_info_path  - 统计生成的 shape 信息存储文件路径
-//       allow_build_at_runtime - 是否开启运行时重建 TensorRT 引擎功能，当设置为 true 时，输入 shape 
+// TensorRT 动态 shape 的自动推导，使用示例参考 https://github.com/PaddlePaddle/Paddle-Inference-Demo/blob/d6c1aac35fa8a02271c9433b0565ff0054a5a82b/c++/paddle-trt/tuned_dynamic_shape
+// 参数： shape_range_info_path  - 统计生成的 shape 信息存储文件路径，当设置为空时，在运行时收集
+//       allow_build_at_runtime - 是否开启运行时重建 TensorRT 引擎功能，当设置为 true 时，输入 shape
 //                                超过 tune 范围时会触发 TensorRT 重建。当设置为 false 时，输入 shape
 //                                超过 tune 范围时会引起推理出错
 // 返回：None
-void EnableTunedTensorRtDynamicShape(const std::string& shape_range_info_path,
+void EnableTunedTensorRtDynamicShape(const std::string& shape_range_info_path = "",
                                      bool allow_build_at_runtime = true);
 
 
@@ -243,7 +243,7 @@ paddle_infer::Config config("./model/mobilenet.pdmodel", "./model/mobilenet.pdip
 config.EnableUseGpu(100, 0);
 
 // 启用 TensorRT 进行预测加速 - FP32
-config.EnableTensorRtEngine(1 << 28, 1, 3, 
+config.EnableTensorRtEngine(1 << 28, 1, 3,
                             paddle_infer::PrecisionType::kFloat32, false, false);
 // 通过 API 获取 TensorRT 启用结果 - true
 std::cout << "Enable TensorRT is: " << config.tensorrt_engine_enabled() << std::endl;
@@ -252,13 +252,13 @@ std::cout << "Enable TensorRT is: " << config.tensorrt_engine_enabled() << std::
 config.EnableTensorRTMemoryOptim();
 
 // 启用 TensorRT 进行预测加速 - FP16
-config.EnableTensorRtEngine(1 << 28, 1, 3, 
+config.EnableTensorRtEngine(1 << 28, 1, 3,
                             paddle_infer::PrecisionType::kHalf, false, false);
 // 通过 API 获取 TensorRT 启用结果 - true
 std::cout << "Enable TensorRT is: " << config.tensorrt_engine_enabled() << std::endl;
 
 // 启用 TensorRT 进行预测加速 - Int8
-config.EnableTensorRtEngine(1 << 28, 1, 3, 
+config.EnableTensorRtEngine(1 << 28, 1, 3,
                             paddle_infer::PrecisionType::kInt8, false, true);
 // 通过 API 获取 TensorRT 启用结果 - true
 std::cout << "Enable TensorRT is: " << config.tensorrt_engine_enabled() << std::endl;

diff --git a/docs-official/api_reference/python_api_doc/Config/GPUConfig.md b/docs-official/api_reference/python_api_doc/Config/GPUConfig.md
@@ -21,9 +21,9 @@ paddle.inference.Config.enable_use_gpu(memory_pool_init_size_mb: int, device_id:
 # 返回：None
 paddle.inference.Config.disable_gpu()
 
-# 判断是否启用 GPU 
+# 判断是否启用 GPU
 # 参数：None
-# 返回：bool - 是否启用 GPU 
+# 返回：bool - 是否启用 GPU
 paddle.inference.Config.use_gpu()
 
 # 获取 GPU 的device id
@@ -67,7 +67,7 @@ print("Use GPU is: {}".format(config.use_gpu())) # False
 
 ## TensorRT 设置
 
-**注意：** 
+**注意：**
 1. 启用 TensorRT 的前提为已经启用 GPU，否则启用 TensorRT 无法生效
 2. 对存在 LoD 信息的模型，如 BERT、ERNIE 等 NLP 模型，必须使用动态 Shape
 3. 启用 TensorRT OSS 可以支持更多 plugin，详细参考 [TensorRT OSS](https://news.developer.nvidia.com/nvidia-open-sources-parsers-and-plugins-in-tensorrt/)
@@ -82,7 +82,7 @@ API定义如下：
 #                           行时显存占用。该值设置过小可能会导致选不到最佳kernel，设置过大时会增加初始
 #                           化阶段的显存使用，请根据实际情况调整，建议值256MB
 #      max_batch_size     - 设置最大的 batch 大小，运行时 batch 大小不得超过此限定值
-#      min_subgraph_size  - Paddle 内 TensorRT 是以子图的形式运行，为了避免性能损失，当 TensorRT 
+#      min_subgraph_size  - Paddle 内 TensorRT 是以子图的形式运行，为了避免性能损失，当 TensorRT
 #                           子图内部节点个数大于 min_subgraph_size 的时候，才会使用 TensorRT 运行
 #      precision          - 指定使用 TensorRT 的精度，支持 FP32(kFloat32)，FP16(kHalf)，Int8(kInt8)
 #      use_static         - 若指定为 true，在初次运行程序退出Predictor析构的时候会将 TensorRT 的优
@@ -97,7 +97,7 @@ paddle.inference.Config.enable_tensorrt_engine(workspace_size: int = 1 << 20,
                                                use_static: bool,
                                                use_calib_mode: bool)
 
-# 判断是否启用 TensorRT 
+# 判断是否启用 TensorRT
 # 参数：None
 # 返回：bool - 是否启用 TensorRT
 paddle.inference.Config.tensorrt_engine_enabled()
@@ -120,20 +120,20 @@ paddle.inference.Config.enable_tensorrt_memory_optim(engine_memory_sharing : boo
 #                                 kernel 阶段以此项配置的 shape 下的性能表现作为选择依据
 #      disable_trt_plugin_fp16  - 设置 TensorRT 的 plugin 不在 fp16 精度下运行
 # 返回：None
-paddle.inference.Config.set_trt_dynamic_shape_info(min_input_shape: Dict[str, List[int]] = {}, 
-                                                   max_input_shape: Dict[str, List[int]] = {}, 
-                                                   optim_input_shape: Dict[str, List[int]] = {}, 
+paddle.inference.Config.set_trt_dynamic_shape_info(min_input_shape: Dict[str, List[int]] = {},
+                                                   max_input_shape: Dict[str, List[int]] = {},
+                                                   optim_input_shape: Dict[str, List[int]] = {},
                                                    disable_trt_plugin_fp16: bool = False)
 
 #
 # TensorRT 动态 shape 的自动推导
-# 参数： shape_range_info_path  - 统计生成的 shape 信息存储文件路径
-#       allow_build_at_runtime - 是否开启运行时重建 TensorRT 引擎功能，当设置为 true 时，输入 shape 
+# 参数： shape_range_info_path  - 统计生成的 shape 信息存储文件路径，当设置为空时，在运行时收集
+#       allow_build_at_runtime - 是否开启运行时重建 TensorRT 引擎功能，当设置为 true 时，输入 shape
 #                                超过 tune 范围时会触发 TensorRT 重建。当设置为 false 时，输入 shape
 #                                超过 tune 范围时会引起推理出错
 # 返回：None
 paddle.inference.Config.enable_tuned_tensorrt_dynamic_shape(
-                                     shape_range_info_path: str,
+                                     shape_range_info_path: str = "",
                                      allow_build_at_runtime: bool = True)
 
 # 启用 TensorRT OSS 进行 ERNIE / BERT 预测加速（原理介绍 https://github.com/PaddlePaddle/Paddle-Inference-Demo/tree/master/c%2B%2B/ernie-varlen ）
@@ -170,29 +170,29 @@ config = paddle_infer.Config("./mobilenet.pdmodel", "./mobilenet.pdiparams")
 config.enable_use_gpu(100, 0)
 
 # 启用 TensorRT 进行预测加速 - FP32
-config.enable_tensorrt_engine(workspace_size = 1 << 28, 
-                              max_batch_size = 1, 
-                              min_subgraph_size = 3, 
-                              precision_mode = paddle_infer.PrecisionType.Float32, 
+config.enable_tensorrt_engine(workspace_size = 1 << 28,
+                              max_batch_size = 1,
+                              min_subgraph_size = 3,
+                              precision_mode = paddle_infer.PrecisionType.Float32,
                               use_static = False, use_calib_mode = False)
 # 通过 API 获取 TensorRT 启用结果 - true
 print("Enable TensorRT is: {}".format(config.tensorrt_engine_enabled()))
 
 
 # 启用 TensorRT 进行预测加速 - FP16
-config.enable_tensorrt_engine(workspace_size = 1 << 28, 
-                              max_batch_size = 1, 
-                              min_subgraph_size = 3, 
-                              precision_mode = paddle_infer.PrecisionType.Half, 
+config.enable_tensorrt_engine(workspace_size = 1 << 28,
+                              max_batch_size = 1,
+                              min_subgraph_size = 3,
+                              precision_mode = paddle_infer.PrecisionType.Half,
                               use_static = False, use_calib_mode = False)
 # 通过 API 获取 TensorRT 启用结果 - true
 print("Enable TensorRT is: {}".format(config.tensorrt_engine_enabled()))
 
 # 启用 TensorRT 进行预测加速 - Int8
-config.enable_tensorrt_engine(workspace_size = 1 << 28, 
-                              max_batch_size = 1, 
-                              min_subgraph_size = 3, 
-                              precision_mode = paddle_infer.PrecisionType.Int8, 
+config.enable_tensorrt_engine(workspace_size = 1 << 28,
+                              max_batch_size = 1,
+                              min_subgraph_size = 3,
+                              precision_mode = paddle_infer.PrecisionType.Int8,
                               use_static = False, use_calib_mode = False)
 
 # 开启 TensorRT 显存优化
@@ -215,10 +215,10 @@ config = paddle_infer.Config("./mobilenet.pdmodel", "./mobilenet.pdiparams")
 config.enable_use_gpu(100, 0)
 
 # 启用 TensorRT 进行预测加速 - Int8
-config.enable_tensorrt_engine(workspace_size = 1 << 29, 
-                              max_batch_size = 1, 
-                              min_subgraph_size = 1, 
-                              precision_mode=paddle_infer.PrecisionType.Int8, 
+config.enable_tensorrt_engine(workspace_size = 1 << 29,
+                              max_batch_size = 1,
+                              min_subgraph_size = 1,
+                              precision_mode=paddle_infer.PrecisionType.Int8,
                               use_static = False, use_calib_mode = True)
 
 # 开启 TensorRT 显存优化

diff --git a/python/gpu/tuned_dynamic_shape/README.md b/python/gpu/tuned_dynamic_shape/README.md
@@ -12,19 +12,31 @@
 
 ### 三：运行 TunedDynamicShape 样例
 
-**1、首先您需要针对业务数据进行离线 tune，来获取计算图中所有中间 tensor 的 shape 范围，并将其存储在 config 中配置的 shape_range_info.pbtxt 文件中**
+#### 方式一：Auto_Tune
+
+Auto_Tune 的方式是运行时构建 trt engine 并收集 shape 信息，使用时不需要预先收集 shape 信息，收集好的 shape 信息保存在 _opt_cache 目录下的 shape_range_info.pbtxt 文件中
+
+```
+python infer_tune.py --model_file ./resnet50/inference.pdmodel --params_file ./resnet50/inference.pdiparams --use_trt 1 --tuned_dynamic_shape 1 --auto_tune 1
+```
+
+#### 方式二：离线 Tune
+
+离线 Tune 的方式需要以下两个步骤完成：
+
+1、首先您需要针对业务数据进行离线 tune，来获取计算图中所有中间 tensor 的 shape 范围，并将其存储在 config 中配置的 shape_range_info.pbtxt 文件中
 
 ```
 python infer_tune.py --model_file ./resnet50/inference.pdmodel --params_file ./resnet50/inference.pdiparams --tune 1
 ```
 
-**2、有了离线 tune 得到的 shape 范围信息后，您可以使用该文件自动对所有的 trt 子图设置其输入的 shape 范围。**
+2、有了离线 tune 得到的 shape 范围信息后，您可以使用该文件自动对所有的 trt 子图设置其输入的 shape 范围。
 
 ```
 python infer_tune.py --model_file ./resnet50/inference.pdmodel --params_file ./resnet50/inference.pdiparams --use_gpu 1 --use_trt 1 --tuned_dynamic_shape 1
 ```
 
 ## 更多链接
-- [Paddle Inference使用Quick Start！](https://paddle-inference.readthedocs.io/en/latest/introduction/quick_start.html)
-- [Paddle Inference C++ Api使用](https://paddle-inference.readthedocs.io/en/latest/user_guides/cxx_api.html)
-- [Paddle Inference Python Api使用](https://paddle-inference.readthedocs.io/en/latest/user_guides/inference_python_api.html)
+- [Paddle Inference使用Quick Start！](https://www.paddlepaddle.org.cn/inference/master/guides/quick_start/index_quick_start.html)
+- [Paddle Inference C++ Api使用](https://www.paddlepaddle.org.cn/inference/master/api_reference/cxx_api_doc/cxx_api_index.html)
+- [Paddle Inference Python Api使用](https://www.paddlepaddle.org.cn/inference/master/api_reference/python_api_doc/python_api_index.html)
diff --git a/python/gpu/tuned_dynamic_shape/infer_tune.py b/python/gpu/tuned_dynamic_shape/infer_tune.py
@@ -15,26 +15,23 @@ def init_predictor(args):
         config = Config(args.model_file, args.params_file)
 
     config.enable_memory_optim()
+    config.enable_use_gpu(1000, 0)
     if args.tune:
         config.collect_shape_range_info(shape_file)
-    if args.use_gpu:
-        config.enable_use_gpu(1000, 0)
-        if args.use_trt:
-            # using dynamic shpae mode, the max_batch_size will be ignored.
-            config.enable_tensorrt_engine(
-                workspace_size=1 << 30,
-                max_batch_size=1,
-                min_subgraph_size=5,
-                precision_mode=PrecisionType.Float32,
-                use_static=False,
-                use_calib_mode=False)
-            if args.tuned_dynamic_shape:
+    if args.use_trt:
+        # using dynamic shpae mode, the max_batch_size will be ignored.
+        config.enable_tensorrt_engine(
+            workspace_size=1 << 30,
+            max_batch_size=1,
+            min_subgraph_size=5,
+            precision_mode=PrecisionType.Float32,
+            use_static=False,
+            use_calib_mode=False)
+        if args.tuned_dynamic_shape:
+            if args.auto_tune:
+                config.enable_tuned_tensorrt_dynamic_shape()
+            else:
                 config.enable_tuned_tensorrt_dynamic_shape(shape_file, True)
-    else:
-        # If not specific mkldnn, you can set the blas thread.
-        # The thread num should not be greater than the number of cores in the CPU.
-        config.set_cpu_math_library_num_threads(4)
-        config.enable_mkldnn()
 
     predictor = create_predictor(config)
     return predictor
@@ -91,6 +88,11 @@ def parse_args():
         type=int,
         default=0,
         help="Whether use tune to get shape range.")
+    parser.add_argument(
+        "--auto_tune",
+        type=int,
+        default=0,
+        help="Whether use auto tune to get shape range.")
     parser.add_argument(
         "--tuned_dynamic_shape",
         type=int,