From c3ea8ecac89b5f7ddce333fe9bdca05147f5da18 Mon Sep 17 00:00:00 2001
From: cryoco <peiyang@baidu.com>
Date: Mon, 14 Dec 2020 14:45:57 +0800
Subject: [PATCH] add gpu inference demo for linux windows and jetson

---
 c++/cuda_linux_demo/CMakeLists.txt       | 192 +++++++++++++++++++
 c++/cuda_linux_demo/README.md            | 105 +++++++++++
 c++/cuda_linux_demo/model_test.cc        |  60 ++++++
 c++/cuda_linux_demo/run_impl.sh          |  26 +++
 docs/demo_tutorial/cuda_jetson_demo.md   | 213 +++++++++++++++++++++
 docs/demo_tutorial/cuda_linux_demo.md    | 213 +++++++++++++++++++++
 docs/demo_tutorial/cuda_windows_demo.md  | 229 +++++++++++++++++++++++
 python/cuda_linux_demo/README.md         |  99 ++++++++++
 python/cuda_linux_demo/img_preprocess.py |  41 ++++
 python/cuda_linux_demo/model_test.py     |  74 ++++++++
 10 files changed, 1252 insertions(+)
 create mode 100644 c++/cuda_linux_demo/CMakeLists.txt
 create mode 100644 c++/cuda_linux_demo/README.md
 create mode 100644 c++/cuda_linux_demo/model_test.cc
 create mode 100755 c++/cuda_linux_demo/run_impl.sh
 create mode 100644 docs/demo_tutorial/cuda_jetson_demo.md
 create mode 100644 docs/demo_tutorial/cuda_linux_demo.md
 create mode 100644 docs/demo_tutorial/cuda_windows_demo.md
 create mode 100644 python/cuda_linux_demo/README.md
 create mode 100644 python/cuda_linux_demo/img_preprocess.py
 create mode 100644 python/cuda_linux_demo/model_test.py

diff --git a/c++/cuda_linux_demo/CMakeLists.txt b/c++/cuda_linux_demo/CMakeLists.txt
new file mode 100644
index 0000000000000..41100b892a094
--- /dev/null
+++ b/c++/cuda_linux_demo/CMakeLists.txt
@@ -0,0 +1,192 @@
+cmake_minimum_required(VERSION 3.0)
+project(cpp_inference_demo CXX C)
+option(WITH_MKL        "Compile demo with MKL/OpenBlas support, default use MKL."       ON)
+option(WITH_GPU        "Compile demo with GPU/CPU, default use CPU."                    ON)
+option(WITH_STATIC_LIB "Compile demo with static/shared library, default use static."   OFF)
+option(USE_TENSORRT "Compile demo with TensorRT."   ON)
+
+if(NOT WITH_STATIC_LIB)
+  add_definitions("-DPADDLE_WITH_SHARED_LIB")
+else()
+  # PD_INFER_DECL is mainly used to set the dllimport/dllexport attribute in dynamic library mode. 
+  # Set it to empty in static library mode to avoid compilation issues.
+  add_definitions("/DPD_INFER_DECL=")
+endif()
+
+macro(safe_set_static_flag)
+    foreach(flag_var
+        CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
+        CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO)
+      if(${flag_var} MATCHES "/MD")
+        string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}")
+      endif(${flag_var} MATCHES "/MD")
+    endforeach(flag_var)
+endmacro()
+
+if(NOT DEFINED PADDLE_LIB)
+  message(FATAL_ERROR "please set PADDLE_LIB with -DPADDLE_LIB=/path/paddle/lib")
+endif()
+if(NOT DEFINED DEMO_NAME)
+  message(FATAL_ERROR "please set DEMO_NAME with -DDEMO_NAME=demo_name")
+endif()
+
+include_directories("${PADDLE_LIB}/")
+set(PADDLE_LIB_THIRD_PARTY_PATH "${PADDLE_LIB}/third_party/install/")
+include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}protobuf/include")
+include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}glog/include")
+include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}gflags/include")
+include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}xxhash/include")
+
+link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}protobuf/lib")
+link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}glog/lib")
+link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}gflags/lib")
+link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}xxhash/lib")
+link_directories("${PADDLE_LIB}/paddle/lib")
+
+if (WIN32)
+  add_definitions("/DGOOGLE_GLOG_DLL_DECL=")
+  option(MSVC_STATIC_CRT "use static C Runtime library by default" ON)
+  if (MSVC_STATIC_CRT)
+    if (WITH_MKL)
+      set(FLAG_OPENMP "/openmp")
+    endif()
+    set(CMAKE_C_FLAGS_DEBUG   "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd ${FLAG_OPENMP}")
+    set(CMAKE_C_FLAGS_RELEASE  "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT ${FLAG_OPENMP}")
+    set(CMAKE_CXX_FLAGS_DEBUG  "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd ${FLAG_OPENMP}")
+    set(CMAKE_CXX_FLAGS_RELEASE   "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT ${FLAG_OPENMP}")
+    safe_set_static_flag()
+    if (WITH_STATIC_LIB)
+      add_definitions(-DSTATIC_LIB)
+    endif()
+  endif()
+else()
+  if(WITH_MKL)
+    set(FLAG_OPENMP "-fopenmp")
+  endif()
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 ${FLAG_OPENMP}")
+endif()
+
+if(WITH_GPU)
+  if(NOT WIN32)
+    set(CUDA_LIB "/usr/local/cuda/lib64/" CACHE STRING "CUDA Library")
+  else()
+    if(CUDA_LIB STREQUAL "")
+      set(CUDA_LIB "C:\\Program\ Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v8.0\\lib\\x64")
+    endif()
+  endif(NOT WIN32)
+endif()
+
+if (USE_TENSORRT AND WITH_GPU)
+  set(TENSORRT_ROOT "" CACHE STRING "The root directory of TensorRT library")
+  if("${TENSORRT_ROOT}" STREQUAL "")
+      message(FATAL_ERROR "The TENSORRT_ROOT is empty, you must assign it a value with CMake command. Such as: -DTENSORRT_ROOT=TENSORRT_ROOT_PATH ")
+  endif()
+  set(TENSORRT_INCLUDE_DIR ${TENSORRT_ROOT}/include)
+  set(TENSORRT_LIB_DIR ${TENSORRT_ROOT}/lib)
+endif()
+
+if (NOT WIN32)
+  if (USE_TENSORRT AND WITH_GPU)
+      include_directories("${TENSORRT_INCLUDE_DIR}")
+      link_directories("${TENSORRT_LIB_DIR}")
+  endif()
+endif(NOT WIN32)
+
+if(WITH_MKL)
+  set(MATH_LIB_PATH "${PADDLE_LIB_THIRD_PARTY_PATH}mklml")
+  include_directories("${MATH_LIB_PATH}/include")
+  if(WIN32)
+    set(MATH_LIB ${MATH_LIB_PATH}/lib/mklml${CMAKE_STATIC_LIBRARY_SUFFIX}
+                 ${MATH_LIB_PATH}/lib/libiomp5md${CMAKE_STATIC_LIBRARY_SUFFIX})
+  else()
+    set(MATH_LIB ${MATH_LIB_PATH}/lib/libmklml_intel${CMAKE_SHARED_LIBRARY_SUFFIX}
+                 ${MATH_LIB_PATH}/lib/libiomp5${CMAKE_SHARED_LIBRARY_SUFFIX})
+  endif()
+  set(MKLDNN_PATH "${PADDLE_LIB_THIRD_PARTY_PATH}mkldnn")
+  if(EXISTS ${MKLDNN_PATH})
+    include_directories("${MKLDNN_PATH}/include")
+    if(WIN32)
+      set(MKLDNN_LIB ${MKLDNN_PATH}/lib/mkldnn.lib)
+    else(WIN32)
+      set(MKLDNN_LIB ${MKLDNN_PATH}/lib/libmkldnn.so.0)
+    endif(WIN32)
+  endif()
+else()
+  set(OPENBLAS_LIB_PATH "${PADDLE_LIB_THIRD_PARTY_PATH}openblas")
+  include_directories("${OPENBLAS_LIB_PATH}/include/openblas")
+  if(WIN32)
+    set(MATH_LIB ${OPENBLAS_LIB_PATH}/lib/openblas${CMAKE_STATIC_LIBRARY_SUFFIX})
+  else()
+    set(MATH_LIB ${OPENBLAS_LIB_PATH}/lib/libopenblas${CMAKE_STATIC_LIBRARY_SUFFIX})
+  endif()
+endif()
+
+if(WITH_STATIC_LIB)
+  set(DEPS ${PADDLE_LIB}/paddle/lib/libpaddle_fluid${CMAKE_STATIC_LIBRARY_SUFFIX})
+else()
+  if(WIN32)
+    set(DEPS ${PADDLE_LIB}/paddle/lib/paddle_fluid${CMAKE_STATIC_LIBRARY_SUFFIX})
+  else()
+    set(DEPS ${PADDLE_LIB}/paddle/lib/libpaddle_fluid${CMAKE_SHARED_LIBRARY_SUFFIX})
+  endif()
+endif()
+
+if (NOT WIN32)
+  set(EXTERNAL_LIB "-lrt -ldl -lpthread")
+  set(DEPS ${DEPS}
+      ${MATH_LIB} ${MKLDNN_LIB}
+      glog gflags protobuf  xxhash
+      ${EXTERNAL_LIB})
+else()
+  set(DEPS ${DEPS}
+      ${MATH_LIB} ${MKLDNN_LIB}
+      glog gflags_static libprotobuf  xxhash ${EXTERNAL_LIB})
+  set(DEPS ${DEPS} shlwapi.lib)
+endif(NOT WIN32)
+
+if(WITH_GPU)
+  if(NOT WIN32)
+    if (USE_TENSORRT)
+      set(DEPS ${DEPS} ${TENSORRT_LIB_DIR}/libnvinfer${CMAKE_STATIC_LIBRARY_SUFFIX})
+      set(DEPS ${DEPS} ${TENSORRT_LIB_DIR}/libnvinfer_plugin${CMAKE_STATIC_LIBRARY_SUFFIX})
+    endif()
+    set(DEPS ${DEPS} ${CUDA_LIB}/libcudart${CMAKE_SHARED_LIBRARY_SUFFIX})
+  else()
+    if(USE_TENSORRT)
+      set(DEPS ${DEPS} ${TENSORRT_LIB_DIR}/nvinfer${CMAKE_STATIC_LIBRARY_SUFFIX})
+      set(DEPS ${DEPS} ${TENSORRT_LIB_DIR}/nvinfer_plugin${CMAKE_STATIC_LIBRARY_SUFFIX})
+    endif()
+    set(DEPS ${DEPS} ${CUDA_LIB}/cudart${CMAKE_STATIC_LIBRARY_SUFFIX} )
+    set(DEPS ${DEPS} ${CUDA_LIB}/cublas${CMAKE_STATIC_LIBRARY_SUFFIX} )
+    set(DEPS ${DEPS} ${CUDA_LIB}/cudnn${CMAKE_STATIC_LIBRARY_SUFFIX} )
+  endif()
+endif()
+
+add_executable(${DEMO_NAME} ${DEMO_NAME}.cc)
+target_link_libraries(${DEMO_NAME} ${DEPS})
+if(WIN32)
+  if(USE_TENSORRT)
+    add_custom_command(TARGET ${DEMO_NAME} POST_BUILD
+            COMMAND ${CMAKE_COMMAND} -E copy ${TENSORRT_LIB_DIR}/nvinfer${CMAKE_SHARED_LIBRARY_SUFFIX}
+              ${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE}
+            COMMAND ${CMAKE_COMMAND} -E copy ${TENSORRT_LIB_DIR}/nvinfer_plugin${CMAKE_SHARED_LIBRARY_SUFFIX}
+              ${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE}
+    )
+  endif()
+  if(WITH_MKL)
+    add_custom_command(TARGET ${DEMO_NAME} POST_BUILD
+          COMMAND ${CMAKE_COMMAND} -E copy ${MATH_LIB_PATH}/lib/mklml.dll ${CMAKE_BINARY_DIR}/Release
+          COMMAND ${CMAKE_COMMAND} -E copy ${MATH_LIB_PATH}/lib/libiomp5md.dll ${CMAKE_BINARY_DIR}/Release
+          COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_PATH}/lib/mkldnn.dll  ${CMAKE_BINARY_DIR}/Release
+    )
+  else()
+    add_custom_command(TARGET ${DEMO_NAME} POST_BUILD
+          COMMAND ${CMAKE_COMMAND} -E copy ${OPENBLAS_LIB_PATH}/lib/openblas.dll ${CMAKE_BINARY_DIR}/Release
+    )
+  endif()
+  if(NOT WITH_STATIC_LIB)
+      add_custom_command(TARGET ${DEMO_NAME} POST_BUILD 
+        COMMAND ${CMAKE_COMMAND} -E copy "${PADDLE_LIB}/paddle/lib/paddle_fluid.dll" ${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE}
+      )
+  endif()
+endif()
diff --git a/c++/cuda_linux_demo/README.md b/c++/cuda_linux_demo/README.md
new file mode 100644
index 0000000000000..5b01f5c0a06e8
--- /dev/null
+++ b/c++/cuda_linux_demo/README.md
@@ -0,0 +1,105 @@
+# GPU上C++预测部署示例
+
+## 1 流程解析
+
+1.1 准备预测库
+
+请参考[推理库下载文档](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/guides/05_inference_deployment/inference/build_and_install_lib_cn.html)下载Paddle预测库。
+
+1.2 准备预测模型
+
+使用Paddle训练结束后，得到预测模型，可以用于预测部署。
+
+本示例准备了mobilenet_v1预测模型，可以从[链接](https://paddle-inference-dist.cdn.bcebos.com/PaddleInference/mobilenetv1_fp32.tar.gz)下载，或者wget下载。
+
+```
+wget https://paddle-inference-dist.cdn.bcebos.com/PaddleInference/mobilenetv1_fp32.tar.gz
+```
+
+1.3 包含头文件
+
+使用Paddle预测库，只需要包含 `paddle_inference_api.h` 头文件。
+
+```
+#include "paddle/include/paddle_inference_api.h"
+```
+
+1.4 设置Config
+
+根据预测部署的实际情况，设置Config，用于后续创建Predictor。
+
+Config默认是使用CPU预测，若要使用GPU预测，需要手动开启，设置运行的GPU卡号和分配的初始显存。可以设置开启TensorRT加速、开启IR优化、开启内存优化。使用Paddle-TensorRT相关说明和示例可以参考[文档](https://paddle-inference.readthedocs.io/en/master/optimize/paddle_trt.html)。
+
+```
+paddle_infer::Config config;
+if (FLAGS_model_dir == "") {
+config.SetModel(FLAGS_model_file, FLAGS_params_file); // Load combined model
+} else {
+config.SetModel(FLAGS_model_dir); // Load no-combined model
+}
+config.EnableUseGpu(500, 0);
+config.SwitchIrOptim(true);
+config.EnableMemoryOptim();
+config.EnableTensorRtEngine(1 << 30, FLAGS_batch_size, 10, PrecisionType::kFloat32, false, false);
+```
+
+1.5 创建Predictor
+
+```
+std::shared_ptr<paddle_infer::Predictor> predictor = paddle_infer::CreatePredictor(config);
+```
+
+1.6 设置输入
+
+从Predictor中获取输入的names和handle，然后设置输入数据。
+
+```
+auto input_names = predictor->GetInputNames();
+auto input_t = predictor->GetInputHandle(input_names[0]);
+std::vector<int> input_shape = {1, 3, 224, 224};
+std::vector<float> input_data(1 * 3 * 224 * 224, 1);
+input_t->Reshape(input_shape);
+input_t->CopyFromCpu(input_data.data());
+```
+
+1.7 执行Predictor
+
+```
+predictor->Run();
+```
+
+1.8 获取输出
+
+```
+auto output_names = predictor->GetOutputNames();
+auto output_t = predictor->GetOutputHandle(output_names[0]);
+std::vector<int> output_shape = output_t->shape();
+int out_num = std::accumulate(output_shape.begin(), output_shape.end(), 1,
+                            std::multiplies<int>());
+std::vector<float> out_data;
+out_data.resize(out_num);
+output_t->CopyToCpu(out_data.data());
+```
+
+## 2 编译运行示例
+
+2.1 编译示例
+
+文件`model_test.cc` 为预测的样例程序（程序中的输入为固定值，如果您有opencv或其他方式进行数据读取的需求，需要对程序进行一定的修改）。    
+文件`CMakeLists.txt` 为编译构建文件。   
+脚本`run_impl.sh` 包含了第三方库、预编译库的信息配置。
+
+打开 `run_impl.sh` 文件，设置 LIB_DIR 为准备的预测库路径，比如 `LIB_DIR=/work/Paddle/build/paddle_inference_install_dir`。
+
+运行 `sh run_impl.sh`， 会在目录下产生build目录。
+
+2.2 运行示例
+
+进入build目录，运行样例
+
+```shell
+cd build
+./model_test --model_dir=mobilenetv1_fp32_dir
+```
+
+运行结束后，程序会将模型结果打印到屏幕，说明运行成功。
diff --git a/c++/cuda_linux_demo/model_test.cc b/c++/cuda_linux_demo/model_test.cc
new file mode 100644
index 0000000000000..8cd479b130fbe
--- /dev/null
+++ b/c++/cuda_linux_demo/model_test.cc
@@ -0,0 +1,60 @@
+#include <assert.h>
+#include <algorithm>
+#include <chrono>
+#include <iostream>
+#include <memory>
+#include <numeric>
+
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+
+#include "paddle/include/paddle_inference_api.h"
+
+DEFINE_string(model_dir, "", "Directory of the inference model.");
+DEFINE_string(model_file, "", "Path of the inference model file.");
+DEFINE_string(params_file, "", "Path of the inference params file.");
+
+int main(int argc, char *argv[]) {
+  google::ParseCommandLineFlags(&argc, &argv, true);
+
+  // Init config
+  paddle_infer::Config config;
+  if (FLAGS_model_dir == "") {
+    config.SetModel(FLAGS_model_file, FLAGS_params_file); // Load combined model
+  } else {
+    config.SetModel(FLAGS_model_dir); // Load no-combined model
+  }
+  config.EnableUseGpu(500, 0);
+  config.SwitchIrOptim(true);
+  config.EnableMemoryOptim();
+  config.EnableTensorRtEngine(1 << 30, FLAGS_batch_size, 10, PrecisionType::kFloat32, false, false);
+  
+  // Create predictor
+  auto predictor = paddle_infer::CreatePredictor(config);
+
+  // Set input
+  auto input_names = predictor->GetInputNames();
+  auto input_t = predictor->GetInputHandle(input_names[0]);
+  std::vector<int> input_shape = {1, 3, 224, 224};
+  std::vector<float> input_data(1 * 3 * 224 * 224, 1);
+  input_t->Reshape(input_shape);
+  input_t->CopyFromCpu(input_data.data());
+
+  // Run
+  predictor->Run();
+
+  // Get output
+  auto output_names = predictor->GetOutputNames();
+  auto output_t = predictor->GetOutputHandle(output_names[0]);
+  std::vector<int> output_shape = output_t->shape();
+  int out_num = std::accumulate(output_shape.begin(), output_shape.end(), 1,
+                                std::multiplies<int>());
+  std::vector<float> out_data;
+  out_data.resize(out_num);
+  output_t->CopyToCpu(out_data.data());
+
+  auto max_iter = std::max_element(out_data.begin(), out_data.end());
+  LOG(INFO) << "Output max_arg_index:" << max_iter - out_data.begin()
+    << ", max_value:" << *max_iter;
+  return 0;
+}
diff --git a/c++/cuda_linux_demo/run_impl.sh b/c++/cuda_linux_demo/run_impl.sh
new file mode 100755
index 0000000000000..96112a72e8039
--- /dev/null
+++ b/c++/cuda_linux_demo/run_impl.sh
@@ -0,0 +1,26 @@
+mkdir -p build
+cd build
+rm -rf *
+
+DEMO_NAME=model_test
+
+WITH_MKL=ON
+WITH_GPU=ON
+USE_TENSORRT=ON
+
+LIB_DIR=/work/Paddle/build/paddle_inference_install_dir
+CUDNN_LIB=/path/to/cudnn/lib
+CUDA_LIB=/path/to/cuda/lib
+TENSORRT_ROOT=/path/to/trt/root/dir
+
+cmake .. -DPADDLE_LIB=${LIB_DIR} \
+  -DWITH_MKL=${WITH_MKL} \
+  -DDEMO_NAME=${DEMO_NAME} \
+  -DWITH_GPU=${WITH_GPU} \
+  -DWITH_STATIC_LIB=OFF \
+  -DUSE_TENSORRT=${USE_TENSORRT} \
+  -DCUDNN_LIB=${CUDNN_LIB} \
+  -DCUDA_LIB=${CUDA_LIB} \
+  -DTENSORRT_ROOT=${TENSORRT_ROOT}
+
+make -j
diff --git a/docs/demo_tutorial/cuda_jetson_demo.md b/docs/demo_tutorial/cuda_jetson_demo.md
new file mode 100644
index 0000000000000..1dbed701b62be
--- /dev/null
+++ b/docs/demo_tutorial/cuda_jetson_demo.md
@@ -0,0 +1,213 @@
+# NV Jetson上预测部署示例
+
+## 1 C++预测部署示例
+
+C++示例代码在[链接](https://github.com/PaddlePaddle/Paddle-Inference-Demo/tree/master/c%2B%2B/cuda_linux_demo)，下面从`流程解析`和`编译运行示例`两方面介绍。
+
+### 1.1 流程解析
+
+#### 1.1.1 准备预测库
+
+请参考[推理库下载文档](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/guides/05_inference_deployment/inference/build_and_install_lib_cn.html)下载Paddle C++预测库，名称前缀包含 `nv_jetson` 的为用于NV Jetson平台的预测库。
+
+#### 1.1.2 准备预测模型
+
+使用Paddle训练结束后，得到预测模型，可以用于预测部署。
+
+本示例准备了mobilenet_v1预测模型，可以从[链接](https://paddle-inference-dist.cdn.bcebos.com/PaddleInference/mobilenetv1_fp32.tar.gz)下载，或者wget下载。
+
+```shell
+wget https://paddle-inference-dist.cdn.bcebos.com/PaddleInference/mobilenetv1_fp32.tar.gz
+```
+
+#### 1.1.3 包含头文件
+
+使用Paddle预测库，只需要包含 `paddle_inference_api.h` 头文件。
+
+```cpp
+#include "paddle/include/paddle_inference_api.h"
+```
+
+#### 1.1.4 设置Config
+
+根据预测部署的实际情况，设置Config，用于后续创建Predictor。
+
+Config默认是使用CPU预测，若要使用GPU预测，需要手动开启，设置运行的GPU卡号和分配的初始显存。可以设置开启TensorRT加速、开启IR优化、开启内存优化。使用Paddle-TensorRT相关说明和示例可以参考[文档](https://paddle-inference.readthedocs.io/en/master/optimize/paddle_trt.html)。
+
+```cpp
+paddle_infer::Config config;
+if (FLAGS_model_dir == "") {
+config.SetModel(FLAGS_model_file, FLAGS_params_file); // Load combined model
+} else {
+config.SetModel(FLAGS_model_dir); // Load no-combined model
+}
+config.EnableUseGpu(500, 0);
+config.SwitchIrOptim(true);
+config.EnableMemoryOptim();
+config.EnableTensorRtEngine(1 << 30, FLAGS_batch_size, 10, PrecisionType::kFloat32, false, false);
+```
+
+#### 1.1.5 创建Predictor
+
+```cpp
+std::shared_ptr<paddle_infer::Predictor> predictor = paddle_infer::CreatePredictor(config);
+```
+
+#### 1.1.6 设置输入
+
+从Predictor中获取输入的names和handle，然后设置输入数据。
+
+```cpp
+auto input_names = predictor->GetInputNames();
+auto input_t = predictor->GetInputHandle(input_names[0]);
+std::vector<int> input_shape = {1, 3, 224, 224};
+std::vector<float> input_data(1 * 3 * 224 * 224, 1);
+input_t->Reshape(input_shape);
+input_t->CopyFromCpu(input_data.data());
+```
+
+#### 1.1.7 执行Predictor
+
+```cpp
+predictor->Run();
+```
+
+#### 1.1.8 获取输出
+
+```cpp
+auto output_names = predictor->GetOutputNames();
+auto output_t = predictor->GetOutputHandle(output_names[0]);
+std::vector<int> output_shape = output_t->shape();
+int out_num = std::accumulate(output_shape.begin(), output_shape.end(), 1,
+                              std::multiplies<int>());
+std::vector<float> out_data;
+out_data.resize(out_num);
+output_t->CopyToCpu(out_data.data());
+```
+
+### 1.2 编译运行示例
+
+#### 1.2.1 编译示例
+
+文件`model_test.cc` 为预测的样例程序（程序中的输入为固定值，如果您有opencv或其他方式进行数据读取的需求，需要对程序进行一定的修改）。    
+文件`CMakeLists.txt` 为编译构建文件。   
+脚本`run_impl.sh` 包含了第三方库、预编译库的信息配置。
+
+根据前面步骤下载Paddle预测库和mobilenetv1模型。
+
+打开 `run_impl.sh` 文件，设置 LIB_DIR 为下载的预测库路径，比如 `LIB_DIR=/work/Paddle/build/paddle_inference_install_dir`。
+
+运行 `sh run_impl.sh`， 会在当前目录下编译产生build目录。
+
+#### 1.2.2 运行示例
+
+进入build目录，运行样例。
+
+```shell
+cd build
+./model_test --model_dir=mobilenetv1_fp32_dir
+```
+
+运行结束后，程序会将模型结果打印到屏幕，说明运行成功。
+
+## 2 Python预测部署示例
+
+Python预测部署示例代码在[链接](https://github.com/PaddlePaddle/Paddle-Inference-Demo/tree/master/python/cuda_linux_demo)，下面从`流程解析`和`编译运行示例`两方面介绍。
+
+### 2.1 流程解析
+
+#### 2.1.1 准备环境
+
+请参考[飞桨官网](https://www.paddlepaddle.org.cn/)安装2.0及以上版本的paddlepaddle-gpu。
+
+Python安装opencv：`pip install opencv-python`。
+
+#### 2.1.2 准备预测模型
+
+使用Paddle训练结束后，得到预测模型，可以用于预测部署。
+
+本示例准备了mobilenet_v1预测模型，可以从[链接](https://paddle-inference-dist.cdn.bcebos.com/PaddleInference/mobilenetv1_fp32.tar.gz)下载，或者wget下载。
+
+```shell
+wget https://paddle-inference-dist.cdn.bcebos.com/PaddleInference/mobilenetv1_fp32.tar.gz
+tar zxf mobilenetv1_fp32.tar.gz
+```
+
+#### 2.1.3 Python导入
+
+```
+from paddle.inference import Config
+from paddle.inference import create_predictor
+```
+
+#### 2.1.4 设置Config
+
+根据预测部署的实际情况，设置Config，用于后续创建Predictor。
+
+Config默认是使用CPU预测，若要使用GPU预测，需要手动开启，设置运行的GPU卡号和分配的初始显存。可以设置开启TensorRT加速、开启IR优化、开启内存优化。使用Paddle-TensorRT相关说明和示例可以参考[文档](https://paddle-inference.readthedocs.io/en/master/optimize/paddle_trt.html)。
+
+```python
+# args 是解析的输入参数
+if args.model_dir == "":
+    config = Config(args.model_file, args.params_file)
+else:
+    config = Config(args.model_dir)
+config.enable_use_gpu(500, 0)
+config.switch_ir_optim()
+config.enable_memory_optim()
+config.enable_tensorrt_engine(workspace_size=1 << 30, precision_mode=AnalysisConfig.Precision.Float32,max_batch_size=1, min_subgraph_size=5, use_static=False, use_calib_mode=False)
+```
+
+#### 2.1.5 创建Predictor
+
+```python
+predictor = create_predictor(config)
+```
+
+#### 2.1.6 设置输入
+
+从Predictor中获取输入的names和handle，然后设置输入数据。
+
+```python
+img = cv2.imread(args.img_path)
+img = preprocess(img)
+input_names = predictor.get_input_names()
+input_tensor = predictor.get_input_handle(input_names[0])
+input_tensor.reshape(img.shape)
+input_tensor.copy_from_cpu(img.copy())
+```
+
+#### 2.1.7 执行Predictor
+
+```python
+predictor.run();
+```
+
+#### 2.1.8 获取输出
+
+```python
+output_names = predictor.get_output_names()
+output_tensor = predictor.get_output_handle(output_names[0])
+output_data = output_tensor.copy_to_cpu()
+```
+
+### 2.2 编译运行示例
+
+文件`img_preprocess.py`是对图像进行预处理。
+文件`model_test.py`是示例程序。
+
+参考前面步骤准备环境、下载预测模型。
+
+下载预测图片。
+
+```shell
+wget https://paddle-inference-dist.bj.bcebos.com/inference_demo/python/resnet50/ILSVRC2012_val_00000247.jpeg
+```
+
+执行预测命令。
+
+```
+python model_test.py --model_dir mobilenetv1_fp32 --img_path ILSVRC2012_val_00000247.jpeg
+```
+
+运行结束后，程序会将模型结果打印到屏幕，说明运行成功。
diff --git a/docs/demo_tutorial/cuda_linux_demo.md b/docs/demo_tutorial/cuda_linux_demo.md
new file mode 100644
index 0000000000000..e38cb4e67d393
--- /dev/null
+++ b/docs/demo_tutorial/cuda_linux_demo.md
@@ -0,0 +1,213 @@
+# GPU上预测部署示例
+
+## 1 C++预测部署示例
+
+C++示例代码在[链接](https://github.com/PaddlePaddle/Paddle-Inference-Demo/tree/master/c%2B%2B/cuda_linux_demo)，下面从`流程解析`和`编译运行示例`两方面介绍。
+
+### 1.1 流程解析
+
+#### 1.1.1 准备预测库
+
+请参考[推理库下载文档](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/guides/05_inference_deployment/inference/build_and_install_lib_cn.html)下载Paddle C++预测库，名称中带有 `cuda` 的为用于GPU的预测库。
+
+#### 1.1.2 准备预测模型
+
+使用Paddle训练结束后，得到预测模型，可以用于预测部署。
+
+本示例准备了mobilenet_v1预测模型，可以从[链接](https://paddle-inference-dist.cdn.bcebos.com/PaddleInference/mobilenetv1_fp32.tar.gz)下载，或者wget下载。
+
+```shell
+wget https://paddle-inference-dist.cdn.bcebos.com/PaddleInference/mobilenetv1_fp32.tar.gz
+```
+
+#### 1.1.3 包含头文件
+
+使用Paddle预测库，只需要包含 `paddle_inference_api.h` 头文件。
+
+```cpp
+#include "paddle/include/paddle_inference_api.h"
+```
+
+#### 1.1.4 设置Config
+
+根据预测部署的实际情况，设置Config，用于后续创建Predictor。
+
+Config默认是使用CPU预测，若要使用GPU预测，需要手动开启，设置运行的GPU卡号和分配的初始显存。可以设置开启TensorRT加速、开启IR优化、开启内存优化。使用Paddle-TensorRT相关说明和示例可以参考[文档](https://paddle-inference.readthedocs.io/en/master/optimize/paddle_trt.html)。
+
+```cpp
+paddle_infer::Config config;
+if (FLAGS_model_dir == "") {
+config.SetModel(FLAGS_model_file, FLAGS_params_file); // Load combined model
+} else {
+config.SetModel(FLAGS_model_dir); // Load no-combined model
+}
+config.EnableUseGpu(500, 0);
+config.SwitchIrOptim(true);
+config.EnableMemoryOptim();
+config.EnableTensorRtEngine(1 << 30, FLAGS_batch_size, 10, PrecisionType::kFloat32, false, false);
+```
+
+#### 1.1.5 创建Predictor
+
+```cpp
+std::shared_ptr<paddle_infer::Predictor> predictor = paddle_infer::CreatePredictor(config);
+```
+
+#### 1.1.6 设置输入
+
+从Predictor中获取输入的names和handle，然后设置输入数据。
+
+```cpp
+auto input_names = predictor->GetInputNames();
+auto input_t = predictor->GetInputHandle(input_names[0]);
+std::vector<int> input_shape = {1, 3, 224, 224};
+std::vector<float> input_data(1 * 3 * 224 * 224, 1);
+input_t->Reshape(input_shape);
+input_t->CopyFromCpu(input_data.data());
+```
+
+#### 1.1.7 执行Predictor
+
+```cpp
+predictor->Run();
+```
+
+#### 1.1.8 获取输出
+
+```cpp
+auto output_names = predictor->GetOutputNames();
+auto output_t = predictor->GetOutputHandle(output_names[0]);
+std::vector<int> output_shape = output_t->shape();
+int out_num = std::accumulate(output_shape.begin(), output_shape.end(), 1,
+                              std::multiplies<int>());
+std::vector<float> out_data;
+out_data.resize(out_num);
+output_t->CopyToCpu(out_data.data());
+```
+
+### 1.2 编译运行示例
+
+#### 1.2.1 编译示例
+
+文件`model_test.cc` 为预测的样例程序（程序中的输入为固定值，如果您有opencv或其他方式进行数据读取的需求，需要对程序进行一定的修改）。    
+文件`CMakeLists.txt` 为编译构建文件。   
+脚本`run_impl.sh` 包含了第三方库、预编译库的信息配置。
+
+根据前面步骤下载Paddle预测库和mobilenetv1模型。
+
+打开 `run_impl.sh` 文件，设置 LIB_DIR 为下载的预测库路径，比如 `LIB_DIR=/work/Paddle/build/paddle_inference_install_dir`。
+
+运行 `sh run_impl.sh`， 会在当前目录下编译产生build目录。
+
+#### 1.2.2 运行示例
+
+进入build目录，运行样例。
+
+```shell
+cd build
+./model_test --model_dir=mobilenetv1_fp32_dir
+```
+
+运行结束后，程序会将模型结果打印到屏幕，说明运行成功。
+
+## 2 Python预测部署示例
+
+Python预测部署示例代码在[链接](https://github.com/PaddlePaddle/Paddle-Inference-Demo/tree/master/python/cuda_linux_demo)，下面从`流程解析`和`编译运行示例`两方面介绍。
+
+### 2.1 流程解析
+
+#### 2.1.1 准备环境
+
+请参考[飞桨官网](https://www.paddlepaddle.org.cn/)安装2.0及以上版本的paddlepaddle-gpu。
+
+Python安装opencv：`pip install opencv-python`。
+
+#### 2.1.2 准备预测模型
+
+使用Paddle训练结束后，得到预测模型，可以用于预测部署。
+
+本示例准备了mobilenet_v1预测模型，可以从[链接](https://paddle-inference-dist.cdn.bcebos.com/PaddleInference/mobilenetv1_fp32.tar.gz)下载，或者wget下载。
+
+```shell
+wget https://paddle-inference-dist.cdn.bcebos.com/PaddleInference/mobilenetv1_fp32.tar.gz
+tar zxf mobilenetv1_fp32.tar.gz
+```
+
+#### 2.1.3 Python导入
+
+```
+from paddle.inference import Config
+from paddle.inference import create_predictor
+```
+
+#### 2.1.4 设置Config
+
+根据预测部署的实际情况，设置Config，用于后续创建Predictor。
+
+Config默认是使用CPU预测，若要使用GPU预测，需要手动开启，设置运行的GPU卡号和分配的初始显存。可以设置开启TensorRT加速、开启IR优化、开启内存优化。使用Paddle-TensorRT相关说明和示例可以参考[文档](https://paddle-inference.readthedocs.io/en/master/optimize/paddle_trt.html)。
+
+```python
+# args 是解析的输入参数
+if args.model_dir == "":
+    config = Config(args.model_file, args.params_file)
+else:
+    config = Config(args.model_dir)
+config.enable_use_gpu(500, 0)
+config.switch_ir_optim()
+config.enable_memory_optim()
+config.enable_tensorrt_engine(workspace_size=1 << 30, precision_mode=AnalysisConfig.Precision.Float32,max_batch_size=1, min_subgraph_size=5, use_static=False, use_calib_mode=False)
+```
+
+#### 2.1.5 创建Predictor
+
+```python
+predictor = create_predictor(config)
+```
+
+#### 2.1.6 设置输入
+
+从Predictor中获取输入的names和handle，然后设置输入数据。
+
+```python
+img = cv2.imread(args.img_path)
+img = preprocess(img)
+input_names = predictor.get_input_names()
+input_tensor = predictor.get_input_handle(input_names[0])
+input_tensor.reshape(img.shape)
+input_tensor.copy_from_cpu(img.copy())
+```
+
+#### 2.1.7 执行Predictor
+
+```python
+predictor.run();
+```
+
+#### 2.1.8 获取输出
+
+```python
+output_names = predictor.get_output_names()
+output_tensor = predictor.get_output_handle(output_names[0])
+output_data = output_tensor.copy_to_cpu()
+```
+
+### 2.2 编译运行示例
+
+文件`img_preprocess.py`是对图像进行预处理。
+文件`model_test.py`是示例程序。
+
+参考前面步骤准备环境、下载预测模型。
+
+下载预测图片。
+
+```shell
+wget https://paddle-inference-dist.bj.bcebos.com/inference_demo/python/resnet50/ILSVRC2012_val_00000247.jpeg
+```
+
+执行预测命令。
+
+```
+python model_test.py --model_dir mobilenetv1_fp32 --img_path ILSVRC2012_val_00000247.jpeg
+```
+
+运行结束后，程序会将模型结果打印到屏幕，说明运行成功。
diff --git a/docs/demo_tutorial/cuda_windows_demo.md b/docs/demo_tutorial/cuda_windows_demo.md
new file mode 100644
index 0000000000000..db9d0814c07fb
--- /dev/null
+++ b/docs/demo_tutorial/cuda_windows_demo.md
@@ -0,0 +1,229 @@
+# Windows上GPU预测部署示例
+
+## 1 C++预测部署示例
+
+C++示例代码在[链接](https://github.com/PaddlePaddle/Paddle-Inference-Demo/tree/master/c%2B%2B/cuda_linux_demo)，下面从`流程解析`和`编译运行示例`两方面介绍。
+
+### 1.1 流程解析
+
+#### 1.1.1 准备预测库
+
+请参考[推理库下载文档](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/guides/05_inference_deployment/inference/windows_cpp_inference.html)下载windows平台的Paddle GPU C++预测库。
+
+#### 1.1.2 准备预测模型
+
+使用Paddle训练结束后，得到预测模型，可以用于预测部署。
+
+本示例准备了mobilenet_v1预测模型，可以从[链接](https://paddle-inference-dist.cdn.bcebos.com/PaddleInference/mobilenetv1_fp32.tar.gz)下载，或者wget下载。
+
+```shell
+wget https://paddle-inference-dist.cdn.bcebos.com/PaddleInference/mobilenetv1_fp32.tar.gz
+```
+
+#### 1.1.3 包含头文件
+
+使用Paddle预测库，只需要包含 `paddle_inference_api.h` 头文件。
+
+```cpp
+#include "paddle/include/paddle_inference_api.h"
+```
+
+#### 1.1.4 设置Config
+
+根据预测部署的实际情况，设置Config，用于后续创建Predictor。
+
+Config默认是使用CPU预测，若要使用GPU预测，需要手动开启，设置运行的GPU卡号和分配的初始显存。可以设置开启IR优化、开启内存优化。
+
+```cpp
+paddle_infer::Config config;
+if (FLAGS_model_dir == "") {
+config.SetModel(FLAGS_model_file, FLAGS_params_file); // Load combined model
+} else {
+config.SetModel(FLAGS_model_dir); // Load no-combined model
+}
+config.EnableUseGpu(500, 0);
+config.SwitchIrOptim(true);
+config.EnableMemoryOptim();
+```
+
+#### 1.1.5 创建Predictor
+
+```cpp
+std::shared_ptr<paddle_infer::Predictor> predictor = paddle_infer::CreatePredictor(config);
+```
+
+#### 1.1.6 设置输入
+
+从Predictor中获取输入的names和handle，然后设置输入数据。
+
+```cpp
+auto input_names = predictor->GetInputNames();
+auto input_t = predictor->GetInputHandle(input_names[0]);
+std::vector<int> input_shape = {1, 3, 224, 224};
+std::vector<float> input_data(1 * 3 * 224 * 224, 1);
+input_t->Reshape(input_shape);
+input_t->CopyFromCpu(input_data.data());
+```
+
+#### 1.1.7 执行Predictor
+
+```cpp
+predictor->Run();
+```
+
+#### 1.1.8 获取输出
+
+```cpp
+auto output_names = predictor->GetOutputNames();
+auto output_t = predictor->GetOutputHandle(output_names[0]);
+std::vector<int> output_shape = output_t->shape();
+int out_num = std::accumulate(output_shape.begin(), output_shape.end(), 1,
+                              std::multiplies<int>());
+std::vector<float> out_data;
+out_data.resize(out_num);
+output_t->CopyToCpu(out_data.data());
+```
+
+### 1.2 编译运行示例
+
+#### 1.2.1 编译示例
+
+文件`model_test.cc` 为预测的样例程序（程序中的输入为固定值，如果您有opencv或其他方式进行数据读取的需求，需要对程序进行一定的修改）。    
+文件`CMakeLists.txt` 为编译构建文件。   
+
+根据前面步骤下载Paddle预测库和mobilenetv1模型。
+
+使用cmake-gui程序生成vs工程：
+
+- 选择源代码路径，及编译产物路径，如图所示
+
+![win_x86_cpu_cmake_1](./images/win_x86_cpu_cmake_1.png)
+
+- 点击Configure，选择Visual Studio且选择x64版本如图所示，点击Finish，由于我们没有加入必要的CMake Options，会导致configure失败，请继续下一步。
+
+![win_x86_cpu_cmake_2](./images/win_x86_cpu_cmake_2.png)
+
+- 设置CMake Options，点击Add Entry，新增PADDLE_LIB，CMAKE_BUILD_TYPE，DEMO_NAME等选项。具体配置项如下图所示，其中PADDLE_LIB为您下载的预测库路径。
+
+![win_x86_cpu_cmake_3](./images/win_x86_cpu_cmake_3.png)
+
+- 点击Configure，log信息显示Configure done代表配置成功，接下来点击Generate生成vs工程，log信息显示Generate done，代表生成成功，最后点击Open Project打开Visual Studio.
+
+- 设置为Release/x64，编译，编译产物在build/Release目录下。
+
+![win_x86_cpu_vs_1](./images/win_x86_cpu_vs_1.png)
+
+#### 1.2.2 运行示例
+
+首先设置model_test工程为启动首选项。
+
+![win_x86_cpu_vs_2](./images/win_x86_cpu_vs_2.png)
+
+配置输入flags，即设置您之前下载的模型路径。点击Debug选项卡的`model_test Properities..`
+
+![win_x86_cpu_vs_3](./images/win_x86_cpu_vs_3.png)
+
+点击Debug选项卡下的Start Without Debugging选项开始执行程序。
+
+![win_x86_cpu_vs_4](./images/win_x86_cpu_vs_4.png)
+
+## 2 Python预测部署示例
+
+Python预测部署示例代码在[链接](https://github.com/PaddlePaddle/Paddle-Inference-Demo/tree/master/python/x86_linux_demo)，下面从`流程解析`和`编译运行示例`两方面介绍。
+
+### 2.1 流程解析
+
+#### 2.1.1 准备环境
+
+请参考[飞桨官网](https://www.paddlepaddle.org.cn/)安装2.0及以上版本的paddlepaddle-gpu。
+
+Python安装opencv：`pip install opencv-python`。
+
+#### 2.1.2 准备预测模型
+
+使用Paddle训练结束后，得到预测模型，可以用于预测部署。
+
+本示例准备了mobilenet_v1预测模型，可以从[链接](https://paddle-inference-dist.cdn.bcebos.com/PaddleInference/mobilenetv1_fp32.tar.gz)下载，或者wget下载。
+
+```shell
+wget https://paddle-inference-dist.cdn.bcebos.com/PaddleInference/mobilenetv1_fp32.tar.gz
+tar zxf mobilenetv1_fp32.tar.gz
+```
+
+#### 2.1.3 Python导入
+
+```
+from paddle.inference import Config
+from paddle.inference import create_predictor
+```
+
+#### 2.1.4 设置Config
+
+根据预测部署的实际情况，设置Config，用于后续创建Predictor。
+
+Config默认是使用CPU预测，若要使用GPU预测，需要手动开启，设置运行的GPU卡号和分配的初始显存。可以设置开启IR优化、开启内存优化。
+
+```python
+# args 是解析的输入参数
+if args.model_dir == "":
+    config = Config(args.model_file, args.params_file)
+else:
+    config = Config(args.model_dir)
+config.enable_use_gpu(500, 0)
+config.switch_ir_optim()
+config.enable_memory_optim()
+```
+
+#### 2.1.5 创建Predictor
+
+```python
+predictor = create_predictor(config)
+```
+
+#### 2.1.6 设置输入
+
+从Predictor中获取输入的names和handle，然后设置输入数据。
+
+```python
+img = cv2.imread(args.img_path)
+img = preprocess(img)
+input_names = predictor.get_input_names()
+input_tensor = predictor.get_input_handle(input_names[0])
+input_tensor.reshape(img.shape)
+input_tensor.copy_from_cpu(img.copy())
+```
+
+#### 2.1.7 执行Predictor
+
+```python
+predictor.run();
+```
+
+#### 2.1.8 获取输出
+
+```python
+output_names = predictor.get_output_names()
+output_tensor = predictor.get_output_handle(output_names[0])
+output_data = output_tensor.copy_to_cpu()
+```
+
+### 2.2 编译运行示例
+
+文件`img_preprocess.py`是对图像进行预处理。
+文件`model_test.py`是示例程序。
+
+参考前面步骤准备环境、下载预测模型。
+
+下载预测图片。
+
+```shell
+wget https://paddle-inference-dist.bj.bcebos.com/inference_demo/python/resnet50/ILSVRC2012_val_00000247.jpeg
+```
+
+执行预测命令。
+
+```
+python model_test.py --model_dir mobilenetv1_fp32 --img_path ILSVRC2012_val_00000247.jpeg
+```
+
+运行结束后，程序会将模型结果打印到屏幕，说明运行成功。
diff --git a/python/cuda_linux_demo/README.md b/python/cuda_linux_demo/README.md
new file mode 100644
index 0000000000000..2fdf9a5db03b5
--- /dev/null
+++ b/python/cuda_linux_demo/README.md
@@ -0,0 +1,99 @@
+# GPU上Python预测部署示例
+
+## 1.1 流程解析
+
+1) 准备环境
+
+请参考[飞桨官网](https://www.paddlepaddle.org.cn/)安装2.0及以上版本的paddlepaddle-gpu。
+
+Python安装opencv：`pip install opencv-python`。
+
+2）准备预测模型
+
+使用Paddle训练结束后，得到预测模型，可以用于预测部署。
+
+本示例准备了mobilenet_v1预测模型，可以从[链接](https://paddle-inference-dist.cdn.bcebos.com/PaddleInference/mobilenetv1_fp32.tar.gz)下载，或者wget下载。
+
+```shell
+wget https://paddle-inference-dist.cdn.bcebos.com/PaddleInference/mobilenetv1_fp32.tar.gz
+tar zxf mobilenetv1_fp32.tar.gz
+```
+
+3）Python导入
+
+```
+from paddle.inference import Config
+from paddle.inference import create_predictor
+```
+
+4) 设置Config
+
+根据预测部署的实际情况，设置Config，用于后续创建Predictor。
+
+Config默认是使用CPU预测，若要使用GPU预测，需要手动开启，设置运行的GPU卡号和分配的初始显存。可以设置开启TensorRT加速、开启IR优化、开启内存优化。使用Paddle-TensorRT相关说明和示例可以参考[文档](https://paddle-inference.readthedocs.io/en/master/optimize/paddle_trt.html)。
+
+```python
+# args 是解析的输入参数
+if args.model_dir == "":
+    config = Config(args.model_file, args.params_file)
+else:
+    config = Config(args.model_dir)
+config.enable_use_gpu(500, 0)
+config.switch_ir_optim()
+config.enable_memory_optim()
+config.enable_tensorrt_engine(workspace_size=1 << 30, precision_mode=AnalysisConfig.Precision.Float32,max_batch_size=1, min_subgraph_size=5, use_static=False, use_calib_mode=False)
+```
+
+5) 创建Predictor
+
+```python
+predictor = create_predictor(config)
+```
+
+6) 设置输入
+
+从Predictor中获取输入的names和handle，然后设置输入数据。
+
+```python
+img = cv2.imread(args.img_path)
+img = preprocess(img)
+input_names = predictor.get_input_names()
+input_tensor = predictor.get_input_handle(input_names[0])
+input_tensor.reshape(img.shape)
+input_tensor.copy_from_cpu(img.copy())
+```
+
+7) 执行Predictor
+
+```python
+predictor.run();
+```
+
+8) 获取输出
+
+```python
+output_names = predictor.get_output_names()
+output_tensor = predictor.get_output_handle(output_names[0])
+output_data = output_tensor.copy_to_cpu()
+```
+
+## 1.2 编译运行示例
+
+文件`img_preprocess.py`是对图像进行预处理。
+文件`model_test.py`是示例程序。
+
+参考前面步骤准备环境、下载预测模型。
+
+下载预测图片。
+
+```shell
+wget https://paddle-inference-dist.bj.bcebos.com/inference_demo/python/resnet50/ILSVRC2012_val_00000247.jpeg
+```
+
+执行预测命令。
+
+```
+python model_test.py --model_dir mobilenetv1_fp32 --img_path ILSVRC2012_val_00000247.jpeg
+``
+
+运行结束后，程序会将模型结果打印到屏幕，说明运行成功。
diff --git a/python/cuda_linux_demo/img_preprocess.py b/python/cuda_linux_demo/img_preprocess.py
new file mode 100644
index 0000000000000..34321de022caa
--- /dev/null
+++ b/python/cuda_linux_demo/img_preprocess.py
@@ -0,0 +1,41 @@
+import cv2
+import numpy as np
+
+
+def resize_short(img, target_size):
+    """ resize_short """
+    percent = float(target_size) / min(img.shape[0], img.shape[1])
+    resized_width = int(round(img.shape[1] * percent))
+    resized_height = int(round(img.shape[0] * percent))
+    resized = cv2.resize(img, (resized_width, resized_height))
+    return resized
+
+
+def crop_image(img, target_size, center):
+    """ crop_image """
+    height, width = img.shape[:2]
+    size = target_size
+    if center == True:
+        w_start = (width - size) / 2
+        h_start = (height - size) / 2
+    else:
+        w_start = np.random.randint(0, width - size + 1)
+        h_start = np.random.randint(0, height - size + 1)
+    w_end = w_start + size
+    h_end = h_start + size
+    img = img[int(h_start):int(h_end), int(w_start):int(w_end), :]
+    return img
+
+
+def preprocess(img):
+    mean = [0.485, 0.456, 0.406]
+    std = [0.229, 0.224, 0.225]
+    img = resize_short(img, 224)
+    img = crop_image(img, 224, True)
+    # bgr-> rgb && hwc->chw
+    img = img[:, :, ::-1].astype('float32').transpose((2, 0, 1)) / 255
+    img_mean = np.array(mean).reshape((3, 1, 1))
+    img_std = np.array(std).reshape((3, 1, 1))
+    img -= img_mean
+    img /= img_std
+    return img[np.newaxis, :]
diff --git a/python/cuda_linux_demo/model_test.py b/python/cuda_linux_demo/model_test.py
new file mode 100644
index 0000000000000..d2dad951fee62
--- /dev/null
+++ b/python/cuda_linux_demo/model_test.py
@@ -0,0 +1,74 @@
+import numpy as np
+import argparse
+import cv2
+
+from paddle.inference import Config
+from paddle.inference import create_predictor
+from img_preprocess import preprocess
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--model_dir",
+        type=str,
+        default="",
+        help=
+        "Model dir, If you load a non-combined model, specify the directory of the model."
+    )
+    parser.add_argument(
+        "--model_file",
+        type=str,
+        default="",
+        help="Model filename, Specify this when your model is a combined model."
+    )
+    parser.add_argument(
+        "--params_file",
+        type=str,
+        default="",
+        help=
+        "Parameter filename, Specify this when your model is a combined model."
+    )
+    parser.add_argument("--img_path", type=str, default="", help="Input image path.")
+    parser.add_argument("--threads",
+                        type=int,
+                        default=1,
+                        help="Whether use gpu.")
+    return parser.parse_args()
+
+if __name__ == '__main__':
+    args = parse_args()
+    assert (args.model_dir != "") or \
+            (args.model_file != "" and args.params_file != ""), \
+            "Set model path error."
+    assert args.img_path != "", "Set img_path error."
+    
+    # Init config
+    if args.model_dir == "":
+        config = Config(args.model_file, args.params_file)
+    else:
+        config = Config(args.model_dir)
+    config.enable_use_gpu(500, 0)
+    config.switch_ir_optim()
+    config.enable_memory_optim()
+    config.enable_tensorrt_engine(workspace_size=1 << 30, precision_mode=AnalysisConfig.Precision.Float32,max_batch_size=1, min_subgraph_size=5, use_static=False, use_calib_mode=False)
+        
+    # Create predictor
+    predictor = create_predictor(config)
+
+    # Set input
+    img = cv2.imread(args.img_path)
+    img = preprocess(img)
+    input_names = predictor.get_input_names()
+    input_tensor = predictor.get_input_handle(input_names[0])
+    input_tensor.reshape(img.shape)
+    input_tensor.copy_from_cpu(img.copy())
+
+    # Run
+    predictor.run()
+
+    # Set output
+    output_names = predictor.get_output_names()
+    output_tensor = predictor.get_output_handle(output_names[0])
+    output_data = output_tensor.copy_to_cpu()
+    
+    print("Predict class index: ", np.argmax(output_data))